diff --git a/.dockerignore b/.dockerignore index 1c6bc1e124015..2c6db205037d4 100644 --- a/.dockerignore +++ b/.dockerignore @@ -17,6 +17,7 @@ .git docker_cache +docs/_build # IDE .vscode @@ -49,7 +50,6 @@ python/dist python/*.egg-info python/*.egg python/*.pyc -python/doc/_build __pycache__/ */__pycache__/ */*/__pycache__/ diff --git a/.gitignore b/.gitignore index 79a2a8e13d424..5817efdcac091 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,9 @@ # specific language governing permissions and limitations # under the License. +apache-rat-*.jar +arrow-src.tar + # Compiled source *.a *.dll @@ -34,7 +37,9 @@ MANIFEST *.iml cpp/.idea/ +cpp/apidoc/xml/ python/.eggs/ +python/doc/ .vscode .idea/ .pytest_cache/ diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3066c5ed4e92b..4e0c7b265311a 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -21,6 +21,14 @@ # To run all hooks on all files use `pre-commit run -a` repos: + - repo: local + hooks: + - id: rat + name: rat + language: system + entry: bash -c "git archive HEAD --prefix=apache-arrow/ --output=arrow-src.tar && ./dev/release/run-rat.sh arrow-src.tar" + always_run: true + pass_filenames: false - repo: git://github.com/pre-commit/pre-commit-hooks sha: v1.2.3 hooks: diff --git a/c_glib/.gitignore b/c_glib/.gitignore index cc7a19348af0c..18f952e0b3727 100644 --- a/c_glib/.gitignore +++ b/c_glib/.gitignore @@ -51,12 +51,12 @@ Makefile.in /libtool /m4/ /stamp-h1 +/arrow-cuda-glib/*.pc /arrow-glib/enums.c /arrow-glib/enums.h /arrow-glib/stamp-* /arrow-glib/version.h /arrow-glib/*.pc -/arrow-gpu-glib/*.pc /gandiva-glib/*.pc /parquet-glib/*.pc /plasma-glib/*.pc diff --git a/c_glib/Makefile.am b/c_glib/Makefile.am index d21555e12bb2f..149894c8241c2 100644 --- a/c_glib/Makefile.am +++ b/c_glib/Makefile.am @@ -19,7 +19,7 @@ ACLOCAL_AMFLAGS = -I m4 ${ACLOCAL_FLAGS} SUBDIRS = \ arrow-glib \ - arrow-gpu-glib \ + arrow-cuda-glib \ gandiva-glib \ parquet-glib \ plasma-glib \ diff --git a/c_glib/arrow-gpu-glib/Makefile.am b/c_glib/arrow-cuda-glib/Makefile.am similarity index 64% rename from c_glib/arrow-gpu-glib/Makefile.am rename to c_glib/arrow-cuda-glib/Makefile.am index a1249035a5a70..2e3848d2a0e2c 100644 --- a/c_glib/arrow-gpu-glib/Makefile.am +++ b/c_glib/arrow-cuda-glib/Makefile.am @@ -24,51 +24,51 @@ AM_CPPFLAGS = \ -I$(top_builddir) \ -I$(top_srcdir) -if HAVE_ARROW_GPU +if HAVE_ARROW_CUDA lib_LTLIBRARIES = \ - libarrow-gpu-glib.la + libarrow-cuda-glib.la -libarrow_gpu_glib_la_CXXFLAGS = \ +libarrow_cuda_glib_la_CXXFLAGS = \ $(GLIB_CFLAGS) \ $(ARROW_CFLAGS) \ - $(ARROW_GPU_CFLAGS) \ + $(ARROW_CUDA_CFLAGS) \ $(GARROW_CXXFLAGS) -libarrow_gpu_glib_la_LDFLAGS = \ +libarrow_cuda_glib_la_LDFLAGS = \ -version-info $(LT_VERSION_INFO) \ -no-undefined -libarrow_gpu_glib_la_LIBADD = \ +libarrow_cuda_glib_la_LIBADD = \ $(GLIB_LIBS) \ $(ARROW_LIBS) \ - $(ARROW_GPU_LIBS) \ + $(ARROW_CUDA_LIBS) \ ../arrow-glib/libarrow-glib.la -libarrow_gpu_glib_la_headers = \ - arrow-gpu-glib.h \ +libarrow_cuda_glib_la_headers = \ + arrow-cuda-glib.h \ cuda.h -libarrow_gpu_glib_la_sources = \ +libarrow_cuda_glib_la_sources = \ cuda.cpp \ - $(libarrow_gpu_glib_la_headers) + $(libarrow_cuda_glib_la_headers) -libarrow_gpu_glib_la_cpp_headers = \ - arrow-gpu-glib.hpp \ +libarrow_cuda_glib_la_cpp_headers = \ + arrow-cuda-glib.hpp \ cuda.hpp -libarrow_gpu_glib_la_SOURCES = \ - $(libarrow_gpu_glib_la_sources) \ - $(libarrow_gpu_glib_la_cpp_headers) +libarrow_cuda_glib_la_SOURCES = \ + $(libarrow_cuda_glib_la_sources) \ + $(libarrow_cuda_glib_la_cpp_headers) -arrow_gpu_glib_includedir = \ - $(includedir)/arrow-gpu-glib -arrow_gpu_glib_include_HEADERS = \ - $(libarrow_gpu_glib_la_headers) \ - $(libarrow_gpu_glib_la_cpp_headers) +arrow_cuda_glib_includedir = \ + $(includedir)/arrow-cuda-glib +arrow_cuda_glib_include_HEADERS = \ + $(libarrow_cuda_glib_la_headers) \ + $(libarrow_cuda_glib_la_cpp_headers) pkgconfigdir = $(libdir)/pkgconfig pkgconfig_DATA = \ - arrow-gpu-glib.pc + arrow-cuda-glib.pc if HAVE_INTROSPECTION -include $(INTROSPECTION_MAKEFILE) @@ -85,39 +85,39 @@ endif INTROSPECTION_COMPILER_ARGS = \ --includedir=$(abs_builddir)/../arrow-glib -ArrowGPU-1.0.gir: libarrow-gpu-glib.la -ArrowGPU_1_0_gir_PACKAGES = \ +ArrowCUDA-1.0.gir: libarrow-cuda-glib.la +ArrowCUDA_1_0_gir_PACKAGES = \ arrow-glib -ArrowGPU_1_0_gir_EXPORT_PACKAGES = \ - arrow-gpu-glib -ArrowGPU_1_0_gir_INCLUDES = \ +ArrowCUDA_1_0_gir_EXPORT_PACKAGES = \ + arrow-cuda-glib +ArrowCUDA_1_0_gir_INCLUDES = \ Arrow-1.0 -ArrowGPU_1_0_gir_CFLAGS = \ +ArrowCUDA_1_0_gir_CFLAGS = \ $(AM_CPPFLAGS) -ArrowGPU_1_0_gir_LIBS = -ArrowGPU_1_0_gir_FILES = \ - $(libarrow_gpu_glib_la_sources) -ArrowGPU_1_0_gir_SCANNERFLAGS = \ +ArrowCUDA_1_0_gir_LIBS = +ArrowCUDA_1_0_gir_FILES = \ + $(libarrow_cuda_glib_la_sources) +ArrowCUDA_1_0_gir_SCANNERFLAGS = \ --library-path=$(ARROW_LIB_DIR) \ --warn-all \ --add-include-path=$(abs_builddir)/../arrow-glib \ - --identifier-prefix=GArrowGPU \ - --symbol-prefix=garrow_gpu + --identifier-prefix=GArrowCUDA \ + --symbol-prefix=garrow_cuda if OS_MACOS -ArrowGPU_1_0_gir_LIBS += \ +ArrowCUDA_1_0_gir_LIBS += \ arrow-glib \ - arrow-gpu-glib -ArrowGPU_1_0_gir_SCANNERFLAGS += \ + arrow-cuda-glib +ArrowCUDA_1_0_gir_SCANNERFLAGS += \ --no-libtool \ --library-path=$(abs_builddir)/../arrow-glib/.libs \ --library-path=$(abs_builddir)/.libs else -ArrowGPU_1_0_gir_LIBS += \ +ArrowCUDA_1_0_gir_LIBS += \ $(abs_builddir)/../arrow-glib/libarrow-glib.la \ - libarrow-gpu-glib.la + libarrow-cuda-glib.la endif -INTROSPECTION_GIRS += ArrowGPU-1.0.gir +INTROSPECTION_GIRS += ArrowCUDA-1.0.gir girdir = $(datadir)/gir-1.0 gir_DATA = $(INTROSPECTION_GIRS) diff --git a/c_glib/arrow-gpu-glib/arrow-gpu-glib.h b/c_glib/arrow-cuda-glib/arrow-cuda-glib.h similarity index 96% rename from c_glib/arrow-gpu-glib/arrow-gpu-glib.h rename to c_glib/arrow-cuda-glib/arrow-cuda-glib.h index 1538c9a1865ac..b3c7f21087669 100644 --- a/c_glib/arrow-gpu-glib/arrow-gpu-glib.h +++ b/c_glib/arrow-cuda-glib/arrow-cuda-glib.h @@ -21,4 +21,4 @@ #include -#include +#include diff --git a/c_glib/arrow-gpu-glib/arrow-gpu-glib.hpp b/c_glib/arrow-cuda-glib/arrow-cuda-glib.hpp similarity index 95% rename from c_glib/arrow-gpu-glib/arrow-gpu-glib.hpp rename to c_glib/arrow-cuda-glib/arrow-cuda-glib.hpp index 92017d8b67aab..e79b43ae07d15 100644 --- a/c_glib/arrow-gpu-glib/arrow-gpu-glib.hpp +++ b/c_glib/arrow-cuda-glib/arrow-cuda-glib.hpp @@ -21,4 +21,4 @@ #include -#include +#include diff --git a/c_glib/arrow-gpu-glib/arrow-gpu-glib.pc.in b/c_glib/arrow-cuda-glib/arrow-cuda-glib.pc.in similarity index 85% rename from c_glib/arrow-gpu-glib/arrow-gpu-glib.pc.in rename to c_glib/arrow-cuda-glib/arrow-cuda-glib.pc.in index 38a6bae1a1298..de0ce974c7a13 100644 --- a/c_glib/arrow-gpu-glib/arrow-gpu-glib.pc.in +++ b/c_glib/arrow-cuda-glib/arrow-cuda-glib.pc.in @@ -20,9 +20,9 @@ exec_prefix=@exec_prefix@ libdir=@libdir@ includedir=@includedir@ -Name: Apache Arrow GPU GLib -Description: C API for Apache Arrow GPU based on GLib +Name: Apache Arrow CUDA GLib +Description: C API for Apache Arrow CUDA based on GLib Version: @VERSION@ -Libs: -L${libdir} -larrow-gpu-glib +Libs: -L${libdir} -larrow-cuda-glib Cflags: -I${includedir} -Requires: arrow-glib +Requires: arrow-glib arrow-cuda diff --git a/c_glib/arrow-cuda-glib/cuda.cpp b/c_glib/arrow-cuda-glib/cuda.cpp new file mode 100644 index 0000000000000..3f82f8fa806cb --- /dev/null +++ b/c_glib/arrow-cuda-glib/cuda.cpp @@ -0,0 +1,942 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include +#include +#include +#include +#include + +#include + +G_BEGIN_DECLS + +/** + * SECTION: cuda + * @section_id: cuda-classes + * @title: CUDA related classes + * @include: arrow-cuda-glib/arrow-cuda-glib.h + * + * The following classes provide CUDA support for Apache Arrow data. + * + * #GArrowCUDADeviceManager is the starting point. You need at + * least one #GArrowCUDAContext to process Apache Arrow data on + * NVIDIA GPU. + * + * #GArrowCUDAContext is a class to keep context for one GPU. You + * need to create #GArrowCUDAContext for each GPU that you want to + * use. You can create #GArrowCUDAContext by + * garrow_cuda_device_manager_get_context(). + * + * #GArrowCUDABuffer is a class for data on GPU. You can copy data + * on GPU to/from CPU by garrow_cuda_buffer_copy_to_host() and + * garrow_cuda_buffer_copy_from_host(). You can share data on GPU + * with other processes by garrow_cuda_buffer_export() and + * garrow_cuda_buffer_new_ipc(). + * + * #GArrowCUDAHostBuffer is a class for data on CPU that is + * directly accessible from GPU. + * + * #GArrowCUDAIPCMemoryHandle is a class to share data on GPU with + * other processes. You can export your data on GPU to other processes + * by garrow_cuda_buffer_export() and + * garrow_cuda_ipc_memory_handle_new(). You can import other + * process data on GPU by garrow_cuda_ipc_memory_handle_new() and + * garrow_cuda_buffer_new_ipc(). + * + * #GArrowCUDABufferInputStream is a class to read data in + * #GArrowCUDABuffer. + * + * #GArrowCUDABufferOutputStream is a class to write data into + * #GArrowCUDABuffer. + */ + +G_DEFINE_TYPE(GArrowCUDADeviceManager, + garrow_cuda_device_manager, + G_TYPE_OBJECT) + +static void +garrow_cuda_device_manager_init(GArrowCUDADeviceManager *object) +{ +} + +static void +garrow_cuda_device_manager_class_init(GArrowCUDADeviceManagerClass *klass) +{ +} + +/** + * garrow_cuda_device_manager_new: + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: A newly created #GArrowCUDADeviceManager on success, + * %NULL on error. + * + * Since: 0.8.0 + */ +GArrowCUDADeviceManager * +garrow_cuda_device_manager_new(GError **error) +{ + arrow::cuda::CudaDeviceManager *manager; + auto status = arrow::cuda::CudaDeviceManager::GetInstance(&manager); + if (garrow_error_check(error, status, "[cuda][device-manager][new]")) { + auto manager = g_object_new(GARROW_CUDA_TYPE_DEVICE_MANAGER, + NULL); + return GARROW_CUDA_DEVICE_MANAGER(manager); + } else { + return NULL; + } +} + +/** + * garrow_cuda_device_manager_get_context: + * @manager: A #GArrowCUDADeviceManager. + * @gpu_number: A GPU device number for the target context. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full): A newly created #GArrowCUDAContext on + * success, %NULL on error. Contexts for the same GPU device number + * share the same data internally. + * + * Since: 0.8.0 + */ +GArrowCUDAContext * +garrow_cuda_device_manager_get_context(GArrowCUDADeviceManager *manager, + gint gpu_number, + GError **error) +{ + arrow::cuda::CudaDeviceManager *arrow_manager; + arrow::cuda::CudaDeviceManager::GetInstance(&arrow_manager); + std::shared_ptr context; + auto status = arrow_manager->GetContext(gpu_number, &context); + if (garrow_error_check(error, status, + "[cuda][device-manager][get-context]]")) { + return garrow_cuda_context_new_raw(&context); + } else { + return NULL; + } +} + +/** + * garrow_cuda_device_manager_get_n_devices: + * @manager: A #GArrowCUDADeviceManager. + * + * Returns: The number of GPU devices. + * + * Since: 0.8.0 + */ +gsize +garrow_cuda_device_manager_get_n_devices(GArrowCUDADeviceManager *manager) +{ + arrow::cuda::CudaDeviceManager *arrow_manager; + arrow::cuda::CudaDeviceManager::GetInstance(&arrow_manager); + return arrow_manager->num_devices(); +} + + +typedef struct GArrowCUDAContextPrivate_ { + std::shared_ptr context; +} GArrowCUDAContextPrivate; + +enum { + PROP_CONTEXT = 1 +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GArrowCUDAContext, + garrow_cuda_context, + G_TYPE_OBJECT) + +#define GARROW_CUDA_CONTEXT_GET_PRIVATE(object) \ + static_cast( \ + garrow_cuda_context_get_instance_private( \ + GARROW_CUDA_CONTEXT(object))) + +static void +garrow_cuda_context_finalize(GObject *object) +{ + auto priv = GARROW_CUDA_CONTEXT_GET_PRIVATE(object); + + priv->context = nullptr; + + G_OBJECT_CLASS(garrow_cuda_context_parent_class)->finalize(object); +} + +static void +garrow_cuda_context_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GARROW_CUDA_CONTEXT_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_CONTEXT: + priv->context = + *static_cast *>(g_value_get_pointer(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +garrow_cuda_context_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + switch (prop_id) { + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +garrow_cuda_context_init(GArrowCUDAContext *object) +{ +} + +static void +garrow_cuda_context_class_init(GArrowCUDAContextClass *klass) +{ + GParamSpec *spec; + + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->finalize = garrow_cuda_context_finalize; + gobject_class->set_property = garrow_cuda_context_set_property; + gobject_class->get_property = garrow_cuda_context_get_property; + + /** + * GArrowCUDAContext:context: + * + * Since: 0.8.0 + */ + spec = g_param_spec_pointer("context", + "Context", + "The raw std::shared_ptr", + static_cast(G_PARAM_WRITABLE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_CONTEXT, spec); +} + +/** + * garrow_cuda_context_get_allocated_size: + * @context: A #GArrowCUDAContext. + * + * Returns: The allocated memory by this context in bytes. + * + * Since: 0.8.0 + */ +gint64 +garrow_cuda_context_get_allocated_size(GArrowCUDAContext *context) +{ + auto arrow_context = garrow_cuda_context_get_raw(context); + return arrow_context->bytes_allocated(); +} + + +G_DEFINE_TYPE(GArrowCUDABuffer, + garrow_cuda_buffer, + GARROW_TYPE_BUFFER) + +static void +garrow_cuda_buffer_init(GArrowCUDABuffer *object) +{ +} + +static void +garrow_cuda_buffer_class_init(GArrowCUDABufferClass *klass) +{ +} + +/** + * garrow_cuda_buffer_new: + * @context: A #GArrowCUDAContext. + * @size: The number of bytes to be allocated on GPU device for this context. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full): A newly created #GArrowCUDABuffer on + * success, %NULL on error. + * + * Since: 0.8.0 + */ +GArrowCUDABuffer * +garrow_cuda_buffer_new(GArrowCUDAContext *context, + gint64 size, + GError **error) +{ + auto arrow_context = garrow_cuda_context_get_raw(context); + std::shared_ptr arrow_buffer; + auto status = arrow_context->Allocate(size, &arrow_buffer); + if (garrow_error_check(error, status, "[cuda][buffer][new]")) { + return garrow_cuda_buffer_new_raw(&arrow_buffer); + } else { + return NULL; + } +} + +/** + * garrow_cuda_buffer_new_ipc: + * @context: A #GArrowCUDAContext. + * @handle: A #GArrowCUDAIPCMemoryHandle to be communicated. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full): A newly created #GArrowCUDABuffer on + * success, %NULL on error. The buffer has data from the IPC target. + * + * Since: 0.8.0 + */ +GArrowCUDABuffer * +garrow_cuda_buffer_new_ipc(GArrowCUDAContext *context, + GArrowCUDAIPCMemoryHandle *handle, + GError **error) +{ + auto arrow_context = garrow_cuda_context_get_raw(context); + auto arrow_handle = garrow_cuda_ipc_memory_handle_get_raw(handle); + std::shared_ptr arrow_buffer; + auto status = arrow_context->OpenIpcBuffer(*arrow_handle, &arrow_buffer); + if (garrow_error_check(error, status, + "[cuda][buffer][new-ipc]")) { + return garrow_cuda_buffer_new_raw(&arrow_buffer); + } else { + return NULL; + } +} + +/** + * garrow_cuda_buffer_new_record_batch: + * @context: A #GArrowCUDAContext. + * @record_batch: A #GArrowRecordBatch to be serialized. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full): A newly created #GArrowCUDABuffer on + * success, %NULL on error. The buffer has serialized record batch + * data. + * + * Since: 0.8.0 + */ +GArrowCUDABuffer * +garrow_cuda_buffer_new_record_batch(GArrowCUDAContext *context, + GArrowRecordBatch *record_batch, + GError **error) +{ + auto arrow_context = garrow_cuda_context_get_raw(context); + auto arrow_record_batch = garrow_record_batch_get_raw(record_batch); + std::shared_ptr arrow_buffer; + auto status = arrow::cuda::SerializeRecordBatch(*arrow_record_batch, + arrow_context.get(), + &arrow_buffer); + if (garrow_error_check(error, status, + "[cuda][buffer][new-record-batch]")) { + return garrow_cuda_buffer_new_raw(&arrow_buffer); + } else { + return NULL; + } +} + +/** + * garrow_cuda_buffer_copy_to_host: + * @buffer: A #GArrowCUDABuffer. + * @position: The offset of memory on GPU device to be copied. + * @size: The size of memory on GPU device to be copied in bytes. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full): A #GBytes that have copied memory on CPU + * host on success, %NULL on error. + * + * Since: 0.8.0 + */ +GBytes * +garrow_cuda_buffer_copy_to_host(GArrowCUDABuffer *buffer, + gint64 position, + gint64 size, + GError **error) +{ + auto arrow_buffer = garrow_cuda_buffer_get_raw(buffer); + auto data = static_cast(g_malloc(size)); + auto status = arrow_buffer->CopyToHost(position, size, data); + if (garrow_error_check(error, status, "[cuda][buffer][copy-to-host]")) { + return g_bytes_new_take(data, size); + } else { + g_free(data); + return NULL; + } +} + +/** + * garrow_cuda_buffer_copy_from_host: + * @buffer: A #GArrowCUDABuffer. + * @data: (array length=size): Data on CPU host to be copied. + * @size: The size of data on CPU host to be copied in bytes. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 0.8.0 + */ +gboolean +garrow_cuda_buffer_copy_from_host(GArrowCUDABuffer *buffer, + const guint8 *data, + gint64 size, + GError **error) +{ + auto arrow_buffer = garrow_cuda_buffer_get_raw(buffer); + auto status = arrow_buffer->CopyFromHost(0, data, size); + return garrow_error_check(error, + status, + "[cuda][buffer][copy-from-host]"); +} + +/** + * garrow_cuda_buffer_export: + * @buffer: A #GArrowCUDABuffer. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full): A newly created + * #GArrowCUDAIPCMemoryHandle to handle the exported buffer on + * success, %NULL on error + * + * Since: 0.8.0 + */ +GArrowCUDAIPCMemoryHandle * +garrow_cuda_buffer_export(GArrowCUDABuffer *buffer, GError **error) +{ + auto arrow_buffer = garrow_cuda_buffer_get_raw(buffer); + std::shared_ptr arrow_handle; + auto status = arrow_buffer->ExportForIpc(&arrow_handle); + if (garrow_error_check(error, status, "[cuda][buffer][export-for-ipc]")) { + return garrow_cuda_ipc_memory_handle_new_raw(&arrow_handle); + } else { + return NULL; + } +} + +/** + * garrow_cuda_buffer_get_context: + * @buffer: A #GArrowCUDABuffer. + * + * Returns: (transfer full): A newly created #GArrowCUDAContext for the + * buffer. Contexts for the same buffer share the same data internally. + * + * Since: 0.8.0 + */ +GArrowCUDAContext * +garrow_cuda_buffer_get_context(GArrowCUDABuffer *buffer) +{ + auto arrow_buffer = garrow_cuda_buffer_get_raw(buffer); + auto arrow_context = arrow_buffer->context(); + return garrow_cuda_context_new_raw(&arrow_context); +} + +/** + * garrow_cuda_buffer_read_record_batch: + * @buffer: A #GArrowCUDABuffer. + * @schema: A #GArrowSchema for record batch. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full): A newly created #GArrowRecordBatch on + * success, %NULL on error. The record batch data is located on GPU. + * + * Since: 0.8.0 + */ +GArrowRecordBatch * +garrow_cuda_buffer_read_record_batch(GArrowCUDABuffer *buffer, + GArrowSchema *schema, + GError **error) +{ + auto arrow_buffer = garrow_cuda_buffer_get_raw(buffer); + auto arrow_schema = garrow_schema_get_raw(schema); + auto pool = arrow::default_memory_pool(); + std::shared_ptr arrow_record_batch; + auto status = arrow::cuda::ReadRecordBatch(arrow_schema, + arrow_buffer, + pool, + &arrow_record_batch); + if (garrow_error_check(error, status, + "[cuda][buffer][read-record-batch]")) { + return garrow_record_batch_new_raw(&arrow_record_batch); + } else { + return NULL; + } +} + + +G_DEFINE_TYPE(GArrowCUDAHostBuffer, + garrow_cuda_host_buffer, + GARROW_TYPE_MUTABLE_BUFFER) + +static void +garrow_cuda_host_buffer_init(GArrowCUDAHostBuffer *object) +{ +} + +static void +garrow_cuda_host_buffer_class_init(GArrowCUDAHostBufferClass *klass) +{ +} + +/** + * garrow_cuda_host_buffer_new: + * @gpu_number: A GPU device number for the target context. + * @size: The number of bytes to be allocated on CPU host. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: A newly created #GArrowCUDAHostBuffer on success, + * %NULL on error. The allocated memory is accessible from GPU + * device for the @context. + * + * Since: 0.8.0 + */ +GArrowCUDAHostBuffer * +garrow_cuda_host_buffer_new(gint gpu_number, gint64 size, GError **error) +{ + arrow::cuda::CudaDeviceManager *manager; + auto status = arrow::cuda::CudaDeviceManager::GetInstance(&manager); + std::shared_ptr arrow_buffer; + status = manager->AllocateHost(gpu_number, size, &arrow_buffer); + if (garrow_error_check(error, status, "[cuda][host-buffer][new]")) { + return garrow_cuda_host_buffer_new_raw(&arrow_buffer); + } else { + return NULL; + } +} + + +typedef struct GArrowCUDAIPCMemoryHandlePrivate_ { + std::shared_ptr ipc_memory_handle; +} GArrowCUDAIPCMemoryHandlePrivate; + +enum { + PROP_IPC_MEMORY_HANDLE = 1 +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GArrowCUDAIPCMemoryHandle, + garrow_cuda_ipc_memory_handle, + G_TYPE_OBJECT) + +#define GARROW_CUDA_IPC_MEMORY_HANDLE_GET_PRIVATE(object) \ + static_cast( \ + garrow_cuda_ipc_memory_handle_get_instance_private( \ + GARROW_CUDA_IPC_MEMORY_HANDLE(object))) + +static void +garrow_cuda_ipc_memory_handle_finalize(GObject *object) +{ + auto priv = GARROW_CUDA_IPC_MEMORY_HANDLE_GET_PRIVATE(object); + + priv->ipc_memory_handle = nullptr; + + G_OBJECT_CLASS(garrow_cuda_ipc_memory_handle_parent_class)->finalize(object); +} + +static void +garrow_cuda_ipc_memory_handle_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GARROW_CUDA_IPC_MEMORY_HANDLE_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_IPC_MEMORY_HANDLE: + priv->ipc_memory_handle = + *static_cast *>(g_value_get_pointer(value)); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +garrow_cuda_ipc_memory_handle_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + switch (prop_id) { + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +garrow_cuda_ipc_memory_handle_init(GArrowCUDAIPCMemoryHandle *object) +{ +} + +static void +garrow_cuda_ipc_memory_handle_class_init(GArrowCUDAIPCMemoryHandleClass *klass) +{ + GParamSpec *spec; + + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->finalize = garrow_cuda_ipc_memory_handle_finalize; + gobject_class->set_property = garrow_cuda_ipc_memory_handle_set_property; + gobject_class->get_property = garrow_cuda_ipc_memory_handle_get_property; + + /** + * GArrowCUDAIPCMemoryHandle:ipc-memory-handle: + * + * Since: 0.8.0 + */ + spec = g_param_spec_pointer("ipc-memory-handle", + "IPC Memory Handle", + "The raw std::shared_ptr", + static_cast(G_PARAM_WRITABLE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_IPC_MEMORY_HANDLE, spec); +} + +/** + * garrow_cuda_ipc_memory_handle_new: + * @data: (array length=size): A serialized #GArrowCUDAIPCMemoryHandle. + * @size: The size of data. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full): A newly created #GArrowCUDAIPCMemoryHandle + * on success, %NULL on error. + * + * Since: 0.8.0 + */ +GArrowCUDAIPCMemoryHandle * +garrow_cuda_ipc_memory_handle_new(const guint8 *data, + gsize size, + GError **error) +{ + std::shared_ptr arrow_handle; + auto status = arrow::cuda::CudaIpcMemHandle::FromBuffer(data, &arrow_handle); + if (garrow_error_check(error, status, + "[cuda][ipc-memory-handle][new]")) { + return garrow_cuda_ipc_memory_handle_new_raw(&arrow_handle); + } else { + return NULL; + } +} + +/** + * garrow_cuda_ipc_memory_handle_serialize: + * @handle: A #GArrowCUDAIPCMemoryHandle. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (transfer full): A newly created #GArrowBuffer on success, + * %NULL on error. The buffer has serialized @handle. The serialized + * @handle can be deserialized by garrow_gpu_cuda_ipc_memory_handle_new() + * in other process. + * + * Since: 0.8.0 + */ +GArrowBuffer * +garrow_cuda_ipc_memory_handle_serialize(GArrowCUDAIPCMemoryHandle *handle, + GError **error) +{ + auto arrow_handle = garrow_cuda_ipc_memory_handle_get_raw(handle); + std::shared_ptr arrow_buffer; + auto status = arrow_handle->Serialize(arrow::default_memory_pool(), + &arrow_buffer); + if (garrow_error_check(error, status, + "[cuda][ipc-memory-handle][serialize]")) { + return garrow_buffer_new_raw(&arrow_buffer); + } else { + return NULL; + } +} + +GArrowBuffer * +garrow_cuda_buffer_input_stream_new_raw_readable_interface(std::shared_ptr *arrow_buffer) +{ + auto buffer = GARROW_BUFFER(g_object_new(GARROW_CUDA_TYPE_BUFFER, + "buffer", arrow_buffer, + NULL)); + return buffer; +} + +static std::shared_ptr +garrow_cuda_buffer_input_stream_get_raw_readable_interface(GArrowReadable *readable) +{ + auto input_stream = GARROW_INPUT_STREAM(readable); + auto arrow_input_stream = garrow_input_stream_get_raw(input_stream); + return arrow_input_stream; +} + +static void +garrow_cuda_buffer_input_stream_readable_interface_init(GArrowReadableInterface *iface) +{ + iface->new_raw = + garrow_cuda_buffer_input_stream_new_raw_readable_interface; + iface->get_raw = + garrow_cuda_buffer_input_stream_get_raw_readable_interface; +} + +G_DEFINE_TYPE_WITH_CODE( + GArrowCUDABufferInputStream, + garrow_cuda_buffer_input_stream, + GARROW_TYPE_BUFFER_INPUT_STREAM, + G_IMPLEMENT_INTERFACE( + GARROW_TYPE_READABLE, + garrow_cuda_buffer_input_stream_readable_interface_init)) + +static void +garrow_cuda_buffer_input_stream_init(GArrowCUDABufferInputStream *object) +{ +} + +static void +garrow_cuda_buffer_input_stream_class_init(GArrowCUDABufferInputStreamClass *klass) +{ +} + +/** + * garrow_cuda_buffer_input_stream_new: + * @buffer: A #GArrowCUDABuffer. + * + * Returns: (transfer full): A newly created + * #GArrowCUDABufferInputStream. + * + * Since: 0.8.0 + */ +GArrowCUDABufferInputStream * +garrow_cuda_buffer_input_stream_new(GArrowCUDABuffer *buffer) +{ + auto arrow_buffer = garrow_cuda_buffer_get_raw(buffer); + auto arrow_reader = + std::make_shared(arrow_buffer); + return garrow_cuda_buffer_input_stream_new_raw(&arrow_reader); +} + + +G_DEFINE_TYPE(GArrowCUDABufferOutputStream, + garrow_cuda_buffer_output_stream, + GARROW_TYPE_OUTPUT_STREAM) + +static void +garrow_cuda_buffer_output_stream_init(GArrowCUDABufferOutputStream *object) +{ +} + +static void +garrow_cuda_buffer_output_stream_class_init(GArrowCUDABufferOutputStreamClass *klass) +{ +} + +/** + * garrow_cuda_buffer_output_stream_new: + * @buffer: A #GArrowCUDABuffer. + * + * Returns: (transfer full): A newly created + * #GArrowCUDABufferOutputStream. + * + * Since: 0.8.0 + */ +GArrowCUDABufferOutputStream * +garrow_cuda_buffer_output_stream_new(GArrowCUDABuffer *buffer) +{ + auto arrow_buffer = garrow_cuda_buffer_get_raw(buffer); + auto arrow_writer = + std::make_shared(arrow_buffer); + return garrow_cuda_buffer_output_stream_new_raw(&arrow_writer); +} + +/** + * garrow_cuda_buffer_output_stream_set_buffer_size: + * @stream: A #GArrowCUDABufferOutputStream. + * @size: A size of CPU buffer in bytes. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Sets CPU buffer size. to limit `cudaMemcpy()` calls. If CPU buffer + * size is `0`, buffering is disabled. + * + * The default is `0`. + * + * Since: 0.8.0 + */ +gboolean +garrow_cuda_buffer_output_stream_set_buffer_size(GArrowCUDABufferOutputStream *stream, + gint64 size, + GError **error) +{ + auto arrow_stream = garrow_cuda_buffer_output_stream_get_raw(stream); + auto status = arrow_stream->SetBufferSize(size); + return garrow_error_check(error, + status, + "[cuda][buffer-output-stream][set-buffer-size]"); +} + +/** + * garrow_cuda_buffer_output_stream_get_buffer_size: + * @stream: A #GArrowCUDABufferOutputStream. + * + * Returns: The CPU buffer size in bytes. + * + * See garrow_cuda_buffer_output_stream_set_buffer_size() for CPU + * buffer size details. + * + * Since: 0.8.0 + */ +gint64 +garrow_cuda_buffer_output_stream_get_buffer_size(GArrowCUDABufferOutputStream *stream) +{ + auto arrow_stream = garrow_cuda_buffer_output_stream_get_raw(stream); + return arrow_stream->buffer_size(); +} + +/** + * garrow_cuda_buffer_output_stream_get_buffered_size: + * @stream: A #GArrowCUDABufferOutputStream. + * + * Returns: The size of buffered data in bytes. + * + * Since: 0.8.0 + */ +gint64 +garrow_cuda_buffer_output_stream_get_buffered_size(GArrowCUDABufferOutputStream *stream) +{ + auto arrow_stream = garrow_cuda_buffer_output_stream_get_raw(stream); + return arrow_stream->num_bytes_buffered(); +} + + +G_END_DECLS + +GArrowCUDAContext * +garrow_cuda_context_new_raw(std::shared_ptr *arrow_context) +{ + return GARROW_CUDA_CONTEXT(g_object_new(GARROW_CUDA_TYPE_CONTEXT, + "context", arrow_context, + NULL)); +} + +std::shared_ptr +garrow_cuda_context_get_raw(GArrowCUDAContext *context) +{ + if (!context) + return nullptr; + + auto priv = GARROW_CUDA_CONTEXT_GET_PRIVATE(context); + return priv->context; +} + +GArrowCUDAIPCMemoryHandle * +garrow_cuda_ipc_memory_handle_new_raw(std::shared_ptr *arrow_handle) +{ + auto handle = g_object_new(GARROW_CUDA_TYPE_IPC_MEMORY_HANDLE, + "ipc-memory-handle", arrow_handle, + NULL); + return GARROW_CUDA_IPC_MEMORY_HANDLE(handle); +} + +std::shared_ptr +garrow_cuda_ipc_memory_handle_get_raw(GArrowCUDAIPCMemoryHandle *handle) +{ + if (!handle) + return nullptr; + + auto priv = GARROW_CUDA_IPC_MEMORY_HANDLE_GET_PRIVATE(handle); + return priv->ipc_memory_handle; +} + +GArrowCUDABuffer * +garrow_cuda_buffer_new_raw(std::shared_ptr *arrow_buffer) +{ + return GARROW_CUDA_BUFFER(g_object_new(GARROW_CUDA_TYPE_BUFFER, + "buffer", arrow_buffer, + NULL)); +} + +std::shared_ptr +garrow_cuda_buffer_get_raw(GArrowCUDABuffer *buffer) +{ + if (!buffer) + return nullptr; + + auto arrow_buffer = garrow_buffer_get_raw(GARROW_BUFFER(buffer)); + return std::static_pointer_cast(arrow_buffer); +} + +GArrowCUDAHostBuffer * +garrow_cuda_host_buffer_new_raw(std::shared_ptr *arrow_buffer) +{ + auto buffer = g_object_new(GARROW_CUDA_TYPE_HOST_BUFFER, + "buffer", arrow_buffer, + NULL); + return GARROW_CUDA_HOST_BUFFER(buffer); +} + +std::shared_ptr +garrow_cuda_host_buffer_get_raw(GArrowCUDAHostBuffer *buffer) +{ + if (!buffer) + return nullptr; + + auto arrow_buffer = garrow_buffer_get_raw(GARROW_BUFFER(buffer)); + return std::static_pointer_cast(arrow_buffer); +} + +GArrowCUDABufferInputStream * +garrow_cuda_buffer_input_stream_new_raw(std::shared_ptr *arrow_reader) +{ + auto input_stream = g_object_new(GARROW_CUDA_TYPE_BUFFER_INPUT_STREAM, + "input-stream", arrow_reader, + NULL); + return GARROW_CUDA_BUFFER_INPUT_STREAM(input_stream); +} + +std::shared_ptr +garrow_cuda_buffer_input_stream_get_raw(GArrowCUDABufferInputStream *input_stream) +{ + if (!input_stream) + return nullptr; + + auto arrow_reader = + garrow_input_stream_get_raw(GARROW_INPUT_STREAM(input_stream)); + return std::static_pointer_cast(arrow_reader); +} + +GArrowCUDABufferOutputStream * +garrow_cuda_buffer_output_stream_new_raw(std::shared_ptr *arrow_writer) +{ + auto output_stream = g_object_new(GARROW_CUDA_TYPE_BUFFER_OUTPUT_STREAM, + "output-stream", arrow_writer, + NULL); + return GARROW_CUDA_BUFFER_OUTPUT_STREAM(output_stream); +} + +std::shared_ptr +garrow_cuda_buffer_output_stream_get_raw(GArrowCUDABufferOutputStream *output_stream) +{ + if (!output_stream) + return nullptr; + + auto arrow_writer = + garrow_output_stream_get_raw(GARROW_OUTPUT_STREAM(output_stream)); + return std::static_pointer_cast(arrow_writer); +} diff --git a/c_glib/arrow-cuda-glib/cuda.h b/c_glib/arrow-cuda-glib/cuda.h new file mode 100644 index 0000000000000..6cdef99221fe2 --- /dev/null +++ b/c_glib/arrow-cuda-glib/cuda.h @@ -0,0 +1,182 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +G_BEGIN_DECLS + +#define GARROW_CUDA_TYPE_DEVICE_MANAGER (garrow_cuda_device_manager_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowCUDADeviceManager, + garrow_cuda_device_manager, + GARROW_CUDA, + DEVICE_MANAGER, + GObject) +struct _GArrowCUDADeviceManagerClass +{ + GObjectClass parent_class; +}; + +#define GARROW_CUDA_TYPE_CONTEXT (garrow_cuda_context_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowCUDAContext, + garrow_cuda_context, + GARROW_CUDA, + CONTEXT, + GObject) +struct _GArrowCUDAContextClass +{ + GObjectClass parent_class; +}; + +#define GARROW_CUDA_TYPE_BUFFER (garrow_cuda_buffer_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowCUDABuffer, + garrow_cuda_buffer, + GARROW_CUDA, + BUFFER, + GArrowBuffer) +struct _GArrowCUDABufferClass +{ + GArrowBufferClass parent_class; +}; + +#define GARROW_CUDA_TYPE_HOST_BUFFER (garrow_cuda_host_buffer_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowCUDAHostBuffer, + garrow_cuda_host_buffer, + GARROW_CUDA, + HOST_BUFFER, + GArrowMutableBuffer) +struct _GArrowCUDAHostBufferClass +{ + GArrowMutableBufferClass parent_class; +}; + +#define GARROW_CUDA_TYPE_IPC_MEMORY_HANDLE \ + (garrow_cuda_ipc_memory_handle_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowCUDAIPCMemoryHandle, + garrow_cuda_ipc_memory_handle, + GARROW_CUDA, + IPC_MEMORY_HANDLE, + GObject) +struct _GArrowCUDAIPCMemoryHandleClass +{ + GObjectClass parent_class; +}; + +#define GARROW_CUDA_TYPE_BUFFER_INPUT_STREAM \ + (garrow_cuda_buffer_input_stream_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowCUDABufferInputStream, + garrow_cuda_buffer_input_stream, + GARROW_CUDA, + BUFFER_INPUT_STREAM, + GArrowBufferInputStream) +struct _GArrowCUDABufferInputStreamClass +{ + GArrowBufferInputStreamClass parent_class; +}; + +#define GARROW_CUDA_TYPE_BUFFER_OUTPUT_STREAM \ + (garrow_cuda_buffer_output_stream_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowCUDABufferOutputStream, + garrow_cuda_buffer_output_stream, + GARROW_CUDA, + BUFFER_OUTPUT_STREAM, + GArrowOutputStream) +struct _GArrowCUDABufferOutputStreamClass +{ + GArrowOutputStreamClass parent_class; +}; + +GArrowCUDADeviceManager * +garrow_cuda_device_manager_new(GError **error); + +GArrowCUDAContext * +garrow_cuda_device_manager_get_context(GArrowCUDADeviceManager *manager, + gint gpu_number, + GError **error); +gsize +garrow_cuda_device_manager_get_n_devices(GArrowCUDADeviceManager *manager); + +gint64 +garrow_cuda_context_get_allocated_size(GArrowCUDAContext *context); + + +GArrowCUDABuffer * +garrow_cuda_buffer_new(GArrowCUDAContext *context, + gint64 size, + GError **error); +GArrowCUDABuffer * +garrow_cuda_buffer_new_ipc(GArrowCUDAContext *context, + GArrowCUDAIPCMemoryHandle *handle, + GError **error); +GArrowCUDABuffer * +garrow_cuda_buffer_new_record_batch(GArrowCUDAContext *context, + GArrowRecordBatch *record_batch, + GError **error); +GBytes * +garrow_cuda_buffer_copy_to_host(GArrowCUDABuffer *buffer, + gint64 position, + gint64 size, + GError **error); +gboolean +garrow_cuda_buffer_copy_from_host(GArrowCUDABuffer *buffer, + const guint8 *data, + gint64 size, + GError **error); +GArrowCUDAIPCMemoryHandle * +garrow_cuda_buffer_export(GArrowCUDABuffer *buffer, + GError **error); +GArrowCUDAContext * +garrow_cuda_buffer_get_context(GArrowCUDABuffer *buffer); +GArrowRecordBatch * +garrow_cuda_buffer_read_record_batch(GArrowCUDABuffer *buffer, + GArrowSchema *schema, + GError **error); + + +GArrowCUDAHostBuffer * +garrow_cuda_host_buffer_new(gint gpu_number, + gint64 size, + GError **error); + +GArrowCUDAIPCMemoryHandle * +garrow_cuda_ipc_memory_handle_new(const guint8 *data, + gsize size, + GError **error); + +GArrowBuffer * +garrow_cuda_ipc_memory_handle_serialize(GArrowCUDAIPCMemoryHandle *handle, + GError **error); + +GArrowCUDABufferInputStream * +garrow_cuda_buffer_input_stream_new(GArrowCUDABuffer *buffer); + +GArrowCUDABufferOutputStream * +garrow_cuda_buffer_output_stream_new(GArrowCUDABuffer *buffer); + +gboolean +garrow_cuda_buffer_output_stream_set_buffer_size(GArrowCUDABufferOutputStream *stream, + gint64 size, + GError **error); +gint64 +garrow_cuda_buffer_output_stream_get_buffer_size(GArrowCUDABufferOutputStream *stream); +gint64 +garrow_cuda_buffer_output_stream_get_buffered_size(GArrowCUDABufferOutputStream *stream); + +G_END_DECLS diff --git a/c_glib/arrow-cuda-glib/cuda.hpp b/c_glib/arrow-cuda-glib/cuda.hpp new file mode 100644 index 0000000000000..0f8985a9de4f5 --- /dev/null +++ b/c_glib/arrow-cuda-glib/cuda.hpp @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +#include + +GArrowCUDAContext * +garrow_cuda_context_new_raw(std::shared_ptr *arrow_context); +std::shared_ptr +garrow_cuda_context_get_raw(GArrowCUDAContext *context); + +GArrowCUDAIPCMemoryHandle * +garrow_cuda_ipc_memory_handle_new_raw(std::shared_ptr *arrow_handle); +std::shared_ptr +garrow_cuda_ipc_memory_handle_get_raw(GArrowCUDAIPCMemoryHandle *handle); + +GArrowCUDABuffer * +garrow_cuda_buffer_new_raw(std::shared_ptr *arrow_buffer); +std::shared_ptr +garrow_cuda_buffer_get_raw(GArrowCUDABuffer *buffer); + +GArrowCUDAHostBuffer * +garrow_cuda_host_buffer_new_raw(std::shared_ptr *arrow_buffer); +std::shared_ptr +garrow_cuda_host_buffer_get_raw(GArrowCUDAHostBuffer *buffer); + +GArrowCUDABufferInputStream * +garrow_cuda_buffer_input_stream_new_raw(std::shared_ptr *arrow_reader); +std::shared_ptr +garrow_cuda_buffer_input_stream_get_raw(GArrowCUDABufferInputStream *input_stream); + +GArrowCUDABufferOutputStream * +garrow_cuda_buffer_output_stream_new_raw(std::shared_ptr *arrow_writer); +std::shared_ptr +garrow_cuda_buffer_output_stream_get_raw(GArrowCUDABufferOutputStream *output_stream); diff --git a/c_glib/arrow-cuda-glib/meson.build b/c_glib/arrow-cuda-glib/meson.build new file mode 100644 index 0000000000000..e5b9f477fc142 --- /dev/null +++ b/c_glib/arrow-cuda-glib/meson.build @@ -0,0 +1,79 @@ +# -*- indent-tabs-mode: nil -*- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +sources = files( + 'cuda.cpp', +) + +c_headers = files( + 'arrow-cuda-glib.h', + 'cuda.h', +) + +cpp_headers = files( + 'arrow-cuda-glib.hpp', + 'cuda.hpp', +) + +headers = c_headers + cpp_headers +install_headers(headers, subdir: 'arrow-cuda-glib') + + +dependencies = [ + arrow_cuda, + arrow_glib, +] +libarrow_cuda_glib = library('arrow-cuda-glib', + sources: sources, + install: true, + dependencies: dependencies, + include_directories: base_include_directories, + soversion: so_version, + version: library_version) +arrow_cuda_glib = declare_dependency(link_with: libarrow_cuda_glib, + include_directories: base_include_directories, + dependencies: dependencies) + +pkgconfig.generate(filebase: 'arrow-cuda-glib', + name: 'Apache Arrow CUDA GLib', + description: 'C API for Apache Arrow CUDA based on GLib', + version: version, + requires: ['arrow-glib', 'arrow-cuda'], + libraries: [libarrow_cuda_glib]) + +gir_dependencies = [ + declare_dependency(sources: arrow_glib_gir), +] +gir_extra_args = [ + '--warn-all', + '--include-uninstalled=./arrow-glib/Arrow-1.0.gir', +] +arrow_cuda_glib_gir = gnome.generate_gir(libarrow_cuda_glib, + dependencies: gir_dependencies, + sources: sources + c_headers, + namespace: 'ArrowCUDA', + nsversion: api_version, + identifier_prefix: 'GArrowCUDA', + symbol_prefix: 'garrow_cuda', + export_packages: 'arrow-cuda-glib', + includes: [ + 'Arrow-1.0', + ], + install: true, + extra_args: gir_extra_args) diff --git a/c_glib/arrow-glib/basic-array.cpp b/c_glib/arrow-glib/basic-array.cpp index 77f64fc0a89fc..fef43a0285e25 100644 --- a/c_glib/arrow-glib/basic-array.cpp +++ b/c_glib/arrow-glib/basic-array.cpp @@ -209,7 +209,9 @@ enum { PROP_ARRAY }; -G_DEFINE_TYPE_WITH_PRIVATE(GArrowArray, garrow_array, G_TYPE_OBJECT) +G_DEFINE_ABSTRACT_TYPE_WITH_PRIVATE(GArrowArray, + garrow_array, + G_TYPE_OBJECT) #define GARROW_ARRAY_GET_PRIVATE(obj) \ static_cast( \ @@ -494,7 +496,8 @@ garrow_array_slice(GArrowArray *array, * @array: A #GArrowArray. * @error: (nullable): Return location for a #GError or %NULL. * - * Returns: (nullable): The formatted array content or %NULL on error. + * Returns: (nullable) (transfer full): + * The formatted array content or %NULL on error. * * The returned string should be freed when with g_free() when no * longer needed. @@ -764,7 +767,8 @@ garrow_boolean_array_get_value(GArrowBooleanArray *array, * @array: A #GArrowBooleanArray. * @length: (out): The number of values. * - * Returns: (array length=length): The raw boolean values. + * Returns: (array length=length) (transfer full): + * The raw boolean values. * * It should be freed with g_free() when no longer needed. */ @@ -2144,10 +2148,10 @@ garrow_decimal128_array_class_init(GArrowDecimal128ArrayClass *klass) * @array: A #GArrowDecimal128Array. * @i: The index of the target value. * - * Returns: The formatted i-th value. + * Returns: (transfer full): The formatted i-th value. * - * The returned string should be freed with g_free() when no longer - * needed. + * The returned string should be freed with g_free() when no longer + * needed. * * Since: 0.10.0 */ @@ -2255,6 +2259,17 @@ garrow_array_new_raw(std::shared_ptr *arrow_array) case arrow::Type::type::STRUCT: type = GARROW_TYPE_STRUCT_ARRAY; break; + case arrow::Type::type::UNION: + { + auto arrow_union_array = + std::static_pointer_cast(*arrow_array); + if (arrow_union_array->mode() == arrow::UnionMode::SPARSE) { + type = GARROW_TYPE_SPARSE_UNION_ARRAY; + } else { + type = GARROW_TYPE_DENSE_UNION_ARRAY; + } + } + break; case arrow::Type::type::DICTIONARY: type = GARROW_TYPE_DICTIONARY_ARRAY; break; diff --git a/c_glib/arrow-glib/basic-data-type.cpp b/c_glib/arrow-glib/basic-data-type.cpp index 24133c99f46de..cd3aa97679b5d 100644 --- a/c_glib/arrow-glib/basic-data-type.cpp +++ b/c_glib/arrow-glib/basic-data-type.cpp @@ -198,8 +198,8 @@ garrow_data_type_equal(GArrowDataType *data_type, * garrow_data_type_to_string: * @data_type: A #GArrowDataType. * - * Returns: The string representation of the data type. The caller - * must free it by g_free() when the caller doesn't need it anymore. + * Returns: (transfer full): The string representation of the data type. + * The caller must free it by g_free() when the caller doesn't need it anymore. */ gchar * garrow_data_type_to_string(GArrowDataType *data_type) @@ -1184,6 +1184,17 @@ garrow_data_type_new_raw(std::shared_ptr *arrow_data_type) case arrow::Type::type::STRUCT: type = GARROW_TYPE_STRUCT_DATA_TYPE; break; + case arrow::Type::type::UNION: + { + auto arrow_union_data_type = + std::static_pointer_cast(*arrow_data_type); + if (arrow_union_data_type->mode() == arrow::UnionMode::SPARSE) { + type = GARROW_TYPE_SPARSE_UNION_DATA_TYPE; + } else { + type = GARROW_TYPE_DENSE_UNION_DATA_TYPE; + } + } + break; case arrow::Type::type::DICTIONARY: type = GARROW_TYPE_DICTIONARY_DATA_TYPE; break; diff --git a/c_glib/arrow-glib/chunked-array.cpp b/c_glib/arrow-glib/chunked-array.cpp index e046b0d547ea9..6d9598bc10618 100644 --- a/c_glib/arrow-glib/chunked-array.cpp +++ b/c_glib/arrow-glib/chunked-array.cpp @@ -302,7 +302,8 @@ garrow_chunked_array_slice(GArrowChunkedArray *chunked_array, * @chunked_array: A #GArrowChunkedArray. * @error: (nullable): Return location for a #GError or %NULL. * - * Returns: (nullable): The formatted chunked array content or %NULL on error. + * Returns: (nullable) (transfer full): + * The formatted chunked array content or %NULL on error. * * The returned string should be freed when with g_free() when no * longer needed. diff --git a/c_glib/arrow-glib/column.cpp b/c_glib/arrow-glib/column.cpp index 06ab0b70de407..e3e964f557659 100644 --- a/c_glib/arrow-glib/column.cpp +++ b/c_glib/arrow-glib/column.cpp @@ -372,7 +372,8 @@ garrow_column_get_data(GArrowColumn *column) * @column: A #GArrowColumn. * @error: (nullable): Return location for a #GError or %NULL. * - * Returns: (nullable): The formatted column content or %NULL on error. + * Returns: (nullable) (transfer full): + * The formatted column content or %NULL on error. * * The returned string should be freed when with g_free() when no * longer needed. diff --git a/c_glib/arrow-glib/composite-array.cpp b/c_glib/arrow-glib/composite-array.cpp index b040ac72c7402..bff1858e8554d 100644 --- a/c_glib/arrow-glib/composite-array.cpp +++ b/c_glib/arrow-glib/composite-array.cpp @@ -41,10 +41,18 @@ G_BEGIN_DECLS * use #GArrowListArrayBuilder to create a new array. * * #GArrowStructArray is a class for struct array. It can store zero - * or more structs. One struct has zero or more fields. If you don't + * or more structs. One struct has one or more fields. If you don't * have Arrow format data, you need to use #GArrowStructArrayBuilder * to create a new array. * + * #GArrowUnionArray is a base class for union array. It can store + * zero or more unions. One union has one or more fields but one union + * can store only one field value. + * + * #GArrowDenseUnionArray is a class for dense union array. + * + * #GArrowSparseUnionArray is a class for sparse union array. + * * #GArrowDictionaryArray is a class for dictionary array. It can * store data with dictionary and indices. It's space effective than * normal array when the array has many same values. You can convert a @@ -159,7 +167,7 @@ garrow_struct_array_class_init(GArrowStructArrayClass *klass) * garrow_struct_array_new: * @data_type: The data type of the struct. * @length: The number of elements. - * @children: (element-type GArrowArray): The arrays for each field + * @fields: (element-type GArrowArray): The arrays for each field * as #GList of #GArrowArray. * @null_bitmap: (nullable): The bitmap that shows null elements. The * N-th element is null when the N-th bit is 0, not null otherwise. @@ -175,21 +183,21 @@ garrow_struct_array_class_init(GArrowStructArrayClass *klass) GArrowStructArray * garrow_struct_array_new(GArrowDataType *data_type, gint64 length, - GList *children, + GList *fields, GArrowBuffer *null_bitmap, gint64 n_nulls) { const auto arrow_data_type = garrow_data_type_get_raw(data_type); - std::vector> arrow_children; - for (GList *node = children; node; node = node->next) { - GArrowArray *child = GARROW_ARRAY(node->data); - arrow_children.push_back(garrow_array_get_raw(child)); + std::vector> arrow_fields; + for (auto node = fields; node; node = node->next) { + auto child = GARROW_ARRAY(node->data); + arrow_fields.push_back(garrow_array_get_raw(child)); } const auto arrow_bitmap = garrow_buffer_get_raw(null_bitmap); auto arrow_struct_array = std::make_shared(arrow_data_type, length, - arrow_children, + arrow_fields, arrow_bitmap, n_nulls); auto arrow_array = @@ -264,6 +272,153 @@ garrow_struct_array_flatten(GArrowStructArray *array, GError **error) } +G_DEFINE_TYPE(GArrowUnionArray, + garrow_union_array, + GARROW_TYPE_ARRAY) + +static void +garrow_union_array_init(GArrowUnionArray *object) +{ +} + +static void +garrow_union_array_class_init(GArrowUnionArrayClass *klass) +{ +} + +/** + * garrow_union_array_get_field + * @array: A #GArrowUnionArray. + * @i: The index of the field in the union. + * + * Returns: (nullable) (transfer full): The i-th field values as a + * #GArrowArray or %NULL on out of range. + */ +GArrowArray * +garrow_union_array_get_field(GArrowUnionArray *array, + gint i) +{ + auto arrow_array = garrow_array_get_raw(GARROW_ARRAY(array)); + auto arrow_union_array = + std::static_pointer_cast(arrow_array); + auto n_fields = arrow_array->num_fields(); + if (i < 0) { + i += n_fields; + } + if (i < 0) { + return NULL; + } + if (i >= n_fields) { + return NULL; + } + auto arrow_field_array = arrow_union_array->child(i); + return garrow_array_new_raw(&arrow_field_array); +} + + +G_DEFINE_TYPE(GArrowSparseUnionArray, + garrow_sparse_union_array, + GARROW_TYPE_UNION_ARRAY) + +static void +garrow_sparse_union_array_init(GArrowSparseUnionArray *object) +{ +} + +static void +garrow_sparse_union_array_class_init(GArrowSparseUnionArrayClass *klass) +{ +} + +/** + * garrow_sparse_union_array_new: + * @type_ids: The field type IDs for each value as #GArrowInt8Array. + * @fields: (element-type GArrowArray): The arrays for each field + * as #GList of #GArrowArray. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (nullable): A newly created #GArrowSparseUnionArray + * or %NULL on error. + * + * Since: 0.12.0 + */ +GArrowSparseUnionArray * +garrow_sparse_union_array_new(GArrowInt8Array *type_ids, + GList *fields, + GError **error) +{ + auto arrow_type_ids = garrow_array_get_raw(GARROW_ARRAY(type_ids)); + std::vector> arrow_fields; + for (auto node = fields; node; node = node->next) { + auto *field = GARROW_ARRAY(node->data); + arrow_fields.push_back(garrow_array_get_raw(field)); + } + std::shared_ptr arrow_union_array; + auto status = arrow::UnionArray::MakeSparse(*arrow_type_ids, + arrow_fields, + &arrow_union_array); + if (garrow_error_check(error, status, "[sparse-union-array][new]")) { + return GARROW_SPARSE_UNION_ARRAY(garrow_array_new_raw(&arrow_union_array)); + } else { + return NULL; + } +} + + +G_DEFINE_TYPE(GArrowDenseUnionArray, + garrow_dense_union_array, + GARROW_TYPE_UNION_ARRAY) + +static void +garrow_dense_union_array_init(GArrowDenseUnionArray *object) +{ +} + +static void +garrow_dense_union_array_class_init(GArrowDenseUnionArrayClass *klass) +{ +} + +/** + * garrow_dense_union_array_new: + * @type_ids: The field type IDs for each value as #GArrowInt8Array. + * @value_offsets: The value offsets for each value as #GArrowInt32Array. + * Each offset is counted for each type. + * @fields: (element-type GArrowArray): The arrays for each field + * as #GList of #GArrowArray. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (nullable): A newly created #GArrowDenseUnionArray + * or %NULL on error. + * + * Since: 0.12.0 + */ +GArrowDenseUnionArray * +garrow_dense_union_array_new(GArrowInt8Array *type_ids, + GArrowInt32Array *value_offsets, + GList *fields, + GError **error) +{ + auto arrow_type_ids = garrow_array_get_raw(GARROW_ARRAY(type_ids)); + auto arrow_value_offsets = garrow_array_get_raw(GARROW_ARRAY(value_offsets)); + std::vector> arrow_fields; + for (auto node = fields; node; node = node->next) { + auto *field = GARROW_ARRAY(node->data); + arrow_fields.push_back(garrow_array_get_raw(field)); + } + std::shared_ptr arrow_union_array; + auto status = arrow::UnionArray::MakeDense(*arrow_type_ids, + *arrow_value_offsets, + arrow_fields, + &arrow_union_array); + if (garrow_error_check(error, status, "[dense-union-array][new]")) { + return GARROW_DENSE_UNION_ARRAY(garrow_array_new_raw(&arrow_union_array)); + } else { + return NULL; + } +} + + G_DEFINE_TYPE(GArrowDictionaryArray, garrow_dictionary_array, GARROW_TYPE_ARRAY) diff --git a/c_glib/arrow-glib/composite-array.h b/c_glib/arrow-glib/composite-array.h index ad6ad53ff9fc8..c634dbfc3b006 100644 --- a/c_glib/arrow-glib/composite-array.h +++ b/c_glib/arrow-glib/composite-array.h @@ -123,7 +123,7 @@ GType garrow_struct_array_get_type(void) G_GNUC_CONST; GArrowStructArray *garrow_struct_array_new(GArrowDataType *data_type, gint64 length, - GList *children, + GList *fields, GArrowBuffer *null_bitmap, gint64 n_nulls); @@ -137,6 +137,56 @@ GARROW_AVAILABLE_IN_0_10 GList *garrow_struct_array_flatten(GArrowStructArray *array, GError **error); +#define GARROW_TYPE_UNION_ARRAY (garrow_union_array_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowUnionArray, + garrow_union_array, + GARROW, + UNION_ARRAY, + GArrowArray) +struct _GArrowUnionArrayClass +{ + GArrowArrayClass parent_class; +}; + +GArrowArray * +garrow_union_array_get_field(GArrowUnionArray *array, + gint i); + +#define GARROW_TYPE_SPARSE_UNION_ARRAY (garrow_sparse_union_array_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowSparseUnionArray, + garrow_sparse_union_array, + GARROW, + SPARSE_UNION_ARRAY, + GArrowUnionArray) +struct _GArrowSparseUnionArrayClass +{ + GArrowUnionArrayClass parent_class; +}; + +GArrowSparseUnionArray * +garrow_sparse_union_array_new(GArrowInt8Array *type_ids, + GList *fields, + GError **error); + + +#define GARROW_TYPE_DENSE_UNION_ARRAY (garrow_dense_union_array_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowDenseUnionArray, + garrow_dense_union_array, + GARROW, + DENSE_UNION_ARRAY, + GArrowUnionArray) +struct _GArrowDenseUnionArrayClass +{ + GArrowUnionArrayClass parent_class; +}; + +GArrowDenseUnionArray * +garrow_dense_union_array_new(GArrowInt8Array *type_ids, + GArrowInt32Array *value_offsets, + GList *fields, + GError **error); + + #define GARROW_TYPE_DICTIONARY_ARRAY (garrow_dictionary_array_get_type()) G_DECLARE_DERIVABLE_TYPE(GArrowDictionaryArray, garrow_dictionary_array, diff --git a/c_glib/arrow-glib/composite-data-type.cpp b/c_glib/arrow-glib/composite-data-type.cpp index 2645bead4010e..a4d3d843617a0 100644 --- a/c_glib/arrow-glib/composite-data-type.cpp +++ b/c_glib/arrow-glib/composite-data-type.cpp @@ -40,6 +40,12 @@ G_BEGIN_DECLS * * #GArrowStructDataType is a class for struct data type. * + * #GArrowUnionDataType is a base class for union data types. + * + * #GArrowSparseUnionDataType is a class for sparse union data type. + * + * #GArrowDenseUnionDataType is a class for dense union data type. + * * #GArrowDictionaryDataType is a class for dictionary data type. */ @@ -122,18 +128,17 @@ GArrowStructDataType * garrow_struct_data_type_new(GList *fields) { std::vector> arrow_fields; - for (GList *node = fields; node; node = g_list_next(node)) { + for (auto *node = fields; node; node = g_list_next(node)) { auto field = GARROW_FIELD(node->data); auto arrow_field = garrow_field_get_raw(field); arrow_fields.push_back(arrow_field); } auto arrow_data_type = std::make_shared(arrow_fields); - GArrowStructDataType *data_type = - GARROW_STRUCT_DATA_TYPE(g_object_new(GARROW_TYPE_STRUCT_DATA_TYPE, - "data-type", &arrow_data_type, - NULL)); - return data_type; + auto data_type = g_object_new(GARROW_TYPE_STRUCT_DATA_TYPE, + "data-type", &arrow_data_type, + NULL); + return GARROW_STRUCT_DATA_TYPE(data_type); } /** @@ -189,9 +194,12 @@ garrow_struct_data_type_get_field(GArrowStructDataType *data_type, { auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(data_type)); - while (i < 0) { + if (i < 0) { i += arrow_data_type->num_children(); } + if (i < 0) { + return NULL; + } if (i >= arrow_data_type->num_children()) { return NULL; } @@ -252,6 +260,222 @@ garrow_struct_data_type_get_field_index(GArrowStructDataType *data_type, } +G_DEFINE_ABSTRACT_TYPE(GArrowUnionDataType, + garrow_union_data_type, + GARROW_TYPE_DATA_TYPE) + +static void +garrow_union_data_type_init(GArrowUnionDataType *object) +{ +} + +static void +garrow_union_data_type_class_init(GArrowUnionDataTypeClass *klass) +{ +} + +/** + * garrow_union_data_type_get_n_fields: + * @data_type: A #GArrowUnionDataType. + * + * Returns: The number of fields of the union data type. + * + * Since: 0.12.0 + */ +gint +garrow_union_data_type_get_n_fields(GArrowUnionDataType *data_type) +{ + auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(data_type)); + return arrow_data_type->num_children(); +} + +/** + * garrow_union_data_type_get_fields: + * @data_type: A #GArrowUnionDataType. + * + * Returns: (transfer full) (element-type GArrowField): + * The fields of the union data type. + * + * Since: 0.12.0 + */ +GList * +garrow_union_data_type_get_fields(GArrowUnionDataType *data_type) +{ + auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(data_type)); + auto arrow_fields = arrow_data_type->children(); + + GList *fields = NULL; + for (auto arrow_field : arrow_fields) { + fields = g_list_prepend(fields, garrow_field_new_raw(&arrow_field)); + } + return g_list_reverse(fields); +} + +/** + * garrow_union_data_type_get_field: + * @data_type: A #GArrowUnionDataType. + * @i: The index of the target field. + * + * Returns: (transfer full) (nullable): + * The field at the index in the union data type or %NULL on not found. + * + * Since: 0.12.0 + */ +GArrowField * +garrow_union_data_type_get_field(GArrowUnionDataType *data_type, + gint i) +{ + auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(data_type)); + + if (i < 0) { + i += arrow_data_type->num_children(); + } + if (i < 0) { + return NULL; + } + if (i >= arrow_data_type->num_children()) { + return NULL; + } + + auto arrow_field = arrow_data_type->child(i); + if (arrow_field) { + return garrow_field_new_raw(&arrow_field); + } else { + return NULL; + } +} + +/** + * garrow_union_data_type_get_type_codes: + * @data_type: A #GArrowUnionDataType. + * @n_type_codes: (out): The number of type codes. + * + * Returns: (transfer full) (array length=n_type_codes): + * The codes for each field. + * + * It should be freed with g_free() when no longer needed. + * + * Since: 0.12.0 + */ +guint8 * +garrow_union_data_type_get_type_codes(GArrowUnionDataType *data_type, + gsize *n_type_codes) +{ + auto arrow_data_type = garrow_data_type_get_raw(GARROW_DATA_TYPE(data_type)); + auto arrow_union_data_type = + std::static_pointer_cast(arrow_data_type); + + const auto arrow_type_codes = arrow_union_data_type->type_codes(); + const auto n = arrow_type_codes.size(); + auto type_codes = static_cast(g_new(guint8, n)); + for (size_t i = 0; i < n; ++i) { + type_codes[i] = arrow_type_codes[i]; + } + *n_type_codes = n; + return type_codes; +} + + +G_DEFINE_TYPE(GArrowSparseUnionDataType, + garrow_sparse_union_data_type, + GARROW_TYPE_UNION_DATA_TYPE) + +static void +garrow_sparse_union_data_type_init(GArrowSparseUnionDataType *object) +{ +} + +static void +garrow_sparse_union_data_type_class_init(GArrowSparseUnionDataTypeClass *klass) +{ +} + +/** + * garrow_sparse_union_data_type_new: + * @fields: (element-type GArrowField): The fields of the union. + * @type_codes: (array length=n_type_codes): The codes to specify each field. + * @n_type_codes: The number of type codes. + * + * Returns: The newly created sparse union data type. + */ +GArrowSparseUnionDataType * +garrow_sparse_union_data_type_new(GList *fields, + guint8 *type_codes, + gsize n_type_codes) +{ + std::vector> arrow_fields; + for (auto node = fields; node; node = g_list_next(node)) { + auto field = GARROW_FIELD(node->data); + auto arrow_field = garrow_field_get_raw(field); + arrow_fields.push_back(arrow_field); + } + + std::vector arrow_type_codes; + for (gsize i = 0; i < n_type_codes; ++i) { + arrow_type_codes.push_back(type_codes[i]); + } + + auto arrow_data_type = + std::make_shared(arrow_fields, + arrow_type_codes, + arrow::UnionMode::SPARSE); + auto data_type = g_object_new(GARROW_TYPE_SPARSE_UNION_DATA_TYPE, + "data-type", &arrow_data_type, + NULL); + return GARROW_SPARSE_UNION_DATA_TYPE(data_type); +} + + +G_DEFINE_TYPE(GArrowDenseUnionDataType, + garrow_dense_union_data_type, + GARROW_TYPE_UNION_DATA_TYPE) + +static void +garrow_dense_union_data_type_init(GArrowDenseUnionDataType *object) +{ +} + +static void +garrow_dense_union_data_type_class_init(GArrowDenseUnionDataTypeClass *klass) +{ +} + +/** + * garrow_dense_union_data_type_new: + * @fields: (element-type GArrowField): The fields of the union. + * @type_codes: (array length=n_type_codes): The codes to specify each field. + * @n_type_codes: The number of type codes. + * + * Returns: The newly created dense union data type. + */ +GArrowDenseUnionDataType * +garrow_dense_union_data_type_new(GList *fields, + guint8 *type_codes, + gsize n_type_codes) +{ + std::vector> arrow_fields; + for (auto node = fields; node; node = g_list_next(node)) { + auto field = GARROW_FIELD(node->data); + auto arrow_field = garrow_field_get_raw(field); + arrow_fields.push_back(arrow_field); + } + + std::vector arrow_type_codes; + for (gsize i = 0; i < n_type_codes; ++i) { + arrow_type_codes.push_back(type_codes[i]); + } + + auto arrow_data_type = + std::make_shared(arrow_fields, + arrow_type_codes, + arrow::UnionMode::DENSE); + auto data_type = g_object_new(GARROW_TYPE_DENSE_UNION_DATA_TYPE, + "data-type", &arrow_data_type, + NULL); + return GARROW_DENSE_UNION_DATA_TYPE(data_type); +} + + G_DEFINE_TYPE(GArrowDictionaryDataType, garrow_dictionary_data_type, GARROW_TYPE_FIXED_WIDTH_DATA_TYPE) diff --git a/c_glib/arrow-glib/composite-data-type.h b/c_glib/arrow-glib/composite-data-type.h index 7d6a02b1c77d9..25e1ac3d94929 100644 --- a/c_glib/arrow-glib/composite-data-type.h +++ b/c_glib/arrow-glib/composite-data-type.h @@ -96,6 +96,66 @@ gint garrow_struct_data_type_get_field_index(GArrowStructDataType *data_type, const gchar *name); + +#define GARROW_TYPE_UNION_DATA_TYPE (garrow_union_data_type_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowUnionDataType, + garrow_union_data_type, + GARROW, + UNION_DATA_TYPE, + GArrowDataType) +struct _GArrowUnionDataTypeClass +{ + GArrowDataTypeClass parent_class; +}; + +gint +garrow_union_data_type_get_n_fields(GArrowUnionDataType *data_type); +GList * +garrow_union_data_type_get_fields(GArrowUnionDataType *data_type); +GArrowField * +garrow_union_data_type_get_field(GArrowUnionDataType *data_type, + gint i); +guint8 * +garrow_union_data_type_get_type_codes(GArrowUnionDataType *data_type, + gsize *n_type_codes); + + +#define GARROW_TYPE_SPARSE_UNION_DATA_TYPE \ + (garrow_sparse_union_data_type_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowSparseUnionDataType, + garrow_sparse_union_data_type, + GARROW, + SPARSE_UNION_DATA_TYPE, + GArrowUnionDataType) +struct _GArrowSparseUnionDataTypeClass +{ + GArrowUnionDataTypeClass parent_class; +}; + +GArrowSparseUnionDataType * +garrow_sparse_union_data_type_new(GList *fields, + guint8 *type_codes, + gsize n_type_codes); + + +#define GARROW_TYPE_DENSE_UNION_DATA_TYPE \ + (garrow_dense_union_data_type_get_type()) +G_DECLARE_DERIVABLE_TYPE(GArrowDenseUnionDataType, + garrow_dense_union_data_type, + GARROW, + DENSE_UNION_DATA_TYPE, + GArrowUnionDataType) +struct _GArrowDenseUnionDataTypeClass +{ + GArrowUnionDataTypeClass parent_class; +}; + +GArrowDenseUnionDataType * +garrow_dense_union_data_type_new(GList *fields, + guint8 *type_codes, + gsize n_type_codes); + + #define GARROW_TYPE_DICTIONARY_DATA_TYPE (garrow_dictionary_data_type_get_type()) G_DECLARE_DERIVABLE_TYPE(GArrowDictionaryDataType, garrow_dictionary_data_type, diff --git a/c_glib/arrow-glib/decimal.cpp b/c_glib/arrow-glib/decimal.cpp index 67b2d43b1018a..34eb417a96105 100644 --- a/c_glib/arrow-glib/decimal.cpp +++ b/c_glib/arrow-glib/decimal.cpp @@ -141,9 +141,9 @@ garrow_decimal128_new_integer(const gint64 data) * @decimal: A #GArrowDecimal128. * @scale: The scale of the decimal. * - * Returns: The string representation of the decimal. + * Returns: (transfer full): The string representation of the decimal. * - * It should be freed with g_free() when no longer needed. + * It should be freed with g_free() when no longer needed. * * Since: 0.10.0 */ @@ -159,9 +159,9 @@ garrow_decimal128_to_string_scale(GArrowDecimal128 *decimal, gint32 scale) * garrow_decimal128_to_string: * @decimal: A #GArrowDecimal128. * - * Returns: The string representation of the decimal. + * Returns: (transfer full): The string representation of the decimal. * - * It should be freed with g_free() when no longer needed. + * It should be freed with g_free() when no longer needed. * * Since: 0.10.0 */ diff --git a/c_glib/arrow-glib/reader.cpp b/c_glib/arrow-glib/reader.cpp index c6c96670ba4b6..b4afde31406d3 100644 --- a/c_glib/arrow-glib/reader.cpp +++ b/c_glib/arrow-glib/reader.cpp @@ -645,9 +645,11 @@ garrow_feather_file_reader_new(GArrowSeekableInputStream *file, * garrow_feather_file_reader_get_description: * @reader: A #GArrowFeatherFileReader. * - * Returns: (nullable): The description of the file if it exists, + * Returns: (nullable) (transfer full): + * The description of the file if it exists, * %NULL otherwise. You can confirm whether description exists or not by * garrow_feather_file_reader_has_description(). + * * It should be freed with g_free() when no longer needed. * * Since: 0.4.0 @@ -730,7 +732,8 @@ garrow_feather_file_reader_get_n_columns(GArrowFeatherFileReader *reader) * @reader: A #GArrowFeatherFileReader. * @i: The index of the target column. * - * Returns: The i-th column name in the file. + * Returns: (transfer full): The i-th column name in the file. + * * It should be freed with g_free() when no longer needed. * * Since: 0.4.0 diff --git a/c_glib/arrow-glib/record-batch.cpp b/c_glib/arrow-glib/record-batch.cpp index f905b065de6e3..04d442b409a8c 100644 --- a/c_glib/arrow-glib/record-batch.cpp +++ b/c_glib/arrow-glib/record-batch.cpp @@ -331,7 +331,8 @@ garrow_record_batch_slice(GArrowRecordBatch *record_batch, * @record_batch: A #GArrowRecordBatch. * @error: (nullable): Return location for a #GError or %NULL. * - * Returns: (nullable): The formatted record batch content or %NULL on error. + * Returns: (nullable) (transfer full): + * The formatted record batch content or %NULL on error. * * The returned string should be freed when with g_free() when no * longer needed. diff --git a/c_glib/arrow-glib/table.cpp b/c_glib/arrow-glib/table.cpp index b4d0d2c6d862f..f9e1b951a3658 100644 --- a/c_glib/arrow-glib/table.cpp +++ b/c_glib/arrow-glib/table.cpp @@ -313,7 +313,8 @@ garrow_table_replace_column(GArrowTable *table, * @table: A #GArrowTable. * @error: (nullable): Return location for a #GError or %NULL. * - * Returns: (nullable): The formatted table content or %NULL on error. + * Returns: (nullable) (transfer full): + * The formatted table content or %NULL on error. * * The returned string should be freed when with g_free() when no * longer needed. diff --git a/c_glib/arrow-glib/tensor.cpp b/c_glib/arrow-glib/tensor.cpp index ff2683de4ed09..46ae7beec2675 100644 --- a/c_glib/arrow-glib/tensor.cpp +++ b/c_glib/arrow-glib/tensor.cpp @@ -281,7 +281,9 @@ garrow_tensor_get_buffer(GArrowTensor *tensor) * @tensor: A #GArrowTensor. * @n_dimensions: (out): The number of dimensions. * - * Returns: (array length=n_dimensions): The shape of the tensor. + * Returns: (array length=n_dimensions) (transfer full): + * The shape of the tensor. + * * It should be freed with g_free() when no longer needed. * * Since: 0.3.0 @@ -306,7 +308,9 @@ garrow_tensor_get_shape(GArrowTensor *tensor, gint *n_dimensions) * @tensor: A #GArrowTensor. * @n_strides: (out): The number of strides. * - * Returns: (array length=n_strides): The strides of the tensor. + * Returns: (array length=n_strides) (transfer full): + * The strides of the tensor. + * * It should be freed with g_free() when no longer needed. * * Since: 0.3.0 diff --git a/c_glib/arrow-gpu-glib/cuda.cpp b/c_glib/arrow-gpu-glib/cuda.cpp deleted file mode 100644 index 6d2e48f351e95..0000000000000 --- a/c_glib/arrow-gpu-glib/cuda.cpp +++ /dev/null @@ -1,942 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -#ifdef HAVE_CONFIG_H -# include -#endif - -#include -#include -#include -#include -#include -#include -#include - -#include - -G_BEGIN_DECLS - -/** - * SECTION: cuda - * @section_id: cuda-classes - * @title: CUDA related classes - * @include: arrow-gpu-glib/arrow-gpu-glib.h - * - * The following classes provide CUDA support for Apache Arrow data. - * - * #GArrowGPUCUDADeviceManager is the starting point. You need at - * least one #GArrowGPUCUDAContext to process Apache Arrow data on - * NVIDIA GPU. - * - * #GArrowGPUCUDAContext is a class to keep context for one GPU. You - * need to create #GArrowGPUCUDAContext for each GPU that you want to - * use. You can create #GArrowGPUCUDAContext by - * garrow_gpu_cuda_device_manager_get_context(). - * - * #GArrowGPUCUDABuffer is a class for data on GPU. You can copy data - * on GPU to/from CPU by garrow_gpu_cuda_buffer_copy_to_host() and - * garrow_gpu_cuda_buffer_copy_from_host(). You can share data on GPU - * with other processes by garrow_gpu_cuda_buffer_export() and - * garrow_gpu_cuda_buffer_new_ipc(). - * - * #GArrowGPUCUDAHostBuffer is a class for data on CPU that is - * directly accessible from GPU. - * - * #GArrowGPUCUDAIPCMemoryHandle is a class to share data on GPU with - * other processes. You can export your data on GPU to other processes - * by garrow_gpu_cuda_buffer_export() and - * garrow_gpu_cuda_ipc_memory_handle_new(). You can import other - * process data on GPU by garrow_gpu_cuda_ipc_memory_handle_new() and - * garrow_gpu_cuda_buffer_new_ipc(). - * - * #GArrowGPUCUDABufferInputStream is a class to read data in - * #GArrowGPUCUDABuffer. - * - * #GArrowGPUCUDABufferOutputStream is a class to write data into - * #GArrowGPUCUDABuffer. - */ - -G_DEFINE_TYPE(GArrowGPUCUDADeviceManager, - garrow_gpu_cuda_device_manager, - G_TYPE_OBJECT) - -static void -garrow_gpu_cuda_device_manager_init(GArrowGPUCUDADeviceManager *object) -{ -} - -static void -garrow_gpu_cuda_device_manager_class_init(GArrowGPUCUDADeviceManagerClass *klass) -{ -} - -/** - * garrow_gpu_cuda_device_manager_new: - * @error: (nullable): Return location for a #GError or %NULL. - * - * Returns: A newly created #GArrowGPUCUDADeviceManager on success, - * %NULL on error. - * - * Since: 0.8.0 - */ -GArrowGPUCUDADeviceManager * -garrow_gpu_cuda_device_manager_new(GError **error) -{ - arrow::gpu::CudaDeviceManager *manager; - auto status = arrow::gpu::CudaDeviceManager::GetInstance(&manager); - if (garrow_error_check(error, status, "[gpu][cuda][device-manager][new]")) { - auto manager = g_object_new(GARROW_GPU_TYPE_CUDA_DEVICE_MANAGER, - NULL); - return GARROW_GPU_CUDA_DEVICE_MANAGER(manager); - } else { - return NULL; - } -} - -/** - * garrow_gpu_cuda_device_manager_get_context: - * @manager: A #GArrowGPUCUDADeviceManager. - * @gpu_number: A GPU device number for the target context. - * @error: (nullable): Return location for a #GError or %NULL. - * - * Returns: (transfer full): A newly created #GArrowGPUCUDAContext on - * success, %NULL on error. Contexts for the same GPU device number - * share the same data internally. - * - * Since: 0.8.0 - */ -GArrowGPUCUDAContext * -garrow_gpu_cuda_device_manager_get_context(GArrowGPUCUDADeviceManager *manager, - gint gpu_number, - GError **error) -{ - arrow::gpu::CudaDeviceManager *arrow_manager; - arrow::gpu::CudaDeviceManager::GetInstance(&arrow_manager); - std::shared_ptr context; - auto status = arrow_manager->GetContext(gpu_number, &context); - if (garrow_error_check(error, status, - "[gpu][cuda][device-manager][get-context]]")) { - return garrow_gpu_cuda_context_new_raw(&context); - } else { - return NULL; - } -} - -/** - * garrow_gpu_cuda_device_manager_get_n_devices: - * @manager: A #GArrowGPUCUDADeviceManager. - * - * Returns: The number of GPU devices. - * - * Since: 0.8.0 - */ -gsize -garrow_gpu_cuda_device_manager_get_n_devices(GArrowGPUCUDADeviceManager *manager) -{ - arrow::gpu::CudaDeviceManager *arrow_manager; - arrow::gpu::CudaDeviceManager::GetInstance(&arrow_manager); - return arrow_manager->num_devices(); -} - - -typedef struct GArrowGPUCUDAContextPrivate_ { - std::shared_ptr context; -} GArrowGPUCUDAContextPrivate; - -enum { - PROP_CONTEXT = 1 -}; - -G_DEFINE_TYPE_WITH_PRIVATE(GArrowGPUCUDAContext, - garrow_gpu_cuda_context, - G_TYPE_OBJECT) - -#define GARROW_GPU_CUDA_CONTEXT_GET_PRIVATE(object) \ - static_cast( \ - garrow_gpu_cuda_context_get_instance_private( \ - GARROW_GPU_CUDA_CONTEXT(object))) - -static void -garrow_gpu_cuda_context_finalize(GObject *object) -{ - auto priv = GARROW_GPU_CUDA_CONTEXT_GET_PRIVATE(object); - - priv->context = nullptr; - - G_OBJECT_CLASS(garrow_gpu_cuda_context_parent_class)->finalize(object); -} - -static void -garrow_gpu_cuda_context_set_property(GObject *object, - guint prop_id, - const GValue *value, - GParamSpec *pspec) -{ - auto priv = GARROW_GPU_CUDA_CONTEXT_GET_PRIVATE(object); - - switch (prop_id) { - case PROP_CONTEXT: - priv->context = - *static_cast *>(g_value_get_pointer(value)); - break; - default: - G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); - break; - } -} - -static void -garrow_gpu_cuda_context_get_property(GObject *object, - guint prop_id, - GValue *value, - GParamSpec *pspec) -{ - switch (prop_id) { - default: - G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); - break; - } -} - -static void -garrow_gpu_cuda_context_init(GArrowGPUCUDAContext *object) -{ -} - -static void -garrow_gpu_cuda_context_class_init(GArrowGPUCUDAContextClass *klass) -{ - GParamSpec *spec; - - auto gobject_class = G_OBJECT_CLASS(klass); - - gobject_class->finalize = garrow_gpu_cuda_context_finalize; - gobject_class->set_property = garrow_gpu_cuda_context_set_property; - gobject_class->get_property = garrow_gpu_cuda_context_get_property; - - /** - * GArrowGPUCUDAContext:context: - * - * Since: 0.8.0 - */ - spec = g_param_spec_pointer("context", - "Context", - "The raw std::shared_ptr", - static_cast(G_PARAM_WRITABLE | - G_PARAM_CONSTRUCT_ONLY)); - g_object_class_install_property(gobject_class, PROP_CONTEXT, spec); -} - -/** - * garrow_gpu_cuda_context_get_allocated_size: - * @context: A #GArrowGPUCUDAContext. - * - * Returns: The allocated memory by this context in bytes. - * - * Since: 0.8.0 - */ -gint64 -garrow_gpu_cuda_context_get_allocated_size(GArrowGPUCUDAContext *context) -{ - auto arrow_context = garrow_gpu_cuda_context_get_raw(context); - return arrow_context->bytes_allocated(); -} - - -G_DEFINE_TYPE(GArrowGPUCUDABuffer, - garrow_gpu_cuda_buffer, - GARROW_TYPE_BUFFER) - -static void -garrow_gpu_cuda_buffer_init(GArrowGPUCUDABuffer *object) -{ -} - -static void -garrow_gpu_cuda_buffer_class_init(GArrowGPUCUDABufferClass *klass) -{ -} - -/** - * garrow_gpu_cuda_buffer_new: - * @context: A #GArrowGPUCUDAContext. - * @size: The number of bytes to be allocated on GPU device for this context. - * @error: (nullable): Return location for a #GError or %NULL. - * - * Returns: (transfer full): A newly created #GArrowGPUCUDABuffer on - * success, %NULL on error. - * - * Since: 0.8.0 - */ -GArrowGPUCUDABuffer * -garrow_gpu_cuda_buffer_new(GArrowGPUCUDAContext *context, - gint64 size, - GError **error) -{ - auto arrow_context = garrow_gpu_cuda_context_get_raw(context); - std::shared_ptr arrow_buffer; - auto status = arrow_context->Allocate(size, &arrow_buffer); - if (garrow_error_check(error, status, "[gpu][cuda][buffer][new]")) { - return garrow_gpu_cuda_buffer_new_raw(&arrow_buffer); - } else { - return NULL; - } -} - -/** - * garrow_gpu_cuda_buffer_new_ipc: - * @context: A #GArrowGPUCUDAContext. - * @handle: A #GArrowGPUCUDAIPCMemoryHandle to be communicated. - * @error: (nullable): Return location for a #GError or %NULL. - * - * Returns: (transfer full): A newly created #GArrowGPUCUDABuffer on - * success, %NULL on error. The buffer has data from the IPC target. - * - * Since: 0.8.0 - */ -GArrowGPUCUDABuffer * -garrow_gpu_cuda_buffer_new_ipc(GArrowGPUCUDAContext *context, - GArrowGPUCUDAIPCMemoryHandle *handle, - GError **error) -{ - auto arrow_context = garrow_gpu_cuda_context_get_raw(context); - auto arrow_handle = garrow_gpu_cuda_ipc_memory_handle_get_raw(handle); - std::shared_ptr arrow_buffer; - auto status = arrow_context->OpenIpcBuffer(*arrow_handle, &arrow_buffer); - if (garrow_error_check(error, status, - "[gpu][cuda][buffer][new-ipc]")) { - return garrow_gpu_cuda_buffer_new_raw(&arrow_buffer); - } else { - return NULL; - } -} - -/** - * garrow_gpu_cuda_buffer_new_record_batch: - * @context: A #GArrowGPUCUDAContext. - * @record_batch: A #GArrowRecordBatch to be serialized. - * @error: (nullable): Return location for a #GError or %NULL. - * - * Returns: (transfer full): A newly created #GArrowGPUCUDABuffer on - * success, %NULL on error. The buffer has serialized record batch - * data. - * - * Since: 0.8.0 - */ -GArrowGPUCUDABuffer * -garrow_gpu_cuda_buffer_new_record_batch(GArrowGPUCUDAContext *context, - GArrowRecordBatch *record_batch, - GError **error) -{ - auto arrow_context = garrow_gpu_cuda_context_get_raw(context); - auto arrow_record_batch = garrow_record_batch_get_raw(record_batch); - std::shared_ptr arrow_buffer; - auto status = arrow::gpu::SerializeRecordBatch(*arrow_record_batch, - arrow_context.get(), - &arrow_buffer); - if (garrow_error_check(error, status, - "[gpu][cuda][buffer][new-record-batch]")) { - return garrow_gpu_cuda_buffer_new_raw(&arrow_buffer); - } else { - return NULL; - } -} - -/** - * garrow_gpu_cuda_buffer_copy_to_host: - * @buffer: A #GArrowGPUCUDABuffer. - * @position: The offset of memory on GPU device to be copied. - * @size: The size of memory on GPU device to be copied in bytes. - * @error: (nullable): Return location for a #GError or %NULL. - * - * Returns: (transfer full): A #GBytes that have copied memory on CPU - * host on success, %NULL on error. - * - * Since: 0.8.0 - */ -GBytes * -garrow_gpu_cuda_buffer_copy_to_host(GArrowGPUCUDABuffer *buffer, - gint64 position, - gint64 size, - GError **error) -{ - auto arrow_buffer = garrow_gpu_cuda_buffer_get_raw(buffer); - auto data = static_cast(g_malloc(size)); - auto status = arrow_buffer->CopyToHost(position, size, data); - if (garrow_error_check(error, status, "[gpu][cuda][buffer][copy-to-host]")) { - return g_bytes_new_take(data, size); - } else { - g_free(data); - return NULL; - } -} - -/** - * garrow_gpu_cuda_buffer_copy_from_host: - * @buffer: A #GArrowGPUCUDABuffer. - * @data: (array length=size): Data on CPU host to be copied. - * @size: The size of data on CPU host to be copied in bytes. - * @error: (nullable): Return location for a #GError or %NULL. - * - * Returns: %TRUE on success, %FALSE if there was an error. - * - * Since: 0.8.0 - */ -gboolean -garrow_gpu_cuda_buffer_copy_from_host(GArrowGPUCUDABuffer *buffer, - const guint8 *data, - gint64 size, - GError **error) -{ - auto arrow_buffer = garrow_gpu_cuda_buffer_get_raw(buffer); - auto status = arrow_buffer->CopyFromHost(0, data, size); - return garrow_error_check(error, - status, - "[gpu][cuda][buffer][copy-from-host]"); -} - -/** - * garrow_gpu_cuda_buffer_export: - * @buffer: A #GArrowGPUCUDABuffer. - * @error: (nullable): Return location for a #GError or %NULL. - * - * Returns: (transfer full): A newly created - * #GArrowGPUCUDAIPCMemoryHandle to handle the exported buffer on - * success, %NULL on error - * - * Since: 0.8.0 - */ -GArrowGPUCUDAIPCMemoryHandle * -garrow_gpu_cuda_buffer_export(GArrowGPUCUDABuffer *buffer, GError **error) -{ - auto arrow_buffer = garrow_gpu_cuda_buffer_get_raw(buffer); - std::shared_ptr arrow_handle; - auto status = arrow_buffer->ExportForIpc(&arrow_handle); - if (garrow_error_check(error, status, "[gpu][cuda][buffer][export-for-ipc]")) { - return garrow_gpu_cuda_ipc_memory_handle_new_raw(&arrow_handle); - } else { - return NULL; - } -} - -/** - * garrow_gpu_cuda_buffer_get_context: - * @buffer: A #GArrowGPUCUDABuffer. - * - * Returns: (transfer full): A newly created #GArrowGPUCUDAContext for the - * buffer. Contexts for the same buffer share the same data internally. - * - * Since: 0.8.0 - */ -GArrowGPUCUDAContext * -garrow_gpu_cuda_buffer_get_context(GArrowGPUCUDABuffer *buffer) -{ - auto arrow_buffer = garrow_gpu_cuda_buffer_get_raw(buffer); - auto arrow_context = arrow_buffer->context(); - return garrow_gpu_cuda_context_new_raw(&arrow_context); -} - -/** - * garrow_gpu_cuda_buffer_read_record_batch: - * @buffer: A #GArrowGPUCUDABuffer. - * @schema: A #GArrowSchema for record batch. - * @error: (nullable): Return location for a #GError or %NULL. - * - * Returns: (transfer full): A newly created #GArrowRecordBatch on - * success, %NULL on error. The record batch data is located on GPU. - * - * Since: 0.8.0 - */ -GArrowRecordBatch * -garrow_gpu_cuda_buffer_read_record_batch(GArrowGPUCUDABuffer *buffer, - GArrowSchema *schema, - GError **error) -{ - auto arrow_buffer = garrow_gpu_cuda_buffer_get_raw(buffer); - auto arrow_schema = garrow_schema_get_raw(schema); - auto pool = arrow::default_memory_pool(); - std::shared_ptr arrow_record_batch; - auto status = arrow::gpu::ReadRecordBatch(arrow_schema, - arrow_buffer, - pool, - &arrow_record_batch); - if (garrow_error_check(error, status, - "[gpu][cuda][buffer][read-record-batch]")) { - return garrow_record_batch_new_raw(&arrow_record_batch); - } else { - return NULL; - } -} - - -G_DEFINE_TYPE(GArrowGPUCUDAHostBuffer, - garrow_gpu_cuda_host_buffer, - GARROW_TYPE_MUTABLE_BUFFER) - -static void -garrow_gpu_cuda_host_buffer_init(GArrowGPUCUDAHostBuffer *object) -{ -} - -static void -garrow_gpu_cuda_host_buffer_class_init(GArrowGPUCUDAHostBufferClass *klass) -{ -} - -/** - * garrow_gpu_cuda_host_buffer_new: - * @gpu_number: A GPU device number for the target context. - * @size: The number of bytes to be allocated on CPU host. - * @error: (nullable): Return location for a #GError or %NULL. - * - * Returns: A newly created #GArrowGPUCUDAHostBuffer on success, - * %NULL on error. The allocated memory is accessible from GPU - * device for the @context. - * - * Since: 0.8.0 - */ -GArrowGPUCUDAHostBuffer * -garrow_gpu_cuda_host_buffer_new(gint gpu_number, gint64 size, GError **error) -{ - arrow::gpu::CudaDeviceManager *manager; - auto status = arrow::gpu::CudaDeviceManager::GetInstance(&manager); - std::shared_ptr arrow_buffer; - status = manager->AllocateHost(gpu_number, size, &arrow_buffer); - if (garrow_error_check(error, status, "[gpu][cuda][host-buffer][new]")) { - return garrow_gpu_cuda_host_buffer_new_raw(&arrow_buffer); - } else { - return NULL; - } -} - - -typedef struct GArrowGPUCUDAIPCMemoryHandlePrivate_ { - std::shared_ptr ipc_memory_handle; -} GArrowGPUCUDAIPCMemoryHandlePrivate; - -enum { - PROP_IPC_MEMORY_HANDLE = 1 -}; - -G_DEFINE_TYPE_WITH_PRIVATE(GArrowGPUCUDAIPCMemoryHandle, - garrow_gpu_cuda_ipc_memory_handle, - G_TYPE_OBJECT) - -#define GARROW_GPU_CUDA_IPC_MEMORY_HANDLE_GET_PRIVATE(object) \ - static_cast( \ - garrow_gpu_cuda_ipc_memory_handle_get_instance_private( \ - GARROW_GPU_CUDA_IPC_MEMORY_HANDLE(object))) - -static void -garrow_gpu_cuda_ipc_memory_handle_finalize(GObject *object) -{ - auto priv = GARROW_GPU_CUDA_IPC_MEMORY_HANDLE_GET_PRIVATE(object); - - priv->ipc_memory_handle = nullptr; - - G_OBJECT_CLASS(garrow_gpu_cuda_ipc_memory_handle_parent_class)->finalize(object); -} - -static void -garrow_gpu_cuda_ipc_memory_handle_set_property(GObject *object, - guint prop_id, - const GValue *value, - GParamSpec *pspec) -{ - auto priv = GARROW_GPU_CUDA_IPC_MEMORY_HANDLE_GET_PRIVATE(object); - - switch (prop_id) { - case PROP_IPC_MEMORY_HANDLE: - priv->ipc_memory_handle = - *static_cast *>(g_value_get_pointer(value)); - break; - default: - G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); - break; - } -} - -static void -garrow_gpu_cuda_ipc_memory_handle_get_property(GObject *object, - guint prop_id, - GValue *value, - GParamSpec *pspec) -{ - switch (prop_id) { - default: - G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); - break; - } -} - -static void -garrow_gpu_cuda_ipc_memory_handle_init(GArrowGPUCUDAIPCMemoryHandle *object) -{ -} - -static void -garrow_gpu_cuda_ipc_memory_handle_class_init(GArrowGPUCUDAIPCMemoryHandleClass *klass) -{ - GParamSpec *spec; - - auto gobject_class = G_OBJECT_CLASS(klass); - - gobject_class->finalize = garrow_gpu_cuda_ipc_memory_handle_finalize; - gobject_class->set_property = garrow_gpu_cuda_ipc_memory_handle_set_property; - gobject_class->get_property = garrow_gpu_cuda_ipc_memory_handle_get_property; - - /** - * GArrowGPUCUDAIPCMemoryHandle:ipc-memory-handle: - * - * Since: 0.8.0 - */ - spec = g_param_spec_pointer("ipc-memory-handle", - "IPC Memory Handle", - "The raw std::shared_ptr", - static_cast(G_PARAM_WRITABLE | - G_PARAM_CONSTRUCT_ONLY)); - g_object_class_install_property(gobject_class, PROP_IPC_MEMORY_HANDLE, spec); -} - -/** - * garrow_gpu_cuda_ipc_memory_handle_new: - * @data: (array length=size): A serialized #GArrowGPUCUDAIPCMemoryHandle. - * @size: The size of data. - * @error: (nullable): Return location for a #GError or %NULL. - * - * Returns: (transfer full): A newly created #GArrowGPUCUDAIPCMemoryHandle - * on success, %NULL on error. - * - * Since: 0.8.0 - */ -GArrowGPUCUDAIPCMemoryHandle * -garrow_gpu_cuda_ipc_memory_handle_new(const guint8 *data, - gsize size, - GError **error) -{ - std::shared_ptr arrow_handle; - auto status = arrow::gpu::CudaIpcMemHandle::FromBuffer(data, &arrow_handle); - if (garrow_error_check(error, status, - "[gpu][cuda][ipc-memory-handle][new]")) { - return garrow_gpu_cuda_ipc_memory_handle_new_raw(&arrow_handle); - } else { - return NULL; - } -} - -/** - * garrow_gpu_cuda_ipc_memory_handle_serialize: - * @handle: A #GArrowGPUCUDAIPCMemoryHandle. - * @error: (nullable): Return location for a #GError or %NULL. - * - * Returns: (transfer full): A newly created #GArrowBuffer on success, - * %NULL on error. The buffer has serialized @handle. The serialized - * @handle can be deserialized by garrow_gpu_cuda_ipc_memory_handle_new() - * in other process. - * - * Since: 0.8.0 - */ -GArrowBuffer * -garrow_gpu_cuda_ipc_memory_handle_serialize(GArrowGPUCUDAIPCMemoryHandle *handle, - GError **error) -{ - auto arrow_handle = garrow_gpu_cuda_ipc_memory_handle_get_raw(handle); - std::shared_ptr arrow_buffer; - auto status = arrow_handle->Serialize(arrow::default_memory_pool(), - &arrow_buffer); - if (garrow_error_check(error, status, - "[gpu][cuda][ipc-memory-handle][serialize]")) { - return garrow_buffer_new_raw(&arrow_buffer); - } else { - return NULL; - } -} - -GArrowBuffer * -garrow_gpu_cuda_buffer_input_stream_new_raw_readable_interface(std::shared_ptr *arrow_buffer) -{ - auto buffer = GARROW_BUFFER(g_object_new(GARROW_GPU_TYPE_CUDA_BUFFER, - "buffer", arrow_buffer, - NULL)); - return buffer; -} - -static std::shared_ptr -garrow_gpu_cuda_buffer_input_stream_get_raw_readable_interface(GArrowReadable *readable) -{ - auto input_stream = GARROW_INPUT_STREAM(readable); - auto arrow_input_stream = garrow_input_stream_get_raw(input_stream); - return arrow_input_stream; -} - -static void -garrow_gpu_cuda_buffer_input_stream_readable_interface_init(GArrowReadableInterface *iface) -{ - iface->new_raw = - garrow_gpu_cuda_buffer_input_stream_new_raw_readable_interface; - iface->get_raw = - garrow_gpu_cuda_buffer_input_stream_get_raw_readable_interface; -} - -G_DEFINE_TYPE_WITH_CODE( - GArrowGPUCUDABufferInputStream, - garrow_gpu_cuda_buffer_input_stream, - GARROW_TYPE_BUFFER_INPUT_STREAM, - G_IMPLEMENT_INTERFACE( - GARROW_TYPE_READABLE, - garrow_gpu_cuda_buffer_input_stream_readable_interface_init)) - -static void -garrow_gpu_cuda_buffer_input_stream_init(GArrowGPUCUDABufferInputStream *object) -{ -} - -static void -garrow_gpu_cuda_buffer_input_stream_class_init(GArrowGPUCUDABufferInputStreamClass *klass) -{ -} - -/** - * garrow_gpu_cuda_buffer_input_stream_new: - * @buffer: A #GArrowGPUCUDABuffer. - * - * Returns: (transfer full): A newly created - * #GArrowGPUCUDABufferInputStream. - * - * Since: 0.8.0 - */ -GArrowGPUCUDABufferInputStream * -garrow_gpu_cuda_buffer_input_stream_new(GArrowGPUCUDABuffer *buffer) -{ - auto arrow_buffer = garrow_gpu_cuda_buffer_get_raw(buffer); - auto arrow_reader = - std::make_shared(arrow_buffer); - return garrow_gpu_cuda_buffer_input_stream_new_raw(&arrow_reader); -} - - -G_DEFINE_TYPE(GArrowGPUCUDABufferOutputStream, - garrow_gpu_cuda_buffer_output_stream, - GARROW_TYPE_OUTPUT_STREAM) - -static void -garrow_gpu_cuda_buffer_output_stream_init(GArrowGPUCUDABufferOutputStream *object) -{ -} - -static void -garrow_gpu_cuda_buffer_output_stream_class_init(GArrowGPUCUDABufferOutputStreamClass *klass) -{ -} - -/** - * garrow_gpu_cuda_buffer_output_stream_new: - * @buffer: A #GArrowGPUCUDABuffer. - * - * Returns: (transfer full): A newly created - * #GArrowGPUCUDABufferOutputStream. - * - * Since: 0.8.0 - */ -GArrowGPUCUDABufferOutputStream * -garrow_gpu_cuda_buffer_output_stream_new(GArrowGPUCUDABuffer *buffer) -{ - auto arrow_buffer = garrow_gpu_cuda_buffer_get_raw(buffer); - auto arrow_writer = - std::make_shared(arrow_buffer); - return garrow_gpu_cuda_buffer_output_stream_new_raw(&arrow_writer); -} - -/** - * garrow_gpu_cuda_buffer_output_stream_set_buffer_size: - * @stream: A #GArrowGPUCUDABufferOutputStream. - * @size: A size of CPU buffer in bytes. - * @error: (nullable): Return location for a #GError or %NULL. - * - * Returns: %TRUE on success, %FALSE if there was an error. - * - * Sets CPU buffer size. to limit `cudaMemcpy()` calls. If CPU buffer - * size is `0`, buffering is disabled. - * - * The default is `0`. - * - * Since: 0.8.0 - */ -gboolean -garrow_gpu_cuda_buffer_output_stream_set_buffer_size(GArrowGPUCUDABufferOutputStream *stream, - gint64 size, - GError **error) -{ - auto arrow_stream = garrow_gpu_cuda_buffer_output_stream_get_raw(stream); - auto status = arrow_stream->SetBufferSize(size); - return garrow_error_check(error, - status, - "[gpu][cuda][buffer-output-stream][set-buffer-size]"); -} - -/** - * garrow_gpu_cuda_buffer_output_stream_get_buffer_size: - * @stream: A #GArrowGPUCUDABufferOutputStream. - * - * Returns: The CPU buffer size in bytes. - * - * See garrow_gpu_cuda_buffer_output_stream_set_buffer_size() for CPU - * buffer size details. - * - * Since: 0.8.0 - */ -gint64 -garrow_gpu_cuda_buffer_output_stream_get_buffer_size(GArrowGPUCUDABufferOutputStream *stream) -{ - auto arrow_stream = garrow_gpu_cuda_buffer_output_stream_get_raw(stream); - return arrow_stream->buffer_size(); -} - -/** - * garrow_gpu_cuda_buffer_output_stream_get_buffered_size: - * @stream: A #GArrowGPUCUDABufferOutputStream. - * - * Returns: The size of buffered data in bytes. - * - * Since: 0.8.0 - */ -gint64 -garrow_gpu_cuda_buffer_output_stream_get_buffered_size(GArrowGPUCUDABufferOutputStream *stream) -{ - auto arrow_stream = garrow_gpu_cuda_buffer_output_stream_get_raw(stream); - return arrow_stream->num_bytes_buffered(); -} - - -G_END_DECLS - -GArrowGPUCUDAContext * -garrow_gpu_cuda_context_new_raw(std::shared_ptr *arrow_context) -{ - return GARROW_GPU_CUDA_CONTEXT(g_object_new(GARROW_GPU_TYPE_CUDA_CONTEXT, - "context", arrow_context, - NULL)); -} - -std::shared_ptr -garrow_gpu_cuda_context_get_raw(GArrowGPUCUDAContext *context) -{ - if (!context) - return nullptr; - - auto priv = GARROW_GPU_CUDA_CONTEXT_GET_PRIVATE(context); - return priv->context; -} - -GArrowGPUCUDAIPCMemoryHandle * -garrow_gpu_cuda_ipc_memory_handle_new_raw(std::shared_ptr *arrow_handle) -{ - auto handle = g_object_new(GARROW_GPU_TYPE_CUDA_IPC_MEMORY_HANDLE, - "ipc-memory-handle", arrow_handle, - NULL); - return GARROW_GPU_CUDA_IPC_MEMORY_HANDLE(handle); -} - -std::shared_ptr -garrow_gpu_cuda_ipc_memory_handle_get_raw(GArrowGPUCUDAIPCMemoryHandle *handle) -{ - if (!handle) - return nullptr; - - auto priv = GARROW_GPU_CUDA_IPC_MEMORY_HANDLE_GET_PRIVATE(handle); - return priv->ipc_memory_handle; -} - -GArrowGPUCUDABuffer * -garrow_gpu_cuda_buffer_new_raw(std::shared_ptr *arrow_buffer) -{ - return GARROW_GPU_CUDA_BUFFER(g_object_new(GARROW_GPU_TYPE_CUDA_BUFFER, - "buffer", arrow_buffer, - NULL)); -} - -std::shared_ptr -garrow_gpu_cuda_buffer_get_raw(GArrowGPUCUDABuffer *buffer) -{ - if (!buffer) - return nullptr; - - auto arrow_buffer = garrow_buffer_get_raw(GARROW_BUFFER(buffer)); - return std::static_pointer_cast(arrow_buffer); -} - -GArrowGPUCUDAHostBuffer * -garrow_gpu_cuda_host_buffer_new_raw(std::shared_ptr *arrow_buffer) -{ - auto buffer = g_object_new(GARROW_GPU_TYPE_CUDA_HOST_BUFFER, - "buffer", arrow_buffer, - NULL); - return GARROW_GPU_CUDA_HOST_BUFFER(buffer); -} - -std::shared_ptr -garrow_gpu_cuda_host_buffer_get_raw(GArrowGPUCUDAHostBuffer *buffer) -{ - if (!buffer) - return nullptr; - - auto arrow_buffer = garrow_buffer_get_raw(GARROW_BUFFER(buffer)); - return std::static_pointer_cast(arrow_buffer); -} - -GArrowGPUCUDABufferInputStream * -garrow_gpu_cuda_buffer_input_stream_new_raw(std::shared_ptr *arrow_reader) -{ - auto input_stream = g_object_new(GARROW_GPU_TYPE_CUDA_BUFFER_INPUT_STREAM, - "input-stream", arrow_reader, - NULL); - return GARROW_GPU_CUDA_BUFFER_INPUT_STREAM(input_stream); -} - -std::shared_ptr -garrow_gpu_cuda_buffer_input_stream_get_raw(GArrowGPUCUDABufferInputStream *input_stream) -{ - if (!input_stream) - return nullptr; - - auto arrow_reader = - garrow_input_stream_get_raw(GARROW_INPUT_STREAM(input_stream)); - return std::static_pointer_cast(arrow_reader); -} - -GArrowGPUCUDABufferOutputStream * -garrow_gpu_cuda_buffer_output_stream_new_raw(std::shared_ptr *arrow_writer) -{ - auto output_stream = g_object_new(GARROW_GPU_TYPE_CUDA_BUFFER_OUTPUT_STREAM, - "output-stream", arrow_writer, - NULL); - return GARROW_GPU_CUDA_BUFFER_OUTPUT_STREAM(output_stream); -} - -std::shared_ptr -garrow_gpu_cuda_buffer_output_stream_get_raw(GArrowGPUCUDABufferOutputStream *output_stream) -{ - if (!output_stream) - return nullptr; - - auto arrow_writer = - garrow_output_stream_get_raw(GARROW_OUTPUT_STREAM(output_stream)); - return std::static_pointer_cast(arrow_writer); -} diff --git a/c_glib/arrow-gpu-glib/cuda.h b/c_glib/arrow-gpu-glib/cuda.h deleted file mode 100644 index f45a46a2def8e..0000000000000 --- a/c_glib/arrow-gpu-glib/cuda.h +++ /dev/null @@ -1,183 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -#pragma once - -#include - -G_BEGIN_DECLS - -#define GARROW_GPU_TYPE_CUDA_DEVICE_MANAGER \ - (garrow_gpu_cuda_device_manager_get_type()) -G_DECLARE_DERIVABLE_TYPE(GArrowGPUCUDADeviceManager, - garrow_gpu_cuda_device_manager, - GARROW_GPU, - CUDA_DEVICE_MANAGER, - GObject) -struct _GArrowGPUCUDADeviceManagerClass -{ - GObjectClass parent_class; -}; - -#define GARROW_GPU_TYPE_CUDA_CONTEXT (garrow_gpu_cuda_context_get_type()) -G_DECLARE_DERIVABLE_TYPE(GArrowGPUCUDAContext, - garrow_gpu_cuda_context, - GARROW_GPU, - CUDA_CONTEXT, - GObject) -struct _GArrowGPUCUDAContextClass -{ - GObjectClass parent_class; -}; - -#define GARROW_GPU_TYPE_CUDA_BUFFER (garrow_gpu_cuda_buffer_get_type()) -G_DECLARE_DERIVABLE_TYPE(GArrowGPUCUDABuffer, - garrow_gpu_cuda_buffer, - GARROW_GPU, - CUDA_BUFFER, - GArrowBuffer) -struct _GArrowGPUCUDABufferClass -{ - GArrowBufferClass parent_class; -}; - -#define GARROW_GPU_TYPE_CUDA_HOST_BUFFER (garrow_gpu_cuda_host_buffer_get_type()) -G_DECLARE_DERIVABLE_TYPE(GArrowGPUCUDAHostBuffer, - garrow_gpu_cuda_host_buffer, - GARROW_GPU, - CUDA_HOST_BUFFER, - GArrowMutableBuffer) -struct _GArrowGPUCUDAHostBufferClass -{ - GArrowMutableBufferClass parent_class; -}; - -#define GARROW_GPU_TYPE_CUDA_IPC_MEMORY_HANDLE \ - (garrow_gpu_cuda_ipc_memory_handle_get_type()) -G_DECLARE_DERIVABLE_TYPE(GArrowGPUCUDAIPCMemoryHandle, - garrow_gpu_cuda_ipc_memory_handle, - GARROW_GPU, - CUDA_IPC_MEMORY_HANDLE, - GObject) -struct _GArrowGPUCUDAIPCMemoryHandleClass -{ - GObjectClass parent_class; -}; - -#define GARROW_GPU_TYPE_CUDA_BUFFER_INPUT_STREAM \ - (garrow_gpu_cuda_buffer_input_stream_get_type()) -G_DECLARE_DERIVABLE_TYPE(GArrowGPUCUDABufferInputStream, - garrow_gpu_cuda_buffer_input_stream, - GARROW_GPU, - CUDA_BUFFER_INPUT_STREAM, - GArrowBufferInputStream) -struct _GArrowGPUCUDABufferInputStreamClass -{ - GArrowBufferInputStreamClass parent_class; -}; - -#define GARROW_GPU_TYPE_CUDA_BUFFER_OUTPUT_STREAM \ - (garrow_gpu_cuda_buffer_output_stream_get_type()) -G_DECLARE_DERIVABLE_TYPE(GArrowGPUCUDABufferOutputStream, - garrow_gpu_cuda_buffer_output_stream, - GARROW_GPU, - CUDA_BUFFER_OUTPUT_STREAM, - GArrowOutputStream) -struct _GArrowGPUCUDABufferOutputStreamClass -{ - GArrowOutputStreamClass parent_class; -}; - -GArrowGPUCUDADeviceManager * -garrow_gpu_cuda_device_manager_new(GError **error); - -GArrowGPUCUDAContext * -garrow_gpu_cuda_device_manager_get_context(GArrowGPUCUDADeviceManager *manager, - gint gpu_number, - GError **error); -gsize -garrow_gpu_cuda_device_manager_get_n_devices(GArrowGPUCUDADeviceManager *manager); - -gint64 -garrow_gpu_cuda_context_get_allocated_size(GArrowGPUCUDAContext *context); - - -GArrowGPUCUDABuffer * -garrow_gpu_cuda_buffer_new(GArrowGPUCUDAContext *context, - gint64 size, - GError **error); -GArrowGPUCUDABuffer * -garrow_gpu_cuda_buffer_new_ipc(GArrowGPUCUDAContext *context, - GArrowGPUCUDAIPCMemoryHandle *handle, - GError **error); -GArrowGPUCUDABuffer * -garrow_gpu_cuda_buffer_new_record_batch(GArrowGPUCUDAContext *context, - GArrowRecordBatch *record_batch, - GError **error); -GBytes * -garrow_gpu_cuda_buffer_copy_to_host(GArrowGPUCUDABuffer *buffer, - gint64 position, - gint64 size, - GError **error); -gboolean -garrow_gpu_cuda_buffer_copy_from_host(GArrowGPUCUDABuffer *buffer, - const guint8 *data, - gint64 size, - GError **error); -GArrowGPUCUDAIPCMemoryHandle * -garrow_gpu_cuda_buffer_export(GArrowGPUCUDABuffer *buffer, - GError **error); -GArrowGPUCUDAContext * -garrow_gpu_cuda_buffer_get_context(GArrowGPUCUDABuffer *buffer); -GArrowRecordBatch * -garrow_gpu_cuda_buffer_read_record_batch(GArrowGPUCUDABuffer *buffer, - GArrowSchema *schema, - GError **error); - - -GArrowGPUCUDAHostBuffer * -garrow_gpu_cuda_host_buffer_new(gint gpu_number, - gint64 size, - GError **error); - -GArrowGPUCUDAIPCMemoryHandle * -garrow_gpu_cuda_ipc_memory_handle_new(const guint8 *data, - gsize size, - GError **error); - -GArrowBuffer * -garrow_gpu_cuda_ipc_memory_handle_serialize(GArrowGPUCUDAIPCMemoryHandle *handle, - GError **error); - -GArrowGPUCUDABufferInputStream * -garrow_gpu_cuda_buffer_input_stream_new(GArrowGPUCUDABuffer *buffer); - -GArrowGPUCUDABufferOutputStream * -garrow_gpu_cuda_buffer_output_stream_new(GArrowGPUCUDABuffer *buffer); - -gboolean -garrow_gpu_cuda_buffer_output_stream_set_buffer_size(GArrowGPUCUDABufferOutputStream *stream, - gint64 size, - GError **error); -gint64 -garrow_gpu_cuda_buffer_output_stream_get_buffer_size(GArrowGPUCUDABufferOutputStream *stream); -gint64 -garrow_gpu_cuda_buffer_output_stream_get_buffered_size(GArrowGPUCUDABufferOutputStream *stream); - -G_END_DECLS diff --git a/c_glib/arrow-gpu-glib/cuda.hpp b/c_glib/arrow-gpu-glib/cuda.hpp deleted file mode 100644 index 4b5b03c8b4608..0000000000000 --- a/c_glib/arrow-gpu-glib/cuda.hpp +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -#pragma once - -#include - -#include - -GArrowGPUCUDAContext * -garrow_gpu_cuda_context_new_raw(std::shared_ptr *arrow_context); -std::shared_ptr -garrow_gpu_cuda_context_get_raw(GArrowGPUCUDAContext *context); - -GArrowGPUCUDAIPCMemoryHandle * -garrow_gpu_cuda_ipc_memory_handle_new_raw(std::shared_ptr *arrow_handle); -std::shared_ptr -garrow_gpu_cuda_ipc_memory_handle_get_raw(GArrowGPUCUDAIPCMemoryHandle *handle); - -GArrowGPUCUDABuffer * -garrow_gpu_cuda_buffer_new_raw(std::shared_ptr *arrow_buffer); -std::shared_ptr -garrow_gpu_cuda_buffer_get_raw(GArrowGPUCUDABuffer *buffer); - -GArrowGPUCUDAHostBuffer * -garrow_gpu_cuda_host_buffer_new_raw(std::shared_ptr *arrow_buffer); -std::shared_ptr -garrow_gpu_cuda_host_buffer_get_raw(GArrowGPUCUDAHostBuffer *buffer); - -GArrowGPUCUDABufferInputStream * -garrow_gpu_cuda_buffer_input_stream_new_raw(std::shared_ptr *arrow_reader); -std::shared_ptr -garrow_gpu_cuda_buffer_input_stream_get_raw(GArrowGPUCUDABufferInputStream *input_stream); - -GArrowGPUCUDABufferOutputStream * -garrow_gpu_cuda_buffer_output_stream_new_raw(std::shared_ptr *arrow_writer); -std::shared_ptr -garrow_gpu_cuda_buffer_output_stream_get_raw(GArrowGPUCUDABufferOutputStream *output_stream); diff --git a/c_glib/arrow-gpu-glib/meson.build b/c_glib/arrow-gpu-glib/meson.build deleted file mode 100644 index e6b170efc5941..0000000000000 --- a/c_glib/arrow-gpu-glib/meson.build +++ /dev/null @@ -1,75 +0,0 @@ -# -*- indent-tabs-mode: nil -*- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -sources = files( - 'cuda.cpp', -) - -c_headers = files( - 'arrow-gpu-glib.h', - 'cuda.h', -) - -cpp_headers = files( - 'arrow-gpu-glib.hpp', - 'cuda.hpp', -) - -headers = c_headers + cpp_headers -install_headers(headers, subdir: 'arrow-gpu-glib') - - -dependencies = [ - arrow_gpu, - arrow_glib, -] -libarrow_gpu_glib = library('arrow-gpu-glib', - sources: sources, - install: true, - dependencies: dependencies, - include_directories: base_include_directories, - soversion: so_version, - version: library_version) -arrow_gpu_glib = declare_dependency(link_with: libarrow_gpu_glib, - include_directories: base_include_directories, - dependencies: dependencies) - -pkgconfig.generate(filebase: 'arrow-gpu-glib', - name: 'Apache Arrow GPU GLib', - description: 'C API for Apache Arrow GPU based on GLib', - version: version, - requires: ['arrow-glib', 'arrow-gpu'], - libraries: [libarrow_gpu_glib]) - -gnome.generate_gir(libarrow_gpu_glib, - dependencies: declare_dependency(sources: arrow_glib_gir), - sources: sources + c_headers, - namespace: 'ArrowGPU', - nsversion: api_version, - identifier_prefix: 'GArrowGPU', - symbol_prefix: 'garrow_gpu', - export_packages: 'arrow-gpu-glib', - includes: [ - 'Arrow-1.0', - ], - install: true, - extra_args: [ - '--warn-all', - '--include-uninstalled=./arrow-glib/Arrow-1.0.gir', - ]) diff --git a/c_glib/configure.ac b/c_glib/configure.ac index badf9e98da4ea..a6d8ed8e1d185 100644 --- a/c_glib/configure.ac +++ b/c_glib/configure.ac @@ -115,6 +115,7 @@ AC_ARG_WITH(arrow-cpp-build-type, [GARROW_ARROW_CPP_BUILD_TYPE="$withval"], [GARROW_ARROW_CPP_BUILD_TYPE="release"]) +ARROW_CUDA_PKG_CONFIG_PATH="" AC_ARG_WITH(arrow-cpp-build-dir, [AS_HELP_STRING([--with-arrow-cpp-build-dir=PATH], [Use this option to build with not installed Arrow C++])], @@ -130,10 +131,10 @@ if test "x$GARROW_ARROW_CPP_BUILD_DIR" = "x"; then [arrow-orc], [HAVE_ARROW_ORC=yes], [HAVE_ARROW_ORC=no]) - PKG_CHECK_MODULES([ARROW_GPU], - [arrow-gpu], - [HAVE_ARROW_GPU=yes], - [HAVE_ARROW_GPU=no]) + PKG_CHECK_MODULES([ARROW_CUDA], + [arrow-cuda], + [HAVE_ARROW_CUDA=yes], + [HAVE_ARROW_CUDA=no]) PKG_CHECK_MODULES([GANDIVA], [gandiva], [HAVE_GANDIVA=yes], @@ -168,16 +169,19 @@ else HAVE_ARROW_ORC=no fi - ARROW_GPU_CFLAGS="" - if test -f "${GARROW_ARROW_CPP_BUILD_DIR}/src/arrow/gpu/arrow-gpu.pc"; then - HAVE_ARROW_GPU=yes - ARROW_GPU_LIBS="-larrow_gpu" + ARROW_CUDA_CFLAGS="" + if test -f "${GARROW_ARROW_CPP_BUILD_DIR}/src/arrow/gpu/arrow-cuda.pc"; then + HAVE_ARROW_CUDA=yes + ARROW_CUDA_LIBS="-larrow_cuda" + ARROW_CUDA_PKG_CONFIG_PATH="\$(ARROW_BUILD_DIR)/src/arrow/gpu" else - HAVE_ARROW_GPU=no - ARROW_GPU_LIBS="" + HAVE_ARROW_CUDA=no + ARROW_CUDA_LIBS="" + ARROW_CUDA_PKG_CONFIG_PATH="" fi - AC_SUBST(ARROW_GPU_CFLAGS) - AC_SUBST(ARROW_GPU_LIBS) + AC_SUBST(ARROW_CUDA_CFLAGS) + AC_SUBST(ARROW_CUDA_LIBS) + AC_SUBST(ARROW_CUDA_PKG_CONFIG_PATH) GANDIVA_CFLAGS="" if test -f "${GARROW_ARROW_CPP_BUILD_DIR}/src/gandiva/gandiva.pc"; then @@ -221,10 +225,20 @@ if test "$HAVE_ARROW_ORC" = "yes"; then AC_DEFINE(HAVE_ARROW_ORC, [1], [Define to 1 if Apache Arrow supports ORC.]) fi -AM_CONDITIONAL([HAVE_ARROW_GPU], [test "$HAVE_ARROW_GPU" = "yes"]) -if test "$HAVE_ARROW_GPU" = "yes"; then - AC_DEFINE(HAVE_ARROW_GPU, [1], [Define to 1 if Apache Arrow supports GPU.]) +AM_CONDITIONAL([HAVE_ARROW_CUDA], [test "$HAVE_ARROW_CUDA" = "yes"]) +if test "$HAVE_ARROW_CUDA" = "yes"; then + ARROW_CUDA_GLIB_PACKAGE="arrow-cuda-glib" + PLASMA_ARROW_CUDA_PKG_CONFIG_PATH=":\$(abs_top_builddir)/arrow-cuda-glib" + if test -n "${ARROW_CUDA_PKG_CONFIG_PATH}"; then + PLASMA_ARROW_CUDA_PKG_CONFIG_PATH=":${ARROW_CUDA_PKG_CONFIG_PATH}${PLASMA_ARROW_CUDA_PKG_CONFIG_PATH}" + fi + AC_DEFINE(HAVE_ARROW_CUDA, [1], [Define to 1 if Apache Arrow supports CUDA.]) +else + ARROW_CUDA_GLIB_PACKAGE="" + PLASMA_ARROW_CUDA_PKG_CONFIG_PATH="" fi +AC_SUBST(ARROW_CUDA_GLIB_PACKAGE) +AC_SUBST(PLASMA_ARROW_CUDA_PKG_CONFIG_PATH) AM_CONDITIONAL([HAVE_GANDIVA], [test "$HAVE_GANDIVA" = "yes"]) if test "$HAVE_GANDIVA" = "yes"; then @@ -246,12 +260,12 @@ AC_SUBST(exampledir) AC_CONFIG_FILES([ Makefile + arrow-cuda-glib/Makefile + arrow-cuda-glib/arrow-cuda-glib.pc arrow-glib/Makefile arrow-glib/arrow-glib.pc arrow-glib/arrow-orc-glib.pc arrow-glib/version.h - arrow-gpu-glib/Makefile - arrow-gpu-glib/arrow-gpu-glib.pc gandiva-glib/Makefile gandiva-glib/gandiva-glib.pc parquet-glib/Makefile diff --git a/c_glib/doc/arrow-glib/Makefile.am b/c_glib/doc/arrow-glib/Makefile.am index ad0c9382194d9..db9f00f39f300 100644 --- a/c_glib/doc/arrow-glib/Makefile.am +++ b/c_glib/doc/arrow-glib/Makefile.am @@ -55,15 +55,15 @@ AM_CFLAGS = \ GTKDOC_LIBS = \ $(top_builddir)/arrow-glib/libarrow-glib.la -if HAVE_ARROW_GPU +if HAVE_ARROW_CUDA DOC_SOURCE_DIR += \ - $(top_srcdir)/arrow-gpu-glib + $(top_srcdir)/arrow-cuda-glib HFILE_GLOB += \ - $(top_srcdir)/arrow-gpu-glib/*.h + $(top_srcdir)/arrow-cuda-glib/*.h CFILE_GLOB += \ - $(top_srcdir)/arrow-gpu-glib/*.cpp + $(top_srcdir)/arrow-cuda-glib/*.cpp GTKDOC_LIBS += \ - $(top_builddir)/arrow-gpu-glib/libarrow-gpu-glib.la + $(top_builddir)/arrow-cuda-glib/libarrow-cuda-glib.la endif include $(top_srcdir)/gtk-doc.make diff --git a/c_glib/doc/arrow-glib/meson.build b/c_glib/doc/arrow-glib/meson.build index 68050aa8792f8..d61a9747de678 100644 --- a/c_glib/doc/arrow-glib/meson.build +++ b/c_glib/doc/arrow-glib/meson.build @@ -50,13 +50,13 @@ source_directories = [ dependencies = [ arrow_glib, ] -if arrow_gpu.found() +if arrow_cuda.found() source_directories += [ - join_paths(meson.source_root(), 'arrow-gpu-glib'), - join_paths(meson.build_root(), 'arrow-gpu-glib'), + join_paths(meson.source_root(), 'arrow-cuda-glib'), + join_paths(meson.build_root(), 'arrow-cuda-glib'), ] dependencies += [ - arrow_gpu_glib, + arrow_cuda_glib, ] endif ignore_headers = [] diff --git a/c_glib/doc/plasma-glib/Makefile.am b/c_glib/doc/plasma-glib/Makefile.am index 6a25bfb484eba..df872d6ca312c 100644 --- a/c_glib/doc/plasma-glib/Makefile.am +++ b/c_glib/doc/plasma-glib/Makefile.am @@ -15,6 +15,12 @@ # specific language governing permissions and limitations # under the License. +PLASMA_ARROW_CUDA_GTKDOC_LIBS = +if HAVE_ARROW_CUDA +PLASMA_ARROW_CUDA_GTKDOC_LIBS += \ + $(top_builddir)/arrow-cuda-glib/libarrow-cuda-glib.la +endif + if HAVE_PLASMA DOC_MODULE = plasma-glib @@ -50,6 +56,7 @@ AM_CFLAGS = \ GTKDOC_LIBS = \ $(top_builddir)/arrow-glib/libarrow-glib.la \ + $(PLASMA_ARROW_CUDA_GTKDOC_LIBS) \ $(top_builddir)/plasma-glib/libplasma-glib.la include $(top_srcdir)/gtk-doc.make diff --git a/c_glib/doc/plasma-glib/meson.build b/c_glib/doc/plasma-glib/meson.build index 2572f0f371cc4..9efc53b4b1b23 100644 --- a/c_glib/doc/plasma-glib/meson.build +++ b/c_glib/doc/plasma-glib/meson.build @@ -56,6 +56,9 @@ dependencies = [ arrow_glib, plasma_glib, ] +if arrow_cuda.found() + dependencies += [arrow_cuda_glib] +endif ignore_headers = [] gnome.gtkdoc(project_name, main_xml: project_name + '-docs.xml', diff --git a/c_glib/doc/plasma-glib/plasma-glib-docs.xml b/c_glib/doc/plasma-glib/plasma-glib-docs.xml index 86e3245043d32..83d3aea9b00f7 100644 --- a/c_glib/doc/plasma-glib/plasma-glib-docs.xml +++ b/c_glib/doc/plasma-glib/plasma-glib-docs.xml @@ -36,12 +36,16 @@ - - PlasmaClient + + Client side Client + + Object + + diff --git a/c_glib/gandiva-glib/expression.cpp b/c_glib/gandiva-glib/expression.cpp index 529d85164de2a..b4e7a96bdef10 100644 --- a/c_glib/gandiva-glib/expression.cpp +++ b/c_glib/gandiva-glib/expression.cpp @@ -201,7 +201,8 @@ ggandiva_expression_new(GGandivaNode *root_node, * ggandiva_expression_to_string: * @expression: A #GGandivaExpression. * - * Returns: The string representation of the node in the expression tree. + * Returns: (transfer full): The string representation of the node in the expression tree. + * * It should be freed with g_free() when no longer needed. * * Since: 0.12.0 diff --git a/c_glib/meson.build b/c_glib/meson.build index 14136056d408c..194421c13d316 100644 --- a/c_glib/meson.build +++ b/c_glib/meson.build @@ -64,7 +64,7 @@ endif if arrow_cpp_build_lib_dir == '' arrow = dependency('arrow') have_arrow_orc = dependency('arrow-orc', required: false).found() - arrow_gpu = dependency('arrow-gpu', required: false) + arrow_cuda = dependency('arrow-cuda', required: false) gandiva = dependency('gandiva', required: false) parquet = dependency('parquet', required: false) plasma = dependency('plasma', required: false) @@ -89,9 +89,9 @@ main(void) have_arrow_orc = cpp_compiler.links(arrow_orc_code, include_directories: base_include_directories, dependencies: [arrow]) - arrow_gpu = cpp_compiler.find_library('arrow_gpu', - dirs: [arrow_cpp_build_lib_dir], - required: false) + arrow_cuda = cpp_compiler.find_library('arrow_cuda', + dirs: [arrow_cpp_build_lib_dir], + required: false) gandiva = cpp_compiler.find_library('gandiva', dirs: [arrow_cpp_build_lib_dir], required: false) @@ -104,8 +104,8 @@ main(void) endif subdir('arrow-glib') -if arrow_gpu.found() - subdir('arrow-gpu-glib') +if arrow_cuda.found() + subdir('arrow-cuda-glib') endif if gandiva.found() subdir('gandiva-glib') @@ -136,7 +136,7 @@ test('unit test', run_test, env: [ 'ARROW_GLIB_TYPELIB_DIR=@0@/arrow-glib'.format(meson.build_root()), - 'ARROW_GPU_GLIB_TYPELIB_DIR=@0@/arrow-gpu-glib'.format(meson.build_root()), + 'ARROW_CUDA_GLIB_TYPELIB_DIR=@0@/arrow-cuda-glib'.format(meson.build_root()), 'GANDIVA_GLIB_TYPELIB_DIR=@0@/gandiva-glib'.format(meson.build_root()), 'PARQUET_GLIB_TYPELIB_DIR=@0@/parquet-glib'.format(meson.build_root()), 'PARQUET_GLIB_TYPELIB_DIR=@0@/plasma-glib'.format(meson.build_root()), diff --git a/c_glib/plasma-glib/Makefile.am b/c_glib/plasma-glib/Makefile.am index f797c97b094c4..d14638bc22764 100644 --- a/c_glib/plasma-glib/Makefile.am +++ b/c_glib/plasma-glib/Makefile.am @@ -23,13 +23,39 @@ EXTRA_DIST = \ AM_CPPFLAGS = \ -I$(top_builddir) \ - -I$(top_srcdir) + -I$(top_srcdir) \ + -DG_LOG_DOMAIN=\"Plasma\" AM_CFLAGS = \ $(GLIB_CFLAGS) \ $(GARROW_CFLAGS) \ $(GPLASMA_CFLAGS) +PLASMA_ARROW_CUDA_LIBS = +PLASMA_INTROSPECTION_COMPILER_ARROW_CUDA_ARGS = +PLASMA_GIR_ARROW_CUDA_PACKAGE = +PLASMA_GIR_ARROW_CUDA_SCANNER_ADD_INCLUDE_PATH = +PLASMA_GIR_ARROW_CUDA_LIBS_MACOS = +PLASMA_GIR_ARROW_CUDA_SCANNER_LIBRARY_PATH_MACOS = +PLASMA_GIR_ARROW_CUDA_LIBS = +if HAVE_ARROW_CUDA +PLASMA_ARROW_CUDA_LIBS += \ + $(ARROW_CUDA_LIBS) \ + ../arrow-cuda-glib/libarrow-cuda-glib.la +PLASMA_INTROSPECTION_COMPILER_ARROW_CUDA_ARGS += \ + --includedir=$(abs_top_builddir)/arrow-cuda-glib +PLASMA_GIR_ARROW_CUDA_PACKAGE += \ + arrow-cuda-glib +PLASMA_GIR_ARROW_CUDA_SCANNER_ADD_INCLUDE_PATH += \ + --add-include-path=$(abs_top_builddir)/arrow-cuda-glib +PLASMA_GIR_ARROW_CUDA_LIBS_MACOS += \ + arrow-cuda-glib +PLASMA_GIR_ARROW_CUDA_SCANNER_LIBRARY_PATH_MACOS += \ + --library-path=$(abs_top_builddir)/arrow-cuda-glib/.libs +PLASMA_GIR_ARROW_CUDA_LIBS += \ + $(abs_top_builddir)/arrow-cuda-glib/libarrow-cuda-glib.la +endif + if HAVE_PLASMA lib_LTLIBRARIES = \ libplasma-glib.la @@ -49,18 +75,22 @@ libplasma_glib_la_LIBADD = \ $(GLIB_LIBS) \ $(ARROW_LIBS) \ $(PLASMA_LIBS) \ - ../arrow-glib/libarrow-glib.la + ../arrow-glib/libarrow-glib.la \ + $(PLASMA_ARROW_CUDA_LIBS) libplasma_glib_la_headers = \ client.h \ + object.h \ plasma-glib.h libplasma_glib_la_sources = \ client.cpp \ + object.cpp \ $(libplasma_glib_la_headers) -libplasma_glib_la_cpp_headers = \ +libplasma_glib_la_cpp_headers = \ client.hpp \ + object.hpp \ plasma-glib.hpp libplasma_glib_la_SOURCES = \ @@ -68,7 +98,7 @@ libplasma_glib_la_SOURCES = \ $(libplasma_glib_la_cpp_headers) plasma_glib_includedir = $(includedir)/plasma-glib -plasma_glib_include_HEADERS = \ +plasma_glib_include_HEADERS = \ $(libplasma_glib_la_headers) \ $(libplasma_glib_la_cpp_headers) @@ -84,17 +114,19 @@ INTROSPECTION_SCANNER_ARGS = INTROSPECTION_SCANNER_ENV = if USE_ARROW_BUILD_DIR INTROSPECTION_SCANNER_ENV += \ - PKG_CONFIG_PATH=${abs_top_builddir}/arrow-glib:$(ARROW_BUILD_DIR)/src/arrow:$${PKG_CONFIG_PATH} + PKG_CONFIG_PATH=$(abs_top_builddir)/arrow-glib$(PLASMA_ARROW_CUDA_PKG_CONFIG_PATH):$(ARROW_BUILD_DIR)/src/arrow:$${PKG_CONFIG_PATH} else INTROSPECTION_SCANNER_ENV += \ - PKG_CONFIG_PATH=${abs_top_builddir}/arrow-glib:$${PKG_CONFIG_PATH} + PKG_CONFIG_PATH=$(abs_top_builddir)/arrow-glib$(PLASMA_ARROW_CUDA_PKG_CONFIG_PATH):$${PKG_CONFIG_PATH} endif -INTROSPECTION_COMPILER_ARGS = \ - --includedir=$(abs_top_builddir)/arrow-glib +INTROSPECTION_COMPILER_ARGS = \ + --includedir=$(abs_top_builddir)/arrow-glib \ + $(PLASMA_INTROSPECTION_COMPILER_ARROW_CUDA_INCLUDEDIR) Plasma-1.0.gir: libplasma-glib.la Plasma_1_0_gir_PACKAGES = \ - arrow-glib + arrow-glib \ + $(PLASMA_GIR_ARROW_CUDA_PACKAGE) Plasma_1_0_gir_EXPORT_PACKAGES = \ plasma-glib Plasma_1_0_gir_INCLUDES = \ @@ -103,8 +135,9 @@ Plasma_1_0_gir_CFLAGS = \ $(AM_CPPFLAGS) Plasma_1_0_gir_LIBS = Plasma_1_0_gir_FILES = $(libplasma_glib_la_sources) -Plasma_1_0_gir_SCANNERFLAGS = \ +Plasma_1_0_gir_SCANNERFLAGS = \ --add-include-path=$(abs_top_builddir)/arrow-glib \ + $(PLASMA_GIR_ARROW_CUDA_SCANNER_ADD_INCLUDE_PATH) \ --library-path=$(ARROW_LIB_DIR) \ --warn-all \ --identifier-prefix=GPlasma \ @@ -112,14 +145,17 @@ Plasma_1_0_gir_SCANNERFLAGS = \ if OS_MACOS Plasma_1_0_gir_LIBS += \ arrow-glib \ + $(PLASMA_GIR_ARROW_CUDA_LIBS_MACOS) \ plasma-glib Plasma_1_0_gir_SCANNERFLAGS += \ --no-libtool \ --library-path=$(abs_top_builddir)/arrow-glib/.libs \ + $(PLASMA_GIR_ARROW_CUDA_SCANNER_LIBRARY_PATH_MACOS) \ --library-path=$(abs_builddir)/.libs else Plasma_1_0_gir_LIBS += \ $(abs_top_builddir)/arrow-glib/libarrow-glib.la \ + $(PLASMA_GIR_ARROW_CUDA_LIBS) \ libplasma-glib.la endif INTROSPECTION_GIRS += Plasma-1.0.gir diff --git a/c_glib/plasma-glib/client.cpp b/c_glib/plasma-glib/client.cpp index f818c971dea91..e88cb13e83cd0 100644 --- a/c_glib/plasma-glib/client.cpp +++ b/c_glib/plasma-glib/client.cpp @@ -21,47 +21,196 @@ # include #endif +#include #include +#ifdef HAVE_ARROW_CUDA +# include +#endif + #include +#include G_BEGIN_DECLS /** * SECTION: client - * @title: Client classes + * @section_id: client-classes + * @title: Client related classes * @include: plasma-glib/plasma-glib.h * + * #GPlasmaClientCreateOptions is a class for customizing object creation. + * * #GPlasmaClient is a class for an interface with a plasma store * and a plasma manager. * * Since: 0.12.0 */ +typedef struct GPlasmaClientCreateOptionsPrivate_ { + guint8 *metadata; + gsize metadata_size; + gint gpu_device; +} GPlasmaClientCreateOptionsPrivate; + +enum { + PROP_GPU_DEVICE = 1 +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GPlasmaClientCreateOptions, + gplasma_client_create_options, + G_TYPE_OBJECT) + +#define GPLASMA_CLIENT_CREATE_OPTIONS_GET_PRIVATE(object) \ + static_cast( \ + gplasma_client_create_options_get_instance_private( \ + GPLASMA_CLIENT_CREATE_OPTIONS(object))) + +static void +gplasma_client_create_options_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GPLASMA_CLIENT_CREATE_OPTIONS_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_GPU_DEVICE: + priv->gpu_device = g_value_get_int(value); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gplasma_client_create_options_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + auto priv = GPLASMA_CLIENT_CREATE_OPTIONS_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_GPU_DEVICE: + g_value_set_int(value, priv->gpu_device); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gplasma_client_create_options_init(GPlasmaClientCreateOptions *object) +{ +} + +static void +gplasma_client_create_options_class_init(GPlasmaClientCreateOptionsClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->set_property = gplasma_client_create_options_set_property; + gobject_class->get_property = gplasma_client_create_options_get_property; + + GParamSpec *spec; + spec = g_param_spec_int("gpu-device", + "GPU device", + "The GPU device number. -1 means GPU isn't used.", + -1, + G_MAXINT, + -1, + static_cast(G_PARAM_READWRITE | + G_PARAM_CONSTRUCT)); + g_object_class_install_property(gobject_class, PROP_GPU_DEVICE, spec); +} + +/** + * gplasma_client_create_options_new: + * + * Returns: A newly created #GPlasmaClientCreateOptions. + * + * Since: 0.12.0 + */ +GPlasmaClientCreateOptions * +gplasma_client_create_options_new(void) +{ + auto options = g_object_new(GPLASMA_TYPE_CLIENT_CREATE_OPTIONS, + NULL); + return GPLASMA_CLIENT_CREATE_OPTIONS(options); +} + +/** + * gplasma_client_create_options_set_metadata: + * @options: A #GPlasmaClientCreateOptions. + * @metadata: (nullable) (array length=size): The metadata of a created object. + * @size: The number of bytes of the metadata. + * + * Since: 0.12.0 + */ +void +gplasma_client_create_options_set_metadata(GPlasmaClientCreateOptions *options, + const guint8 *metadata, + gsize size) +{ + auto priv = GPLASMA_CLIENT_CREATE_OPTIONS_GET_PRIVATE(options); + if (priv->metadata) { + g_free(priv->metadata); + } + priv->metadata = static_cast(g_memdup(metadata, size)); + priv->metadata_size = size; +} + +/** + * gplasma_client_create_options_get_metadata: + * @options: A #GPlasmaClientCreateOptions. + * @size: (nullable) (out): The number of bytes of the metadata. + * + * Returns: (nullable) (array length=size): The metadata of a created object. + * + * Since: 0.12.0 + */ +const guint8 * +gplasma_client_create_options_get_metadata(GPlasmaClientCreateOptions *options, + gsize *size) +{ + auto priv = GPLASMA_CLIENT_CREATE_OPTIONS_GET_PRIVATE(options); + if (size) { + *size = priv->metadata_size; + } + return priv->metadata; +} + typedef struct GPlasmaClientPrivate_ { - std::shared_ptr client; + plasma::PlasmaClient *client; } GPlasmaClientPrivate; enum { - PROP_0, - PROP_CLIENT + PROP_CLIENT = 1 }; G_DEFINE_TYPE_WITH_PRIVATE(GPlasmaClient, gplasma_client, G_TYPE_OBJECT) -#define GPLASMA_CLIENT_GET_PRIVATE(obj) \ - static_cast( \ - gplasma_client_get_instance_private( \ - GPLASMA_CLIENT(obj))) +#define GPLASMA_CLIENT_GET_PRIVATE(object) \ + static_cast( \ + gplasma_client_get_instance_private( \ + GPLASMA_CLIENT(object))) static void gplasma_client_finalize(GObject *object) { auto priv = GPLASMA_CLIENT_GET_PRIVATE(object); - priv->client = nullptr; + auto status = priv->client->Disconnect(); + if (!status.ok()) { + g_warning("[plasma][client][finalize] Failed to disconnect: %s", + status.ToString().c_str()); + } + delete priv->client; G_OBJECT_CLASS(gplasma_client_parent_class)->finalize(object); } @@ -77,7 +226,7 @@ gplasma_client_set_property(GObject *object, switch (prop_id) { case PROP_CLIENT: priv->client = - *static_cast *>(g_value_get_pointer(value)); + static_cast(g_value_get_pointer(value)); break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); @@ -102,7 +251,7 @@ gplasma_client_class_init(GPlasmaClientClass *klass) spec = g_param_spec_pointer("client", "Client", - "The raw std::shared *", + "The raw plasma::PlasmaClient *", static_cast(G_PARAM_WRITABLE | G_PARAM_CONSTRUCT_ONLY)); g_object_class_install_property(gobject_class, PROP_CLIENT, spec); @@ -122,10 +271,161 @@ GPlasmaClient * gplasma_client_new(const gchar *store_socket_name, GError **error) { - auto plasma_client = std::make_shared(); + auto plasma_client = new plasma::PlasmaClient(); auto status = plasma_client->Connect(store_socket_name, ""); if (garrow_error_check(error, status, "[plasma][client][new]")) { - return gplasma_client_new_raw(&plasma_client); + return gplasma_client_new_raw(plasma_client); + } else { + return NULL; + } +} + +/** + * gplasma_client_create: + * @client: A #GPlasmaClient. + * @id: The ID for a newly created object. + * @data_size: The number of bytes of data for a newly created object. + * @options: (nullable): The option for creating an object. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (nullable) (transfer full): A newly created #GPlasmaCreatedObject + * on success, %NULL on error. + * + * Since: 0.12.0 + */ +GPlasmaCreatedObject * +gplasma_client_create(GPlasmaClient *client, + GPlasmaObjectID *id, + gsize data_size, + GPlasmaClientCreateOptions *options, + GError **error) +{ + const auto context = "[plasma][client][create]"; + auto plasma_client = gplasma_client_get_raw(client); + auto plasma_id = gplasma_object_id_get_raw(id); + const uint8_t *raw_metadata = nullptr; + int64_t raw_metadata_size = 0; + int device_number = 0; + if (options) { + auto options_priv = GPLASMA_CLIENT_CREATE_OPTIONS_GET_PRIVATE(options); + raw_metadata = options_priv->metadata; + raw_metadata_size = options_priv->metadata_size; + if (options_priv->gpu_device >= 0) { +#ifndef HAVE_ARROW_CUDA + g_set_error(error, + GARROW_ERROR, + GARROW_ERROR_INVALID, + "%s Arrow CUDA GLib is needed to use GPU", + context); + return NULL; +#endif + device_number = options_priv->gpu_device + 1; + } + } + std::shared_ptr plasma_data; + auto status = plasma_client->Create(plasma_id, + data_size, + raw_metadata, + raw_metadata_size, + &plasma_data, + device_number); + if (garrow_error_check(error, status, context)) { + GArrowBuffer *data = nullptr; + if (device_number == 0) { + auto plasma_mutable_data = + std::static_pointer_cast(plasma_data); + data = GARROW_BUFFER(garrow_mutable_buffer_new_raw(&plasma_mutable_data)); +#ifdef HAVE_ARROW_CUDA + } else { + auto plasma_cuda_data = + std::static_pointer_cast(plasma_data); + data = GARROW_BUFFER(garrow_cuda_buffer_new_raw(&plasma_cuda_data)); +#endif + } + GArrowBuffer *metadata = nullptr; + if (raw_metadata_size > 0) { + auto plasma_metadata = + std::make_shared(raw_metadata, raw_metadata_size); + metadata = garrow_buffer_new_raw(&plasma_metadata); + } + return gplasma_created_object_new_raw(client, + id, + data, + metadata, + device_number - 1); + } else { + return NULL; + } +} + +/** + * gplasma_client_refer_object: + * @client: A #GPlasmaClient. + * @id: The ID of the target object. + * @timeout_ms: The timeout in milliseconds. -1 means no timeout. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (nullable) (transfer full): A found #GPlasmaReferredObject + * on success, %NULL on error. + * + * Since: 0.12.0 + */ +GPlasmaReferredObject * +gplasma_client_refer_object(GPlasmaClient *client, + GPlasmaObjectID *id, + gint64 timeout_ms, + GError **error) +{ + const auto context = "[plasma][client][refer-object]"; + auto plasma_client = gplasma_client_get_raw(client); + auto plasma_id = gplasma_object_id_get_raw(id); + std::vector plasma_ids; + plasma_ids.push_back(plasma_id); + std::vector plasma_object_buffers; + auto status = plasma_client->Get(plasma_ids, + timeout_ms, + &plasma_object_buffers); + if (garrow_error_check(error, status, context)) { + auto plasma_object_buffer = plasma_object_buffers[0]; + auto plasma_data = plasma_object_buffer.data; + auto plasma_metadata = plasma_object_buffer.metadata; + GArrowBuffer *data = nullptr; + GArrowBuffer *metadata = nullptr; + if (plasma_object_buffer.device_num > 0) { +#ifdef HAVE_ARROW_CUDA + std::shared_ptr plasma_cuda_data; + status = arrow::cuda::CudaBuffer::FromBuffer(plasma_data, + &plasma_cuda_data); + if (!garrow_error_check(error, status, context)) { + return NULL; + } + std::shared_ptr plasma_cuda_metadata; + status = arrow::cuda::CudaBuffer::FromBuffer(plasma_metadata, + &plasma_cuda_metadata); + if (!garrow_error_check(error, status, context)) { + return NULL; + } + + data = GARROW_BUFFER(garrow_cuda_buffer_new_raw(&plasma_cuda_data)); + metadata = + GARROW_BUFFER(garrow_cuda_buffer_new_raw(&plasma_cuda_metadata)); +#else + g_set_error(error, + GARROW_ERROR, + GARROW_ERROR_INVALID, + "%s Arrow CUDA GLib is needed to use GPU", + context); + return NULL; +#endif + } else { + data = garrow_buffer_new_raw(&plasma_data); + metadata = garrow_buffer_new_raw(&plasma_metadata); + } + return gplasma_referred_object_new_raw(client, + id, + data, + metadata, + plasma_object_buffer.device_num - 1); } else { return NULL; } @@ -134,7 +434,7 @@ gplasma_client_new(const gchar *store_socket_name, G_END_DECLS GPlasmaClient * -gplasma_client_new_raw(std::shared_ptr *plasma_client) +gplasma_client_new_raw(plasma::PlasmaClient *plasma_client) { auto client = g_object_new(GPLASMA_TYPE_CLIENT, "client", plasma_client, @@ -142,7 +442,7 @@ gplasma_client_new_raw(std::shared_ptr *plasma_client) return GPLASMA_CLIENT(client); } -std::shared_ptr +plasma::PlasmaClient * gplasma_client_get_raw(GPlasmaClient *client) { auto priv = GPLASMA_CLIENT_GET_PRIVATE(client); diff --git a/c_glib/plasma-glib/client.h b/c_glib/plasma-glib/client.h index 30c8a81aff7bb..6f99f467c83a7 100644 --- a/c_glib/plasma-glib/client.h +++ b/c_glib/plasma-glib/client.h @@ -19,10 +19,33 @@ #pragma once -#include +#include G_BEGIN_DECLS +#define GPLASMA_TYPE_CLIENT_CREATE_OPTIONS \ + (gplasma_client_create_options_get_type()) +G_DECLARE_DERIVABLE_TYPE(GPlasmaClientCreateOptions, + gplasma_client_create_options, + GPLASMA, + CLIENT_CREATE_OPTIONS, + GObject) + +struct _GPlasmaClientCreateOptionsClass +{ + GObjectClass parent_class; +}; + +GPlasmaClientCreateOptions *gplasma_client_create_options_new(void); +void +gplasma_client_create_options_set_metadata(GPlasmaClientCreateOptions *options, + const guint8 *metadata, + gsize size); +const guint8 * +gplasma_client_create_options_get_metadata(GPlasmaClientCreateOptions *options, + gsize *size); + + #define GPLASMA_TYPE_CLIENT (gplasma_client_get_type()) G_DECLARE_DERIVABLE_TYPE(GPlasmaClient, gplasma_client, @@ -37,5 +60,16 @@ struct _GPlasmaClientClass GPlasmaClient *gplasma_client_new(const gchar *store_socket_name, GError **error); +GPlasmaCreatedObject * +gplasma_client_create(GPlasmaClient *client, + GPlasmaObjectID *id, + gsize data_size, + GPlasmaClientCreateOptions *options, + GError **error); +GPlasmaReferredObject * +gplasma_client_refer_object(GPlasmaClient *client, + GPlasmaObjectID *id, + gint64 timeout_ms, + GError **error); G_END_DECLS diff --git a/c_glib/plasma-glib/client.hpp b/c_glib/plasma-glib/client.hpp index 473ea16ae4444..d3e2ab2598d2a 100644 --- a/c_glib/plasma-glib/client.hpp +++ b/c_glib/plasma-glib/client.hpp @@ -19,11 +19,11 @@ #pragma once -#include - #include #include -GPlasmaClient *gplasma_client_new_raw(std::shared_ptr *plasma_client); -std::shared_ptr gplasma_client_get_raw(GPlasmaClient *client); +GPlasmaClient * +gplasma_client_new_raw(plasma::PlasmaClient *plasma_client); +plasma::PlasmaClient * +gplasma_client_get_raw(GPlasmaClient *client); diff --git a/c_glib/plasma-glib/meson.build b/c_glib/plasma-glib/meson.build index 40a20e9c7d006..75ebce870dba8 100644 --- a/c_glib/plasma-glib/meson.build +++ b/c_glib/plasma-glib/meson.build @@ -21,15 +21,18 @@ project_name = 'plasma-glib' sources = files( 'client.cpp', + 'object.cpp', ) c_headers = files( 'client.h', + 'object.h', 'plasma-glib.h', ) cpp_headers = files( 'client.hpp', + 'object.hpp', 'plasma-glib.hpp', ) @@ -41,13 +44,39 @@ dependencies = [ plasma, arrow_glib, ] +cpp_args = [ + '-DG_LOG_DOMAIN="Plasma"', +] +pkg_config_requires = [ + 'plasma', + 'arrow-glib', +] +gir_dependencies = [ + declare_dependency(sources: arrow_glib_gir), +] +gir_includes = [ + 'Arrow-1.0', +] +gir_extra_args = [ + '--warn-all', + '--include-uninstalled=./arrow-glib/Arrow-1.0.gir', +] +if arrow_cuda.found() + dependencies += [arrow_cuda_glib] + cpp_args += ['-DHAVE_ARROW_CUDA'] + pkg_config_requires += ['arrow-cuda-glib'] + gir_dependencies += [declare_dependency(sources: arrow_cuda_glib_gir)] + gir_includes += ['ArrowCUDA-1.0'] + gir_extra_args += ['--include-uninstalled=./arrow-cuda-glib/ArrowCUDA-1.0.gir'] +endif libplasma_glib = library('plasma-glib', - sources: sources, - install: true, - dependencies: dependencies, - include_directories: base_include_directories, - soversion: so_version, - version: library_version) + sources: sources, + install: true, + dependencies: dependencies, + include_directories: base_include_directories, + cpp_args: cpp_args, + soversion: so_version, + version: library_version) plasma_glib = declare_dependency(link_with: libplasma_glib, include_directories: base_include_directories, dependencies: dependencies) @@ -56,22 +85,17 @@ pkgconfig.generate(filebase: project_name, name: 'Apache Arrow Plasma GLib', description: 'C API for Apache Arrow Plasma based on GLib', version: version, - requires: ['plasma', 'arrow-glib'], + requires: pkg_config_requires, libraries: [libplasma_glib]) gnome.generate_gir(libplasma_glib, - dependencies: declare_dependency(sources: arrow_glib_gir), + dependencies: gir_dependencies, sources: sources + c_headers, namespace: 'Plasma', nsversion: api_version, identifier_prefix: 'GPlasma', symbol_prefix: 'gplasma', export_packages: 'plasma-glib', - includes: [ - 'Arrow-1.0', - ], + includes: gir_includes, install: true, - extra_args: [ - '--warn-all', - '--include-uninstalled=./arrow-glib/Arrow-1.0.gir', - ]) + extra_args: gir_extra_args) diff --git a/c_glib/plasma-glib/object.cpp b/c_glib/plasma-glib/object.cpp new file mode 100644 index 0000000000000..f7afd7231f2e5 --- /dev/null +++ b/c_glib/plasma-glib/object.cpp @@ -0,0 +1,538 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include + +#include +#include + +G_BEGIN_DECLS + +/** + * SECTION: object + * @section_id: object-classes + * @title: Object related classes + * @include: plasma-glib/plasma-glib.h + * + * #GPlasmaObjectID is a class for an object ID. + * + * #GPlasmaObject is a base class for an object stored in plasma store. + * + * #GPlasmaCreatedObject is a class for a created object. You can + * change data of the object until the object is sealed or aborted. + * + * #GPlasmaReferredObject is a class for a created object. You can + * only refer the data and metadata of the object. You can't change + * the data of the object. + * + * Since: 0.12.0 + */ + +typedef struct GPlasmaObjectIDPrivate_ { + plasma::ObjectID id; +} GPlasmaObjectIDPrivate; + +G_DEFINE_TYPE_WITH_PRIVATE(GPlasmaObjectID, + gplasma_object_id, + G_TYPE_OBJECT) + +#define GPLASMA_OBJECT_ID_GET_PRIVATE(object) \ + static_cast( \ + gplasma_object_id_get_instance_private( \ + GPLASMA_OBJECT_ID(object))) + +static void +gplasma_object_id_init(GPlasmaObjectID *object) +{ +} + +static void +gplasma_object_id_class_init(GPlasmaObjectIDClass *klass) +{ +} + +/** + * gplasma_object_id_new: + * @id: (array length=size): The raw ID bytes. + * @size: The number of bytes of the ID. It must be 1..20. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Returns: (nullable): A newly created #GPlasmaObjectID on success, + * %NULL on error. + * + * Since: 0.12.0 + */ +GPlasmaObjectID * +gplasma_object_id_new(const guint8 *id, + gsize size, + GError **error) +{ + if (size == 0 || size > plasma::kUniqueIDSize) { + g_set_error(error, + GARROW_ERROR, + GARROW_ERROR_INVALID, + "[plasma][object-id][new] " + "ID must be 1..20 bytes: <%" G_GSIZE_FORMAT ">", + size); + return NULL; + } + + auto object_id = g_object_new(GPLASMA_TYPE_OBJECT_ID, NULL); + auto priv = GPLASMA_OBJECT_ID_GET_PRIVATE(object_id); + memcpy(priv->id.mutable_data(), id, size); + if (size != plasma::kUniqueIDSize) { + memset(priv->id.mutable_data() + size, 0, plasma::kUniqueIDSize - size); + } + return GPLASMA_OBJECT_ID(object_id); +} + +/** + * gplasma_object_id_to_binary: + * @id: A #GPlasmaObjectID. + * @size: (nullable) (out): The number of bytes of the byte string of + * the object ID. It's always 20. 20 is `plasma::kUniqueIDSize`. + * + * Returns: (array length=size): The byte string of the object ID. + * + * Since: 0.12.0 + */ +const guint8 * +gplasma_object_id_to_binary(GPlasmaObjectID *id, + gsize *size) +{ + auto priv = GPLASMA_OBJECT_ID_GET_PRIVATE(id); + if (size) { + *size = plasma::kUniqueIDSize; + } + return priv->id.data(); +} + +/** + * gplasma_object_id_to_hex: + * @id: A #GPlasmaObjectID. + * + * Returns: (transfer full): The hex representation of the object ID. + * + * It should be freed with g_free() when no longer needed. + * + * Since: 0.12.0 + */ +gchar * +gplasma_object_id_to_hex(GPlasmaObjectID *id) +{ + auto priv = GPLASMA_OBJECT_ID_GET_PRIVATE(id); + return g_strdup(priv->id.hex().c_str()); +} + +typedef struct GPlasmaObjectPrivate_ { + GPlasmaClient *client; + GPlasmaObjectID *id; + GArrowBuffer *data; + GArrowBuffer *metadata; + gint gpu_device; +} GPlasmaObjectPrivate; + +enum { + PROP_CLIENT = 1, + PROP_ID, + PROP_DATA, + PROP_METADATA, + PROP_GPU_DEVICE +}; + +G_DEFINE_TYPE_WITH_PRIVATE(GPlasmaObject, + gplasma_object, + G_TYPE_OBJECT) + +#define GPLASMA_OBJECT_GET_PRIVATE(object) \ + static_cast( \ + gplasma_object_get_instance_private( \ + GPLASMA_OBJECT(object))) + +static void +gplasma_object_dispose(GObject *object) +{ + auto priv = GPLASMA_OBJECT_GET_PRIVATE(object); + + // Properties except priv->id must be disposed in subclass. + + if (priv->id) { + g_object_unref(priv->id); + priv->id = nullptr; + } + + G_OBJECT_CLASS(gplasma_object_parent_class)->dispose(object); +} + +static void +gplasma_object_set_property(GObject *object, + guint prop_id, + const GValue *value, + GParamSpec *pspec) +{ + auto priv = GPLASMA_OBJECT_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_CLIENT: + priv->client = GPLASMA_CLIENT(g_value_dup_object(value)); + break; + case PROP_ID: + priv->id = GPLASMA_OBJECT_ID(g_value_dup_object(value)); + break; + case PROP_DATA: + priv->data = GARROW_BUFFER(g_value_dup_object(value)); + break; + case PROP_METADATA: + priv->metadata = GARROW_BUFFER(g_value_dup_object(value)); + break; + case PROP_GPU_DEVICE: + priv->gpu_device = g_value_get_int(value); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gplasma_object_get_property(GObject *object, + guint prop_id, + GValue *value, + GParamSpec *pspec) +{ + auto priv = GPLASMA_OBJECT_GET_PRIVATE(object); + + switch (prop_id) { + case PROP_CLIENT: + g_value_set_object(value, priv->client); + break; + case PROP_ID: + g_value_set_object(value, priv->id); + break; + case PROP_DATA: + g_value_set_object(value, priv->data); + break; + case PROP_METADATA: + g_value_set_object(value, priv->metadata); + break; + case PROP_GPU_DEVICE: + g_value_set_int(value, priv->gpu_device); + break; + default: + G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); + break; + } +} + +static void +gplasma_object_init(GPlasmaObject *object) +{ +} + +static void +gplasma_object_class_init(GPlasmaObjectClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->dispose = gplasma_object_dispose; + gobject_class->set_property = gplasma_object_set_property; + gobject_class->get_property = gplasma_object_get_property; + + GParamSpec *spec; + spec = g_param_spec_object("client", + "Client", + "The client", + GPLASMA_TYPE_CLIENT, + static_cast(G_PARAM_READWRITE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_CLIENT, spec); + + spec = g_param_spec_object("id", + "ID", + "The ID of this object", + GPLASMA_TYPE_OBJECT_ID, + static_cast(G_PARAM_READWRITE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_ID, spec); + + spec = g_param_spec_object("data", + "Data", + "The data of this object", + GARROW_TYPE_BUFFER, + static_cast(G_PARAM_READWRITE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_DATA, spec); + + spec = g_param_spec_object("metadata", + "Metadata", + "The metadata of this object", + GARROW_TYPE_BUFFER, + static_cast(G_PARAM_READWRITE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_METADATA, spec); + + spec = g_param_spec_int("gpu-device", + "GPU device", + "The GPU device number. -1 means GPU isn't used.", + -1, + G_MAXINT, + -1, + static_cast(G_PARAM_READWRITE | + G_PARAM_CONSTRUCT_ONLY)); + g_object_class_install_property(gobject_class, PROP_GPU_DEVICE, spec); +} + +static bool +gplasma_object_check_not_released(GPlasmaObjectPrivate *priv, + GError **error, + const gchar *context) +{ + if (priv->client) { + return true; + } + + auto id_priv = GPLASMA_OBJECT_ID_GET_PRIVATE(priv->id); + auto id_hex = id_priv->id.hex(); + g_set_error(error, + GARROW_ERROR, + GARROW_ERROR_INVALID, + "%s: Can't process released object: <%s>", + context, + id_hex.c_str()); + return false; +} + +static void +gplasma_object_release_resources(GPlasmaObjectPrivate *priv) +{ + if (priv->client) { + g_object_unref(priv->client); + priv->client = nullptr; + } + + if (priv->data) { + g_object_unref(priv->data); + priv->data = nullptr; + } + + if (priv->metadata) { + g_object_unref(priv->metadata); + priv->metadata = nullptr; + } +} + +G_DEFINE_TYPE(GPlasmaCreatedObject, + gplasma_created_object, + GPLASMA_TYPE_OBJECT) + +static void +gplasma_created_object_dispose(GObject *object) +{ + auto priv = GPLASMA_OBJECT_GET_PRIVATE(object); + + if (priv->client) { + gplasma_created_object_abort(GPLASMA_CREATED_OBJECT(object), NULL); + } + + G_OBJECT_CLASS(gplasma_created_object_parent_class)->dispose(object); +} + +static void +gplasma_created_object_init(GPlasmaCreatedObject *object) +{ +} + +static void +gplasma_created_object_class_init(GPlasmaCreatedObjectClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->dispose = gplasma_created_object_dispose; +} + +/** + * gplasma_created_object_seal: + * @object: A #GPlasmaCreatedObject. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Seals the object in the object store. You can't use the sealed + * object anymore. + * + * Returns: %TRUE on success, %FALSE on error. + * + * Since: 0.12.0 + */ +gboolean +gplasma_created_object_seal(GPlasmaCreatedObject *object, + GError **error) +{ + const auto context = "[plasma][created-object][seal]"; + + auto priv = GPLASMA_OBJECT_GET_PRIVATE(object); + if (!gplasma_object_check_not_released(priv, error, context)) { + return FALSE; + } + + auto plasma_client = gplasma_client_get_raw(priv->client); + auto id_priv = GPLASMA_OBJECT_ID_GET_PRIVATE(priv->id); + auto status = plasma_client->Seal(id_priv->id); + auto success = garrow_error_check(error, status, context); + if (success) { + status = plasma_client->Release(id_priv->id); + success = garrow_error_check(error, status, context); + gplasma_object_release_resources(priv); + } + return success; +} + +/** + * gplasma_created_object_abort: + * @object: A #GPlasmaCreatedObject. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Aborts the object in the object store. You can't use the aborted + * object anymore. + * + * Returns: %TRUE on success, %FALSE on error. + * + * Since: 0.12.0 + */ +gboolean +gplasma_created_object_abort(GPlasmaCreatedObject *object, + GError **error) +{ + const auto context = "[plasma][created-object][abort]"; + + auto priv = GPLASMA_OBJECT_GET_PRIVATE(object); + if (!gplasma_object_check_not_released(priv, error, context)) { + return FALSE; + } + + auto plasma_client = gplasma_client_get_raw(priv->client); + auto id_priv = GPLASMA_OBJECT_ID_GET_PRIVATE(priv->id); + auto status = plasma_client->Release(id_priv->id); + auto success = garrow_error_check(error, status, context); + if (success) { + status = plasma_client->Abort(id_priv->id); + success = garrow_error_check(error, status, context); + gplasma_object_release_resources(priv); + } + return success; +} + + +G_DEFINE_TYPE(GPlasmaReferredObject, + gplasma_referred_object, + GPLASMA_TYPE_OBJECT) + +static void +gplasma_referred_object_dispose(GObject *object) +{ + auto priv = GPLASMA_OBJECT_GET_PRIVATE(object); + + gplasma_object_release_resources(priv); + + G_OBJECT_CLASS(gplasma_referred_object_parent_class)->dispose(object); +} + +static void +gplasma_referred_object_init(GPlasmaReferredObject *object) +{ +} + +static void +gplasma_referred_object_class_init(GPlasmaReferredObjectClass *klass) +{ + auto gobject_class = G_OBJECT_CLASS(klass); + + gobject_class->dispose = gplasma_referred_object_dispose; +} + +/** + * gplasma_referred_object_release: + * @object: A #GPlasmaReferredObject. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Releases the object explicitly. The object is no longer valid. + * + * Returns: %TRUE on success, %FALSE on error. + * + * Since: 0.12.0 + */ +gboolean +gplasma_referred_object_release(GPlasmaReferredObject *object, + GError **error) +{ + const auto context = "[plasma][referred-object][release]"; + + auto priv = GPLASMA_OBJECT_GET_PRIVATE(object); + if (!gplasma_object_check_not_released(priv, error, context)) { + return FALSE; + } + + gplasma_object_release_resources(priv); + return TRUE; +} + +G_END_DECLS + +plasma::ObjectID +gplasma_object_id_get_raw(GPlasmaObjectID *id) +{ + auto priv = GPLASMA_OBJECT_ID_GET_PRIVATE(id); + return priv->id; +} + +GPlasmaCreatedObject * +gplasma_created_object_new_raw(GPlasmaClient *client, + GPlasmaObjectID *id, + GArrowBuffer *data, + GArrowBuffer *metadata, + gint gpu_device) +{ + auto object = g_object_new(GPLASMA_TYPE_CREATED_OBJECT, + "client", client, + "id", id, + "data", data, + "metadata", metadata, + "gpu-device", gpu_device, + NULL); + return GPLASMA_CREATED_OBJECT(object); +} + +GPlasmaReferredObject * +gplasma_referred_object_new_raw(GPlasmaClient *client, + GPlasmaObjectID *id, + GArrowBuffer *data, + GArrowBuffer *metadata, + gint gpu_device) +{ + auto object = g_object_new(GPLASMA_TYPE_REFERRED_OBJECT, + "client", client, + "id", id, + "data", data, + "metadata", metadata, + "gpu-device", gpu_device, + NULL); + return GPLASMA_REFERRED_OBJECT(object); +} diff --git a/c_glib/plasma-glib/object.h b/c_glib/plasma-glib/object.h new file mode 100644 index 0000000000000..46547d37b46e8 --- /dev/null +++ b/c_glib/plasma-glib/object.h @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +G_BEGIN_DECLS + +#define GPLASMA_TYPE_OBJECT_ID (gplasma_object_id_get_type()) +G_DECLARE_DERIVABLE_TYPE(GPlasmaObjectID, + gplasma_object_id, + GPLASMA, + OBJECT_ID, + GObject) + +struct _GPlasmaObjectIDClass +{ + GObjectClass parent_class; +}; + +GPlasmaObjectID *gplasma_object_id_new(const guint8 *id, + gsize size, + GError **error); +const guint8 *gplasma_object_id_to_binary(GPlasmaObjectID *id, + gsize *size); +gchar *gplasma_object_id_to_hex(GPlasmaObjectID *id); + +#define GPLASMA_TYPE_OBJECT (gplasma_object_get_type()) +G_DECLARE_DERIVABLE_TYPE(GPlasmaObject, + gplasma_object, + GPLASMA, + OBJECT, + GObject) + +struct _GPlasmaObjectClass +{ + GObjectClass parent_class; +}; + +#define GPLASMA_TYPE_CREATED_OBJECT (gplasma_created_object_get_type()) +G_DECLARE_DERIVABLE_TYPE(GPlasmaCreatedObject, + gplasma_created_object, + GPLASMA, + CREATED_OBJECT, + GPlasmaObject) + +struct _GPlasmaCreatedObjectClass +{ + GPlasmaObjectClass parent_class; +}; + +gboolean gplasma_created_object_seal(GPlasmaCreatedObject *object, + GError **error); +gboolean gplasma_created_object_abort(GPlasmaCreatedObject *object, + GError **error); + +#define GPLASMA_TYPE_REFERRED_OBJECT (gplasma_referred_object_get_type()) +G_DECLARE_DERIVABLE_TYPE(GPlasmaReferredObject, + gplasma_referred_object, + GPLASMA, + REFERRED_OBJECT, + GPlasmaObject) + +struct _GPlasmaReferredObjectClass +{ + GPlasmaObjectClass parent_class; +}; + +gboolean gplasma_referred_object_release(GPlasmaReferredObject *object, + GError **error); + +G_END_DECLS diff --git a/c_glib/plasma-glib/object.hpp b/c_glib/plasma-glib/object.hpp new file mode 100644 index 0000000000000..9d598b2ed6b3a --- /dev/null +++ b/c_glib/plasma-glib/object.hpp @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#pragma once + +#include + +#include + +#include + +plasma::ObjectID +gplasma_object_id_get_raw(GPlasmaObjectID *id); + +GPlasmaCreatedObject * +gplasma_created_object_new_raw(GPlasmaClient *client, + GPlasmaObjectID *id, + GArrowBuffer *data, + GArrowBuffer *metadata, + gint gpu_device); + +GPlasmaReferredObject * +gplasma_referred_object_new_raw(GPlasmaClient *client, + GPlasmaObjectID *id, + GArrowBuffer *data, + GArrowBuffer *metadata, + gint gpu_device); diff --git a/c_glib/plasma-glib/plasma-glib.h b/c_glib/plasma-glib/plasma-glib.h index 33eed2cc6fc3c..2a6dd76ca35b2 100644 --- a/c_glib/plasma-glib/plasma-glib.h +++ b/c_glib/plasma-glib/plasma-glib.h @@ -20,3 +20,4 @@ #pragma once #include +#include diff --git a/c_glib/plasma-glib/plasma-glib.hpp b/c_glib/plasma-glib/plasma-glib.hpp index b0af4899ea3de..b2958c28f863d 100644 --- a/c_glib/plasma-glib/plasma-glib.hpp +++ b/c_glib/plasma-glib/plasma-glib.hpp @@ -22,3 +22,4 @@ #include #include +#include diff --git a/c_glib/plasma-glib/plasma-glib.pc.in b/c_glib/plasma-glib/plasma-glib.pc.in index 21f202c9b06fd..f3a82c237d0b9 100644 --- a/c_glib/plasma-glib/plasma-glib.pc.in +++ b/c_glib/plasma-glib/plasma-glib.pc.in @@ -25,4 +25,4 @@ Description: C API for Apache Arrow Plasma based on GLib Version: @VERSION@ Libs: -L${libdir} -lplasma-glib Cflags: -I${includedir} -Requires: plasma arrow-glib +Requires: plasma arrow-glib @ARROW_GPU_GLIB_PACKAGE@ diff --git a/c_glib/test/plasma/test-plasma-client.rb b/c_glib/test/plasma/test-plasma-client.rb index aee2d037b3104..cbdce865f0132 100644 --- a/c_glib/test/plasma/test-plasma-client.rb +++ b/c_glib/test/plasma/test-plasma-client.rb @@ -16,20 +16,72 @@ # under the License. class TestPlasmaClient < Test::Unit::TestCase + include Helper::Omittable + def setup @store = nil omit("Plasma is required") unless defined?(::Plasma) @store = Helper::PlasmaStore.new @store.start + @client = Plasma::Client.new(@store.socket_path) end def teardown @store.stop if @store end - def test_new - assert_nothing_raised do - Plasma::Client.new(@store.socket_path) + sub_test_case("#create") do + def setup + super + + @id = Plasma::ObjectID.new("Hello") + @data = "World" + @metadata = "Metadata" + @options = Plasma::ClientCreateOptions.new + end + + test("no options") do + require_gi(1, 42, 0) + + object = @client.create(@id, @data.bytesize) + object.data.set_data(0, @data) + object.seal + + object = @client.refer_object(@id, -1) + assert_equal(@data, object.data.data.to_s) + end + + test("options: metadata") do + @options.set_metadata(@metadata) + object = @client.create(@id, 1, @options) + object.seal + + object = @client.refer_object(@id, -1) + assert_equal(@metadata, object.metadata.data.to_s) + end + + test("options: GPU device") do + omit("Arrow CUDA is required") unless defined?(::ArrowCUDA) + + gpu_device = 0 + + @options.gpu_device = gpu_device + @options.metadata = @metadata + object = @client.create(@id, @data.bytesize, @options) + object.data.copy_from_host(@data) + object.seal + + object = @client.refer_object(@id, -1) + assert_equal([ + gpu_device, + @data, + @metadata, + ], + [ + object.gpu_device, + object.data.copy_to_host(0, @data.bytesize).to_s, + object.metadata.copy_to_host(0, @metadata.bytesize).to_s, + ]) end end end diff --git a/c_glib/test/plasma/test-plasma-created-object.rb b/c_glib/test/plasma/test-plasma-created-object.rb new file mode 100644 index 0000000000000..54d6774790abe --- /dev/null +++ b/c_glib/test/plasma/test-plasma-created-object.rb @@ -0,0 +1,56 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestPlasmaCreatedObject < Test::Unit::TestCase + def setup + @store = nil + omit("Plasma is required") unless defined?(::Plasma) + @store = Helper::PlasmaStore.new + @store.start + @client = Plasma::Client.new(@store.socket_path) + + @id = Plasma::ObjectID.new("Hello") + @data = "World" + @metadata = "Metadata" + @options = Plasma::ClientCreateOptions.new + @options.metadata = @metadata + @object = @client.create(@id, @data.bytesize, @options) + end + + def teardown + @store.stop if @store + end + + test("#seal") do + @object.data.set_data(0, @data) + @object.seal + + object = @client.refer_object(@id, -1) + assert_equal(@data, object.data.data.to_s) + end + + test("#abort") do + @object.data.set_data(0, @data) + assert_raise(Arrow::Error::PlasmaObjectExists) do + @client.create(@id, @data.bytesize, @options) + end + @object.abort + + object = @client.create(@id, @data.bytesize, @options) + object.abort + end +end diff --git a/c_glib/test/plasma/test-plasma-referred-object.rb b/c_glib/test/plasma/test-plasma-referred-object.rb new file mode 100644 index 0000000000000..f55c0b13c5603 --- /dev/null +++ b/c_glib/test/plasma/test-plasma-referred-object.rb @@ -0,0 +1,51 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestPlasmaReferredObject < Test::Unit::TestCase + def setup + @store = nil + omit("Plasma is required") unless defined?(::Plasma) + @store = Helper::PlasmaStore.new + @store.start + @client = Plasma::Client.new(@store.socket_path) + + @id = Plasma::ObjectID.new("Hello") + @data = "World" + @metadata = "Metadata" + @options = Plasma::ClientCreateOptions.new + @options.metadata = @metadata + object = @client.create(@id, @data.bytesize, @options) + object.data.set_data(0, @data) + object.seal + @object = @client.refer_object(@id, -1) + end + + def teardown + @store.stop if @store + end + + test("#release") do + @object.release + + message = "[plasma][referred-object][release]: " + message << "Can't process released object: <#{@id.to_hex}>" + error = Arrow::Error::Invalid.new(message) + assert_raise(error) do + @object.release + end + end +end diff --git a/c_glib/test/run-test.rb b/c_glib/test/run-test.rb index 238bb2d68af70..99d72f4289176 100755 --- a/c_glib/test/run-test.rb +++ b/c_glib/test/run-test.rb @@ -38,7 +38,7 @@ def initialize(data) end begin - ArrowGPU = GI.load("ArrowGPU") + ArrowCUDA = GI.load("ArrowCUDA") rescue GObjectIntrospection::RepositoryError::TypelibNotFound end diff --git a/c_glib/test/run-test.sh b/c_glib/test/run-test.sh index 96585ce653a74..d33555dd459e2 100755 --- a/c_glib/test/run-test.sh +++ b/c_glib/test/run-test.sh @@ -20,7 +20,7 @@ test_dir="$(cd $(dirname $0); pwd)" build_dir="$(cd .; pwd)" -modules="arrow-glib arrow-gpu-glib gandiva-glib parquet-glib plasma-glib" +modules="arrow-glib arrow-cuda-glib gandiva-glib parquet-glib plasma-glib" for module in ${modules}; do module_build_dir="${build_dir}/${module}" diff --git a/c_glib/test/test-boolean-array.rb b/c_glib/test/test-boolean-array.rb index ae22bce48b64a..ac07ec995ea32 100644 --- a/c_glib/test/test-boolean-array.rb +++ b/c_glib/test/test-boolean-array.rb @@ -44,7 +44,7 @@ def test_value end def test_values - require_gi_bindings(3, 1, 9) + require_gi_bindings(3, 3, 1) builder = Arrow::BooleanArrayBuilder.new builder.append(true) builder.append(false) diff --git a/c_glib/test/test-gpu-cuda.rb b/c_glib/test/test-cuda.rb similarity index 80% rename from c_glib/test/test-gpu-cuda.rb rename to c_glib/test/test-cuda.rb index 66ec19d424ec9..32d486ef8ba97 100644 --- a/c_glib/test/test-gpu-cuda.rb +++ b/c_glib/test/test-cuda.rb @@ -15,12 +15,12 @@ # specific language governing permissions and limitations # under the License. -class TestGPUCUDA < Test::Unit::TestCase +class TestCUDA < Test::Unit::TestCase include Helper::Buildable def setup - omit("Arrow GPU is required") unless defined?(::ArrowGPU) - @manager = ArrowGPU::CUDADeviceManager.new + omit("Arrow CUDA is required") unless defined?(::ArrowCUDA) + @manager = ArrowCUDA::DeviceManager.new omit("At least one GPU is required") if @manager.n_devices.zero? @context = @manager.get_context(0) end @@ -29,7 +29,7 @@ def setup def test_allocated_size allocated_size_before = @context.allocated_size size = 128 - buffer = ArrowGPU::CUDABuffer.new(@context, size) + buffer = ArrowCUDA::Buffer.new(@context, size) assert_equal(size, @context.allocated_size - allocated_size_before) end @@ -38,7 +38,7 @@ def test_allocated_size sub_test_case("Buffer") do def setup super - @buffer = ArrowGPU::CUDABuffer.new(@context, 128) + @buffer = ArrowCUDA::Buffer.new(@context, 128) end def test_copy @@ -50,19 +50,19 @@ def test_export @buffer.copy_from_host("Hello World") handle = @buffer.export serialized_handle = handle.serialize.data - Tempfile.open("arrow-gpu-cuda-export") do |output| + Tempfile.open("arrow-cuda-export") do |output| pid = spawn(RbConfig.ruby, "-e", <<-SCRIPT) require "gi" Gio = GI.load("Gio") Arrow = GI.load("Arrow") -ArrowGPU = GI.load("ArrowGPU") +ArrowCUDA = GI.load("ArrowCUDA") -manager = ArrowGPU::CUDADeviceManager.new +manager = ArrowCUDA::ADeviceManager.new context = manager.get_context(0) serialized_handle = #{serialized_handle.to_s.dump} -handle = ArrowGPU::CUDAIPCMemoryHandle.new(serialized_handle) -buffer = ArrowGPU::CUDABuffer.new(context, handle) +handle = ArrowCUDA::IPCMemoryHandle.new(serialized_handle) +buffer = ArrowCUDA::Buffer.new(context, handle) File.open(#{output.path.dump}, "w") do |output| output.print(buffer.copy_to_host(0, 6).to_s) end @@ -85,7 +85,7 @@ def test_record_batch ] cpu_record_batch = Arrow::RecordBatch.new(schema, 1, columns) - buffer = ArrowGPU::CUDABuffer.new(@context, cpu_record_batch) + buffer = ArrowCUDA::Buffer.new(@context, cpu_record_batch) gpu_record_batch = buffer.read_record_batch(schema) assert_equal(cpu_record_batch.n_rows, gpu_record_batch.n_rows) @@ -94,16 +94,16 @@ def test_record_batch sub_test_case("HostBuffer") do def test_new - buffer = ArrowGPU::CUDAHostBuffer.new(0, 128) + buffer = ArrowCUDA::HostBuffer.new(0, 128) assert_equal(128, buffer.size) end end sub_test_case("BufferInputStream") do def test_new - buffer = ArrowGPU::CUDABuffer.new(@context, 128) + buffer = ArrowCUDA::Buffer.new(@context, 128) buffer.copy_from_host("Hello World") - stream = ArrowGPU::CUDABufferInputStream.new(buffer) + stream = ArrowCUDA::BufferInputStream.new(buffer) begin assert_equal("Hello Worl", stream.read(5).copy_to_host(0, 10).to_s) ensure @@ -115,9 +115,9 @@ def test_new sub_test_case("BufferOutputStream") do def setup super - @buffer = ArrowGPU::CUDABuffer.new(@context, 128) + @buffer = ArrowCUDA::Buffer.new(@context, 128) @buffer.copy_from_host("\x00" * @buffer.size) - @stream = ArrowGPU::CUDABufferOutputStream.new(@buffer) + @stream = ArrowCUDA::BufferOutputStream.new(@buffer) end def cleanup diff --git a/c_glib/test/test-dense-union-array.rb b/c_glib/test/test-dense-union-array.rb new file mode 100644 index 0000000000000..fa73f8d4c0918 --- /dev/null +++ b/c_glib/test/test-dense-union-array.rb @@ -0,0 +1,50 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestDenseUnionArray < Test::Unit::TestCase + include Helper::Buildable + + def setup + type_ids = build_int8_array([0, 1, nil, 1, 1]) + value_offsets = build_int32_array([0, 0, 0, 1, 2]) + fields = [ + build_int16_array([1]), + build_string_array(["a", "b", "c"]), + ] + @array = Arrow::DenseUnionArray.new(type_ids, value_offsets, fields) + end + + def test_value_data_type + fields = [ + Arrow::Field.new("0", Arrow::Int16DataType.new), + Arrow::Field.new("1", Arrow::StringDataType.new), + ] + assert_equal(Arrow::DenseUnionDataType.new(fields, [0, 1]), + @array.value_data_type) + end + + def test_field + assert_equal([ + build_int16_array([1]), + build_string_array(["a", "b", "c"]), + ], + [ + @array.get_field(0), + @array.get_field(1), + ]) + end +end diff --git a/c_glib/test/test-dense-union-data-type.rb b/c_glib/test/test-dense-union-data-type.rb new file mode 100644 index 0000000000000..0d1295423ebbb --- /dev/null +++ b/c_glib/test/test-dense-union-data-type.rb @@ -0,0 +1,35 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestDenseUnionDataType < Test::Unit::TestCase + def setup + fields = [ + Arrow::Field.new("number", Arrow::Int32DataType.new), + Arrow::Field.new("text", Arrow::StringDataType.new), + ] + @data_type = Arrow::DenseUnionDataType.new(fields, [2, 9]) + end + + def test_type + assert_equal(Arrow::Type::UNION, @data_type.id) + end + + def test_to_s + assert_equal("union[dense]", + @data_type.to_s) + end +end diff --git a/c_glib/test/test-record-batch-builder.rb b/c_glib/test/test-record-batch-builder.rb index 1bb72820a5860..030cc78a01555 100644 --- a/c_glib/test/test-record-batch-builder.rb +++ b/c_glib/test/test-record-batch-builder.rb @@ -17,6 +17,7 @@ class TestRecordBatchBuilder < Test::Unit::TestCase include Helper::Buildable + include Helper::Omittable def setup @fields = [ @@ -61,6 +62,7 @@ def test_too_large end def test_flush + require_gi_bindings(3, 3, 1) arrays = { "visible" => build_boolean_array([true, false, true]), "point" => build_int32_array([1, -1, 0]), diff --git a/c_glib/test/test-sparse-union-array.rb b/c_glib/test/test-sparse-union-array.rb new file mode 100644 index 0000000000000..721f95c1fbec6 --- /dev/null +++ b/c_glib/test/test-sparse-union-array.rb @@ -0,0 +1,49 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestSparseUnionArray < Test::Unit::TestCase + include Helper::Buildable + + def setup + type_ids = build_int8_array([0, 1, nil, 1, 0]) + fields = [ + build_int16_array([1, nil, nil, nil, 5]), + build_string_array([nil, "b", nil, "d", nil]), + ] + @array = Arrow::SparseUnionArray.new(type_ids, fields) + end + + def test_value_data_type + fields = [ + Arrow::Field.new("0", Arrow::Int16DataType.new), + Arrow::Field.new("1", Arrow::StringDataType.new), + ] + assert_equal(Arrow::SparseUnionDataType.new(fields, [0, 1]), + @array.value_data_type) + end + + def test_field + assert_equal([ + build_int16_array([1, nil, nil, nil, 5]), + build_string_array([nil, "b", nil, "d", nil]), + ], + [ + @array.get_field(0), + @array.get_field(1), + ]) + end +end diff --git a/c_glib/test/test-sparse-union-data-type.rb b/c_glib/test/test-sparse-union-data-type.rb new file mode 100644 index 0000000000000..ff4ce72c274a3 --- /dev/null +++ b/c_glib/test/test-sparse-union-data-type.rb @@ -0,0 +1,35 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +class TestSparseUnionDataType < Test::Unit::TestCase + def setup + fields = [ + Arrow::Field.new("number", Arrow::Int32DataType.new), + Arrow::Field.new("text", Arrow::StringDataType.new), + ] + @data_type = Arrow::SparseUnionDataType.new(fields, [2, 9]) + end + + def test_type + assert_equal(Arrow::Type::UNION, @data_type.id) + end + + def test_to_s + assert_equal("union[sparse]", + @data_type.to_s) + end +end diff --git a/c_glib/test/test-tensor.rb b/c_glib/test/test-tensor.rb index 4f18011c047d8..31f2556c4e604 100644 --- a/c_glib/test/test-tensor.rb +++ b/c_glib/test/test-tensor.rb @@ -66,12 +66,12 @@ def test_buffer end def test_shape - require_gi_bindings(3, 1, 2) + require_gi_bindings(3, 3, 1) assert_equal(@shape, @tensor.shape) end def test_strides - require_gi_bindings(3, 1, 2) + require_gi_bindings(3, 3, 1) assert_equal([4, 2, 1], @tensor.strides) end diff --git a/ci/conda_env_python.yml b/ci/conda_env_python.yml index 37ec65496ebcc..429851eb2f5ae 100644 --- a/ci/conda_env_python.yml +++ b/ci/conda_env_python.yml @@ -21,5 +21,6 @@ numpy pandas pytest python +rsync setuptools setuptools_scm diff --git a/ci/conda_env_sphinx.yml b/ci/conda_env_sphinx.yml new file mode 100644 index 0000000000000..af6b4077dd7fa --- /dev/null +++ b/ci/conda_env_sphinx.yml @@ -0,0 +1,23 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Requirements for building the documentation +breathe +doxygen +ipython +sphinx +sphinx_rtd_theme diff --git a/python/manylinux1/.dockerignore b/ci/conda_env_unix.yml similarity index 89% rename from python/manylinux1/.dockerignore rename to ci/conda_env_unix.yml index be421b169fad4..eeb90e48dce72 100644 --- a/python/manylinux1/.dockerignore +++ b/ci/conda_env_unix.yml @@ -15,4 +15,6 @@ # specific language governing permissions and limitations # under the License. -dist/ +# conda package dependencies specific to Unix-like environments (Linux and macOS) + +autoconf diff --git a/ci/docker_build_cpp.sh b/ci/docker_build_cpp.sh index f1cf43fd1c3ba..c6a46f22f714d 100755 --- a/ci/docker_build_cpp.sh +++ b/ci/docker_build_cpp.sh @@ -19,23 +19,23 @@ set -e set -o xtrace -# Arrow specific environment variables -export ARROW_BUILD_TOOLCHAIN=$CONDA_PREFIX -export ARROW_HOME=$CONDA_PREFIX -export PARQUET_HOME=$CONDA_PREFIX +source_dir=${1:-/arrow/cpp} +build_dir=${2:-/build/cpp} +install_dir=${3:-${ARROW_HOME:-/usr/local}} # https://arrow.apache.org/docs/python/development.html#known-issues export CXXFLAGS="-D_GLIBCXX_USE_CXX11_ABI=0" -mkdir -p /build/cpp -pushd /build/cpp +mkdir -p ${build_dir} +pushd ${build_dir} cmake -GNinja \ -DCMAKE_BUILD_TYPE=${ARROW_BUILD_TYPE:-debug} \ - -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \ - -DARROW_ORC=ON \ - -DARROW_PLASMA=ON \ - -DARROW_PARQUET=ON \ + -DCMAKE_INSTALL_PREFIX=${install_dir} \ + -DCMAKE_INSTALL_LIBDIR=lib \ + -DARROW_ORC=${ARROW_ORC:-ON} \ + -DARROW_PLASMA=${ARROW_PLASMA:-ON} \ + -DARROW_PARQUET=${ARROW_PARQUET:-ON} \ -DARROW_HDFS=${ARROW_HDFS:-OFF} \ -DARROW_PYTHON=${ARROW_PYTHON:-OFF} \ -DARROW_BUILD_TESTS=${ARROW_BUILD_TESTS:-OFF} \ @@ -43,7 +43,7 @@ cmake -GNinja \ -DARROW_INSTALL_NAME_RPATH=${ARROW_INSTALL_NAME_RPATH:-ON} \ -DARROW_EXTRA_ERROR_CONTEXT=ON \ -DCMAKE_CXX_FLAGS=$CXXFLAGS \ - /arrow/cpp + ${source_dir} ninja ninja install diff --git a/ci/docker_build_python.sh b/ci/docker_build_python.sh index e89a0b44d6fd0..8ba8a1d66f1be 100755 --- a/ci/docker_build_python.sh +++ b/ci/docker_build_python.sh @@ -18,22 +18,20 @@ set -e -export ARROW_BUILD_TOOLCHAIN=$CONDA_PREFIX -export ARROW_HOME=$CONDA_PREFIX +source_dir=${1:-/arrow/python} +build_dir=${2:-/build/python} # For newer GCC per https://arrow.apache.org/docs/python/development.html#known-issues export CXXFLAGS="-D_GLIBCXX_USE_CXX11_ABI=0" export PYARROW_CXXFLAGS=$CXXFLAGS export PYARROW_CMAKE_GENERATOR=Ninja +export PYARROW_BUILD_TYPE=${PYARROW_BUILD_TYPE:-debug} +export PYARROW_WITH_PARQUET=${PYARROW_WITH_PARQUET:-1} +export PYARROW_WITH_PLASMA=${PYARROW_WITH_PLASMA:-1} # Build pyarrow -pushd /arrow/python +pushd ${source_dir} -python setup.py build_ext \ - --build-temp=/build/python \ - --build-type=${PYARROW_BUILD_TYPE:-debug} \ - --with-parquet \ - --with-plasma \ - install +python setup.py build_ext --build-temp=${build_dir} install popd diff --git a/ci/docker_build_sphinx.sh b/ci/docker_build_sphinx.sh new file mode 100755 index 0000000000000..957804325adf1 --- /dev/null +++ b/ci/docker_build_sphinx.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +set -ex + +pushd /arrow/cpp/apidoc +doxygen +popd + +pushd /arrow/python +python setup.py build_sphinx -s ../docs/source --build-dir ../docs/_build +popd + +mkdir -p /arrow/site/asf-site/docs/latest +rsync -r /arrow/docs/_build/html/ /arrow/site/asf-site/docs/latest/ diff --git a/ci/travis_install_osx.sh b/ci/travis_install_osx.sh index 83ca4a70bc364..47d6a637f7d58 100755 --- a/ci/travis_install_osx.sh +++ b/ci/travis_install_osx.sh @@ -17,11 +17,22 @@ # specific language governing permissions and limitations # under the License. +set -x set -e if [ "$ARROW_CI_RUBY_AFFECTED" = "1" ]; then - brew update - brew upgrade python - brew uninstall postgis - brew bundle --file=$TRAVIS_BUILD_DIR/c_glib/Brewfile + brew_log_path=brew.log + function run_brew() { + echo brew "$@" >> ${brew_log_path} + if ! gtimeout --signal=KILL 5m brew "$@" >> ${brew_log_path} 2>&1; then + cat ${brew_log_path} + rm ${brew_log_path} + false + fi + } + run_brew update + run_brew upgrade python + run_brew uninstall postgis + run_brew bundle --file=$TRAVIS_BUILD_DIR/c_glib/Brewfile --verbose + rm ${brew_log_path} fi diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh index 608e1ce636524..e4290ed8ee026 100755 --- a/ci/travis_script_python.sh +++ b/ci/travis_script_python.sh @@ -61,11 +61,7 @@ conda install -y -q pip \ if [ "$ARROW_TRAVIS_PYTHON_DOCS" == "1" ] && [ "$PYTHON_VERSION" == "3.6" ]; then # Install documentation dependencies - conda install -y -q \ - ipython \ - numpydoc \ - sphinx=1.7.9 \ - sphinx_rtd_theme + conda install -y -c conda-forge --file ci/conda_env_sphinx.yml fi # ARROW-2093: PyTorch increases the size of our conda dependency stack @@ -190,7 +186,10 @@ if [ "$ARROW_TRAVIS_COVERAGE" == "1" ]; then fi if [ "$ARROW_TRAVIS_PYTHON_DOCS" == "1" ] && [ "$PYTHON_VERSION" == "3.6" ]; then - cd doc + pushd ../cpp/apidoc + doxygen + popd + cd ../docs sphinx-build -q -b html -d _build/doctrees -W source _build/html fi diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 8436e65ba8076..6deb339f4c2f0 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -150,8 +150,8 @@ Pass multiple labels by dividing with semicolons") "Build the Arrow IPC extensions" ON) - option(ARROW_GPU - "Build the Arrow GPU extensions (requires CUDA installation)" + option(ARROW_CUDA + "Build the Arrow CUDA extensions (requires CUDA toolkit)" OFF) option(ARROW_ORC @@ -803,7 +803,9 @@ endif() if(ARROW_PARQUET) add_subdirectory(src/parquet) add_subdirectory(tools/parquet) - add_subdirectory(examples/parquet/low-level-api) + if (PARQUET_BUILD_EXAMPLES) + add_subdirectory(examples/parquet) + endif() endif() if(ARROW_GANDIVA) diff --git a/cpp/Dockerfile b/cpp/Dockerfile index 4ec8f0f3bf183..c4791019634c1 100644 --- a/cpp/Dockerfile +++ b/cpp/Dockerfile @@ -20,33 +20,35 @@ FROM ubuntu:18.04 # install build essentials RUN apt-get update -y -q && \ apt-get install -y -q --no-install-recommends \ - autoconf \ - automake \ - ca-certificates \ - ccache \ - g++ \ - gcc \ - git \ - ninja-build \ - pkg-config \ - wget + ca-certificates \ + ccache \ + g++ \ + gcc \ + git \ + ninja-build \ + pkg-config \ + wget # install conda and required packages ENV PATH=/opt/conda/bin:$PATH \ CONDA_PREFIX=/opt/conda ADD ci/docker_install_conda.sh \ ci/conda_env_cpp.yml \ + ci/conda_env_unix.yml \ /arrow/ci/ RUN arrow/ci/docker_install_conda.sh && \ conda install -c conda-forge \ - --file arrow/ci/conda_env_cpp.yml && \ + --file arrow/ci/conda_env_cpp.yml \ + --file arrow/ci/conda_env_unix.yml && \ conda clean --all ENV CC=gcc \ CXX=g++ \ - ARROW_BUILD_TESTS=ON + ARROW_BUILD_TESTS=ON \ + ARROW_BUILD_TOOLCHAIN=$CONDA_PREFIX \ + ARROW_HOME=$CONDA_PREFIX \ + PARQUET_HOME=$CONDA_PREFIX # build and test CMD arrow/ci/docker_build_cpp.sh && \ - cd /build/cpp && \ - ctest -j2 --output-on-failure -L unittest + cd /build/cpp && ctest -j2 --output-on-failure -L unittest diff --git a/cpp/Dockerfile.alpine b/cpp/Dockerfile.alpine new file mode 100644 index 0000000000000..3c412e613bc2c --- /dev/null +++ b/cpp/Dockerfile.alpine @@ -0,0 +1,47 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +FROM alpine + +# install dependencies +RUN apk add --no-cache -q \ + autoconf \ + bash \ + bison \ + boost-dev \ + cmake \ + flex \ + g++ \ + gcc \ + git \ + gzip \ + make \ + musl-dev \ + ninja \ + wget \ + zlib-dev + +ENV CC=gcc \ + CXX=g++ \ + ARROW_ORC=OFF \ + ARROW_PARQUET=OFF \ + ARROW_BUILD_TESTS=ON \ + ARROW_HOME=/usr/local + +# build and test +CMD arrow/ci/docker_build_cpp.sh && \ + cd /build/cpp && ctest -j2 --output-on-failure -L unittest diff --git a/cpp/README.md b/cpp/README.md index fcf913723974b..394b23d69f8fc 100644 --- a/cpp/README.md +++ b/cpp/README.md @@ -204,13 +204,11 @@ The Python library must be built against the same Python version for which you are building pyarrow, e.g. Python 2.7 or Python 3.6. NumPy must also be installed. -### Building GPU extension library (optional) +### Building CUDA extension library (optional) -The optional `arrow_gpu` shared library can be built by passing -`-DARROW_GPU=on`. This requires a CUDA installation to build, and to use many -of the functions you must have a functioning GPU. Currently only CUDA -functionality is supported, though if there is demand we can also add OpenCL -interfaces in this library as needed. +The optional `arrow_cuda` shared library can be built by passing +`-DARROW_CUDA=on`. This requires a CUDA installation to build, and to use many +of the functions you must have a functioning CUDA-compatible GPU. The CUDA toolchain used to build the library can be customized by using the `$CUDA_HOME` environment variable. diff --git a/cpp/apidoc/Doxyfile b/cpp/apidoc/Doxyfile index 3ec9af9262622..e5285873c9e02 100644 --- a/cpp/apidoc/Doxyfile +++ b/cpp/apidoc/Doxyfile @@ -1919,7 +1919,7 @@ MAN_LINKS = NO # captures the structure of the code including all documentation. # The default value is: NO. -GENERATE_XML = NO +GENERATE_XML = YES # The XML_OUTPUT tag is used to specify where the XML pages will be put. If a # relative path is entered the value of OUTPUT_DIRECTORY will be put in front of diff --git a/cpp/apidoc/index.md b/cpp/apidoc/index.md index 46ee5003678dd..c887a74e64124 100644 --- a/cpp/apidoc/index.md +++ b/cpp/apidoc/index.md @@ -41,60 +41,3 @@ Table of Contents * [Convert a vector of row-wise data into an Arrow table](tutorials/row_wise_conversion.md) * [Using the Plasma In-Memory Object Store](tutorials/plasma.md) * [Use Plasma to Access Tensors from C++ in Python](tutorials/tensor_to_py.md) - -Getting Started ---------------- - -The most basic structure in Arrow is an `arrow::Array`. It holds a sequence -of values with known length all having the same type. It consists of the data -itself and an additional bitmap that indicates if the corresponding entry of -array is a null-value. Note that for array with zero null entries, we can omit -this bitmap. - -As Arrow objects are immutable, there are classes provided that should help you -build these objects. To build an array of `int64_t` elements, we can use the -`arrow::Int64Builder`. In the following example, we build an array of the range -1 to 8 where the element that should hold the number 4 is nulled. - - Int64Builder builder; - builder.Append(1); - builder.Append(2); - builder.Append(3); - builder.AppendNull(); - builder.Append(5); - builder.Append(6); - builder.Append(7); - builder.Append(8); - - std::shared_ptr array; - builder.Finish(&array); - -The resulting Array (which can be casted to `arrow::Int64Array` if you want -to access its values) then consists of two `arrow::Buffer`. The first one is -the null bitmap holding a single byte with the bits `0|0|0|0|1|0|0|0`. -As we use [least-significant bit (LSB) numbering](https://en.wikipedia.org/wiki/Bit_numbering) -this indicates that the fourth entry in the array is null. The second -buffer is simply an `int64_t` array containing all the above values. -As the fourth entry is null, the value at that position in the buffer is -undefined. - - // Cast the Array to its actual type to access its data - std::shared_ptr int64_array = std::static_pointer_cast(array); - - // Get the pointer to the null bitmap. - const uint8_t* null_bitmap = int64_array->null_bitmap_data(); - - // Get the pointer to the actual data - const int64_t* data = int64_array->raw_values(); - -In the above example, we have yet skipped explaining two things in the code. -On constructing the builder, we have passed `arrow::int64()` to it. This is -the type information with which the resulting array will be annotated. In -this simple form, it is solely a `std::shared_ptr` -instantiation. - -Furthermore, we have passed `arrow::default_memory_pool()` to the constructor. -This `arrow::MemoryPool` is used for the allocations of heap memory. Besides -tracking the amount of memory allocated, the allocator also ensures that the -allocated memory regions are 64-byte aligned (as required by the Arrow -specification). diff --git a/cpp/cmake_modules/BuildUtils.cmake b/cpp/cmake_modules/BuildUtils.cmake index fb646dd6c1a74..916b9ebddb88e 100644 --- a/cpp/cmake_modules/BuildUtils.cmake +++ b/cpp/cmake_modules/BuildUtils.cmake @@ -368,6 +368,7 @@ endfunction() # # Arguments after the test name will be passed to set_tests_properties(). # +# \arg ENABLED if passed, add this unit test even if ARROW_BUILD_TESTS is off # \arg PREFIX a string to append to the name of the test executable. For # example, if you have src/arrow/foo/bar-test.cc, then PREFIX "foo" will create # test executable foo-bar-test @@ -377,7 +378,7 @@ endfunction() # groups using the syntax unittest;GROUP2;GROUP3. Custom targets for the group # names must exist function(ADD_ARROW_TEST REL_TEST_NAME) - set(options NO_VALGRIND) + set(options NO_VALGRIND ENABLED) set(one_value_args) set(multi_value_args SOURCES STATIC_LINK_LIBS EXTRA_LINK_LIBS EXTRA_INCLUDES EXTRA_DEPENDENCIES LABELS PREFIX) @@ -398,7 +399,7 @@ function(ADD_ARROW_TEST REL_TEST_NAME) endif() endif() - if (NO_TESTS) + if (NO_TESTS AND NOT ARG_ENABLED) return() endif() get_filename_component(TEST_NAME ${REL_TEST_NAME} NAME_WE) @@ -424,13 +425,13 @@ function(ADD_ARROW_TEST REL_TEST_NAME) if (ARG_STATIC_LINK_LIBS) # Customize link libraries - target_link_libraries(${TEST_NAME} ${ARG_STATIC_LINK_LIBS}) + target_link_libraries(${TEST_NAME} PRIVATE ${ARG_STATIC_LINK_LIBS}) else() - target_link_libraries(${TEST_NAME} ${ARROW_TEST_LINK_LIBS}) + target_link_libraries(${TEST_NAME} PRIVATE ${ARROW_TEST_LINK_LIBS}) endif() if (ARG_EXTRA_LINK_LIBS) - target_link_libraries(${TEST_NAME} ${ARG_EXTRA_LINK_LIBS}) + target_link_libraries(${TEST_NAME} PRIVATE ${ARG_EXTRA_LINK_LIBS}) endif() if (ARG_EXTRA_INCLUDES) diff --git a/cpp/cmake_modules/CompilerInfo.cmake b/cpp/cmake_modules/CompilerInfo.cmake index 76f692b06dc13..faa12916b7273 100644 --- a/cpp/cmake_modules/CompilerInfo.cmake +++ b/cpp/cmake_modules/CompilerInfo.cmake @@ -21,14 +21,21 @@ if (NOT MSVC) set(COMPILER_GET_VERSION_SWITCH "-v") endif() -message(INFO "Compiler command: ${CMAKE_CXX_COMPILER}") +set(COMPILER_VERSION_COMMAND "${CMAKE_CXX_COMPILER}" "${COMPILER_GET_VERSION_SWITCH}") + +if (UNIX OR APPLE) + set(COMPILER_VERSION_COMMAND "env" "LANG=C" ${COMPILER_VERSION_COMMAND}) +endif() + +string(REPLACE ";" " " COMPILER_VERSION_COMMAND_STR "${COMPILER_VERSION_COMMAND}") +message(STATUS "Compiler command: ${COMPILER_VERSION_COMMAND_STR}") # Some gcc seem to output their version on stdout, most do it on stderr, simply # merge both pipes into a single variable -execute_process(COMMAND "${CMAKE_CXX_COMPILER}" ${COMPILER_GET_VERSION_SWITCH} +execute_process(COMMAND ${COMPILER_VERSION_COMMAND} OUTPUT_VARIABLE COMPILER_VERSION_FULL ERROR_VARIABLE COMPILER_VERSION_FULL) -message(INFO "Compiler version: ${COMPILER_VERSION_FULL}") -message(INFO "Compiler id: ${CMAKE_CXX_COMPILER_ID}") +message(STATUS "Compiler version: ${COMPILER_VERSION_FULL}") +message(STATUS "Compiler id: ${CMAKE_CXX_COMPILER_ID}") string(TOLOWER "${COMPILER_VERSION_FULL}" COMPILER_VERSION_FULL_LOWER) if(MSVC) diff --git a/cpp/cmake_modules/FindArrowCuda.cmake b/cpp/cmake_modules/FindArrowCuda.cmake index 8733b6167380a..bac148fa3b637 100644 --- a/cpp/cmake_modules/FindArrowCuda.cmake +++ b/cpp/cmake_modules/FindArrowCuda.cmake @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -# - Find ARROW CUDA (arrow/gpu/cuda_api.h, libarrow_gpu.a, libarrow_gpu.so) +# - Find ARROW CUDA (arrow/gpu/cuda_api.h, libarrow_cuda.a, libarrow_cuda.so) # # This module requires Arrow from which it uses # ARROW_FOUND @@ -31,10 +31,6 @@ # ARROW_CUDA_SHARED_IMP_LIB, path to libarrow's import library (MSVC only) # ARROW_CUDA_FOUND, whether arrow has been found -# -# TODO(ARROW-3209): rename arrow/gpu to arrow/cuda, arrow_gpu to arrow_cuda -# - include(FindPkgConfig) include(GNUInstallDirs) @@ -63,14 +59,14 @@ if (NOT (ARROW_CUDA_INCLUDE_DIR STREQUAL ARROW_INCLUDE_DIR)) message(WARNING ${ARROW_CUDA_WARN_MSG}) endif() -find_library(ARROW_CUDA_LIB_PATH NAMES arrow_gpu +find_library(ARROW_CUDA_LIB_PATH NAMES arrow_cuda PATHS ${ARROW_SEARCH_LIB_PATH} NO_DEFAULT_PATH) get_filename_component(ARROW_CUDA_LIBS ${ARROW_CUDA_LIB_PATH} DIRECTORY) if (MSVC) - find_library(ARROW_CUDA_SHARED_LIBRARIES NAMES arrow_gpu + find_library(ARROW_CUDA_SHARED_LIBRARIES NAMES arrow_cuda PATHS ${ARROW_HOME} NO_DEFAULT_PATH PATH_SUFFIXES "bin" ) get_filename_component(ARROW_CUDA_SHARED_LIBS ${ARROW_CUDA_SHARED_LIBRARIES} PATH ) @@ -79,7 +75,7 @@ endif() if (ARROW_CUDA_INCLUDE_DIR AND ARROW_CUDA_LIBS) set(ARROW_CUDA_FOUND TRUE) - set(ARROW_CUDA_LIB_NAME arrow_gpu) + set(ARROW_CUDA_LIB_NAME arrow_cuda) if (MSVC) set(ARROW_CUDA_STATIC_LIB ${ARROW_CUDA_LIBS}/${ARROW_CUDA_LIB_NAME}${ARROW_MSVC_STATIC_LIB_SUFFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}) set(ARROW_CUDA_SHARED_LIB ${ARROW_CUDA_SHARED_LIBS}/${ARROW_CUDA_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}) diff --git a/cpp/cmake_modules/FindLLVM.cmake b/cpp/cmake_modules/FindLLVM.cmake index eb6afd6f90759..4094162a1d9dc 100644 --- a/cpp/cmake_modules/FindLLVM.cmake +++ b/cpp/cmake_modules/FindLLVM.cmake @@ -23,7 +23,8 @@ set(GANDIVA_LLVM_VERSION 6.0) find_package(LLVM ${GANDIVA_LLVM_VERSION} REQUIRED CONFIG HINTS /usr/local/opt/llvm - /usr/share) + /usr/share + ${LLVM_DIR}) message(STATUS "Found LLVM ${LLVM_PACKAGE_VERSION}") message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}") diff --git a/cpp/cmake_modules/GandivaBuildUtils.cmake b/cpp/cmake_modules/GandivaBuildUtils.cmake deleted file mode 100644 index 521d6976b5803..0000000000000 --- a/cpp/cmake_modules/GandivaBuildUtils.cmake +++ /dev/null @@ -1,91 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -set(GANDIVA_TEST_LINK_LIBS - gtest_static - gtest_main_static - ${RE2_LIBRARY}) - -if (PTHREAD_LIBRARY) - set(GANDIVA_TEST_LINK_LIBS - ${GANDIVA_TEST_LINK_LIBS} - ${PTHREAD_LIBRARY}) -endif() - -# Add a unittest executable, with its dependencies. -function(add_gandiva_unit_test REL_TEST_NAME) - get_filename_component(TEST_NAME ${REL_TEST_NAME} NAME_WE) - - add_executable(${TEST_NAME} ${REL_TEST_NAME} ${ARGN}) - if(${REL_TEST_NAME} MATCHES "llvm" OR - ${REL_TEST_NAME} MATCHES "expression_registry") - # If the unit test has llvm in its name, include llvm. - add_dependencies(${TEST_NAME} LLVM::LLVM_INTERFACE) - target_link_libraries(${TEST_NAME} PRIVATE LLVM::LLVM_INTERFACE) - endif() - - # Require toolchain to be built - add_dependencies(${TEST_NAME} arrow_dependencies) - - target_include_directories(${TEST_NAME} PRIVATE - ${CMAKE_SOURCE_DIR}/include - ${CMAKE_SOURCE_DIR}/src - ) - target_link_libraries(${TEST_NAME} - PRIVATE arrow_shared ${GANDIVA_TEST_LINK_LIBS} - ) - add_test(NAME ${TEST_NAME} COMMAND ${TEST_NAME}) - set_property(TEST ${TEST_NAME} PROPERTY LABELS gandiva,unittest ${TEST_NAME}) -endfunction(add_gandiva_unit_test REL_TEST_NAME) - -# Add a unittest executable for a precompiled file (used to generate IR) -function(add_precompiled_unit_test REL_TEST_NAME) - get_filename_component(TEST_NAME ${REL_TEST_NAME} NAME_WE) - - add_executable(${TEST_NAME} ${REL_TEST_NAME} ${ARGN}) - # Require toolchain to be built - add_dependencies(${TEST_NAME} arrow_dependencies) - target_include_directories(${TEST_NAME} PRIVATE ${CMAKE_SOURCE_DIR}/src) - target_link_libraries(${TEST_NAME} - PRIVATE arrow_shared ${GANDIVA_TEST_LINK_LIBS} - ) - target_compile_definitions(${TEST_NAME} PRIVATE GANDIVA_UNIT_TEST=1) - add_test(NAME ${TEST_NAME} COMMAND ${TEST_NAME}) - set_property(TEST ${TEST_NAME} PROPERTY LABELS gandiva,unittest ${TEST_NAME}) -endfunction(add_precompiled_unit_test REL_TEST_NAME) - -# Add an integ executable, with its dependencies. -function(add_gandiva_integ_test REL_TEST_NAME GANDIVA_LIB) - get_filename_component(TEST_NAME ${REL_TEST_NAME} NAME_WE) - - add_executable(${TEST_NAME}_${GANDIVA_LIB} ${REL_TEST_NAME} ${ARGN}) - target_include_directories(${TEST_NAME}_${GANDIVA_LIB} PRIVATE ${CMAKE_SOURCE_DIR}) - target_link_libraries(${TEST_NAME}_${GANDIVA_LIB} PRIVATE - ${GANDIVA_LIB} - ${GANDIVA_TEST_LINK_LIBS} - ) - - add_test(NAME ${TEST_NAME}_${GANDIVA_LIB} COMMAND ${TEST_NAME}_${GANDIVA_LIB}) - set_property(TEST ${TEST_NAME}_${GANDIVA_LIB} PROPERTY LABELS gandiva,integ ${TEST_NAME}_${GANDIVA_LIB}) -endfunction(add_gandiva_integ_test REL_TEST_NAME) - -function(prevent_in_source_builds) - file(TO_CMAKE_PATH "${PROJECT_BINARY_DIR}/CMakeLists.txt" LOC_PATH) - if(EXISTS "${LOC_PATH}") - message(FATAL_ERROR "Gandiva does not support in-source builds") - endif() -endfunction(prevent_in_source_builds) diff --git a/cpp/examples/parquet/CMakeLists.txt b/cpp/examples/parquet/CMakeLists.txt new file mode 100644 index 0000000000000..98c5cd9402bb7 --- /dev/null +++ b/cpp/examples/parquet/CMakeLists.txt @@ -0,0 +1,31 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +add_executable(parquet-low-level-example low-level-api/reader-writer.cc) +add_executable(parquet-low-level-example2 low-level-api/reader-writer2.cc) +target_include_directories(parquet-low-level-example PRIVATE low-level-api/) +target_include_directories(parquet-low-level-example2 PRIVATE low-level-api/) +target_link_libraries(parquet-low-level-example parquet_static) +target_link_libraries(parquet-low-level-example2 parquet_static) + +add_executable(parquet-arrow-example parquet-arrow/src/reader-writer.cc) +target_link_libraries(parquet-arrow-example parquet_shared) + +add_dependencies(parquet + parquet-low-level-example + parquet-low-level-example2 + parquet-arrow-example) diff --git a/cpp/examples/parquet/parquet-arrow/CMakeLists.txt b/cpp/examples/parquet/parquet-arrow/CMakeLists.txt index 892ec92a591ed..d9e01acd3eea3 100644 --- a/cpp/examples/parquet/parquet-arrow/CMakeLists.txt +++ b/cpp/examples/parquet/parquet-arrow/CMakeLists.txt @@ -32,15 +32,11 @@ set(CMAKE_CXX_STANDARD 11) # We require a C++11 compliant compiler set(CMAKE_CXX_STANDARD_REQUIRED ON) -# First search the packages in the system. If they are not found, use CMake's -# ExternalProject mechanism to build them locally. +# Look for installed packages the system find_package(Arrow) find_package(Parquet) include_directories(SYSTEM ${ARROW_INCLUDE_DIR} ${PARQUET_INCLUDE_DIR}) -add_executable(parquet-arrow-reader-writer src/reader-writer.cc) -target_link_libraries(parquet-arrow-reader-writer ${PARQUET_SHARED_LIB} ${ARROW_SHARED_LIB}) -if (ARROW_VENDORED) - add_dependencies(parquet-arrow-reader-writer arrow_ep) -endif() +add_executable(parquet-arrow-example src/reader-writer.cc) +target_link_libraries(parquet-arrow-example ${PARQUET_SHARED_LIB} ${ARROW_SHARED_LIB}) diff --git a/cpp/examples/parquet/parquet-arrow/src/reader-writer.cc b/cpp/examples/parquet/parquet-arrow/src/reader-writer.cc index 8154d7adef2ad..8d474486e7413 100644 --- a/cpp/examples/parquet/parquet-arrow/src/reader-writer.cc +++ b/cpp/examples/parquet/parquet-arrow/src/reader-writer.cc @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +#include #include #include #include diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index a56079fb2a271..6858f3c4c4fbe 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -19,6 +19,9 @@ set(ARROW_SRCS array.cc buffer.cc builder.cc + builder-adaptive.cc + builder-binary.cc + builder-dict.cc compare.cc memory_pool.cc pretty_print.cc @@ -75,8 +78,8 @@ if (ARROW_COMPUTE) ) endif() -if (ARROW_GPU) - # IPC extensions required to build the GPU library +if (ARROW_CUDA) + # IPC extensions required to build the CUDA library set(ARROW_IPC ON) add_subdirectory(gpu) endif() diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index 586605831b3e5..1a88740a4ac08 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -331,7 +331,10 @@ class TestPrimitiveBuilder : public TestBuilder { ASSERT_TRUE(result->Equals(*expected)); } - int64_t FlipValue(int64_t value) const { return ~value; } + void FlipValue(T* ptr) { + auto byteptr = reinterpret_cast(ptr); + *byteptr = static_cast(~*byteptr); + } protected: std::unique_ptr builder_; @@ -414,8 +417,8 @@ void TestPrimitiveBuilder::RandomData(int64_t N, double pct_null) { } template <> -int64_t TestPrimitiveBuilder::FlipValue(int64_t value) const { - return !value; +void TestPrimitiveBuilder::FlipValue(T* ptr) { + *ptr = !*ptr; } template <> @@ -559,8 +562,7 @@ TYPED_TEST(TestPrimitiveBuilder, Equality) { const int64_t first_valid_idx = std::distance(valid_bytes.begin(), first_valid); // This should be true with a very high probability, but might introduce flakiness ASSERT_LT(first_valid_idx, size - 1); - draws[first_valid_idx] = static_cast( - this->FlipValue(*reinterpret_cast(&draws[first_valid_idx]))); + this->FlipValue(&draws[first_valid_idx]); ASSERT_OK(MakeArray(valid_bytes, draws, size, builder, &unequal_array)); // test normal equality diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index 0274c15f74f61..b34b53933314f 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -170,24 +170,34 @@ struct ARROW_EXPORT ArrayData { // Access a buffer's data as a typed C pointer template - inline const T* GetValues(int i) const { + inline const T* GetValues(int i, int64_t absolute_offset) const { if (buffers[i]) { - return reinterpret_cast(buffers[i]->data()) + offset; + return reinterpret_cast(buffers[i]->data()) + absolute_offset; } else { return NULLPTR; } } + template + inline const T* GetValues(int i) const { + return GetValues(i, offset); + } + // Access a buffer's data as a typed C pointer template - inline T* GetMutableValues(int i) { + inline T* GetMutableValues(int i, int64_t absolute_offset) { if (buffers[i]) { - return reinterpret_cast(buffers[i]->mutable_data()) + offset; + return reinterpret_cast(buffers[i]->mutable_data()) + absolute_offset; } else { return NULLPTR; } } + template + inline T* GetMutableValues(int i) { + return GetMutableValues(i, offset); + } + std::shared_ptr type; int64_t length; int64_t null_count; @@ -387,6 +397,7 @@ class ARROW_EXPORT PrimitiveArray : public FlatArray { const uint8_t* raw_values_; }; +/// Concrete Array class for numeric data. template class ARROW_EXPORT NumericArray : public PrimitiveArray { public: diff --git a/cpp/src/arrow/builder-adaptive.cc b/cpp/src/arrow/builder-adaptive.cc new file mode 100644 index 0000000000000..a715f469c7aa1 --- /dev/null +++ b/cpp/src/arrow/builder-adaptive.cc @@ -0,0 +1,405 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/buffer.h" +#include "arrow/builder.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/int-util.h" +#include "arrow/util/logging.h" + +namespace arrow { + +using internal::AdaptiveIntBuilderBase; + +AdaptiveIntBuilderBase::AdaptiveIntBuilderBase(MemoryPool* pool) + : ArrayBuilder(int64(), pool), + data_(nullptr), + raw_data_(nullptr), + int_size_(1), + pending_pos_(0), + pending_has_nulls_(false) {} + +void AdaptiveIntBuilderBase::Reset() { + ArrayBuilder::Reset(); + data_.reset(); + raw_data_ = nullptr; + pending_pos_ = 0; + pending_has_nulls_ = false; +} + +Status AdaptiveIntBuilderBase::Resize(int64_t capacity) { + RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); + capacity = std::max(capacity, kMinBuilderCapacity); + + int64_t nbytes = capacity * int_size_; + if (capacity_ == 0) { + RETURN_NOT_OK(AllocateResizableBuffer(pool_, nbytes, &data_)); + } else { + RETURN_NOT_OK(data_->Resize(nbytes)); + } + raw_data_ = reinterpret_cast(data_->mutable_data()); + + return ArrayBuilder::Resize(capacity); +} + +AdaptiveIntBuilder::AdaptiveIntBuilder(MemoryPool* pool) : AdaptiveIntBuilderBase(pool) {} + +Status AdaptiveIntBuilder::FinishInternal(std::shared_ptr* out) { + RETURN_NOT_OK(CommitPendingData()); + + std::shared_ptr output_type; + switch (int_size_) { + case 1: + output_type = int8(); + break; + case 2: + output_type = int16(); + break; + case 4: + output_type = int32(); + break; + case 8: + output_type = int64(); + break; + default: + DCHECK(false); + return Status::NotImplemented("Only ints of size 1,2,4,8 are supported"); + } + + RETURN_NOT_OK(TrimBuffer(BitUtil::BytesForBits(length_), null_bitmap_.get())); + RETURN_NOT_OK(TrimBuffer(length_ * int_size_, data_.get())); + + *out = ArrayData::Make(output_type, length_, {null_bitmap_, data_}, null_count_); + + data_ = null_bitmap_ = nullptr; + capacity_ = length_ = null_count_ = 0; + return Status::OK(); +} + +Status AdaptiveIntBuilder::CommitPendingData() { + if (pending_pos_ == 0) { + return Status::OK(); + } + RETURN_NOT_OK(Reserve(pending_pos_)); + const uint8_t* valid_bytes = pending_has_nulls_ ? pending_valid_ : nullptr; + RETURN_NOT_OK(AppendValuesInternal(reinterpret_cast(pending_data_), + pending_pos_, valid_bytes)); + pending_has_nulls_ = false; + pending_pos_ = 0; + return Status::OK(); +} + +static constexpr int64_t kAdaptiveIntChunkSize = 8192; + +Status AdaptiveIntBuilder::AppendValuesInternal(const int64_t* values, int64_t length, + const uint8_t* valid_bytes) { + while (length > 0) { + // In case `length` is very large, we don't want to trash the cache by + // scanning it twice (first to detect int width, second to copy the data). + // Instead, process data in L2-cacheable chunks. + const int64_t chunk_size = std::min(length, kAdaptiveIntChunkSize); + + uint8_t new_int_size; + new_int_size = internal::DetectIntWidth(values, valid_bytes, chunk_size, int_size_); + + DCHECK_GE(new_int_size, int_size_); + if (new_int_size > int_size_) { + // This updates int_size_ + RETURN_NOT_OK(ExpandIntSize(new_int_size)); + } + + switch (int_size_) { + case 1: + internal::DowncastInts(values, reinterpret_cast(raw_data_) + length_, + chunk_size); + break; + case 2: + internal::DowncastInts(values, reinterpret_cast(raw_data_) + length_, + chunk_size); + break; + case 4: + internal::DowncastInts(values, reinterpret_cast(raw_data_) + length_, + chunk_size); + break; + case 8: + internal::DowncastInts(values, reinterpret_cast(raw_data_) + length_, + chunk_size); + break; + default: + DCHECK(false); + } + + // This updates length_ + ArrayBuilder::UnsafeAppendToBitmap(valid_bytes, chunk_size); + values += chunk_size; + if (valid_bytes != nullptr) { + valid_bytes += chunk_size; + } + length -= chunk_size; + } + + return Status::OK(); +} + +Status AdaptiveUIntBuilder::CommitPendingData() { + if (pending_pos_ == 0) { + return Status::OK(); + } + RETURN_NOT_OK(Reserve(pending_pos_)); + const uint8_t* valid_bytes = pending_has_nulls_ ? pending_valid_ : nullptr; + RETURN_NOT_OK(AppendValuesInternal(pending_data_, pending_pos_, valid_bytes)); + pending_has_nulls_ = false; + pending_pos_ = 0; + return Status::OK(); +} + +Status AdaptiveIntBuilder::AppendValues(const int64_t* values, int64_t length, + const uint8_t* valid_bytes) { + RETURN_NOT_OK(CommitPendingData()); + RETURN_NOT_OK(Reserve(length)); + + return AppendValuesInternal(values, length, valid_bytes); +} + +template +typename std::enable_if= sizeof(new_type), Status>::type +AdaptiveIntBuilder::ExpandIntSizeInternal() { + return Status::OK(); +} + +#define __LESS(a, b) (a) < (b) +template +typename std::enable_if<__LESS(sizeof(old_type), sizeof(new_type)), Status>::type +AdaptiveIntBuilder::ExpandIntSizeInternal() { + int_size_ = sizeof(new_type); + RETURN_NOT_OK(Resize(data_->size() / sizeof(old_type))); + raw_data_ = reinterpret_cast(data_->mutable_data()); + const old_type* src = reinterpret_cast(raw_data_); + new_type* dst = reinterpret_cast(raw_data_); + + // By doing the backward copy, we ensure that no element is overriden during + // the copy process and the copy stays in-place. + std::copy_backward(src, src + length_, dst + length_); + + return Status::OK(); +} +#undef __LESS + +template +Status AdaptiveIntBuilder::ExpandIntSizeN() { + switch (int_size_) { + case 1: + RETURN_NOT_OK((ExpandIntSizeInternal())); + break; + case 2: + RETURN_NOT_OK((ExpandIntSizeInternal())); + break; + case 4: + RETURN_NOT_OK((ExpandIntSizeInternal())); + break; + case 8: + RETURN_NOT_OK((ExpandIntSizeInternal())); + break; + default: + DCHECK(false); + } + return Status::OK(); +} + +Status AdaptiveIntBuilder::ExpandIntSize(uint8_t new_int_size) { + switch (new_int_size) { + case 1: + RETURN_NOT_OK((ExpandIntSizeN())); + break; + case 2: + RETURN_NOT_OK((ExpandIntSizeN())); + break; + case 4: + RETURN_NOT_OK((ExpandIntSizeN())); + break; + case 8: + RETURN_NOT_OK((ExpandIntSizeN())); + break; + default: + DCHECK(false); + } + return Status::OK(); +} + +AdaptiveUIntBuilder::AdaptiveUIntBuilder(MemoryPool* pool) + : AdaptiveIntBuilderBase(pool) {} + +Status AdaptiveUIntBuilder::FinishInternal(std::shared_ptr* out) { + RETURN_NOT_OK(CommitPendingData()); + + std::shared_ptr output_type; + switch (int_size_) { + case 1: + output_type = uint8(); + break; + case 2: + output_type = uint16(); + break; + case 4: + output_type = uint32(); + break; + case 8: + output_type = uint64(); + break; + default: + DCHECK(false); + return Status::NotImplemented("Only ints of size 1,2,4,8 are supported"); + } + + RETURN_NOT_OK(TrimBuffer(BitUtil::BytesForBits(length_), null_bitmap_.get())); + RETURN_NOT_OK(TrimBuffer(length_ * int_size_, data_.get())); + + *out = ArrayData::Make(output_type, length_, {null_bitmap_, data_}, null_count_); + + data_ = null_bitmap_ = nullptr; + capacity_ = length_ = null_count_ = 0; + return Status::OK(); +} + +Status AdaptiveUIntBuilder::AppendValuesInternal(const uint64_t* values, int64_t length, + const uint8_t* valid_bytes) { + while (length > 0) { + // See AdaptiveIntBuilder::AppendValuesInternal + const int64_t chunk_size = std::min(length, kAdaptiveIntChunkSize); + + uint8_t new_int_size; + new_int_size = internal::DetectUIntWidth(values, valid_bytes, chunk_size, int_size_); + + DCHECK_GE(new_int_size, int_size_); + if (new_int_size > int_size_) { + // This updates int_size_ + RETURN_NOT_OK(ExpandIntSize(new_int_size)); + } + + switch (int_size_) { + case 1: + internal::DowncastUInts(values, reinterpret_cast(raw_data_) + length_, + chunk_size); + break; + case 2: + internal::DowncastUInts(values, reinterpret_cast(raw_data_) + length_, + chunk_size); + break; + case 4: + internal::DowncastUInts(values, reinterpret_cast(raw_data_) + length_, + chunk_size); + break; + case 8: + internal::DowncastUInts(values, reinterpret_cast(raw_data_) + length_, + chunk_size); + break; + default: + DCHECK(false); + } + + // This updates length_ + ArrayBuilder::UnsafeAppendToBitmap(valid_bytes, chunk_size); + values += chunk_size; + if (valid_bytes != nullptr) { + valid_bytes += chunk_size; + } + length -= chunk_size; + } + + return Status::OK(); +} + +Status AdaptiveUIntBuilder::AppendValues(const uint64_t* values, int64_t length, + const uint8_t* valid_bytes) { + RETURN_NOT_OK(Reserve(length)); + + return AppendValuesInternal(values, length, valid_bytes); +} + +template +typename std::enable_if= sizeof(new_type), Status>::type +AdaptiveUIntBuilder::ExpandIntSizeInternal() { + return Status::OK(); +} + +#define __LESS(a, b) (a) < (b) +template +typename std::enable_if<__LESS(sizeof(old_type), sizeof(new_type)), Status>::type +AdaptiveUIntBuilder::ExpandIntSizeInternal() { + int_size_ = sizeof(new_type); + RETURN_NOT_OK(Resize(data_->size() / sizeof(old_type))); + + old_type* src = reinterpret_cast(raw_data_); + new_type* dst = reinterpret_cast(raw_data_); + // By doing the backward copy, we ensure that no element is overriden during + // the copy process and the copy stays in-place. + std::copy_backward(src, src + length_, dst + length_); + + return Status::OK(); +} +#undef __LESS + +template +Status AdaptiveUIntBuilder::ExpandIntSizeN() { + switch (int_size_) { + case 1: + RETURN_NOT_OK((ExpandIntSizeInternal())); + break; + case 2: + RETURN_NOT_OK((ExpandIntSizeInternal())); + break; + case 4: + RETURN_NOT_OK((ExpandIntSizeInternal())); + break; + case 8: + RETURN_NOT_OK((ExpandIntSizeInternal())); + break; + default: + DCHECK(false); + } + return Status::OK(); +} + +Status AdaptiveUIntBuilder::ExpandIntSize(uint8_t new_int_size) { + switch (new_int_size) { + case 1: + RETURN_NOT_OK((ExpandIntSizeN())); + break; + case 2: + RETURN_NOT_OK((ExpandIntSizeN())); + break; + case 4: + RETURN_NOT_OK((ExpandIntSizeN())); + break; + case 8: + RETURN_NOT_OK((ExpandIntSizeN())); + break; + default: + DCHECK(false); + } + return Status::OK(); +} + +} // namespace arrow diff --git a/cpp/src/arrow/builder-binary.cc b/cpp/src/arrow/builder-binary.cc new file mode 100644 index 0000000000000..c250837b4a3fa --- /dev/null +++ b/cpp/src/arrow/builder-binary.cc @@ -0,0 +1,315 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/buffer.h" +#include "arrow/builder.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/decimal.h" +#include "arrow/util/logging.h" + +namespace arrow { + +using internal::checked_cast; + +// ---------------------------------------------------------------------- +// String and binary + +BinaryBuilder::BinaryBuilder(const std::shared_ptr& type, MemoryPool* pool) + : ArrayBuilder(type, pool), offsets_builder_(pool), value_data_builder_(pool) {} + +BinaryBuilder::BinaryBuilder(MemoryPool* pool) : BinaryBuilder(binary(), pool) {} + +Status BinaryBuilder::Resize(int64_t capacity) { + DCHECK_LE(capacity, kListMaximumElements); + RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); + + // one more then requested for offsets + RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int32_t))); + return ArrayBuilder::Resize(capacity); +} + +Status BinaryBuilder::ReserveData(int64_t elements) { + if (value_data_length() + elements > value_data_capacity()) { + if (value_data_length() + elements > kBinaryMemoryLimit) { + return Status::CapacityError( + "Cannot reserve capacity larger than 2^31 - 1 for binary"); + } + RETURN_NOT_OK(value_data_builder_.Reserve(elements)); + } + return Status::OK(); +} + +Status BinaryBuilder::AppendNextOffset() { + const int64_t num_bytes = value_data_builder_.length(); + if (ARROW_PREDICT_FALSE(num_bytes > kBinaryMemoryLimit)) { + std::stringstream ss; + ss << "BinaryArray cannot contain more than " << kBinaryMemoryLimit << " bytes, have " + << num_bytes; + return Status::CapacityError(ss.str()); + } + return offsets_builder_.Append(static_cast(num_bytes)); +} + +Status BinaryBuilder::Append(const uint8_t* value, int32_t length) { + RETURN_NOT_OK(Reserve(1)); + RETURN_NOT_OK(AppendNextOffset()); + RETURN_NOT_OK(value_data_builder_.Append(value, length)); + + UnsafeAppendToBitmap(true); + return Status::OK(); +} + +Status BinaryBuilder::AppendNull() { + RETURN_NOT_OK(AppendNextOffset()); + RETURN_NOT_OK(Reserve(1)); + + UnsafeAppendToBitmap(false); + return Status::OK(); +} + +Status BinaryBuilder::FinishInternal(std::shared_ptr* out) { + // Write final offset (values length) + RETURN_NOT_OK(AppendNextOffset()); + + // These buffers' padding zeroed by BufferBuilder + std::shared_ptr offsets, value_data; + RETURN_NOT_OK(offsets_builder_.Finish(&offsets)); + RETURN_NOT_OK(value_data_builder_.Finish(&value_data)); + + *out = ArrayData::Make(type_, length_, {null_bitmap_, offsets, value_data}, null_count_, + 0); + Reset(); + return Status::OK(); +} + +void BinaryBuilder::Reset() { + ArrayBuilder::Reset(); + offsets_builder_.Reset(); + value_data_builder_.Reset(); +} + +const uint8_t* BinaryBuilder::GetValue(int64_t i, int32_t* out_length) const { + const int32_t* offsets = offsets_builder_.data(); + int32_t offset = offsets[i]; + if (i == (length_ - 1)) { + *out_length = static_cast(value_data_builder_.length()) - offset; + } else { + *out_length = offsets[i + 1] - offset; + } + return value_data_builder_.data() + offset; +} + +util::string_view BinaryBuilder::GetView(int64_t i) const { + const int32_t* offsets = offsets_builder_.data(); + int32_t offset = offsets[i]; + int32_t value_length; + if (i == (length_ - 1)) { + value_length = static_cast(value_data_builder_.length()) - offset; + } else { + value_length = offsets[i + 1] - offset; + } + return util::string_view( + reinterpret_cast(value_data_builder_.data() + offset), value_length); +} + +StringBuilder::StringBuilder(MemoryPool* pool) : BinaryBuilder(utf8(), pool) {} + +Status StringBuilder::AppendValues(const std::vector& values, + const uint8_t* valid_bytes) { + std::size_t total_length = std::accumulate( + values.begin(), values.end(), 0ULL, + [](uint64_t sum, const std::string& str) { return sum + str.size(); }); + RETURN_NOT_OK(Reserve(values.size())); + RETURN_NOT_OK(value_data_builder_.Reserve(total_length)); + RETURN_NOT_OK(offsets_builder_.Reserve(values.size())); + + if (valid_bytes) { + for (std::size_t i = 0; i < values.size(); ++i) { + RETURN_NOT_OK(AppendNextOffset()); + if (valid_bytes[i]) { + RETURN_NOT_OK(value_data_builder_.Append( + reinterpret_cast(values[i].data()), values[i].size())); + } + } + } else { + for (std::size_t i = 0; i < values.size(); ++i) { + RETURN_NOT_OK(AppendNextOffset()); + RETURN_NOT_OK(value_data_builder_.Append( + reinterpret_cast(values[i].data()), values[i].size())); + } + } + + UnsafeAppendToBitmap(valid_bytes, values.size()); + return Status::OK(); +} + +Status StringBuilder::AppendValues(const char** values, int64_t length, + const uint8_t* valid_bytes) { + std::size_t total_length = 0; + std::vector value_lengths(length); + bool have_null_value = false; + for (int64_t i = 0; i < length; ++i) { + if (values[i]) { + auto value_length = strlen(values[i]); + value_lengths[i] = value_length; + total_length += value_length; + } else { + have_null_value = true; + } + } + RETURN_NOT_OK(Reserve(length)); + RETURN_NOT_OK(value_data_builder_.Reserve(total_length)); + RETURN_NOT_OK(offsets_builder_.Reserve(length)); + + if (valid_bytes) { + int64_t valid_bytes_offset = 0; + for (int64_t i = 0; i < length; ++i) { + RETURN_NOT_OK(AppendNextOffset()); + if (valid_bytes[i]) { + if (values[i]) { + RETURN_NOT_OK(value_data_builder_.Append( + reinterpret_cast(values[i]), value_lengths[i])); + } else { + UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset, i - valid_bytes_offset); + UnsafeAppendToBitmap(false); + valid_bytes_offset = i + 1; + } + } + } + UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset, length - valid_bytes_offset); + } else { + if (have_null_value) { + std::vector valid_vector(length, 0); + for (int64_t i = 0; i < length; ++i) { + RETURN_NOT_OK(AppendNextOffset()); + if (values[i]) { + RETURN_NOT_OK(value_data_builder_.Append( + reinterpret_cast(values[i]), value_lengths[i])); + valid_vector[i] = 1; + } + } + UnsafeAppendToBitmap(valid_vector.data(), length); + } else { + for (int64_t i = 0; i < length; ++i) { + RETURN_NOT_OK(AppendNextOffset()); + RETURN_NOT_OK(value_data_builder_.Append( + reinterpret_cast(values[i]), value_lengths[i])); + } + UnsafeAppendToBitmap(nullptr, length); + } + } + return Status::OK(); +} + +// ---------------------------------------------------------------------- +// Fixed width binary + +FixedSizeBinaryBuilder::FixedSizeBinaryBuilder(const std::shared_ptr& type, + MemoryPool* pool) + : ArrayBuilder(type, pool), + byte_width_(checked_cast(*type).byte_width()), + byte_builder_(pool) {} + +#ifndef NDEBUG +void FixedSizeBinaryBuilder::CheckValueSize(int64_t size) { + DCHECK_EQ(size, byte_width_) << "Appending wrong size to FixedSizeBinaryBuilder"; +} +#endif + +Status FixedSizeBinaryBuilder::AppendValues(const uint8_t* data, int64_t length, + const uint8_t* valid_bytes) { + RETURN_NOT_OK(Reserve(length)); + UnsafeAppendToBitmap(valid_bytes, length); + return byte_builder_.Append(data, length * byte_width_); +} + +Status FixedSizeBinaryBuilder::AppendNull() { + RETURN_NOT_OK(Reserve(1)); + UnsafeAppendToBitmap(false); + return byte_builder_.Advance(byte_width_); +} + +void FixedSizeBinaryBuilder::Reset() { + ArrayBuilder::Reset(); + byte_builder_.Reset(); +} + +Status FixedSizeBinaryBuilder::Resize(int64_t capacity) { + RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); + RETURN_NOT_OK(byte_builder_.Resize(capacity * byte_width_)); + return ArrayBuilder::Resize(capacity); +} + +Status FixedSizeBinaryBuilder::FinishInternal(std::shared_ptr* out) { + std::shared_ptr data; + RETURN_NOT_OK(byte_builder_.Finish(&data)); + + *out = ArrayData::Make(type_, length_, {null_bitmap_, data}, null_count_); + + null_bitmap_ = nullptr; + capacity_ = length_ = null_count_ = 0; + return Status::OK(); +} + +const uint8_t* FixedSizeBinaryBuilder::GetValue(int64_t i) const { + const uint8_t* data_ptr = byte_builder_.data(); + return data_ptr + i * byte_width_; +} + +util::string_view FixedSizeBinaryBuilder::GetView(int64_t i) const { + const uint8_t* data_ptr = byte_builder_.data(); + return util::string_view(reinterpret_cast(data_ptr + i * byte_width_), + byte_width_); +} + +// ---------------------------------------------------------------------- +// Decimal128Builder + +Decimal128Builder::Decimal128Builder(const std::shared_ptr& type, + MemoryPool* pool) + : FixedSizeBinaryBuilder(type, pool) {} + +Status Decimal128Builder::Append(const Decimal128& value) { + RETURN_NOT_OK(FixedSizeBinaryBuilder::Reserve(1)); + return FixedSizeBinaryBuilder::Append(value.ToBytes()); +} + +Status Decimal128Builder::FinishInternal(std::shared_ptr* out) { + std::shared_ptr data; + RETURN_NOT_OK(byte_builder_.Finish(&data)); + + *out = ArrayData::Make(type_, length_, {null_bitmap_, data}, null_count_); + + return Status::OK(); +} + +} // namespace arrow diff --git a/cpp/src/arrow/builder-dict.cc b/cpp/src/arrow/builder-dict.cc new file mode 100644 index 0000000000000..b021c3a9d37cc --- /dev/null +++ b/cpp/src/arrow/builder-dict.cc @@ -0,0 +1,228 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "arrow/array.h" +#include "arrow/buffer.h" +#include "arrow/builder.h" +#include "arrow/status.h" +#include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/util/checked_cast.h" +#include "arrow/util/hashing.h" +#include "arrow/util/logging.h" + +namespace arrow { + +using internal::checked_cast; + +// ---------------------------------------------------------------------- +// DictionaryBuilder + +template +class DictionaryBuilder::MemoTableImpl + : public internal::HashTraits::MemoTableType { + public: + using MemoTableType = typename internal::HashTraits::MemoTableType; + using MemoTableType::MemoTableType; +}; + +template +DictionaryBuilder::~DictionaryBuilder() {} + +template +DictionaryBuilder::DictionaryBuilder(const std::shared_ptr& type, + MemoryPool* pool) + : ArrayBuilder(type, pool), byte_width_(-1), values_builder_(pool) { + DCHECK_EQ(T::type_id, type->id()) << "inconsistent type passed to DictionaryBuilder"; +} + +DictionaryBuilder::DictionaryBuilder(const std::shared_ptr& type, + MemoryPool* pool) + : ArrayBuilder(type, pool), values_builder_(pool) { + DCHECK_EQ(Type::NA, type->id()) << "inconsistent type passed to DictionaryBuilder"; +} + +template <> +DictionaryBuilder::DictionaryBuilder( + const std::shared_ptr& type, MemoryPool* pool) + : ArrayBuilder(type, pool), + byte_width_(checked_cast(*type).byte_width()) {} + +template +void DictionaryBuilder::Reset() { + ArrayBuilder::Reset(); + values_builder_.Reset(); + memo_table_.reset(); + delta_offset_ = 0; +} + +template +Status DictionaryBuilder::Resize(int64_t capacity) { + RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); + capacity = std::max(capacity, kMinBuilderCapacity); + + if (capacity_ == 0) { + // Initialize hash table + // XXX should we let the user pass additional size heuristics? + memo_table_.reset(new MemoTableImpl(0)); + delta_offset_ = 0; + } + RETURN_NOT_OK(values_builder_.Resize(capacity)); + return ArrayBuilder::Resize(capacity); +} + +Status DictionaryBuilder::Resize(int64_t capacity) { + RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); + capacity = std::max(capacity, kMinBuilderCapacity); + + RETURN_NOT_OK(values_builder_.Resize(capacity)); + return ArrayBuilder::Resize(capacity); +} + +template +Status DictionaryBuilder::Append(const Scalar& value) { + RETURN_NOT_OK(Reserve(1)); + + auto memo_index = memo_table_->GetOrInsert(value); + RETURN_NOT_OK(values_builder_.Append(memo_index)); + + return Status::OK(); +} + +template +Status DictionaryBuilder::AppendNull() { + return values_builder_.AppendNull(); +} + +Status DictionaryBuilder::AppendNull() { return values_builder_.AppendNull(); } + +template +Status DictionaryBuilder::AppendArray(const Array& array) { + const auto& numeric_array = checked_cast&>(array); + for (int64_t i = 0; i < array.length(); i++) { + if (array.IsNull(i)) { + RETURN_NOT_OK(AppendNull()); + } else { + RETURN_NOT_OK(Append(numeric_array.Value(i))); + } + } + return Status::OK(); +} + +Status DictionaryBuilder::AppendArray(const Array& array) { + for (int64_t i = 0; i < array.length(); i++) { + RETURN_NOT_OK(AppendNull()); + } + return Status::OK(); +} + +template +Status DictionaryBuilder::FinishInternal(std::shared_ptr* out) { + // Finalize indices array + RETURN_NOT_OK(values_builder_.FinishInternal(out)); + + // Generate dictionary array from hash table contents + std::shared_ptr dictionary; + std::shared_ptr dictionary_data; + + RETURN_NOT_OK(internal::DictionaryTraits::GetDictionaryArrayData( + pool_, type_, *memo_table_, delta_offset_, &dictionary_data)); + dictionary = MakeArray(dictionary_data); + + // Set type of array data to the right dictionary type + (*out)->type = std::make_shared((*out)->type, dictionary); + + // Update internals for further uses of this DictionaryBuilder + delta_offset_ = memo_table_->size(); + values_builder_.Reset(); + + return Status::OK(); +} + +Status DictionaryBuilder::FinishInternal(std::shared_ptr* out) { + std::shared_ptr dictionary = std::make_shared(0); + + RETURN_NOT_OK(values_builder_.FinishInternal(out)); + (*out)->type = std::make_shared((*out)->type, dictionary); + + return Status::OK(); +} + +// +// StringType and BinaryType specializations +// + +#define BINARY_DICTIONARY_SPECIALIZATIONS(Type) \ + \ + template <> \ + Status DictionaryBuilder::AppendArray(const Array& array) { \ + using ArrayType = typename TypeTraits::ArrayType; \ + const ArrayType& binary_array = checked_cast(array); \ + for (int64_t i = 0; i < array.length(); i++) { \ + if (array.IsNull(i)) { \ + RETURN_NOT_OK(AppendNull()); \ + } else { \ + RETURN_NOT_OK(Append(binary_array.GetView(i))); \ + } \ + } \ + return Status::OK(); \ + } + +BINARY_DICTIONARY_SPECIALIZATIONS(StringType); +BINARY_DICTIONARY_SPECIALIZATIONS(BinaryType); + +template <> +Status DictionaryBuilder::AppendArray(const Array& array) { + if (!type_->Equals(*array.type())) { + return Status::Invalid("Cannot append FixedSizeBinary array with non-matching type"); + } + + const auto& typed_array = checked_cast(array); + for (int64_t i = 0; i < array.length(); i++) { + if (array.IsNull(i)) { + RETURN_NOT_OK(AppendNull()); + } else { + RETURN_NOT_OK(Append(typed_array.GetValue(i))); + } + } + return Status::OK(); +} + +template class DictionaryBuilder; +template class DictionaryBuilder; +template class DictionaryBuilder; +template class DictionaryBuilder; +template class DictionaryBuilder; +template class DictionaryBuilder; +template class DictionaryBuilder; +template class DictionaryBuilder; +template class DictionaryBuilder; +template class DictionaryBuilder; +template class DictionaryBuilder; +template class DictionaryBuilder; +template class DictionaryBuilder; +template class DictionaryBuilder; +template class DictionaryBuilder; +template class DictionaryBuilder; +template class DictionaryBuilder; +template class DictionaryBuilder; + +} // namespace arrow diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index 0e10be7ff46f4..aef4df05108b7 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -15,36 +15,30 @@ // specific language governing permissions and limitations // under the License. -#include "arrow/builder.h" #include #include #include #include -#include #include #include #include #include "arrow/array.h" #include "arrow/buffer.h" +#include "arrow/builder.h" #include "arrow/status.h" #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/util/bit-util.h" #include "arrow/util/checked_cast.h" -#include "arrow/util/decimal.h" -#include "arrow/util/hashing.h" #include "arrow/util/int-util.h" #include "arrow/util/logging.h" namespace arrow { -using internal::AdaptiveIntBuilderBase; using internal::checked_cast; -namespace { - -Status TrimBuffer(const int64_t bytes_filled, ResizableBuffer* buffer) { +Status ArrayBuilder::TrimBuffer(const int64_t bytes_filled, ResizableBuffer* buffer) { if (buffer) { if (bytes_filled < buffer->size()) { // Trim buffer @@ -59,8 +53,6 @@ Status TrimBuffer(const int64_t bytes_filled, ResizableBuffer* buffer) { return Status::OK(); } -} // namespace - Status ArrayBuilder::AppendToBitmap(bool is_valid) { if (length_ == capacity_) { // If the capacity was not already a multiple of 2, do so here @@ -80,13 +72,6 @@ Status ArrayBuilder::AppendToBitmap(const uint8_t* valid_bytes, int64_t length) return Status::OK(); } -static inline Status CheckCapacity(int64_t new_capacity, int64_t old_capacity) { - if (new_capacity < 0) return Status::Invalid("Resize capacity must be positive"); - if (new_capacity < old_capacity) return Status::Invalid("Resize cannot downsize"); - - return Status::OK(); -} - Status ArrayBuilder::Resize(int64_t capacity) { // Target size of validity (null) bitmap data const int64_t new_bitmap_size = BitUtil::BytesForBits(capacity); @@ -295,375 +280,6 @@ template class PrimitiveBuilder; template class PrimitiveBuilder; template class PrimitiveBuilder; -AdaptiveIntBuilderBase::AdaptiveIntBuilderBase(MemoryPool* pool) - : ArrayBuilder(int64(), pool), - data_(nullptr), - raw_data_(nullptr), - int_size_(1), - pending_pos_(0), - pending_has_nulls_(false) {} - -void AdaptiveIntBuilderBase::Reset() { - ArrayBuilder::Reset(); - data_.reset(); - raw_data_ = nullptr; - pending_pos_ = 0; - pending_has_nulls_ = false; -} - -Status AdaptiveIntBuilderBase::Resize(int64_t capacity) { - RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); - capacity = std::max(capacity, kMinBuilderCapacity); - - int64_t nbytes = capacity * int_size_; - if (capacity_ == 0) { - RETURN_NOT_OK(AllocateResizableBuffer(pool_, nbytes, &data_)); - } else { - RETURN_NOT_OK(data_->Resize(nbytes)); - } - raw_data_ = reinterpret_cast(data_->mutable_data()); - - return ArrayBuilder::Resize(capacity); -} - -AdaptiveIntBuilder::AdaptiveIntBuilder(MemoryPool* pool) : AdaptiveIntBuilderBase(pool) {} - -Status AdaptiveIntBuilder::FinishInternal(std::shared_ptr* out) { - RETURN_NOT_OK(CommitPendingData()); - - std::shared_ptr output_type; - switch (int_size_) { - case 1: - output_type = int8(); - break; - case 2: - output_type = int16(); - break; - case 4: - output_type = int32(); - break; - case 8: - output_type = int64(); - break; - default: - DCHECK(false); - return Status::NotImplemented("Only ints of size 1,2,4,8 are supported"); - } - - RETURN_NOT_OK(TrimBuffer(BitUtil::BytesForBits(length_), null_bitmap_.get())); - RETURN_NOT_OK(TrimBuffer(length_ * int_size_, data_.get())); - - *out = ArrayData::Make(output_type, length_, {null_bitmap_, data_}, null_count_); - - data_ = null_bitmap_ = nullptr; - capacity_ = length_ = null_count_ = 0; - return Status::OK(); -} - -Status AdaptiveIntBuilder::CommitPendingData() { - if (pending_pos_ == 0) { - return Status::OK(); - } - RETURN_NOT_OK(Reserve(pending_pos_)); - const uint8_t* valid_bytes = pending_has_nulls_ ? pending_valid_ : nullptr; - RETURN_NOT_OK(AppendValuesInternal(reinterpret_cast(pending_data_), - pending_pos_, valid_bytes)); - pending_has_nulls_ = false; - pending_pos_ = 0; - return Status::OK(); -} - -static constexpr int64_t kAdaptiveIntChunkSize = 8192; - -Status AdaptiveIntBuilder::AppendValuesInternal(const int64_t* values, int64_t length, - const uint8_t* valid_bytes) { - while (length > 0) { - // In case `length` is very large, we don't want to trash the cache by - // scanning it twice (first to detect int width, second to copy the data). - // Instead, process data in L2-cacheable chunks. - const int64_t chunk_size = std::min(length, kAdaptiveIntChunkSize); - - uint8_t new_int_size; - new_int_size = internal::DetectIntWidth(values, valid_bytes, chunk_size, int_size_); - - DCHECK_GE(new_int_size, int_size_); - if (new_int_size > int_size_) { - // This updates int_size_ - RETURN_NOT_OK(ExpandIntSize(new_int_size)); - } - - switch (int_size_) { - case 1: - internal::DowncastInts(values, reinterpret_cast(raw_data_) + length_, - chunk_size); - break; - case 2: - internal::DowncastInts(values, reinterpret_cast(raw_data_) + length_, - chunk_size); - break; - case 4: - internal::DowncastInts(values, reinterpret_cast(raw_data_) + length_, - chunk_size); - break; - case 8: - internal::DowncastInts(values, reinterpret_cast(raw_data_) + length_, - chunk_size); - break; - default: - DCHECK(false); - } - - // This updates length_ - ArrayBuilder::UnsafeAppendToBitmap(valid_bytes, chunk_size); - values += chunk_size; - if (valid_bytes != nullptr) { - valid_bytes += chunk_size; - } - length -= chunk_size; - } - - return Status::OK(); -} - -Status AdaptiveUIntBuilder::CommitPendingData() { - if (pending_pos_ == 0) { - return Status::OK(); - } - RETURN_NOT_OK(Reserve(pending_pos_)); - const uint8_t* valid_bytes = pending_has_nulls_ ? pending_valid_ : nullptr; - RETURN_NOT_OK(AppendValuesInternal(pending_data_, pending_pos_, valid_bytes)); - pending_has_nulls_ = false; - pending_pos_ = 0; - return Status::OK(); -} - -Status AdaptiveIntBuilder::AppendValues(const int64_t* values, int64_t length, - const uint8_t* valid_bytes) { - RETURN_NOT_OK(CommitPendingData()); - RETURN_NOT_OK(Reserve(length)); - - return AppendValuesInternal(values, length, valid_bytes); -} - -template -typename std::enable_if= sizeof(new_type), Status>::type -AdaptiveIntBuilder::ExpandIntSizeInternal() { - return Status::OK(); -} - -#define __LESS(a, b) (a) < (b) -template -typename std::enable_if<__LESS(sizeof(old_type), sizeof(new_type)), Status>::type -AdaptiveIntBuilder::ExpandIntSizeInternal() { - int_size_ = sizeof(new_type); - RETURN_NOT_OK(Resize(data_->size() / sizeof(old_type))); - raw_data_ = reinterpret_cast(data_->mutable_data()); - const old_type* src = reinterpret_cast(raw_data_); - new_type* dst = reinterpret_cast(raw_data_); - - // By doing the backward copy, we ensure that no element is overriden during - // the copy process and the copy stays in-place. - std::copy_backward(src, src + length_, dst + length_); - - return Status::OK(); -} -#undef __LESS - -template -Status AdaptiveIntBuilder::ExpandIntSizeN() { - switch (int_size_) { - case 1: - RETURN_NOT_OK((ExpandIntSizeInternal())); - break; - case 2: - RETURN_NOT_OK((ExpandIntSizeInternal())); - break; - case 4: - RETURN_NOT_OK((ExpandIntSizeInternal())); - break; - case 8: - RETURN_NOT_OK((ExpandIntSizeInternal())); - break; - default: - DCHECK(false); - } - return Status::OK(); -} - -Status AdaptiveIntBuilder::ExpandIntSize(uint8_t new_int_size) { - switch (new_int_size) { - case 1: - RETURN_NOT_OK((ExpandIntSizeN())); - break; - case 2: - RETURN_NOT_OK((ExpandIntSizeN())); - break; - case 4: - RETURN_NOT_OK((ExpandIntSizeN())); - break; - case 8: - RETURN_NOT_OK((ExpandIntSizeN())); - break; - default: - DCHECK(false); - } - return Status::OK(); -} - -AdaptiveUIntBuilder::AdaptiveUIntBuilder(MemoryPool* pool) - : AdaptiveIntBuilderBase(pool) {} - -Status AdaptiveUIntBuilder::FinishInternal(std::shared_ptr* out) { - RETURN_NOT_OK(CommitPendingData()); - - std::shared_ptr output_type; - switch (int_size_) { - case 1: - output_type = uint8(); - break; - case 2: - output_type = uint16(); - break; - case 4: - output_type = uint32(); - break; - case 8: - output_type = uint64(); - break; - default: - DCHECK(false); - return Status::NotImplemented("Only ints of size 1,2,4,8 are supported"); - } - - RETURN_NOT_OK(TrimBuffer(BitUtil::BytesForBits(length_), null_bitmap_.get())); - RETURN_NOT_OK(TrimBuffer(length_ * int_size_, data_.get())); - - *out = ArrayData::Make(output_type, length_, {null_bitmap_, data_}, null_count_); - - data_ = null_bitmap_ = nullptr; - capacity_ = length_ = null_count_ = 0; - return Status::OK(); -} - -Status AdaptiveUIntBuilder::AppendValuesInternal(const uint64_t* values, int64_t length, - const uint8_t* valid_bytes) { - while (length > 0) { - // See AdaptiveIntBuilder::AppendValuesInternal - const int64_t chunk_size = std::min(length, kAdaptiveIntChunkSize); - - uint8_t new_int_size; - new_int_size = internal::DetectUIntWidth(values, valid_bytes, chunk_size, int_size_); - - DCHECK_GE(new_int_size, int_size_); - if (new_int_size > int_size_) { - // This updates int_size_ - RETURN_NOT_OK(ExpandIntSize(new_int_size)); - } - - switch (int_size_) { - case 1: - internal::DowncastUInts(values, reinterpret_cast(raw_data_) + length_, - chunk_size); - break; - case 2: - internal::DowncastUInts(values, reinterpret_cast(raw_data_) + length_, - chunk_size); - break; - case 4: - internal::DowncastUInts(values, reinterpret_cast(raw_data_) + length_, - chunk_size); - break; - case 8: - internal::DowncastUInts(values, reinterpret_cast(raw_data_) + length_, - chunk_size); - break; - default: - DCHECK(false); - } - - // This updates length_ - ArrayBuilder::UnsafeAppendToBitmap(valid_bytes, chunk_size); - values += chunk_size; - if (valid_bytes != nullptr) { - valid_bytes += chunk_size; - } - length -= chunk_size; - } - - return Status::OK(); -} - -Status AdaptiveUIntBuilder::AppendValues(const uint64_t* values, int64_t length, - const uint8_t* valid_bytes) { - RETURN_NOT_OK(Reserve(length)); - - return AppendValuesInternal(values, length, valid_bytes); -} - -template -typename std::enable_if= sizeof(new_type), Status>::type -AdaptiveUIntBuilder::ExpandIntSizeInternal() { - return Status::OK(); -} - -#define __LESS(a, b) (a) < (b) -template -typename std::enable_if<__LESS(sizeof(old_type), sizeof(new_type)), Status>::type -AdaptiveUIntBuilder::ExpandIntSizeInternal() { - int_size_ = sizeof(new_type); - RETURN_NOT_OK(Resize(data_->size() / sizeof(old_type))); - - old_type* src = reinterpret_cast(raw_data_); - new_type* dst = reinterpret_cast(raw_data_); - // By doing the backward copy, we ensure that no element is overriden during - // the copy process and the copy stays in-place. - std::copy_backward(src, src + length_, dst + length_); - - return Status::OK(); -} -#undef __LESS - -template -Status AdaptiveUIntBuilder::ExpandIntSizeN() { - switch (int_size_) { - case 1: - RETURN_NOT_OK((ExpandIntSizeInternal())); - break; - case 2: - RETURN_NOT_OK((ExpandIntSizeInternal())); - break; - case 4: - RETURN_NOT_OK((ExpandIntSizeInternal())); - break; - case 8: - RETURN_NOT_OK((ExpandIntSizeInternal())); - break; - default: - DCHECK(false); - } - return Status::OK(); -} - -Status AdaptiveUIntBuilder::ExpandIntSize(uint8_t new_int_size) { - switch (new_int_size) { - case 1: - RETURN_NOT_OK((ExpandIntSizeN())); - break; - case 2: - RETURN_NOT_OK((ExpandIntSizeN())); - break; - case 4: - RETURN_NOT_OK((ExpandIntSizeN())); - break; - case 8: - RETURN_NOT_OK((ExpandIntSizeN())); - break; - default: - DCHECK(false); - } - return Status::OK(); -} - BooleanBuilder::BooleanBuilder(MemoryPool* pool) : ArrayBuilder(boolean(), pool), data_(nullptr), raw_data_(nullptr) {} @@ -792,219 +408,6 @@ Status BooleanBuilder::AppendValues(const std::vector& values) { return Status::OK(); } -// ---------------------------------------------------------------------- -// DictionaryBuilder - -template -class DictionaryBuilder::MemoTableImpl - : public internal::HashTraits::MemoTableType { - public: - using MemoTableType = typename internal::HashTraits::MemoTableType; - using MemoTableType::MemoTableType; -}; - -template -DictionaryBuilder::~DictionaryBuilder() {} - -template -DictionaryBuilder::DictionaryBuilder(const std::shared_ptr& type, - MemoryPool* pool) - : ArrayBuilder(type, pool), byte_width_(-1), values_builder_(pool) { - DCHECK_EQ(T::type_id, type->id()) << "inconsistent type passed to DictionaryBuilder"; -} - -DictionaryBuilder::DictionaryBuilder(const std::shared_ptr& type, - MemoryPool* pool) - : ArrayBuilder(type, pool), values_builder_(pool) { - DCHECK_EQ(Type::NA, type->id()) << "inconsistent type passed to DictionaryBuilder"; -} - -template <> -DictionaryBuilder::DictionaryBuilder( - const std::shared_ptr& type, MemoryPool* pool) - : ArrayBuilder(type, pool), - byte_width_(checked_cast(*type).byte_width()) {} - -template -void DictionaryBuilder::Reset() { - ArrayBuilder::Reset(); - values_builder_.Reset(); - memo_table_.reset(); - delta_offset_ = 0; -} - -template -Status DictionaryBuilder::Resize(int64_t capacity) { - RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); - capacity = std::max(capacity, kMinBuilderCapacity); - - if (capacity_ == 0) { - // Initialize hash table - // XXX should we let the user pass additional size heuristics? - memo_table_.reset(new MemoTableImpl(0)); - delta_offset_ = 0; - } - RETURN_NOT_OK(values_builder_.Resize(capacity)); - return ArrayBuilder::Resize(capacity); -} - -Status DictionaryBuilder::Resize(int64_t capacity) { - RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); - capacity = std::max(capacity, kMinBuilderCapacity); - - RETURN_NOT_OK(values_builder_.Resize(capacity)); - return ArrayBuilder::Resize(capacity); -} - -template -Status DictionaryBuilder::Append(const Scalar& value) { - RETURN_NOT_OK(Reserve(1)); - - auto memo_index = memo_table_->GetOrInsert(value); - RETURN_NOT_OK(values_builder_.Append(memo_index)); - - return Status::OK(); -} - -template -Status DictionaryBuilder::AppendNull() { - return values_builder_.AppendNull(); -} - -Status DictionaryBuilder::AppendNull() { return values_builder_.AppendNull(); } - -template -Status DictionaryBuilder::AppendArray(const Array& array) { - const auto& numeric_array = checked_cast&>(array); - for (int64_t i = 0; i < array.length(); i++) { - if (array.IsNull(i)) { - RETURN_NOT_OK(AppendNull()); - } else { - RETURN_NOT_OK(Append(numeric_array.Value(i))); - } - } - return Status::OK(); -} - -Status DictionaryBuilder::AppendArray(const Array& array) { - for (int64_t i = 0; i < array.length(); i++) { - RETURN_NOT_OK(AppendNull()); - } - return Status::OK(); -} - -template -Status DictionaryBuilder::FinishInternal(std::shared_ptr* out) { - // Finalize indices array - RETURN_NOT_OK(values_builder_.FinishInternal(out)); - - // Generate dictionary array from hash table contents - std::shared_ptr dictionary; - std::shared_ptr dictionary_data; - - RETURN_NOT_OK(internal::DictionaryTraits::GetDictionaryArrayData( - pool_, type_, *memo_table_, delta_offset_, &dictionary_data)); - dictionary = MakeArray(dictionary_data); - - // Set type of array data to the right dictionary type - (*out)->type = std::make_shared((*out)->type, dictionary); - - // Update internals for further uses of this DictionaryBuilder - delta_offset_ = memo_table_->size(); - values_builder_.Reset(); - - return Status::OK(); -} - -Status DictionaryBuilder::FinishInternal(std::shared_ptr* out) { - std::shared_ptr dictionary = std::make_shared(0); - - RETURN_NOT_OK(values_builder_.FinishInternal(out)); - (*out)->type = std::make_shared((*out)->type, dictionary); - - return Status::OK(); -} - -// -// StringType and BinaryType specializations -// - -#define BINARY_DICTIONARY_SPECIALIZATIONS(Type) \ - \ - template <> \ - Status DictionaryBuilder::AppendArray(const Array& array) { \ - using ArrayType = typename TypeTraits::ArrayType; \ - const ArrayType& binary_array = checked_cast(array); \ - for (int64_t i = 0; i < array.length(); i++) { \ - if (array.IsNull(i)) { \ - RETURN_NOT_OK(AppendNull()); \ - } else { \ - RETURN_NOT_OK(Append(binary_array.GetView(i))); \ - } \ - } \ - return Status::OK(); \ - } - -BINARY_DICTIONARY_SPECIALIZATIONS(StringType); -BINARY_DICTIONARY_SPECIALIZATIONS(BinaryType); - -template <> -Status DictionaryBuilder::AppendArray(const Array& array) { - if (!type_->Equals(*array.type())) { - return Status::Invalid("Cannot append FixedSizeBinary array with non-matching type"); - } - - const auto& typed_array = checked_cast(array); - for (int64_t i = 0; i < array.length(); i++) { - if (array.IsNull(i)) { - RETURN_NOT_OK(AppendNull()); - } else { - RETURN_NOT_OK(Append(typed_array.GetValue(i))); - } - } - return Status::OK(); -} - -template class DictionaryBuilder; -template class DictionaryBuilder; -template class DictionaryBuilder; -template class DictionaryBuilder; -template class DictionaryBuilder; -template class DictionaryBuilder; -template class DictionaryBuilder; -template class DictionaryBuilder; -template class DictionaryBuilder; -template class DictionaryBuilder; -template class DictionaryBuilder; -template class DictionaryBuilder; -template class DictionaryBuilder; -template class DictionaryBuilder; -template class DictionaryBuilder; -template class DictionaryBuilder; -template class DictionaryBuilder; -template class DictionaryBuilder; - -// ---------------------------------------------------------------------- -// Decimal128Builder - -Decimal128Builder::Decimal128Builder(const std::shared_ptr& type, - MemoryPool* pool) - : FixedSizeBinaryBuilder(type, pool) {} - -Status Decimal128Builder::Append(const Decimal128& value) { - RETURN_NOT_OK(FixedSizeBinaryBuilder::Reserve(1)); - return FixedSizeBinaryBuilder::Append(value.ToBytes()); -} - -Status Decimal128Builder::FinishInternal(std::shared_ptr* out) { - std::shared_ptr data; - RETURN_NOT_OK(byte_builder_.Finish(&data)); - - *out = ArrayData::Make(type_, length_, {null_bitmap_, data}, null_count_); - - return Status::OK(); -} - // ---------------------------------------------------------------------- // ListBuilder @@ -1088,257 +491,6 @@ ArrayBuilder* ListBuilder::value_builder() const { return value_builder_.get(); } -// ---------------------------------------------------------------------- -// String and binary - -BinaryBuilder::BinaryBuilder(const std::shared_ptr& type, MemoryPool* pool) - : ArrayBuilder(type, pool), offsets_builder_(pool), value_data_builder_(pool) {} - -BinaryBuilder::BinaryBuilder(MemoryPool* pool) : BinaryBuilder(binary(), pool) {} - -Status BinaryBuilder::Resize(int64_t capacity) { - DCHECK_LE(capacity, kListMaximumElements); - RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); - - // one more then requested for offsets - RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int32_t))); - return ArrayBuilder::Resize(capacity); -} - -Status BinaryBuilder::ReserveData(int64_t elements) { - if (value_data_length() + elements > value_data_capacity()) { - if (value_data_length() + elements > kBinaryMemoryLimit) { - return Status::CapacityError( - "Cannot reserve capacity larger than 2^31 - 1 for binary"); - } - RETURN_NOT_OK(value_data_builder_.Reserve(elements)); - } - return Status::OK(); -} - -Status BinaryBuilder::AppendNextOffset() { - const int64_t num_bytes = value_data_builder_.length(); - if (ARROW_PREDICT_FALSE(num_bytes > kBinaryMemoryLimit)) { - std::stringstream ss; - ss << "BinaryArray cannot contain more than " << kBinaryMemoryLimit << " bytes, have " - << num_bytes; - return Status::CapacityError(ss.str()); - } - return offsets_builder_.Append(static_cast(num_bytes)); -} - -Status BinaryBuilder::Append(const uint8_t* value, int32_t length) { - RETURN_NOT_OK(Reserve(1)); - RETURN_NOT_OK(AppendNextOffset()); - RETURN_NOT_OK(value_data_builder_.Append(value, length)); - - UnsafeAppendToBitmap(true); - return Status::OK(); -} - -Status BinaryBuilder::AppendNull() { - RETURN_NOT_OK(AppendNextOffset()); - RETURN_NOT_OK(Reserve(1)); - - UnsafeAppendToBitmap(false); - return Status::OK(); -} - -Status BinaryBuilder::FinishInternal(std::shared_ptr* out) { - // Write final offset (values length) - RETURN_NOT_OK(AppendNextOffset()); - - // These buffers' padding zeroed by BufferBuilder - std::shared_ptr offsets, value_data; - RETURN_NOT_OK(offsets_builder_.Finish(&offsets)); - RETURN_NOT_OK(value_data_builder_.Finish(&value_data)); - - *out = ArrayData::Make(type_, length_, {null_bitmap_, offsets, value_data}, null_count_, - 0); - Reset(); - return Status::OK(); -} - -void BinaryBuilder::Reset() { - ArrayBuilder::Reset(); - offsets_builder_.Reset(); - value_data_builder_.Reset(); -} - -const uint8_t* BinaryBuilder::GetValue(int64_t i, int32_t* out_length) const { - const int32_t* offsets = offsets_builder_.data(); - int32_t offset = offsets[i]; - if (i == (length_ - 1)) { - *out_length = static_cast(value_data_builder_.length()) - offset; - } else { - *out_length = offsets[i + 1] - offset; - } - return value_data_builder_.data() + offset; -} - -util::string_view BinaryBuilder::GetView(int64_t i) const { - const int32_t* offsets = offsets_builder_.data(); - int32_t offset = offsets[i]; - int32_t value_length; - if (i == (length_ - 1)) { - value_length = static_cast(value_data_builder_.length()) - offset; - } else { - value_length = offsets[i + 1] - offset; - } - return util::string_view( - reinterpret_cast(value_data_builder_.data() + offset), value_length); -} - -StringBuilder::StringBuilder(MemoryPool* pool) : BinaryBuilder(utf8(), pool) {} - -Status StringBuilder::AppendValues(const std::vector& values, - const uint8_t* valid_bytes) { - std::size_t total_length = std::accumulate( - values.begin(), values.end(), 0ULL, - [](uint64_t sum, const std::string& str) { return sum + str.size(); }); - RETURN_NOT_OK(Reserve(values.size())); - RETURN_NOT_OK(value_data_builder_.Reserve(total_length)); - RETURN_NOT_OK(offsets_builder_.Reserve(values.size())); - - if (valid_bytes) { - for (std::size_t i = 0; i < values.size(); ++i) { - RETURN_NOT_OK(AppendNextOffset()); - if (valid_bytes[i]) { - RETURN_NOT_OK(value_data_builder_.Append( - reinterpret_cast(values[i].data()), values[i].size())); - } - } - } else { - for (std::size_t i = 0; i < values.size(); ++i) { - RETURN_NOT_OK(AppendNextOffset()); - RETURN_NOT_OK(value_data_builder_.Append( - reinterpret_cast(values[i].data()), values[i].size())); - } - } - - UnsafeAppendToBitmap(valid_bytes, values.size()); - return Status::OK(); -} - -Status StringBuilder::AppendValues(const char** values, int64_t length, - const uint8_t* valid_bytes) { - std::size_t total_length = 0; - std::vector value_lengths(length); - bool have_null_value = false; - for (int64_t i = 0; i < length; ++i) { - if (values[i]) { - auto value_length = strlen(values[i]); - value_lengths[i] = value_length; - total_length += value_length; - } else { - have_null_value = true; - } - } - RETURN_NOT_OK(Reserve(length)); - RETURN_NOT_OK(value_data_builder_.Reserve(total_length)); - RETURN_NOT_OK(offsets_builder_.Reserve(length)); - - if (valid_bytes) { - int64_t valid_bytes_offset = 0; - for (int64_t i = 0; i < length; ++i) { - RETURN_NOT_OK(AppendNextOffset()); - if (valid_bytes[i]) { - if (values[i]) { - RETURN_NOT_OK(value_data_builder_.Append( - reinterpret_cast(values[i]), value_lengths[i])); - } else { - UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset, i - valid_bytes_offset); - UnsafeAppendToBitmap(false); - valid_bytes_offset = i + 1; - } - } - } - UnsafeAppendToBitmap(valid_bytes + valid_bytes_offset, length - valid_bytes_offset); - } else { - if (have_null_value) { - std::vector valid_vector(length, 0); - for (int64_t i = 0; i < length; ++i) { - RETURN_NOT_OK(AppendNextOffset()); - if (values[i]) { - RETURN_NOT_OK(value_data_builder_.Append( - reinterpret_cast(values[i]), value_lengths[i])); - valid_vector[i] = 1; - } - } - UnsafeAppendToBitmap(valid_vector.data(), length); - } else { - for (int64_t i = 0; i < length; ++i) { - RETURN_NOT_OK(AppendNextOffset()); - RETURN_NOT_OK(value_data_builder_.Append( - reinterpret_cast(values[i]), value_lengths[i])); - } - UnsafeAppendToBitmap(nullptr, length); - } - } - return Status::OK(); -} - -// ---------------------------------------------------------------------- -// Fixed width binary - -FixedSizeBinaryBuilder::FixedSizeBinaryBuilder(const std::shared_ptr& type, - MemoryPool* pool) - : ArrayBuilder(type, pool), - byte_width_(checked_cast(*type).byte_width()), - byte_builder_(pool) {} - -#ifndef NDEBUG -void FixedSizeBinaryBuilder::CheckValueSize(int64_t size) { - DCHECK_EQ(size, byte_width_) << "Appending wrong size to FixedSizeBinaryBuilder"; -} -#endif - -Status FixedSizeBinaryBuilder::AppendValues(const uint8_t* data, int64_t length, - const uint8_t* valid_bytes) { - RETURN_NOT_OK(Reserve(length)); - UnsafeAppendToBitmap(valid_bytes, length); - return byte_builder_.Append(data, length * byte_width_); -} - -Status FixedSizeBinaryBuilder::AppendNull() { - RETURN_NOT_OK(Reserve(1)); - UnsafeAppendToBitmap(false); - return byte_builder_.Advance(byte_width_); -} - -void FixedSizeBinaryBuilder::Reset() { - ArrayBuilder::Reset(); - byte_builder_.Reset(); -} - -Status FixedSizeBinaryBuilder::Resize(int64_t capacity) { - RETURN_NOT_OK(CheckCapacity(capacity, capacity_)); - RETURN_NOT_OK(byte_builder_.Resize(capacity * byte_width_)); - return ArrayBuilder::Resize(capacity); -} - -Status FixedSizeBinaryBuilder::FinishInternal(std::shared_ptr* out) { - std::shared_ptr data; - RETURN_NOT_OK(byte_builder_.Finish(&data)); - - *out = ArrayData::Make(type_, length_, {null_bitmap_, data}, null_count_); - - null_bitmap_ = nullptr; - capacity_ = length_ = null_count_ = 0; - return Status::OK(); -} - -const uint8_t* FixedSizeBinaryBuilder::GetValue(int64_t i) const { - const uint8_t* data_ptr = byte_builder_.data(); - return data_ptr + i * byte_width_; -} - -util::string_view FixedSizeBinaryBuilder::GetView(int64_t i) const { - const uint8_t* data_ptr = byte_builder_.data(); - return util::string_view(reinterpret_cast(data_ptr + i * byte_width_), - byte_width_); -} - // ---------------------------------------------------------------------- // Struct @@ -1352,6 +504,7 @@ void StructBuilder::Reset() { field_builder->Reset(); } } + Status StructBuilder::FinishInternal(std::shared_ptr* out) { RETURN_NOT_OK(TrimBuffer(BitUtil::BytesForBits(length_), null_bitmap_.get())); *out = ArrayData::Make(type_, length_, {null_bitmap_}, null_count_); diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index 34cac55202cc2..34398eebebfb6 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -193,6 +193,18 @@ class ARROW_EXPORT ArrayBuilder { // Set the next length bits to not null (i.e. valid). void UnsafeSetNotNull(int64_t length); + static Status TrimBuffer(const int64_t bytes_filled, ResizableBuffer* buffer); + + static Status CheckCapacity(int64_t new_capacity, int64_t old_capacity) { + if (new_capacity < 0) { + return Status::Invalid("Resize capacity must be positive"); + } + if (new_capacity < old_capacity) { + return Status::Invalid("Resize cannot downsize"); + } + return Status::OK(); + } + std::shared_ptr type_; MemoryPool* pool_; diff --git a/cpp/src/arrow/compute/compute-test.cc b/cpp/src/arrow/compute/compute-test.cc index 821569e3f524c..52fc58809604c 100644 --- a/cpp/src/arrow/compute/compute-test.cc +++ b/cpp/src/arrow/compute/compute-test.cc @@ -111,8 +111,10 @@ class TestCast : public ComputeFixture, public TestBase { void CheckCase(const shared_ptr& in_type, const vector& in_values, const vector& is_valid, const shared_ptr& out_type, const vector& out_values, const CastOptions& options) { + DCHECK_EQ(in_values.size(), out_values.size()); shared_ptr input, expected; if (is_valid.size() > 0) { + DCHECK_EQ(is_valid.size(), out_values.size()); ArrayFromVector(in_type, is_valid, in_values, &input); ArrayFromVector(out_type, is_valid, out_values, &expected); } else { @@ -1056,6 +1058,37 @@ TEST_F(TestCast, StringToNumberErrors) { CheckFails(utf8(), {"z"}, is_valid, float32(), options); } +TEST_F(TestCast, StringToTimestamp) { + CastOptions options; + + vector is_valid = {true, false, true}; + vector strings = {"1970-01-01", "xxx", "2000-02-29"}; + + auto type = timestamp(TimeUnit::SECOND); + vector e = {0, 0, 951782400}; + CheckCase(utf8(), strings, is_valid, + type, e, options); + + type = timestamp(TimeUnit::MICRO); + e = {0, 0, 951782400000000LL}; + CheckCase(utf8(), strings, is_valid, + type, e, options); + + // NOTE: timestamp parsing is tested comprehensively in parsing-util-test.cc +} + +TEST_F(TestCast, StringToTimestampErrors) { + CastOptions options; + + vector is_valid = {true}; + + for (auto unit : {TimeUnit::SECOND, TimeUnit::MILLI, TimeUnit::MICRO, TimeUnit::NANO}) { + auto type = timestamp(unit); + CheckFails(utf8(), {""}, is_valid, type, options); + CheckFails(utf8(), {"xxx"}, is_valid, type, options); + } +} + template class TestDictionaryCast : public TestCast {}; diff --git a/cpp/src/arrow/compute/kernels/cast.cc b/cpp/src/arrow/compute/kernels/cast.cc index cd45b2d35275e..4f7d7f822b3ab 100644 --- a/cpp/src/arrow/compute/kernels/cast.cc +++ b/cpp/src/arrow/compute/kernels/cast.cc @@ -980,6 +980,35 @@ struct CastFunctor +struct CastFunctor { + void operator()(FunctionContext* ctx, const CastOptions& options, + const ArrayData& input, ArrayData* output) { + using out_type = TimestampType::c_type; + + StringArray input_array(input.Copy()); + auto out_data = output->GetMutableValues(1); + internal::StringConverter converter(output->type); + + for (int64_t i = 0; i < input.length; ++i, ++out_data) { + if (input_array.IsNull(i)) { + continue; + } + + auto str = input_array.GetView(i); + if (!converter(str.data(), str.length(), out_data)) { + std::stringstream ss; + ss << "Failed to cast String '" << str << "' into " << output->type->ToString(); + ctx->SetStatus(Status(StatusCode::Invalid, ss.str())); + return; + } + } + } +}; + // ---------------------------------------------------------------------- typedef std::function* out) const { } /// XXX(wesm): arrow::ipc::ReadSchema in its current form will not suffice /// for reading schemas with dictionaries. See ARROW-3144 - io::BufferReader schema_reader(reinterpret_cast(data_.schema.c_str()), - static_cast(data_.schema.size())); + io::BufferReader schema_reader(data_.schema); RETURN_NOT_OK(ipc::ReadSchema(&schema_reader, &schema_)); reconstructed_schema_ = true; *out = schema_; diff --git a/cpp/src/arrow/gpu/CMakeLists.txt b/cpp/src/arrow/gpu/CMakeLists.txt index ed4c125297771..60407acb0a1ec 100644 --- a/cpp/src/arrow/gpu/CMakeLists.txt +++ b/cpp/src/arrow/gpu/CMakeLists.txt @@ -16,7 +16,7 @@ # under the License. ####################################### -# arrow_gpu +# arrow_cuda ####################################### if (DEFINED ENV{CUDA_HOME}) @@ -28,28 +28,28 @@ include_directories(SYSTEM ${CUDA_INCLUDE_DIRS}) message(STATUS "CUDA Libraries: ${CUDA_LIBRARIES}") -set(ARROW_GPU_SRCS +set(ARROW_CUDA_SRCS cuda_arrow_ipc.cc cuda_context.cc cuda_memory.cc ) -set(ARROW_GPU_SHARED_LINK_LIBS +set(ARROW_CUDA_SHARED_LINK_LIBS ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY} ) -ADD_ARROW_LIB(arrow_gpu - SOURCES ${ARROW_GPU_SRCS} - OUTPUTS ARROW_GPU_LIBRARIES +ADD_ARROW_LIB(arrow_cuda + SOURCES ${ARROW_CUDA_SRCS} + OUTPUTS ARROW_CUDA_LIBRARIES DEPENDENCIES metadata_fbs SHARED_LINK_FLAGS "" - SHARED_LINK_LIBS arrow_shared ${ARROW_GPU_SHARED_LINK_LIBS} - # Static arrow_gpu must also link against CUDA shared libs - STATIC_LINK_LIBS ${ARROW_GPU_SHARED_LINK_LIBS} + SHARED_LINK_LIBS arrow_shared ${ARROW_CUDA_SHARED_LINK_LIBS} + # Static arrow_cuda must also link against CUDA shared libs + STATIC_LINK_LIBS ${ARROW_CUDA_SHARED_LINK_LIBS} ) -foreach(LIB_TARGET ${ARROW_GPU_LIBRARIES}) +foreach(LIB_TARGET ${ARROW_CUDA_LIBRARIES}) target_compile_definitions(${LIB_TARGET} PRIVATE ARROW_EXPORTING) endforeach() @@ -71,28 +71,28 @@ install(FILES DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/gpu") # pkg-config support -configure_file(arrow-gpu.pc.in - "${CMAKE_CURRENT_BINARY_DIR}/arrow-gpu.pc" +configure_file(arrow-cuda.pc.in + "${CMAKE_CURRENT_BINARY_DIR}/arrow-cuda.pc" @ONLY) install( - FILES "${CMAKE_CURRENT_BINARY_DIR}/arrow-gpu.pc" + FILES "${CMAKE_CURRENT_BINARY_DIR}/arrow-cuda.pc" DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig/") -set(ARROW_GPU_TEST_LINK_LIBS - arrow_gpu_shared +set(ARROW_CUDA_TEST_LINK_LIBS + arrow_cuda_shared ${ARROW_TEST_LINK_LIBS}) if (ARROW_BUILD_TESTS) ADD_ARROW_TEST(cuda-test - STATIC_LINK_LIBS ${ARROW_GPU_TEST_LINK_LIBS} + STATIC_LINK_LIBS ${ARROW_CUDA_TEST_LINK_LIBS} NO_VALGRIND) endif() if (ARROW_BUILD_BENCHMARKS) cuda_add_executable(cuda-benchmark cuda-benchmark.cc) target_link_libraries(cuda-benchmark - arrow_gpu_shared + arrow_cuda_shared gtest_static ${ARROW_BENCHMARK_LINK_LIBS}) endif() diff --git a/cpp/src/arrow/gpu/arrow-gpu.pc.in b/cpp/src/arrow/gpu/arrow-cuda.pc.in similarity index 89% rename from cpp/src/arrow/gpu/arrow-gpu.pc.in rename to cpp/src/arrow/gpu/arrow-cuda.pc.in index 3889d03b204ca..858096f892270 100644 --- a/cpp/src/arrow/gpu/arrow-gpu.pc.in +++ b/cpp/src/arrow/gpu/arrow-cuda.pc.in @@ -18,9 +18,9 @@ libdir=@CMAKE_INSTALL_FULL_LIBDIR@ includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@ -Name: Apache Arrow GPU -Description: GPU integration library for Apache Arrow +Name: Apache Arrow CUDA +Description: CUDA integration library for Apache Arrow Version: @ARROW_VERSION@ Requires: arrow -Libs: -L${libdir} -larrow_gpu +Libs: -L${libdir} -larrow_cuda Cflags: -I${includedir} diff --git a/cpp/src/arrow/gpu/cuda-benchmark.cc b/cpp/src/arrow/gpu/cuda-benchmark.cc index 8b3723d838797..9889373d09c75 100644 --- a/cpp/src/arrow/gpu/cuda-benchmark.cc +++ b/cpp/src/arrow/gpu/cuda-benchmark.cc @@ -28,7 +28,7 @@ #include "arrow/gpu/cuda_api.h" namespace arrow { -namespace gpu { +namespace cuda { constexpr int64_t kGpuNumber = 0; @@ -94,5 +94,5 @@ BENCHMARK(BM_Writer_Unbuffered) ->MinTime(1.0) ->UseRealTime(); -} // namespace gpu +} // namespace cuda } // namespace arrow diff --git a/cpp/src/arrow/gpu/cuda-test.cc b/cpp/src/arrow/gpu/cuda-test.cc index cb375458004a0..5d85a81a23641 100644 --- a/cpp/src/arrow/gpu/cuda-test.cc +++ b/cpp/src/arrow/gpu/cuda-test.cc @@ -29,7 +29,7 @@ #include "arrow/gpu/cuda_api.h" namespace arrow { -namespace gpu { +namespace cuda { constexpr int kGpuNumber = 0; @@ -323,7 +323,7 @@ TEST_F(TestCudaArrowIpc, BasicWriteRead) { ASSERT_OK(ipc::MakeIntRecordBatch(&batch)); std::shared_ptr device_serialized; - ASSERT_OK(arrow::gpu::SerializeRecordBatch(*batch, context_.get(), &device_serialized)); + ASSERT_OK(SerializeRecordBatch(*batch, context_.get(), &device_serialized)); // Test that ReadRecordBatch works properly std::shared_ptr device_batch; @@ -343,5 +343,5 @@ TEST_F(TestCudaArrowIpc, BasicWriteRead) { CompareBatch(*batch, *cpu_batch); } -} // namespace gpu +} // namespace cuda } // namespace arrow diff --git a/cpp/src/arrow/gpu/cuda_arrow_ipc.cc b/cpp/src/arrow/gpu/cuda_arrow_ipc.cc index a7262c8b4d4ba..03256a1f52c70 100644 --- a/cpp/src/arrow/gpu/cuda_arrow_ipc.cc +++ b/cpp/src/arrow/gpu/cuda_arrow_ipc.cc @@ -38,7 +38,7 @@ namespace arrow { namespace flatbuf = org::apache::arrow::flatbuf; -namespace gpu { +namespace cuda { Status SerializeRecordBatch(const RecordBatch& batch, CudaContext* ctx, std::shared_ptr* out) { @@ -106,5 +106,5 @@ Status ReadRecordBatch(const std::shared_ptr& schema, return ipc::ReadRecordBatch(*message, schema, out); } -} // namespace gpu +} // namespace cuda } // namespace arrow diff --git a/cpp/src/arrow/gpu/cuda_arrow_ipc.h b/cpp/src/arrow/gpu/cuda_arrow_ipc.h index 52dd92473eaec..4eb85e797c77b 100644 --- a/cpp/src/arrow/gpu/cuda_arrow_ipc.h +++ b/cpp/src/arrow/gpu/cuda_arrow_ipc.h @@ -39,7 +39,7 @@ class Message; } // namespace ipc -namespace gpu { +namespace cuda { /// \brief Write record batch message to GPU device memory /// \param[in] batch record batch to write @@ -71,7 +71,7 @@ Status ReadRecordBatch(const std::shared_ptr& schema, const std::shared_ptr& buffer, MemoryPool* pool, std::shared_ptr* out); -} // namespace gpu +} // namespace cuda } // namespace arrow #endif // ARROW_GPU_CUDA_ARROW_IPC_H diff --git a/cpp/src/arrow/gpu/cuda_common.h b/cpp/src/arrow/gpu/cuda_common.h index c06c1a21ff481..a53dd220adda0 100644 --- a/cpp/src/arrow/gpu/cuda_common.h +++ b/cpp/src/arrow/gpu/cuda_common.h @@ -25,7 +25,7 @@ #include namespace arrow { -namespace gpu { +namespace cuda { #define CUDA_DCHECK(STMT) \ do { \ @@ -45,7 +45,7 @@ namespace gpu { } \ } while (0) -} // namespace gpu +} // namespace cuda } // namespace arrow #endif // ARROW_GPU_CUDA_COMMON_H diff --git a/cpp/src/arrow/gpu/cuda_context.cc b/cpp/src/arrow/gpu/cuda_context.cc index 566ae6f878330..9e95040837bf5 100644 --- a/cpp/src/arrow/gpu/cuda_context.cc +++ b/cpp/src/arrow/gpu/cuda_context.cc @@ -28,8 +28,9 @@ #include "arrow/gpu/cuda_common.h" #include "arrow/gpu/cuda_memory.h" + namespace arrow { -namespace gpu { +namespace cuda { struct CudaDevice { int device_num; @@ -342,5 +343,5 @@ void* CudaContext::handle() const { return impl_->context_handle(); } int CudaContext::device_number() const { return impl_->device().device_num; } -} // namespace gpu +} // namespace cuda } // namespace arrow diff --git a/cpp/src/arrow/gpu/cuda_context.h b/cpp/src/arrow/gpu/cuda_context.h index e59273e5624f7..9a67cea8975d1 100644 --- a/cpp/src/arrow/gpu/cuda_context.h +++ b/cpp/src/arrow/gpu/cuda_context.h @@ -27,7 +27,7 @@ #include "arrow/gpu/cuda_memory.h" namespace arrow { -namespace gpu { +namespace cuda { // Forward declaration class CudaContext; @@ -138,7 +138,7 @@ class ARROW_EXPORT CudaContext : public std::enable_shared_from_thisAllocateHost(device_number, size, out); } -} // namespace gpu +} // namespace cuda } // namespace arrow diff --git a/cpp/src/arrow/gpu/cuda_memory.h b/cpp/src/arrow/gpu/cuda_memory.h index 0da58c170ff24..c8f80837cd9df 100644 --- a/cpp/src/arrow/gpu/cuda_memory.h +++ b/cpp/src/arrow/gpu/cuda_memory.h @@ -27,7 +27,7 @@ #include "arrow/status.h" namespace arrow { -namespace gpu { +namespace cuda { class CudaContext; class CudaIpcMemHandle; @@ -215,7 +215,7 @@ ARROW_EXPORT Status AllocateCudaHostBuffer(int device_number, const int64_t size, std::shared_ptr* out); -} // namespace gpu +} // namespace cuda } // namespace arrow #endif // ARROW_GPU_CUDA_MEMORY_H diff --git a/cpp/src/arrow/io/buffered.h b/cpp/src/arrow/io/buffered.h index e7302589dd650..e4374ba8079d3 100644 --- a/cpp/src/arrow/io/buffered.h +++ b/cpp/src/arrow/io/buffered.h @@ -100,10 +100,6 @@ class ARROW_EXPORT BufferedInputStream : public InputStream { static Status Create(std::shared_ptr raw, int64_t buffer_size, MemoryPool* pool, std::shared_ptr* out); - /// \brief Return string_view to buffered bytes, up to the indicated - /// number. View becomes invalid after any operation on file - util::string_view Peek(int64_t nbytes) const; - /// \brief Resize internal read buffer; calls to Read(...) will read at least /// \param[in] new_buffer_size the new read buffer size /// \return Status @@ -124,6 +120,7 @@ class ARROW_EXPORT BufferedInputStream : public InputStream { std::shared_ptr raw() const; // InputStream APIs + util::string_view Peek(int64_t nbytes) const override; Status Close() override; bool closed() const override; diff --git a/cpp/src/arrow/io/interfaces.cc b/cpp/src/arrow/io/interfaces.cc index ccabd475997a7..94e8fe6f43f0d 100644 --- a/cpp/src/arrow/io/interfaces.cc +++ b/cpp/src/arrow/io/interfaces.cc @@ -22,6 +22,7 @@ #include #include "arrow/status.h" +#include "arrow/util/string_view.h" namespace arrow { namespace io { @@ -33,6 +34,10 @@ Status InputStream::Advance(int64_t nbytes) { return Read(nbytes, &temp); } +util::string_view InputStream::Peek(int64_t ARROW_ARG_UNUSED(nbytes)) const { + return util::string_view(nullptr, 0); +} + bool InputStream::supports_zero_copy() const { return false; } struct RandomAccessFile::RandomAccessFileImpl { diff --git a/cpp/src/arrow/io/interfaces.h b/cpp/src/arrow/io/interfaces.h index b6ba59bd247f9..7104affaed77c 100644 --- a/cpp/src/arrow/io/interfaces.h +++ b/cpp/src/arrow/io/interfaces.h @@ -24,6 +24,7 @@ #include #include "arrow/util/macros.h" +#include "arrow/util/string_view.h" #include "arrow/util/visibility.h" namespace arrow { @@ -121,6 +122,13 @@ class ARROW_EXPORT InputStream : virtual public FileInterface, virtual public Re /// \return Status Status Advance(int64_t nbytes); + /// \brief Return string_view to any buffered bytes, up to the indicated + /// number. View becomes invalid after any operation on file. If the + /// InputStream is unbuffered, returns 0-length string_view + /// \param[in] nbytes the maximum number of bytes to see + /// \return arrow::util::string_view + virtual util::string_view Peek(int64_t nbytes) const; + /// \brief Return true if InputStream is capable of zero copy Buffer reads virtual bool supports_zero_copy() const; diff --git a/cpp/src/arrow/io/io-compressed-test.cc b/cpp/src/arrow/io/io-compressed-test.cc index 4a3b32333eb4f..507302f384c0b 100644 --- a/cpp/src/arrow/io/io-compressed-test.cc +++ b/cpp/src/arrow/io/io-compressed-test.cc @@ -180,7 +180,7 @@ TEST_P(CompressedInputStreamTest, TruncatedData) { TEST_P(CompressedInputStreamTest, InvalidData) { auto codec = MakeCodec(); - auto compressed_data = MakeRandomData(10000); + auto compressed_data = MakeRandomData(100); auto buffer_reader = std::make_shared(Buffer::Wrap(compressed_data)); std::shared_ptr stream; diff --git a/cpp/src/arrow/io/io-file-test.cc b/cpp/src/arrow/io/io-file-test.cc index afe2c60718b0e..6081005a8f6e1 100644 --- a/cpp/src/arrow/io/io-file-test.cc +++ b/cpp/src/arrow/io/io-file-test.cc @@ -345,6 +345,15 @@ TEST_F(TestReadableFile, FromFileDescriptor) { ASSERT_TRUE(FileIsClosed(fd)); } +TEST_F(TestReadableFile, Peek) { + MakeTestFile(); + OpenFile(); + + // Cannot peek + auto view = file_->Peek(4); + ASSERT_EQ(0, view.size()); +} + TEST_F(TestReadableFile, SeekTellSize) { MakeTestFile(); OpenFile(); diff --git a/cpp/src/arrow/io/io-memory-test.cc b/cpp/src/arrow/io/io-memory-test.cc index fa90c1f141bd3..ecd920b854c69 100644 --- a/cpp/src/arrow/io/io-memory-test.cc +++ b/cpp/src/arrow/io/io-memory-test.cc @@ -139,11 +139,29 @@ TEST(TestFixedSizeBufferWriter, Basics) { ASSERT_OK(writer.Close()); } +TEST(TestBufferReader, FromStrings) { + // ARROW-3291: construct BufferReader from std::string or + // arrow::util::string_view + + std::string data = "data123456"; + auto view = util::string_view(data); + + BufferReader reader1(data); + BufferReader reader2(view); + + std::shared_ptr piece; + ASSERT_OK(reader1.Read(4, &piece)); + ASSERT_EQ(0, memcmp(piece->data(), data.data(), 4)); + + ASSERT_OK(reader2.Seek(2)); + ASSERT_OK(reader2.Read(4, &piece)); + ASSERT_EQ(0, memcmp(piece->data(), data.data() + 2, 4)); +} + TEST(TestBufferReader, Seeking) { std::string data = "data123456"; - auto buffer = std::make_shared(data); - BufferReader reader(buffer); + BufferReader reader(data); int64_t pos; ASSERT_OK(reader.Tell(&pos)); ASSERT_EQ(pos, 0); @@ -161,6 +179,21 @@ TEST(TestBufferReader, Seeking) { ASSERT_EQ(pos, 10); } +TEST(TestBufferReader, Peek) { + std::string data = "data123456"; + + BufferReader reader(std::make_shared(data)); + + auto view = reader.Peek(4); + + ASSERT_EQ(4, view.size()); + ASSERT_EQ(data.substr(0, 4), view.to_string()); + + view = reader.Peek(20); + ASSERT_EQ(data.size(), view.size()); + ASSERT_EQ(data, view.to_string()); +} + TEST(TestBufferReader, RetainParentReference) { // ARROW-387 std::string data = "data123456"; diff --git a/cpp/src/arrow/io/io-readahead-test.cc b/cpp/src/arrow/io/io-readahead-test.cc index 1e5d02abd2f03..b7f404f666983 100644 --- a/cpp/src/arrow/io/io-readahead-test.cc +++ b/cpp/src/arrow/io/io-readahead-test.cc @@ -59,9 +59,8 @@ static void busy_wait(double seconds, std::function predicate) { std::shared_ptr DataReader(const std::string& data) { std::shared_ptr buffer; - ABORT_NOT_OK(AllocateBuffer(data.length(), &buffer)); - memcpy(buffer->mutable_data(), data.data(), data.length()); - return std::make_shared(std::move(buffer)); + ABORT_NOT_OK(Buffer::FromString(data, &buffer)); + return std::make_shared(buffer); } static int64_t WaitForPosition(const RandomAccessFile& file, int64_t expected, diff --git a/cpp/src/arrow/io/memory.cc b/cpp/src/arrow/io/memory.cc index 8a79f6bb94dfd..6afafbc25dc04 100644 --- a/cpp/src/arrow/io/memory.cc +++ b/cpp/src/arrow/io/memory.cc @@ -287,6 +287,12 @@ Status BufferReader::Tell(int64_t* position) const { return Status::OK(); } +util::string_view BufferReader::Peek(int64_t nbytes) const { + const int64_t bytes_available = std::min(nbytes, size_ - position_); + return util::string_view(reinterpret_cast(data_) + position_, + static_cast(bytes_available)); +} + bool BufferReader::supports_zero_copy() const { return true; } Status BufferReader::ReadAt(int64_t position, int64_t nbytes, int64_t* bytes_read, diff --git a/cpp/src/arrow/io/memory.h b/cpp/src/arrow/io/memory.h index 7b29800762c8f..cf73def3decfd 100644 --- a/cpp/src/arrow/io/memory.h +++ b/cpp/src/arrow/io/memory.h @@ -25,6 +25,7 @@ #include "arrow/io/interfaces.h" #include "arrow/memory_pool.h" +#include "arrow/util/string_view.h" #include "arrow/util/visibility.h" namespace arrow { @@ -133,6 +134,12 @@ class ARROW_EXPORT BufferReader : public RandomAccessFile { explicit BufferReader(const Buffer& buffer); BufferReader(const uint8_t* data, int64_t size); + /// \brief Instantiate from std::string or arrow::util::string_view. Does not + /// own data + explicit BufferReader(const util::string_view& data) + : BufferReader(reinterpret_cast(data.data()), + static_cast(data.size())) {} + Status Close() override; bool closed() const override; Status Tell(int64_t* position) const override; @@ -140,6 +147,10 @@ class ARROW_EXPORT BufferReader : public RandomAccessFile { // Zero copy read Status Read(int64_t nbytes, std::shared_ptr* out) override; + util::string_view Peek(int64_t nbytes) const override; + + bool supports_zero_copy() const override; + Status ReadAt(int64_t position, int64_t nbytes, int64_t* bytes_read, void* out) override; Status ReadAt(int64_t position, int64_t nbytes, std::shared_ptr* out) override; @@ -147,8 +158,6 @@ class ARROW_EXPORT BufferReader : public RandomAccessFile { Status GetSize(int64_t* size) override; Status Seek(int64_t position) override; - bool supports_zero_copy() const override; - std::shared_ptr buffer() const { return buffer_; } protected: diff --git a/cpp/src/arrow/ipc/CMakeLists.txt b/cpp/src/arrow/ipc/CMakeLists.txt index 13ed9b9e58060..9c384c3e9901c 100644 --- a/cpp/src/arrow/ipc/CMakeLists.txt +++ b/cpp/src/arrow/ipc/CMakeLists.txt @@ -33,8 +33,7 @@ if (NOT ARROW_BOOST_HEADER_ONLY) set_target_properties(json-integration-test PROPERTIES LINK_FLAGS "-undefined dynamic_lookup") else() - target_link_libraries(json-integration-test - pthread) + target_link_libraries(json-integration-test PRIVATE pthread) endif() endif() endif() diff --git a/cpp/src/arrow/ipc/feather-test.cc b/cpp/src/arrow/ipc/feather-test.cc index d032710b0be7c..b0be28925cf23 100644 --- a/cpp/src/arrow/ipc/feather-test.cc +++ b/cpp/src/arrow/ipc/feather-test.cc @@ -289,7 +289,7 @@ class TestTableReader : public ::testing::Test { ASSERT_OK(stream_->Finish(&output_)); - std::shared_ptr buffer(new io::BufferReader(output_)); + auto buffer = std::make_shared(output_); ASSERT_OK(TableReader::Open(buffer, &reader_)); } @@ -364,7 +364,7 @@ class TestTableWriter : public ::testing::Test { ASSERT_OK(stream_->Finish(&output_)); - std::shared_ptr buffer(new io::BufferReader(output_)); + auto buffer = std::make_shared(output_); ASSERT_OK(TableReader::Open(buffer, &reader_)); } diff --git a/cpp/src/arrow/ipc/ipc-read-write-test.cc b/cpp/src/arrow/ipc/ipc-read-write-test.cc index 31a9d474fe98a..3a723badf37d7 100644 --- a/cpp/src/arrow/ipc/ipc-read-write-test.cc +++ b/cpp/src/arrow/ipc/ipc-read-write-test.cc @@ -657,16 +657,7 @@ class TestStreamFormat : public ::testing::TestWithParam { std::shared_ptr reader; RETURN_NOT_OK(RecordBatchStreamReader::Open(&buf_reader, &reader)); - - std::shared_ptr chunk; - while (true) { - RETURN_NOT_OK(reader->ReadNext(&chunk)); - if (chunk == nullptr) { - break; - } - out_batches->emplace_back(chunk); - } - return Status::OK(); + return reader->ReadAll(out_batches); } protected: diff --git a/cpp/src/arrow/ipc/writer.cc b/cpp/src/arrow/ipc/writer.cc index 8225cce7b8131..3d3355dfe17fd 100644 --- a/cpp/src/arrow/ipc/writer.cc +++ b/cpp/src/arrow/ipc/writer.cc @@ -949,6 +949,10 @@ class RecordBatchFileWriter::RecordBatchFileWriterImpl } Status Close() override { + // Write the schema if not already written + // User is responsible for closing the OutputStream + RETURN_NOT_OK(CheckStarted()); + // Write metadata RETURN_NOT_OK(UpdatePosition()); diff --git a/cpp/src/arrow/python/CMakeLists.txt b/cpp/src/arrow/python/CMakeLists.txt index ff63eb05675df..7f4603ae5dfaf 100644 --- a/cpp/src/arrow/python/CMakeLists.txt +++ b/cpp/src/arrow/python/CMakeLists.txt @@ -76,7 +76,7 @@ ADD_ARROW_LIB(arrow_python foreach(LIB_TARGET ${ARROW_PYTHON_LIBRARIES}) target_compile_definitions(${LIB_TARGET} - PRIVATE ARROW_EXPORTING) + PRIVATE ARROW_PYTHON_EXPORTING) endforeach() if (ARROW_BUILD_STATIC AND MSVC) @@ -112,6 +112,7 @@ install(FILES pyarrow.h serialize.h type_traits.h + visibility.h DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/python") # pkg-config support diff --git a/cpp/src/arrow/python/arrow_to_pandas.h b/cpp/src/arrow/python/arrow_to_pandas.h index 138b010515bed..753bf4823566b 100644 --- a/cpp/src/arrow/python/arrow_to_pandas.h +++ b/cpp/src/arrow/python/arrow_to_pandas.h @@ -27,7 +27,7 @@ #include #include -#include "arrow/util/visibility.h" +#include "arrow/python/visibility.h" namespace arrow { @@ -57,16 +57,16 @@ struct PandasOptions { use_threads(false) {} }; -ARROW_EXPORT +ARROW_PYTHON_EXPORT Status ConvertArrayToPandas(PandasOptions options, const std::shared_ptr& arr, PyObject* py_ref, PyObject** out); -ARROW_EXPORT +ARROW_PYTHON_EXPORT Status ConvertChunkedArrayToPandas(PandasOptions options, const std::shared_ptr& col, PyObject* py_ref, PyObject** out); -ARROW_EXPORT +ARROW_PYTHON_EXPORT Status ConvertColumnToPandas(PandasOptions options, const std::shared_ptr& col, PyObject* py_ref, PyObject** out); @@ -76,7 +76,7 @@ Status ConvertColumnToPandas(PandasOptions options, const std::shared_ptr& table, MemoryPool* pool, PyObject** out); @@ -84,7 +84,7 @@ Status ConvertTableToPandas(PandasOptions options, const std::shared_ptr& /// /// Explicitly name columns that should be a categorical /// This option is only used on conversions that are applied to a table. -ARROW_EXPORT +ARROW_PYTHON_EXPORT Status ConvertTableToPandas(PandasOptions options, const std::unordered_set& categorical_columns, const std::shared_ptr
& table, MemoryPool* pool, diff --git a/cpp/src/arrow/python/benchmark.h b/cpp/src/arrow/python/benchmark.h index f88b6b432bf79..caaff32b365dd 100644 --- a/cpp/src/arrow/python/benchmark.h +++ b/cpp/src/arrow/python/benchmark.h @@ -20,7 +20,7 @@ #include "arrow/python/platform.h" -#include "arrow/util/visibility.h" +#include "arrow/python/visibility.h" namespace arrow { namespace py { @@ -29,7 +29,7 @@ namespace benchmark { // Micro-benchmark routines for use from ASV // Run PandasObjectIsNull() once over every object in *list* -ARROW_EXPORT +ARROW_PYTHON_EXPORT void Benchmark_PandasObjectIsNull(PyObject* list); } // namespace benchmark diff --git a/cpp/src/arrow/python/common.h b/cpp/src/arrow/python/common.h index 5779ef09767fe..6587bd328f3fb 100644 --- a/cpp/src/arrow/python/common.h +++ b/cpp/src/arrow/python/common.h @@ -26,8 +26,8 @@ #include "arrow/python/config.h" #include "arrow/buffer.h" +#include "arrow/python/visibility.h" #include "arrow/util/macros.h" -#include "arrow/util/visibility.h" namespace arrow { @@ -35,7 +35,7 @@ class MemoryPool; namespace py { -ARROW_EXPORT Status ConvertPyError(StatusCode code = StatusCode::UnknownError); +ARROW_PYTHON_EXPORT Status ConvertPyError(StatusCode code = StatusCode::UnknownError); // Catch a pending Python exception and return the corresponding Status. // If no exception is pending, Status::OK() is returned. @@ -47,14 +47,14 @@ inline Status CheckPyError(StatusCode code = StatusCode::UnknownError) { } } -ARROW_EXPORT Status PassPyError(); +ARROW_PYTHON_EXPORT Status PassPyError(); // TODO(wesm): We can just let errors pass through. To be explored later #define RETURN_IF_PYERROR() ARROW_RETURN_NOT_OK(CheckPyError()); #define PY_RETURN_IF_ERROR(CODE) ARROW_RETURN_NOT_OK(CheckPyError(CODE)); -class ARROW_EXPORT PyAcquireGIL { +class ARROW_PYTHON_EXPORT PyAcquireGIL { public: PyAcquireGIL() : acquired_gil_(false) { acquire(); } @@ -85,7 +85,7 @@ class ARROW_EXPORT PyAcquireGIL { // A RAII primitive that DECREFs the underlying PyObject* when it // goes out of scope. -class ARROW_EXPORT OwnedRef { +class ARROW_PYTHON_EXPORT OwnedRef { public: OwnedRef() : obj_(NULLPTR) {} OwnedRef(OwnedRef&& other) : OwnedRef(other.detach()) {} @@ -126,7 +126,7 @@ class ARROW_EXPORT OwnedRef { // Same as OwnedRef, but ensures the GIL is taken when it goes out of scope. // This is for situations where the GIL is not always known to be held // (e.g. if it is released in the middle of a function for performance reasons) -class ARROW_EXPORT OwnedRefNoGIL : public OwnedRef { +class ARROW_PYTHON_EXPORT OwnedRefNoGIL : public OwnedRef { public: OwnedRefNoGIL() : OwnedRef() {} OwnedRefNoGIL(OwnedRefNoGIL&& other) : OwnedRef(other.detach()) {} @@ -226,10 +226,10 @@ struct PyBytesView { }; // Return the common PyArrow memory pool -ARROW_EXPORT void set_default_memory_pool(MemoryPool* pool); -ARROW_EXPORT MemoryPool* get_memory_pool(); +ARROW_PYTHON_EXPORT void set_default_memory_pool(MemoryPool* pool); +ARROW_PYTHON_EXPORT MemoryPool* get_memory_pool(); -class ARROW_EXPORT PyBuffer : public Buffer { +class ARROW_PYTHON_EXPORT PyBuffer : public Buffer { public: /// While memoryview objects support multi-dimensional buffers, PyBuffer only supports /// one-dimensional byte buffers. diff --git a/cpp/src/arrow/python/config.h b/cpp/src/arrow/python/config.h index c2b089d382c00..5649ffe55c2ec 100644 --- a/cpp/src/arrow/python/config.h +++ b/cpp/src/arrow/python/config.h @@ -21,7 +21,7 @@ #include "arrow/python/platform.h" #include "arrow/python/numpy_interop.h" -#include "arrow/util/visibility.h" +#include "arrow/python/visibility.h" #if PY_MAJOR_VERSION >= 3 #define PyString_Check PyUnicode_Check @@ -30,10 +30,10 @@ namespace arrow { namespace py { -ARROW_EXPORT +ARROW_PYTHON_EXPORT extern PyObject* numpy_nan; -ARROW_EXPORT +ARROW_PYTHON_EXPORT void set_numpy_nan(PyObject* obj); } // namespace py diff --git a/cpp/src/arrow/python/decimal.h b/cpp/src/arrow/python/decimal.h index dd382d14e063e..80727954e0b65 100644 --- a/cpp/src/arrow/python/decimal.h +++ b/cpp/src/arrow/python/decimal.h @@ -20,8 +20,8 @@ #include +#include "arrow/python/visibility.h" #include "arrow/type.h" -#include "arrow/util/visibility.h" namespace arrow { @@ -38,21 +38,21 @@ class OwnedRef; namespace internal { // \brief Import the Python Decimal type -ARROW_EXPORT +ARROW_PYTHON_EXPORT Status ImportDecimalType(OwnedRef* decimal_type); // \brief Convert a Python Decimal object to a C++ string // \param[in] python_decimal A Python decimal.Decimal instance // \param[out] The string representation of the Python Decimal instance // \return The status of the operation -ARROW_EXPORT +ARROW_PYTHON_EXPORT Status PythonDecimalToString(PyObject* python_decimal, std::string* out); // \brief Convert a C++ std::string to a Python Decimal instance // \param[in] decimal_constructor The decimal type object // \param[in] decimal_string A decimal string // \return An instance of decimal.Decimal -ARROW_EXPORT +ARROW_PYTHON_EXPORT PyObject* DecimalFromString(PyObject* decimal_constructor, const std::string& decimal_string); @@ -61,21 +61,21 @@ PyObject* DecimalFromString(PyObject* decimal_constructor, // \param[in] arrow_type An instance of arrow::DecimalType // \param[out] out A pointer to a Decimal128 // \return The status of the operation -ARROW_EXPORT +ARROW_PYTHON_EXPORT Status DecimalFromPythonDecimal(PyObject* python_decimal, const DecimalType& arrow_type, Decimal128* out); // \brief Check whether obj is an instance of Decimal -ARROW_EXPORT +ARROW_PYTHON_EXPORT bool PyDecimal_Check(PyObject* obj); // \brief Check whether obj is nan. This function will abort the program if the argument // is not a Decimal instance -ARROW_EXPORT +ARROW_PYTHON_EXPORT bool PyDecimal_ISNAN(PyObject* obj); // \brief Helper class to track and update the precision and scale of a decimal -class ARROW_EXPORT DecimalMetadata { +class ARROW_PYTHON_EXPORT DecimalMetadata { public: DecimalMetadata(); DecimalMetadata(int32_t precision, int32_t scale); diff --git a/cpp/src/arrow/python/deserialize.h b/cpp/src/arrow/python/deserialize.h index 754765a6825fd..b9c4984a3b0e4 100644 --- a/cpp/src/arrow/python/deserialize.h +++ b/cpp/src/arrow/python/deserialize.h @@ -23,8 +23,8 @@ #include #include "arrow/python/serialize.h" +#include "arrow/python/visibility.h" #include "arrow/status.h" -#include "arrow/util/visibility.h" namespace arrow { @@ -43,7 +43,7 @@ namespace py { /// \param[in] src a RandomAccessFile /// \param[out] out the reconstructed data /// \return Status -ARROW_EXPORT +ARROW_PYTHON_EXPORT Status ReadSerializedObject(io::RandomAccessFile* src, SerializedPyObject* out); /// \brief Reconstruct SerializedPyObject from representation produced by @@ -56,7 +56,7 @@ Status ReadSerializedObject(io::RandomAccessFile* src, SerializedPyObject* out); /// num_tensors * 2 + num_buffers in length /// \param[out] out the reconstructed object /// \return Status -ARROW_EXPORT +ARROW_PYTHON_EXPORT Status GetSerializedFromComponents(int num_tensors, int num_ndarrays, int num_buffers, PyObject* data, SerializedPyObject* out); @@ -72,7 +72,7 @@ Status GetSerializedFromComponents(int num_tensors, int num_ndarrays, int num_bu /// \param[out] out The returned object /// \return Status /// This acquires the GIL -ARROW_EXPORT +ARROW_PYTHON_EXPORT Status DeserializeObject(PyObject* context, const SerializedPyObject& object, PyObject* base, PyObject** out); @@ -80,10 +80,10 @@ Status DeserializeObject(PyObject* context, const SerializedPyObject& object, /// \param[in] object Object to deserialize /// \param[out] out The deserialized tensor /// \return Status -ARROW_EXPORT +ARROW_PYTHON_EXPORT Status DeserializeNdarray(const SerializedPyObject& object, std::shared_ptr* out); -ARROW_EXPORT +ARROW_PYTHON_EXPORT Status NdarrayFromBuffer(std::shared_ptr src, std::shared_ptr* out); } // namespace py diff --git a/cpp/src/arrow/python/helpers.h b/cpp/src/arrow/python/helpers.h index 4a7c8f12c15eb..2d44feea5ac81 100644 --- a/cpp/src/arrow/python/helpers.h +++ b/cpp/src/arrow/python/helpers.h @@ -27,9 +27,9 @@ #include +#include "arrow/python/visibility.h" #include "arrow/type.h" #include "arrow/util/macros.h" -#include "arrow/util/visibility.h" namespace arrow { @@ -40,20 +40,20 @@ class OwnedRef; // \brief Get an arrow DataType instance from Arrow's Type::type enum // \param[in] type One of the values of Arrow's Type::type enum // \return A shared pointer to DataType -ARROW_EXPORT std::shared_ptr GetPrimitiveType(Type::type type); +ARROW_PYTHON_EXPORT std::shared_ptr GetPrimitiveType(Type::type type); // \brief Construct a np.float16 object from a npy_half value. -ARROW_EXPORT PyObject* PyHalf_FromHalf(npy_half value); +ARROW_PYTHON_EXPORT PyObject* PyHalf_FromHalf(npy_half value); // \brief Convert a Python object to a npy_half value. -ARROW_EXPORT Status PyFloat_AsHalf(PyObject* obj, npy_half* out); +ARROW_PYTHON_EXPORT Status PyFloat_AsHalf(PyObject* obj, npy_half* out); namespace internal { // \brief Import a Python module // \param[in] module_name The name of the module // \param[out] ref The OwnedRef containing the module PyObject* -ARROW_EXPORT +ARROW_PYTHON_EXPORT Status ImportModule(const std::string& module_name, OwnedRef* ref); // \brief Import an object from a Python module @@ -61,7 +61,7 @@ Status ImportModule(const std::string& module_name, OwnedRef* ref); // \param[in] name The name of the object to import // \param[out] ref The OwnedRef containing the \c name attribute of the Python module \c // module -ARROW_EXPORT +ARROW_PYTHON_EXPORT Status ImportFromModule(const OwnedRef& module, const std::string& name, OwnedRef* ref); // \brief Check whether obj is an integer, independent of Python versions. @@ -74,11 +74,11 @@ inline bool IsPyInteger(PyObject* obj) { } // \brief Use pandas missing value semantics to check if a value is null -ARROW_EXPORT +ARROW_PYTHON_EXPORT bool PandasObjectIsNull(PyObject* obj); // \brief Check whether obj is a floating-point NaN -ARROW_EXPORT +ARROW_PYTHON_EXPORT bool PyFloat_IsNaN(PyObject* obj); inline bool IsPyBinary(PyObject* obj) { @@ -93,19 +93,19 @@ template Status CIntFromPython(PyObject* obj, Int* out, const std::string& overflow_message = ""); // \brief Convert a Python unicode string to a std::string -ARROW_EXPORT +ARROW_PYTHON_EXPORT Status PyUnicode_AsStdString(PyObject* obj, std::string* out); // \brief Convert a Python bytes object to a std::string -ARROW_EXPORT +ARROW_PYTHON_EXPORT std::string PyBytes_AsStdString(PyObject* obj); // \brief Call str() on the given object and return the result as a std::string -ARROW_EXPORT +ARROW_PYTHON_EXPORT Status PyObject_StdStringStr(PyObject* obj, std::string* out); // \brief Return the repr() of the given object (always succeeds) -ARROW_EXPORT +ARROW_PYTHON_EXPORT std::string PyObject_StdStringRepr(PyObject* obj); // \brief Cast the given size to int32_t, with error checking @@ -121,12 +121,12 @@ inline Status CastSize(Py_ssize_t size, int32_t* out, // \brief Print the Python object's __str__ form along with the passed error // message -ARROW_EXPORT +ARROW_PYTHON_EXPORT Status InvalidValue(PyObject* obj, const std::string& why); -ARROW_EXPORT +ARROW_PYTHON_EXPORT Status IntegerScalarToDoubleSafe(PyObject* obj, double* result); -ARROW_EXPORT +ARROW_PYTHON_EXPORT Status IntegerScalarToFloat32Safe(PyObject* obj, float* result); } // namespace internal diff --git a/cpp/src/arrow/python/inference.cc b/cpp/src/arrow/python/inference.cc index e619a64eb8aae..0f1d85ead2a16 100644 --- a/cpp/src/arrow/python/inference.cc +++ b/cpp/src/arrow/python/inference.cc @@ -583,13 +583,13 @@ Status InferArrowTypeAndSize(PyObject* obj, int64_t* size, return Status::OK(); } -ARROW_EXPORT +ARROW_PYTHON_EXPORT bool IsPyBool(PyObject* obj) { return internal::PyBoolScalar_Check(obj); } -ARROW_EXPORT +ARROW_PYTHON_EXPORT bool IsPyInt(PyObject* obj) { return internal::PyIntScalar_Check(obj); } -ARROW_EXPORT +ARROW_PYTHON_EXPORT bool IsPyFloat(PyObject* obj) { return internal::PyFloatScalar_Check(obj); } } // namespace py diff --git a/cpp/src/arrow/python/inference.h b/cpp/src/arrow/python/inference.h index 2cffa17ac2dc8..f2e2305e34441 100644 --- a/cpp/src/arrow/python/inference.h +++ b/cpp/src/arrow/python/inference.h @@ -27,9 +27,9 @@ #include #include +#include "arrow/python/visibility.h" #include "arrow/type.h" #include "arrow/util/macros.h" -#include "arrow/util/visibility.h" #include "arrow/python/common.h" @@ -41,23 +41,23 @@ class Status; namespace py { // These three functions take a sequence input, not arbitrary iterables -ARROW_EXPORT +ARROW_PYTHON_EXPORT arrow::Status InferArrowType(PyObject* obj, std::shared_ptr* out_type); -ARROW_EXPORT +ARROW_PYTHON_EXPORT arrow::Status InferArrowTypeAndSize(PyObject* obj, int64_t* size, std::shared_ptr* out_type); /// Checks whether the passed Python object is a boolean scalar -ARROW_EXPORT +ARROW_PYTHON_EXPORT bool IsPyBool(PyObject* obj); /// Checks whether the passed Python object is an integer scalar -ARROW_EXPORT +ARROW_PYTHON_EXPORT bool IsPyInt(PyObject* obj); /// Checks whether the passed Python object is a float scalar -ARROW_EXPORT +ARROW_PYTHON_EXPORT bool IsPyFloat(PyObject* obj); } // namespace py diff --git a/cpp/src/arrow/python/init.h b/cpp/src/arrow/python/init.h index 1daa5a3d2624d..34d19b21fdf31 100644 --- a/cpp/src/arrow/python/init.h +++ b/cpp/src/arrow/python/init.h @@ -19,10 +19,10 @@ #define ARROW_PYTHON_INIT_H #include "arrow/python/platform.h" -#include "arrow/util/visibility.h" +#include "arrow/python/visibility.h" extern "C" { -ARROW_EXPORT +ARROW_PYTHON_EXPORT int arrow_init_numpy(); } diff --git a/cpp/src/arrow/python/io.h b/cpp/src/arrow/python/io.h index 73d96f5f40fd8..d3b7c999eb8bb 100644 --- a/cpp/src/arrow/python/io.h +++ b/cpp/src/arrow/python/io.h @@ -22,7 +22,7 @@ #include "arrow/io/interfaces.h" #include "arrow/io/memory.h" -#include "arrow/util/visibility.h" +#include "arrow/python/visibility.h" #include "arrow/python/config.h" @@ -36,7 +36,7 @@ namespace py { class ARROW_NO_EXPORT PythonFile; -class ARROW_EXPORT PyReadableFile : public io::RandomAccessFile { +class ARROW_PYTHON_EXPORT PyReadableFile : public io::RandomAccessFile { public: explicit PyReadableFile(PyObject* file); ~PyReadableFile() override; @@ -64,7 +64,7 @@ class ARROW_EXPORT PyReadableFile : public io::RandomAccessFile { std::unique_ptr file_; }; -class ARROW_EXPORT PyOutputStream : public io::OutputStream { +class ARROW_PYTHON_EXPORT PyOutputStream : public io::OutputStream { public: explicit PyOutputStream(PyObject* file); ~PyOutputStream() override; @@ -87,7 +87,7 @@ class ARROW_EXPORT PyOutputStream : public io::OutputStream { // Keeping the reference in a Python wrapper would be incorrect as // the Python wrapper can get destroyed even though the wrapped C++ // buffer is still alive (ARROW-2270). -class ARROW_EXPORT PyForeignBuffer : public Buffer { +class ARROW_PYTHON_EXPORT PyForeignBuffer : public Buffer { public: static Status Make(const uint8_t* data, int64_t size, PyObject* base, std::shared_ptr* out); diff --git a/cpp/src/arrow/python/numpy_convert.h b/cpp/src/arrow/python/numpy_convert.h index dfdb1acd1237b..dce5fe522d65b 100644 --- a/cpp/src/arrow/python/numpy_convert.h +++ b/cpp/src/arrow/python/numpy_convert.h @@ -27,7 +27,7 @@ #include #include "arrow/buffer.h" -#include "arrow/util/visibility.h" +#include "arrow/python/visibility.h" namespace arrow { @@ -38,7 +38,7 @@ class Tensor; namespace py { -class ARROW_EXPORT NumPyBuffer : public Buffer { +class ARROW_PYTHON_EXPORT NumPyBuffer : public Buffer { public: explicit NumPyBuffer(PyObject* arr); virtual ~NumPyBuffer(); @@ -48,25 +48,25 @@ class ARROW_EXPORT NumPyBuffer : public Buffer { }; // Handle misbehaved types like LONGLONG and ULONGLONG -ARROW_EXPORT +ARROW_PYTHON_EXPORT int cast_npy_type_compat(int type_num); -ARROW_EXPORT +ARROW_PYTHON_EXPORT bool is_contiguous(PyObject* array); -ARROW_EXPORT +ARROW_PYTHON_EXPORT Status NumPyDtypeToArrow(PyObject* dtype, std::shared_ptr* out); -ARROW_EXPORT +ARROW_PYTHON_EXPORT Status NumPyDtypeToArrow(PyArray_Descr* descr, std::shared_ptr* out); Status GetTensorType(PyObject* dtype, std::shared_ptr* out); Status GetNumPyType(const DataType& type, int* type_num); -ARROW_EXPORT Status NdarrayToTensor(MemoryPool* pool, PyObject* ao, - std::shared_ptr* out); +ARROW_PYTHON_EXPORT Status NdarrayToTensor(MemoryPool* pool, PyObject* ao, + std::shared_ptr* out); -ARROW_EXPORT Status TensorToNdarray(const std::shared_ptr& tensor, PyObject* base, - PyObject** out); +ARROW_PYTHON_EXPORT Status TensorToNdarray(const std::shared_ptr& tensor, + PyObject* base, PyObject** out); } // namespace py } // namespace arrow diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc index 37141d7642b6f..f9a5ea1b0d67e 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.cc +++ b/cpp/src/arrow/python/numpy_to_arrow.cc @@ -41,6 +41,8 @@ #include "arrow/util/checked_cast.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" +#include "arrow/util/string.h" +#include "arrow/util/utf8.h" #include "arrow/visitor_inline.h" #include "arrow/compute/context.h" @@ -634,30 +636,48 @@ Status AppendUTF32(const char* data, int itemsize, int byteorder, } // namespace Status NumPyConverter::Visit(const StringType& type) { + util::InitializeUTF8(); + StringBuilder builder(pool_); - auto data = reinterpret_cast(PyArray_DATA(arr_)); + auto data = reinterpret_cast(PyArray_DATA(arr_)); - char numpy_byteorder = PyArray_DESCR(arr_)->byteorder; + char numpy_byteorder = dtype_->byteorder; // For Python C API, -1 is little-endian, 1 is big-endian int byteorder = numpy_byteorder == '>' ? 1 : -1; PyAcquireGIL gil_lock; + const bool is_binary_type = dtype_->type_num == NPY_STRING; + + auto AppendNonNullValue = [&](const uint8_t* data) { + if (is_binary_type) { + if (ARROW_PREDICT_TRUE(util::ValidateUTF8(data, itemsize_))) { + return builder.Append(data, itemsize_); + } else { + std::stringstream ss; + ss << "Encountered non-UTF8 binary value: " << HexEncode(data, itemsize_); + return Status::Invalid(ss.str()); + } + } else { + return AppendUTF32(reinterpret_cast(data), itemsize_, byteorder, + &builder); + } + }; if (mask_ != nullptr) { Ndarray1DIndexer mask_values(mask_); for (int64_t i = 0; i < length_; ++i) { if (mask_values[i]) { RETURN_NOT_OK(builder.AppendNull()); } else { - RETURN_NOT_OK(AppendUTF32(data, itemsize_, byteorder, &builder)); + RETURN_NOT_OK(AppendNonNullValue(data)); } data += stride_; } } else { for (int64_t i = 0; i < length_; ++i) { - RETURN_NOT_OK(AppendUTF32(data, itemsize_, byteorder, &builder)); + RETURN_NOT_OK(AppendNonNullValue(data)); data += stride_; } } diff --git a/cpp/src/arrow/python/numpy_to_arrow.h b/cpp/src/arrow/python/numpy_to_arrow.h index 5e1c088264a46..4edc7669bb82e 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.h +++ b/cpp/src/arrow/python/numpy_to_arrow.h @@ -25,7 +25,7 @@ #include #include "arrow/compute/kernels/cast.h" -#include "arrow/util/visibility.h" +#include "arrow/python/visibility.h" namespace arrow { @@ -48,7 +48,7 @@ namespace py { /// \param[in] type a specific type to cast to, may be null /// \param[in] cast_options casting options /// \param[out] out a ChunkedArray, to accommodate chunked output -ARROW_EXPORT +ARROW_PYTHON_EXPORT Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas, const std::shared_ptr& type, const compute::CastOptions& cast_options, @@ -64,7 +64,7 @@ Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pa /// whether values are null /// \param[in] type a specific type to cast to, may be null /// \param[out] out a ChunkedArray, to accommodate chunked output -ARROW_EXPORT +ARROW_PYTHON_EXPORT Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas, const std::shared_ptr& type, std::shared_ptr* out); diff --git a/cpp/src/arrow/python/pyarrow.h b/cpp/src/arrow/python/pyarrow.h index e637627006177..a5a3910847977 100644 --- a/cpp/src/arrow/python/pyarrow.h +++ b/cpp/src/arrow/python/pyarrow.h @@ -22,7 +22,7 @@ #include -#include "arrow/util/visibility.h" +#include "arrow/python/visibility.h" namespace arrow { @@ -39,44 +39,46 @@ class Tensor; namespace py { -ARROW_EXPORT int import_pyarrow(); +ARROW_PYTHON_EXPORT int import_pyarrow(); -ARROW_EXPORT bool is_buffer(PyObject* buffer); -ARROW_EXPORT Status unwrap_buffer(PyObject* buffer, std::shared_ptr* out); -ARROW_EXPORT PyObject* wrap_buffer(const std::shared_ptr& buffer); +ARROW_PYTHON_EXPORT bool is_buffer(PyObject* buffer); +ARROW_PYTHON_EXPORT Status unwrap_buffer(PyObject* buffer, std::shared_ptr* out); +ARROW_PYTHON_EXPORT PyObject* wrap_buffer(const std::shared_ptr& buffer); -ARROW_EXPORT bool is_data_type(PyObject* data_type); -ARROW_EXPORT Status unwrap_data_type(PyObject* data_type, std::shared_ptr* out); -ARROW_EXPORT PyObject* wrap_data_type(const std::shared_ptr& type); +ARROW_PYTHON_EXPORT bool is_data_type(PyObject* data_type); +ARROW_PYTHON_EXPORT Status unwrap_data_type(PyObject* data_type, + std::shared_ptr* out); +ARROW_PYTHON_EXPORT PyObject* wrap_data_type(const std::shared_ptr& type); -ARROW_EXPORT bool is_field(PyObject* field); -ARROW_EXPORT Status unwrap_field(PyObject* field, std::shared_ptr* out); -ARROW_EXPORT PyObject* wrap_field(const std::shared_ptr& field); +ARROW_PYTHON_EXPORT bool is_field(PyObject* field); +ARROW_PYTHON_EXPORT Status unwrap_field(PyObject* field, std::shared_ptr* out); +ARROW_PYTHON_EXPORT PyObject* wrap_field(const std::shared_ptr& field); -ARROW_EXPORT bool is_schema(PyObject* schema); -ARROW_EXPORT Status unwrap_schema(PyObject* schema, std::shared_ptr* out); -ARROW_EXPORT PyObject* wrap_schema(const std::shared_ptr& schema); +ARROW_PYTHON_EXPORT bool is_schema(PyObject* schema); +ARROW_PYTHON_EXPORT Status unwrap_schema(PyObject* schema, std::shared_ptr* out); +ARROW_PYTHON_EXPORT PyObject* wrap_schema(const std::shared_ptr& schema); -ARROW_EXPORT bool is_array(PyObject* array); -ARROW_EXPORT Status unwrap_array(PyObject* array, std::shared_ptr* out); -ARROW_EXPORT PyObject* wrap_array(const std::shared_ptr& array); +ARROW_PYTHON_EXPORT bool is_array(PyObject* array); +ARROW_PYTHON_EXPORT Status unwrap_array(PyObject* array, std::shared_ptr* out); +ARROW_PYTHON_EXPORT PyObject* wrap_array(const std::shared_ptr& array); -ARROW_EXPORT bool is_tensor(PyObject* tensor); -ARROW_EXPORT Status unwrap_tensor(PyObject* tensor, std::shared_ptr* out); -ARROW_EXPORT PyObject* wrap_tensor(const std::shared_ptr& tensor); +ARROW_PYTHON_EXPORT bool is_tensor(PyObject* tensor); +ARROW_PYTHON_EXPORT Status unwrap_tensor(PyObject* tensor, std::shared_ptr* out); +ARROW_PYTHON_EXPORT PyObject* wrap_tensor(const std::shared_ptr& tensor); -ARROW_EXPORT bool is_column(PyObject* column); -ARROW_EXPORT Status unwrap_column(PyObject* column, std::shared_ptr* out); -ARROW_EXPORT PyObject* wrap_column(const std::shared_ptr& column); +ARROW_PYTHON_EXPORT bool is_column(PyObject* column); +ARROW_PYTHON_EXPORT Status unwrap_column(PyObject* column, std::shared_ptr* out); +ARROW_PYTHON_EXPORT PyObject* wrap_column(const std::shared_ptr& column); -ARROW_EXPORT bool is_table(PyObject* table); -ARROW_EXPORT Status unwrap_table(PyObject* table, std::shared_ptr
* out); -ARROW_EXPORT PyObject* wrap_table(const std::shared_ptr
& table); +ARROW_PYTHON_EXPORT bool is_table(PyObject* table); +ARROW_PYTHON_EXPORT Status unwrap_table(PyObject* table, std::shared_ptr
* out); +ARROW_PYTHON_EXPORT PyObject* wrap_table(const std::shared_ptr
& table); -ARROW_EXPORT bool is_record_batch(PyObject* batch); -ARROW_EXPORT Status unwrap_record_batch(PyObject* batch, - std::shared_ptr* out); -ARROW_EXPORT PyObject* wrap_record_batch(const std::shared_ptr& batch); +ARROW_PYTHON_EXPORT bool is_record_batch(PyObject* batch); +ARROW_PYTHON_EXPORT Status unwrap_record_batch(PyObject* batch, + std::shared_ptr* out); +ARROW_PYTHON_EXPORT PyObject* wrap_record_batch( + const std::shared_ptr& batch); } // namespace py } // namespace arrow diff --git a/cpp/src/arrow/python/python_to_arrow.h b/cpp/src/arrow/python/python_to_arrow.h index d133089f97f51..f9d97569ef47a 100644 --- a/cpp/src/arrow/python/python_to_arrow.h +++ b/cpp/src/arrow/python/python_to_arrow.h @@ -26,9 +26,9 @@ #include #include +#include "arrow/python/visibility.h" #include "arrow/type.h" #include "arrow/util/macros.h" -#include "arrow/util/visibility.h" #include "arrow/python/common.h" @@ -68,12 +68,12 @@ struct PyConversionOptions { /// \param[in] options various conversion options /// \param[out] out a ChunkedArray containing one or more chunks /// \return Status -ARROW_EXPORT +ARROW_PYTHON_EXPORT Status ConvertPySequence(PyObject* obj, PyObject* mask, const PyConversionOptions& options, std::shared_ptr* out); -ARROW_EXPORT +ARROW_PYTHON_EXPORT Status ConvertPySequence(PyObject* obj, const PyConversionOptions& options, std::shared_ptr* out); diff --git a/cpp/src/arrow/python/serialize.h b/cpp/src/arrow/python/serialize.h index 2759d0c9f1fb5..9a9cc65087d55 100644 --- a/cpp/src/arrow/python/serialize.h +++ b/cpp/src/arrow/python/serialize.h @@ -21,8 +21,8 @@ #include #include +#include "arrow/python/visibility.h" #include "arrow/status.h" -#include "arrow/util/visibility.h" // Forward declaring PyObject, see // https://mail.python.org/pipermail/python-dev/2003-August/037601.html @@ -47,7 +47,7 @@ class OutputStream; namespace py { -struct ARROW_EXPORT SerializedPyObject { +struct ARROW_PYTHON_EXPORT SerializedPyObject { std::shared_ptr batch; std::vector> tensors; std::vector> ndarrays; @@ -86,14 +86,14 @@ struct ARROW_EXPORT SerializedPyObject { /// \return Status /// /// Release GIL before calling -ARROW_EXPORT +ARROW_PYTHON_EXPORT Status SerializeObject(PyObject* context, PyObject* sequence, SerializedPyObject* out); /// \brief Serialize an Arrow Tensor as a SerializedPyObject. /// \param[in] tensor Tensor to be serialized /// \param[out] out The serialized representation /// \return Status -ARROW_EXPORT +ARROW_PYTHON_EXPORT Status SerializeTensor(std::shared_ptr tensor, py::SerializedPyObject* out); /// \brief Write the Tensor metadata header to an OutputStream. @@ -102,7 +102,7 @@ Status SerializeTensor(std::shared_ptr tensor, py::SerializedPyObject* o /// \param[in] tensor_num_bytes The lengh of the Tensor data in bytes /// \param[in] dst The OutputStream to write the Tensor header to /// \return Status -ARROW_EXPORT +ARROW_PYTHON_EXPORT Status WriteNdarrayHeader(std::shared_ptr dtype, const std::vector& shape, int64_t tensor_num_bytes, io::OutputStream* dst); diff --git a/cpp/src/arrow/python/visibility.h b/cpp/src/arrow/python/visibility.h new file mode 100644 index 0000000000000..c0b343c70e976 --- /dev/null +++ b/cpp/src/arrow/python/visibility.h @@ -0,0 +1,39 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#if defined(_WIN32) || defined(__CYGWIN__) // Windows +#if defined(_MSC_VER) +#pragma warning(disable : 4251) +#else +#pragma GCC diagnostic ignored "-Wattributes" +#endif + +#ifdef ARROW_STATIC +#define ARROW_PYTHON_EXPORT +#elif defined(ARROW_PYTHON_EXPORTING) +#define ARROW_PYTHON_EXPORT __declspec(dllexport) +#else +#define ARROW_PYTHON_EXPORT __declspec(dllimport) +#endif + +#else // Not Windows +#ifndef ARROW_PYTHON_EXPORT +#define ARROW_PYTHON_EXPORT __attribute__((visibility("default"))) +#endif +#endif // Non-Windows diff --git a/cpp/src/arrow/record_batch.cc b/cpp/src/arrow/record_batch.cc index f295b864c0066..33287c19ffdde 100644 --- a/cpp/src/arrow/record_batch.cc +++ b/cpp/src/arrow/record_batch.cc @@ -26,6 +26,7 @@ #include "arrow/array.h" #include "arrow/status.h" +#include "arrow/table.h" #include "arrow/type.h" #include "arrow/util/logging.h" #include "arrow/util/stl.h" @@ -249,4 +250,22 @@ Status RecordBatch::Validate() const { RecordBatchReader::~RecordBatchReader() {} +Status RecordBatchReader::ReadAll(std::vector>* batches) { + while (true) { + std::shared_ptr batch; + RETURN_NOT_OK(ReadNext(&batch)); + if (!batch) { + break; + } + batches->emplace_back(std::move(batch)); + } + return Status::OK(); +} + +Status RecordBatchReader::ReadAll(std::shared_ptr
* table) { + std::vector> batches; + RETURN_NOT_OK(ReadAll(&batches)); + return Table::FromRecordBatches(schema(), batches, table); +} + } // namespace arrow diff --git a/cpp/src/arrow/record_batch.h b/cpp/src/arrow/record_batch.h index f6538f9c40578..674b68b40fa6e 100644 --- a/cpp/src/arrow/record_batch.h +++ b/cpp/src/arrow/record_batch.h @@ -170,12 +170,18 @@ class ARROW_EXPORT RecordBatchReader { /// \return the shared schema of the record batches in the stream virtual std::shared_ptr schema() const = 0; - /// Read the next record batch in the stream. Return null for batch when - /// reaching end of stream + /// \brief Read the next record batch in the stream. Return null for batch + /// when reaching end of stream /// /// \param[out] batch the next loaded batch, null at end of stream /// \return Status virtual Status ReadNext(std::shared_ptr* batch) = 0; + + /// \brief Consume entire stream as a vector of record batches + Status ReadAll(std::vector>* batches); + + /// \brief Read all batches and concatenate as arrow::Table + Status ReadAll(std::shared_ptr
* table); }; } // namespace arrow diff --git a/cpp/src/arrow/test-util.cc b/cpp/src/arrow/test-util.cc index 84c76ee1aee84..7fb96cda7af73 100644 --- a/cpp/src/arrow/test-util.cc +++ b/cpp/src/arrow/test-util.cc @@ -60,7 +60,7 @@ namespace arrow { void random_null_bytes(int64_t n, double pct_null, uint8_t* null_bytes) { const int random_seed = 0; - std::mt19937 gen(random_seed); + std::default_random_engine gen(random_seed); std::uniform_real_distribution d(0.0, 1.0); std::generate(null_bytes, null_bytes + n, [&d, &gen, &pct_null] { return d(gen) > pct_null; }); @@ -68,7 +68,7 @@ void random_null_bytes(int64_t n, double pct_null, uint8_t* null_bytes) { void random_is_valid(int64_t n, double pct_null, std::vector* is_valid) { const int random_seed = 0; - std::mt19937 gen(random_seed); + std::default_random_engine gen(random_seed); std::uniform_real_distribution d(0.0, 1.0); is_valid->resize(n, false); std::generate(is_valid->begin(), is_valid->end(), @@ -76,7 +76,7 @@ void random_is_valid(int64_t n, double pct_null, std::vector* is_valid) { } void random_bytes(int64_t n, uint32_t seed, uint8_t* out) { - std::mt19937 gen(seed); + std::default_random_engine gen(seed); std::uniform_int_distribution d(0, std::numeric_limits::max()); std::generate(out, out + n, [&d, &gen] { return static_cast(d(gen)); }); } @@ -150,7 +150,7 @@ int32_t DecimalSize(int32_t precision) { } void random_decimals(int64_t n, uint32_t seed, int32_t precision, uint8_t* out) { - std::mt19937 gen(seed); + std::default_random_engine gen(seed); std::uniform_int_distribution d(0, std::numeric_limits::max()); const int32_t required_bytes = DecimalSize(precision); constexpr int32_t byte_width = 16; diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h index 3011f287f096a..a01fd7d84a601 100644 --- a/cpp/src/arrow/test-util.h +++ b/cpp/src/arrow/test-util.h @@ -15,8 +15,7 @@ // specific language governing permissions and limitations // under the License. -#ifndef ARROW_TEST_UTIL_H_ -#define ARROW_TEST_UTIL_H_ +#pragma once #ifndef _WIN32 #include @@ -120,7 +119,7 @@ using ArrayVector = std::vector>; template void randint(int64_t N, T lower, T upper, std::vector* out) { const int random_seed = 0; - std::mt19937 gen(random_seed); + std::default_random_engine gen(random_seed); std::uniform_int_distribution d(lower, upper); out->resize(N, static_cast(0)); std::generate(out->begin(), out->end(), [&d, &gen] { return static_cast(d(gen)); }); @@ -129,7 +128,7 @@ void randint(int64_t N, T lower, T upper, std::vector* out) { template void random_real(int64_t n, uint32_t seed, T min_value, T max_value, std::vector* out) { - std::mt19937 gen(seed); + std::default_random_engine gen(seed); std::uniform_real_distribution d(min_value, max_value); out->resize(n, static_cast(0)); std::generate(out->begin(), out->end(), [&d, &gen] { return static_cast(d(gen)); }); @@ -221,7 +220,7 @@ void FinishAndCheckPadding(BuilderType* builder, std::shared_ptr* out) { template void rand_uniform_int(int64_t n, uint32_t seed, T min_value, T max_value, U* out) { DCHECK(out || (n == 0)); - std::mt19937 gen(seed); + std::default_random_engine gen(seed); std::uniform_int_distribution d(min_value, max_value); std::generate(out, out + n, [&d, &gen] { return static_cast(d(gen)); }); } @@ -409,5 +408,3 @@ class BatchIterator : public RecordBatchReader { }; } // namespace arrow - -#endif // ARROW_TEST_UTIL_H_ diff --git a/cpp/src/arrow/util/bit-util-benchmark.cc b/cpp/src/arrow/util/bit-util-benchmark.cc index beb48df278acc..cc71078880156 100644 --- a/cpp/src/arrow/util/bit-util-benchmark.cc +++ b/cpp/src/arrow/util/bit-util-benchmark.cc @@ -39,11 +39,7 @@ class NaiveBitmapReader { NaiveBitmapReader(const uint8_t* bitmap, int64_t start_offset, int64_t length) : bitmap_(bitmap), position_(0) {} - bool IsSet() const { - const int64_t byte_offset = position_ / 8; - const int64_t bit_offset = position_ % 8; - return (bitmap_[byte_offset] & (1 << bit_offset)) == 0; - } + bool IsSet() const { return BitUtil::GetBit(bitmap_, position_); } bool IsNotSet() const { return !IsSet(); } @@ -51,7 +47,7 @@ class NaiveBitmapReader { private: const uint8_t* bitmap_; - int64_t position_; + uint64_t position_; }; // A naive bitmap writer implementation, meant as a baseline against @@ -100,7 +96,7 @@ static void BenchmarkBitmapReader(benchmark::State& state, int64_t nbytes) { const int64_t num_bits = nbytes * 8; const uint8_t* bitmap = buffer->data(); - while (state.KeepRunning()) { + for (auto _ : state) { { BitmapReaderType reader(bitmap, 0, num_bits); int64_t total = 0; @@ -240,11 +236,11 @@ BENCHMARK(BM_CopyBitmap) ->Unit(benchmark::kMicrosecond); BENCHMARK(BM_NaiveBitmapReader) - ->Args({100000}) - ->MinTime(1.0) + ->Args({1000000}) + ->MinTime(5.0) ->Unit(benchmark::kMicrosecond); -BENCHMARK(BM_BitmapReader)->Args({100000})->MinTime(1.0)->Unit(benchmark::kMicrosecond); +BENCHMARK(BM_BitmapReader)->Args({1000000})->MinTime(5.0)->Unit(benchmark::kMicrosecond); BENCHMARK(BM_NaiveBitmapWriter) ->Args({100000}) diff --git a/cpp/src/arrow/util/bit-util.h b/cpp/src/arrow/util/bit-util.h index cd3d5b0c58ff8..93b6cb28d91b1 100644 --- a/cpp/src/arrow/util/bit-util.h +++ b/cpp/src/arrow/util/bit-util.h @@ -310,8 +310,8 @@ static constexpr uint8_t kPrecedingBitmask[] = {0, 1, 3, 7, 15, 31, 63, 127}; // the bitwise complement version of kPrecedingBitmask static constexpr uint8_t kTrailingBitmask[] = {255, 254, 252, 248, 240, 224, 192, 128}; -static inline bool GetBit(const uint8_t* bits, int64_t i) { - return (bits[i / 8] & kBitmask[i % 8]) != 0; +static inline bool GetBit(const uint8_t* bits, uint64_t i) { + return (bits[i >> 3] >> (i & 0x07)) & 1; } static inline void ClearBit(uint8_t* bits, int64_t i) { diff --git a/cpp/src/arrow/util/io-util.cc b/cpp/src/arrow/util/io-util.cc index 8db5db442841b..74ad80691da94 100644 --- a/cpp/src/arrow/util/io-util.cc +++ b/cpp/src/arrow/util/io-util.cc @@ -146,8 +146,8 @@ Status FileNameFromString(const std::string& file_name, PlatformFilename* out) { Status FileOpenReadable(const PlatformFilename& file_name, int* fd) { int ret, errno_actual; #if defined(_MSC_VER) - errno_actual = _wsopen_s(fd, file_name.wstring().c_str(), _O_RDONLY | _O_BINARY, - _SH_DENYNO, _S_IREAD); + errno_actual = _wsopen_s(fd, file_name.wstring().c_str(), + _O_RDONLY | _O_BINARY | _O_NOINHERIT, _SH_DENYNO, _S_IREAD); ret = *fd; #else ret = *fd = open(file_name.c_str(), O_RDONLY | O_BINARY); @@ -162,7 +162,7 @@ Status FileOpenWritable(const PlatformFilename& file_name, bool write_only, bool int ret, errno_actual; #if defined(_MSC_VER) - int oflag = _O_CREAT | _O_BINARY; + int oflag = _O_CREAT | _O_BINARY | _O_NOINHERIT; int pmode = _S_IWRITE; if (!write_only) { pmode |= _S_IREAD; diff --git a/cpp/src/arrow/util/number-parsing-benchmark.cc b/cpp/src/arrow/util/number-parsing-benchmark.cc index 28ef76abe7281..42c7b31ae6757 100644 --- a/cpp/src/arrow/util/number-parsing-benchmark.cc +++ b/cpp/src/arrow/util/number-parsing-benchmark.cc @@ -43,7 +43,7 @@ static std::vector MakeIntStrings(int32_t num_items) { for (int32_t i = 0; i < num_items; ++i) { strings.push_back(base_strings[i % base_strings.size()]); } - return base_strings; + return strings; } static std::vector MakeFloatStrings(int32_t num_items) { @@ -54,7 +54,18 @@ static std::vector MakeFloatStrings(int32_t num_items) { for (int32_t i = 0; i < num_items; ++i) { strings.push_back(base_strings[i % base_strings.size()]); } - return base_strings; + return strings; +} + +static std::vector MakeTimestampStrings(int32_t num_items) { + std::vector base_strings = {"2018-11-13 17:11:10", "2018-11-13 11:22:33", + "2016-02-29 11:22:33"}; + + std::vector strings; + for (int32_t i = 0; i < num_items; ++i) { + strings.push_back(base_strings[i % base_strings.size()]); + } + return strings; } template @@ -97,6 +108,29 @@ static void BM_FloatParsing(benchmark::State& state) { // NOLINT non-const refe state.SetItemsProcessed(state.iterations() * strings.size()); } +template +static void BM_TimestampParsing(benchmark::State& state) { // NOLINT non-const reference + using c_type = TimestampType::c_type; + + auto strings = MakeTimestampStrings(1000); + auto type = timestamp(UNIT); + StringConverter converter(type); + + while (state.KeepRunning()) { + c_type total = 0; + for (const auto& s : strings) { + c_type value; + if (!converter(s.data(), s.length(), &value)) { + std::cerr << "Conversion failed for '" << s << "'"; + std::abort(); + } + total += value; + } + benchmark::DoNotOptimize(total); + } + state.SetItemsProcessed(state.iterations() * strings.size()); +} + BENCHMARK_TEMPLATE(BM_IntegerParsing, Int8Type); BENCHMARK_TEMPLATE(BM_IntegerParsing, Int16Type); BENCHMARK_TEMPLATE(BM_IntegerParsing, Int32Type); @@ -109,5 +143,10 @@ BENCHMARK_TEMPLATE(BM_IntegerParsing, UInt64Type); BENCHMARK_TEMPLATE(BM_FloatParsing, FloatType); BENCHMARK_TEMPLATE(BM_FloatParsing, DoubleType); +BENCHMARK_TEMPLATE(BM_TimestampParsing, TimeUnit::SECOND); +BENCHMARK_TEMPLATE(BM_TimestampParsing, TimeUnit::MILLI); +BENCHMARK_TEMPLATE(BM_TimestampParsing, TimeUnit::MICRO); +BENCHMARK_TEMPLATE(BM_TimestampParsing, TimeUnit::NANO); + } // namespace internal } // namespace arrow diff --git a/cpp/src/arrow/util/parsing.h b/cpp/src/arrow/util/parsing.h index aa1f820257e79..23e0361235d3e 100644 --- a/cpp/src/arrow/util/parsing.h +++ b/cpp/src/arrow/util/parsing.h @@ -419,8 +419,9 @@ class StringConverter { *out = std::chrono::duration_cast(duration).count(); return true; } - // Unreachable + // Unreachable, but suppress compiler warning assert(0); + *out = 0; return true; } diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt index b71313e019aab..bd497dcb92882 100644 --- a/cpp/src/gandiva/CMakeLists.txt +++ b/cpp/src/gandiva/CMakeLists.txt @@ -20,10 +20,11 @@ cmake_minimum_required(VERSION 3.11) project(gandiva) -include(GandivaBuildUtils) - find_package(LLVM) +# For "make gandiva" to build everything Gandiva-related +add_custom_target(gandiva) + # Set the path where the byte-code files will be installed. set(GANDIVA_BC_INSTALL_DIR ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_INCLUDEDIR}/gandiva) @@ -92,6 +93,8 @@ ADD_ARROW_LIB(gandiva SHARED_PRIVATE_LINK_LIBS ${GANDIVA_SHARED_PRIVATE_LINK_LIBS} STATIC_LINK_LIBS ${GANDIVA_STATIC_LINK_LIBS}) +add_dependencies(gandiva ${GANDIVA_LIBRARIES}) + # install for gandiva include(GNUInstallDirs) @@ -125,28 +128,62 @@ install( FILES "${CMAKE_CURRENT_BINARY_DIR}/gandiva.pc" DESTINATION "${CMAKE_INSTALL_LIBDIR}/pkgconfig/") +set(GANDIVA_STATIC_TEST_LINK_LIBS + gandiva_static + ${RE2_LIBRARY} + ${ARROW_TEST_LINK_LIBS}) +set(GANDIVA_SHARED_TEST_LINK_LIBS + gandiva_shared + ${RE2_LIBRARY} + ${ARROW_TEST_LINK_LIBS}) + +function(ADD_GANDIVA_TEST REL_TEST_NAME) + set(options USE_STATIC_LINKING) + set(one_value_args) + set(multi_value_args) + cmake_parse_arguments(ARG "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN}) + + set(TEST_ARGUMENTS + ENABLED + PREFIX "gandiva" + LABELS "unittest;gandiva" + ${ARG_UNPARSED_ARGUMENTS}) + + # and uses less disk space, but in some cases we need to force static + # linking (see rationale below). + if (ARG_USE_STATIC_LINKING) + ADD_ARROW_TEST(${REL_TEST_NAME} + ${TEST_ARGUMENTS} + STATIC_LINK_LIBS ${GANDIVA_STATIC_TEST_LINK_LIBS}) + else() + ADD_ARROW_TEST(${REL_TEST_NAME} + ${TEST_ARGUMENTS} + STATIC_LINK_LIBS ${GANDIVA_SHARED_TEST_LINK_LIBS}) + endif() + + if(${REL_TEST_NAME} MATCHES "llvm" OR + ${REL_TEST_NAME} MATCHES "expression_registry") + # If the unit test has llvm in its name, include llvm. + add_dependencies(gandiva-${REL_TEST_NAME} LLVM::LLVM_INTERFACE) + target_link_libraries(gandiva-${REL_TEST_NAME} PRIVATE LLVM::LLVM_INTERFACE) + endif() +endfunction() + if (ARROW_GANDIVA_BUILD_TESTS) - #args: label test-file src-files - add_gandiva_unit_test(bitmap_accumulator_test.cc bitmap_accumulator.cc) - add_gandiva_unit_test(engine_llvm_test.cc engine.cc llvm_types.cc configuration.cc - gdv_function_stubs.cc context_helper.cc to_date_holder.cc date_utils.cc - exported_funcs_registry.cc ${BC_FILE_PATH_CC}) - add_gandiva_unit_test(function_signature_test.cc function_signature.cc) - add_gandiva_unit_test(function_registry_test.cc function_registry.cc function_signature.cc) - add_gandiva_unit_test(llvm_types_test.cc llvm_types.cc) - add_gandiva_unit_test(llvm_generator_test.cc llvm_generator.cc regex_util.cc engine.cc - llvm_types.cc expr_decomposer.cc function_registry.cc annotator.cc - bitmap_accumulator.cc configuration.cc function_signature.cc like_holder.cc - to_date_holder.cc date_utils.cc regex_util.cc gdv_function_stubs.cc context_helper.cc - exported_funcs_registry.cc ${BC_FILE_PATH_CC}) - add_gandiva_unit_test(annotator_test.cc annotator.cc function_signature.cc) - add_gandiva_unit_test(tree_expr_test.cc tree_expr_builder.cc expr_decomposer.cc annotator.cc function_registry.cc function_signature.cc like_holder.cc regex_util.cc to_date_holder.cc date_utils.cc) - add_gandiva_unit_test(expr_decomposer_test.cc expr_decomposer.cc tree_expr_builder.cc annotator.cc function_registry.cc function_signature.cc like_holder.cc regex_util.cc to_date_holder.cc date_utils.cc) - add_gandiva_unit_test(expression_registry_test.cc llvm_types.cc expression_registry.cc function_signature.cc function_registry.cc) - add_gandiva_unit_test(selection_vector_test.cc selection_vector.cc) - add_gandiva_unit_test(lru_cache_test.cc) - add_gandiva_unit_test(to_date_holder_test.cc to_date_holder.cc date_utils.cc) - add_gandiva_unit_test(simple_arena_test.cc) + ADD_GANDIVA_TEST(bitmap_accumulator_test) + ADD_GANDIVA_TEST(engine_llvm_test) + ADD_GANDIVA_TEST(function_signature_test) + ADD_GANDIVA_TEST(function_registry_test) + ADD_GANDIVA_TEST(llvm_types_test) + ADD_GANDIVA_TEST(llvm_generator_test) + ADD_GANDIVA_TEST(annotator_test) + ADD_GANDIVA_TEST(tree_expr_test) + ADD_GANDIVA_TEST(expr_decomposer_test) + ADD_GANDIVA_TEST(expression_registry_test) + ADD_GANDIVA_TEST(selection_vector_test) + ADD_GANDIVA_TEST(lru_cache_test) + ADD_GANDIVA_TEST(to_date_holder_test) + ADD_GANDIVA_TEST(simple_arena_test) endif() if (ARROW_GANDIVA_JAVA) diff --git a/cpp/src/gandiva/jni/CMakeLists.txt b/cpp/src/gandiva/jni/CMakeLists.txt index 8684fe8723de3..9f7bc526dbf5b 100644 --- a/cpp/src/gandiva/jni/CMakeLists.txt +++ b/cpp/src/gandiva/jni/CMakeLists.txt @@ -61,6 +61,7 @@ set(GANDIVA_JNI_SOURCES config_builder.cc # cpp/src ADD_ARROW_LIB(gandiva_jni SOURCES ${GANDIVA_JNI_SOURCES} + OUTPUTS GANDIVA_JNI_LIBRARIES SHARED_PRIVATE_LINK_LIBS ${GANDIVA_LINK_LIBS} STATIC_LINK_LIBS ${GANDIVA_LINK_LIBS} DEPENDENCIES gandiva_java gandiva_jni_proto @@ -69,6 +70,8 @@ ADD_ARROW_LIB(gandiva_jni ${JNI_HEADERS_DIR} PRIVATE_INCLUDES ${JNI_INCLUDE_DIRS} ${CMAKE_CURRENT_BINARY_DIR}) +add_dependencies(gandiva ${GANDIVA_JNI_LIBRARIES}) + # filter out everything that is not needed for the jni bridge # statically linked stdc++ has conflicts with stdc++ loaded by other libraries. if (NOT APPLE) diff --git a/cpp/src/gandiva/precompiled/CMakeLists.txt b/cpp/src/gandiva/precompiled/CMakeLists.txt index a4414cae0fc86..886fdced887ff 100644 --- a/cpp/src/gandiva/precompiled/CMakeLists.txt +++ b/cpp/src/gandiva/precompiled/CMakeLists.txt @@ -51,11 +51,30 @@ add_custom_command( add_custom_target(precompiled ALL DEPENDS ${GANDIVA_BC_OUTPUT_PATH}) +# Add a unittest executable for a precompiled file (used to generate IR) +function(add_precompiled_unit_test REL_TEST_NAME) + get_filename_component(TEST_NAME ${REL_TEST_NAME} NAME_WE) + + set(TEST_NAME "gandiva-precompiled-${TEST_NAME}") + + add_executable(${TEST_NAME} ${REL_TEST_NAME} ${ARGN}) + add_dependencies(gandiva ${TEST_NAME}) + target_include_directories(${TEST_NAME} PRIVATE ${CMAKE_SOURCE_DIR}/src) + target_link_libraries(${TEST_NAME} + PRIVATE ${ARROW_TEST_LINK_LIBS} ${RE2_LIBRARY} + ) + target_compile_definitions(${TEST_NAME} PRIVATE GANDIVA_UNIT_TEST=1) + add_test(NAME ${TEST_NAME} COMMAND ${TEST_NAME}) + set_property(TEST ${TEST_NAME} PROPERTY LABELS gandiva;unittest ${TEST_NAME}) +endfunction(add_precompiled_unit_test REL_TEST_NAME) + # testing -add_precompiled_unit_test(bitmap_test.cc bitmap.cc) -add_precompiled_unit_test(epoch_time_point_test.cc) -add_precompiled_unit_test(time_test.cc time.cc timestamp_arithmetic.cc ../context_helper.cc) -add_precompiled_unit_test(hash_test.cc hash.cc) -add_precompiled_unit_test(string_ops_test.cc string_ops.cc ../context_helper.cc) -add_precompiled_unit_test(arithmetic_ops_test.cc arithmetic_ops.cc ../context_helper.cc) -add_precompiled_unit_test(extended_math_ops_test.cc extended_math_ops.cc ../context_helper.cc) +if (ARROW_GANDIVA_BUILD_TESTS) + add_precompiled_unit_test(bitmap_test.cc bitmap.cc) + add_precompiled_unit_test(epoch_time_point_test.cc) + add_precompiled_unit_test(time_test.cc time.cc timestamp_arithmetic.cc ../context_helper.cc) + add_precompiled_unit_test(hash_test.cc hash.cc) + add_precompiled_unit_test(string_ops_test.cc string_ops.cc ../context_helper.cc) + add_precompiled_unit_test(arithmetic_ops_test.cc arithmetic_ops.cc ../context_helper.cc) + add_precompiled_unit_test(extended_math_ops_test.cc extended_math_ops.cc ../context_helper.cc) +endif() diff --git a/cpp/src/gandiva/tests/CMakeLists.txt b/cpp/src/gandiva/tests/CMakeLists.txt index ae600634e74a7..1fd30aac495cf 100644 --- a/cpp/src/gandiva/tests/CMakeLists.txt +++ b/cpp/src/gandiva/tests/CMakeLists.txt @@ -15,28 +15,23 @@ # specific language governing permissions and limitations # under the License. -project(gandiva) +ADD_GANDIVA_TEST(filter_test) +ADD_GANDIVA_TEST(projector_test) +ADD_GANDIVA_TEST(projector_build_validation_test) +ADD_GANDIVA_TEST(if_expr_test) +ADD_GANDIVA_TEST(literal_test) +ADD_GANDIVA_TEST(boolean_expr_test) +ADD_GANDIVA_TEST(binary_test) +ADD_GANDIVA_TEST(date_time_test) +ADD_GANDIVA_TEST(to_string_test) +ADD_GANDIVA_TEST(hash_test) +ADD_GANDIVA_TEST(in_expr_test) +ADD_GANDIVA_TEST(null_validity_test) -foreach(lib_type "shared" "static") - add_gandiva_integ_test(filter_test.cc gandiva_${lib_type}) - add_gandiva_integ_test(projector_test.cc gandiva_${lib_type}) - add_gandiva_integ_test(if_expr_test.cc gandiva_${lib_type}) - add_gandiva_integ_test(literal_test.cc gandiva_${lib_type}) - add_gandiva_integ_test(projector_build_validation_test.cc gandiva_${lib_type}) - add_gandiva_integ_test(boolean_expr_test.cc gandiva_${lib_type}) - add_gandiva_integ_test(utf8_test.cc gandiva_${lib_type}) - add_gandiva_integ_test(binary_test.cc gandiva_${lib_type}) - add_gandiva_integ_test(date_time_test.cc gandiva_${lib_type}) - add_gandiva_integ_test(to_string_test.cc gandiva_${lib_type}) - add_gandiva_integ_test(hash_test.cc gandiva_${lib_type}) - add_gandiva_integ_test(in_expr_test.cc gandiva_${lib_type}) - add_gandiva_integ_test(null_validity_test.cc gandiva_${lib_type}) -endforeach(lib_type) - -set(GANDIVA_BENCHMARK_LINK_LIBRARIES - gandiva_static -) +ADD_GANDIVA_TEST(projector_test_static + SOURCES projector_test.cc + USE_STATIC_LINKING) ADD_ARROW_BENCHMARK(micro_benchmarks PREFIX "gandiva" - EXTRA_LINK_LIBS ${GANDIVA_BENCHMARK_LINK_LIBRARIES}) + EXTRA_LINK_LIBS gandiva_static) diff --git a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc index 8aedd388d2341..24ec0dd24eec3 100644 --- a/cpp/src/parquet/arrow/arrow-reader-writer-test.cc +++ b/cpp/src/parquet/arrow/arrow-reader-writer-test.cc @@ -29,6 +29,11 @@ #include #include +#include "arrow/api.h" +#include "arrow/test-util.h" +#include "arrow/type_traits.h" +#include "arrow/util/decimal.h" + #include "parquet/api/reader.h" #include "parquet/api/writer.h" @@ -36,16 +41,9 @@ #include "parquet/arrow/schema.h" #include "parquet/arrow/test-util.h" #include "parquet/arrow/writer.h" - #include "parquet/file_writer.h" - #include "parquet/util/test-common.h" -#include "arrow/api.h" -#include "arrow/test-util.h" -#include "arrow/type_traits.h" -#include "arrow/util/decimal.h" - using arrow::Array; using arrow::ArrayVisitor; using arrow::Buffer; @@ -1712,6 +1710,7 @@ TEST(TestArrowReadWrite, ReadColumnSubset) { TEST(TestArrowReadWrite, ListLargeRecords) { // PARQUET-1308: This test passed on Linux when num_rows was smaller const int num_rows = 2000; + const int row_group_size = 100; std::shared_ptr list_array; std::shared_ptr<::DataType> list_type; @@ -1723,8 +1722,8 @@ TEST(TestArrowReadWrite, ListLargeRecords) { std::shared_ptr
table = Table::Make(schema, {list_array}); std::shared_ptr buffer; - ASSERT_NO_FATAL_FAILURE( - WriteTableToBuffer(table, 100, default_arrow_writer_properties(), &buffer)); + ASSERT_NO_FATAL_FAILURE(WriteTableToBuffer(table, row_group_size, + default_arrow_writer_properties(), &buffer)); std::unique_ptr reader; ASSERT_OK_NO_THROW(OpenFile(std::make_shared(buffer), @@ -1736,7 +1735,7 @@ TEST(TestArrowReadWrite, ListLargeRecords) { ASSERT_OK_NO_THROW(reader->ReadTable(&result)); ASSERT_NO_FATAL_FAILURE(::arrow::AssertTablesEqual(*table, *result)); - // Read chunked + // Read 1 record at a time ASSERT_OK_NO_THROW(OpenFile(std::make_shared(buffer), ::arrow::default_memory_pool(), ::parquet::default_reader_properties(), nullptr, &reader)); @@ -2263,7 +2262,7 @@ TEST_P(TestNestedSchemaRead, DeepNestedSchemaRead) { const int num_trees = 3; const int depth = 3; #else - const int num_trees = 10; + const int num_trees = 5; const int depth = 5; #endif const int num_children = 3; diff --git a/cpp/src/parquet/arrow/record_reader.cc b/cpp/src/parquet/arrow/record_reader.cc index ce6fa2a5b91bb..4a3cd526b118a 100644 --- a/cpp/src/parquet/arrow/record_reader.cc +++ b/cpp/src/parquet/arrow/record_reader.cc @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -59,6 +60,10 @@ static bool IsDictionaryIndexEncoding(const Encoding::type& e) { return e == Encoding::RLE_DICTIONARY || e == Encoding::PLAIN_DICTIONARY; } +// The minimum number of repetition/definition levels to decode at a time, for +// better vectorized performance when doing many smaller record reads +constexpr int64_t kMinLevelBatchSize = 1024; + class RecordReader::RecordReaderImpl { public: RecordReaderImpl(const ColumnDescriptor* descr, MemoryPool* pool) @@ -94,7 +99,88 @@ class RecordReader::RecordReaderImpl { virtual ~RecordReaderImpl() = default; - virtual int64_t ReadRecords(int64_t num_records) = 0; + virtual int64_t ReadRecordData(const int64_t num_records) = 0; + + // Returns true if there are still values in this column. + bool HasNext() { + // Either there is no data page available yet, or the data page has been + // exhausted + if (num_buffered_values_ == 0 || num_decoded_values_ == num_buffered_values_) { + if (!ReadNewPage() || num_buffered_values_ == 0) { + return false; + } + } + return true; + } + + int64_t ReadRecords(int64_t num_records) { + // Delimit records, then read values at the end + int64_t records_read = 0; + + if (levels_position_ < levels_written_) { + records_read += ReadRecordData(num_records); + } + + int64_t level_batch_size = std::max(kMinLevelBatchSize, num_records); + + // If we are in the middle of a record, we continue until reaching the + // desired number of records or the end of the current record if we've found + // enough records + while (!at_record_start_ || records_read < num_records) { + // Is there more data to read in this row group? + if (!HasNext()) { + if (!at_record_start_) { + // We ended the row group while inside a record that we haven't seen + // the end of yet. So increment the record count for the last record in + // the row group + ++records_read; + at_record_start_ = true; + } + break; + } + + /// We perform multiple batch reads until we either exhaust the row group + /// or observe the desired number of records + int64_t batch_size = std::min(level_batch_size, available_values_current_page()); + + // No more data in column + if (batch_size == 0) { + break; + } + + if (max_def_level_ > 0) { + ReserveLevels(batch_size); + + int16_t* def_levels = this->def_levels() + levels_written_; + int16_t* rep_levels = this->rep_levels() + levels_written_; + + // Not present for non-repeated fields + int64_t levels_read = 0; + if (max_rep_level_ > 0) { + levels_read = ReadDefinitionLevels(batch_size, def_levels); + if (ReadRepetitionLevels(batch_size, rep_levels) != levels_read) { + throw ParquetException("Number of decoded rep / def levels did not match"); + } + } else if (max_def_level_ > 0) { + levels_read = ReadDefinitionLevels(batch_size, def_levels); + } + + // Exhausted column chunk + if (levels_read == 0) { + break; + } + + levels_written_ += levels_read; + records_read += ReadRecordData(num_records - records_read); + } else { + // No repetition or definition levels + batch_size = std::min(num_records - records_read, batch_size); + records_read += ReadRecordData(batch_size); + } + } + + return records_read; + } // Dictionary decoders must be reset when advancing row groups virtual void ResetDecoders() = 0; @@ -303,7 +389,11 @@ class RecordReader::RecordReaderImpl { } } + virtual void DebugPrintState() = 0; + protected: + virtual bool ReadNewPage() = 0; + const ColumnDescriptor* descr_; ::arrow::MemoryPool* pool_; @@ -359,10 +449,6 @@ class RecordReader::RecordReaderImpl { std::shared_ptr<::arrow::ResizableBuffer> rep_levels_; }; -// The minimum number of repetition/definition levels to decode at a time, for -// better vectorized performance when doing many smaller record reads -constexpr int64_t kMinLevelBatchSize = 1024; - template class TypedRecordReader : public RecordReader::RecordReaderImpl { public: @@ -390,7 +476,7 @@ class TypedRecordReader : public RecordReader::RecordReaderImpl { } // Return number of logical records read - int64_t ReadRecordData(const int64_t num_records) { + int64_t ReadRecordData(const int64_t num_records) override { // Conservative upper bound const int64_t possible_num_values = std::max(num_records, levels_written_ - levels_position_); @@ -434,85 +520,30 @@ class TypedRecordReader : public RecordReader::RecordReaderImpl { return records_read; } - // Returns true if there are still values in this column. - bool HasNext() { - // Either there is no data page available yet, or the data page has been - // exhausted - if (num_buffered_values_ == 0 || num_decoded_values_ == num_buffered_values_) { - if (!ReadNewPage() || num_buffered_values_ == 0) { - return false; - } - } - return true; - } + void DebugPrintState() override { + const int16_t* def_levels = this->def_levels(); + const int16_t* rep_levels = this->rep_levels(); + const int64_t total_levels_read = levels_position_; - int64_t ReadRecords(int64_t num_records) override { - // Delimit records, then read values at the end - int64_t records_read = 0; + const T* values = reinterpret_cast(this->values()); - if (levels_position_ < levels_written_) { - records_read += ReadRecordData(num_records); + std::cout << "def levels: "; + for (int64_t i = 0; i < total_levels_read; ++i) { + std::cout << def_levels[i] << " "; } + std::cout << std::endl; - int64_t level_batch_size = std::max(kMinLevelBatchSize, num_records); - - // If we are in the middle of a record, we continue until reaching the - // desired number of records or the end of the current record if we've found - // enough records - while (!at_record_start_ || records_read < num_records) { - // Is there more data to read in this row group? - if (!HasNext()) { - if (!at_record_start_) { - // We ended the row group while inside a record that we haven't seen - // the end of yet. So increment the record count for the last record in - // the row group - ++records_read; - at_record_start_ = true; - } - break; - } - - /// We perform multiple batch reads until we either exhaust the row group - /// or observe the desired number of records - int64_t batch_size = std::min(level_batch_size, available_values_current_page()); - - // No more data in column - if (batch_size == 0) { - break; - } - - if (max_def_level_ > 0) { - ReserveLevels(batch_size); - - int16_t* def_levels = this->def_levels() + levels_written_; - int16_t* rep_levels = this->rep_levels() + levels_written_; - - // Not present for non-repeated fields - int64_t levels_read = 0; - if (max_rep_level_ > 0) { - levels_read = ReadDefinitionLevels(batch_size, def_levels); - if (ReadRepetitionLevels(batch_size, rep_levels) != levels_read) { - throw ParquetException("Number of decoded rep / def levels did not match"); - } - } else if (max_def_level_ > 0) { - levels_read = ReadDefinitionLevels(batch_size, def_levels); - } - - // Exhausted column chunk - if (levels_read == 0) { - break; - } - - levels_written_ += levels_read; - records_read += ReadRecordData(num_records - records_read); - } else { - // No repetition or definition levels - batch_size = std::min(num_records - records_read, batch_size); - records_read += ReadRecordData(batch_size); - } + std::cout << "rep levels: "; + for (int64_t i = 0; i < total_levels_read; ++i) { + std::cout << rep_levels[i] << " "; } + std::cout << std::endl; - return records_read; + std::cout << "values: "; + for (int64_t i = 0; i < this->values_written(); ++i) { + std::cout << values[i] << " "; + } + std::cout << std::endl; } private: @@ -526,11 +557,21 @@ class TypedRecordReader : public RecordReader::RecordReaderImpl { DecoderType* current_decoder_; // Advance to the next data page - bool ReadNewPage(); + bool ReadNewPage() override; void ConfigureDictionary(const DictionaryPage* page); }; +// TODO(wesm): Implement these to some satisfaction +template <> +void TypedRecordReader::DebugPrintState() {} + +template <> +void TypedRecordReader::DebugPrintState() {} + +template <> +void TypedRecordReader::DebugPrintState() {} + template <> inline void TypedRecordReader::ReadValuesDense(int64_t values_to_read) { auto values = ValuesHead(); @@ -822,5 +863,7 @@ void RecordReader::SetPageReader(std::unique_ptr reader) { impl_->SetPageReader(std::move(reader)); } +void RecordReader::DebugPrintState() { impl_->DebugPrintState(); } + } // namespace internal } // namespace parquet diff --git a/cpp/src/parquet/arrow/record_reader.h b/cpp/src/parquet/arrow/record_reader.h index 8da0709997026..7efd0d54899fe 100644 --- a/cpp/src/parquet/arrow/record_reader.h +++ b/cpp/src/parquet/arrow/record_reader.h @@ -104,6 +104,8 @@ class RecordReader { /// \param[in] reader obtained from RowGroupReader::GetColumnPageReader void SetPageReader(std::unique_ptr reader); + void DebugPrintState(); + private: std::unique_ptr impl_; explicit RecordReader(RecordReaderImpl* impl); diff --git a/cpp/src/parquet/arrow/test-util.h b/cpp/src/parquet/arrow/test-util.h index d425cb0db7e48..abe4a03364e13 100644 --- a/cpp/src/parquet/arrow/test-util.h +++ b/cpp/src/parquet/arrow/test-util.h @@ -15,8 +15,11 @@ // specific language governing permissions and limitations // under the License. +#pragma once + #include #include +#include #include #include #include @@ -28,14 +31,6 @@ #include "parquet/arrow/record_reader.h" -namespace arrow { -// PARQUET-1382: backwards-compatible shim for arrow::test namespace -namespace test {} - -using namespace ::arrow::test; // NOLINT - -} // namespace arrow - namespace parquet { using internal::RecordReader; @@ -144,9 +139,9 @@ NonNullArray(size_t size, std::shared_ptr* out) { static inline void random_decimals(int64_t n, uint32_t seed, int32_t precision, uint8_t* out) { - std::mt19937 gen(seed); + std::default_random_engine gen(seed); std::uniform_int_distribution d(0, std::numeric_limits::max()); - const int32_t required_bytes = DecimalSize(precision); + const int32_t required_bytes = ::arrow::DecimalSize(precision); constexpr int32_t byte_width = 16; std::fill(out, out + byte_width * n, '\0'); @@ -433,14 +428,13 @@ Status MakeEmptyListsArray(int64_t size, std::shared_ptr* out_array) { return Status::OK(); } -static std::shared_ptr<::arrow::Column> MakeColumn(const std::string& name, - const std::shared_ptr& array, - bool nullable) { +static inline std::shared_ptr<::arrow::Column> MakeColumn( + const std::string& name, const std::shared_ptr& array, bool nullable) { auto field = ::arrow::field(name, array->type(), nullable); return std::make_shared<::arrow::Column>(field, array); } -static std::shared_ptr<::arrow::Column> MakeColumn( +static inline std::shared_ptr<::arrow::Column> MakeColumn( const std::string& name, const std::vector>& arrays, bool nullable) { auto field = ::arrow::field(name, arrays[0]->type(), nullable); @@ -484,44 +478,6 @@ void ExpectArrayT<::arrow::BooleanType>(void* expected, Array* result) { EXPECT_TRUE(result->Equals(*expected_array)); } -template -void PrintBufferedLevels(const RecordReader& reader) { - using T = typename ::parquet::type_traits::value_type; - - const int16_t* def_levels = reader.def_levels(); - const int16_t* rep_levels = reader.rep_levels(); - const int64_t total_levels_read = reader.levels_position(); - - const T* values = reinterpret_cast(reader.values()); - - std::cout << "def levels: "; - for (int64_t i = 0; i < total_levels_read; ++i) { - std::cout << def_levels[i] << " "; - } - std::cout << std::endl; - - std::cout << "rep levels: "; - for (int64_t i = 0; i < total_levels_read; ++i) { - std::cout << rep_levels[i] << " "; - } - std::cout << std::endl; - - std::cout << "values: "; - for (int64_t i = 0; i < reader.values_written(); ++i) { - std::cout << values[i] << " "; - } - std::cout << std::endl; -} - -template <> -void PrintBufferedLevels(const RecordReader& reader) {} - -template <> -void PrintBufferedLevels(const RecordReader& reader) {} - -template <> -void PrintBufferedLevels(const RecordReader& reader) {} - } // namespace arrow } // namespace parquet diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc index f5e234d30211e..ef5de07d87f16 100644 --- a/cpp/src/parquet/arrow/writer.cc +++ b/cpp/src/parquet/arrow/writer.cc @@ -504,7 +504,7 @@ Status ArrowColumnWriter::WriteNullableBatch( using ParquetCType = typename ParquetType::c_type; ParquetCType* buffer; - RETURN_NOT_OK(ctx_->GetScratchData(num_levels, &buffer)); + RETURN_NOT_OK(ctx_->GetScratchData(num_values, &buffer)); for (int i = 0; i < num_values; i++) { buffer[i] = static_cast(values[i]); } diff --git a/cpp/src/parquet/bloom_filter-test.cc b/cpp/src/parquet/bloom_filter-test.cc index 945f80b7b96f0..e2b0b699b203f 100644 --- a/cpp/src/parquet/bloom_filter-test.cc +++ b/cpp/src/parquet/bloom_filter-test.cc @@ -93,17 +93,13 @@ std::string GetRandomString(uint32_t length) { const std::string charset = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; - // The uuid_seed was generated by "uuidgen -r" - const std::string uuid_seed = "8de406aa-fb59-4195-a81c-5152af26433f"; - std::seed_seq seed(uuid_seed.begin(), uuid_seed.end()); - std::mt19937 generator(seed); + std::default_random_engine gen(42); std::uniform_int_distribution dist(0, static_cast(charset.size() - 1)); - std::string ret = ""; + std::string ret(length, 'x'); for (uint32_t i = 0; i < length; i++) { - ret += charset[dist(generator)]; + ret[i] = charset[dist(gen)]; } - return ret; } @@ -146,7 +142,7 @@ TEST(FPPTest, TestBloomFilter) { } // The exist should be probably less than 1000 according default FPP 0.01. - EXPECT_TRUE(exist < total_count * fpp); + EXPECT_LT(exist, total_count * fpp); } // The CompatibilityTest is used to test cross compatibility with parquet-mr, it reads diff --git a/cpp/src/parquet/column_reader-test.cc b/cpp/src/parquet/column_reader-test.cc index 273b3029ba3d1..60f2be2362510 100644 --- a/cpp/src/parquet/column_reader-test.cc +++ b/cpp/src/parquet/column_reader-test.cc @@ -386,5 +386,34 @@ TEST_F(TestPrimitiveReader, TestDictionaryEncodedPages) { pages_.clear(); } +TEST(TestColumnReader, DefinitionLevelsToBitmap) { + // Bugs in this function were exposed in ARROW-3930 + std::vector def_levels = {3, 3, 3, 2, 3, 3, 3, 3, 3}; + std::vector rep_levels = {0, 1, 1, 1, 1, 1, 1, 1, 1}; + + std::vector valid_bits(2, 0); + + const int max_def_level = 3; + const int max_rep_level = 1; + + int64_t values_read = -1; + int64_t null_count = 0; + internal::DefinitionLevelsToBitmap(def_levels.data(), 9, max_def_level, max_rep_level, + &values_read, &null_count, valid_bits.data(), + 0 /* valid_bits_offset */); + ASSERT_EQ(9, values_read); + ASSERT_EQ(1, null_count); + + // Call again with 0 definition levels, make sure that valid_bits is unmodifed + const uint8_t current_byte = valid_bits[1]; + null_count = 0; + internal::DefinitionLevelsToBitmap(def_levels.data(), 0, max_def_level, max_rep_level, + &values_read, &null_count, valid_bits.data(), + 9 /* valid_bits_offset */); + ASSERT_EQ(0, values_read); + ASSERT_EQ(0, null_count); + ASSERT_EQ(current_byte, valid_bits[1]); +} + } // namespace test } // namespace parquet diff --git a/cpp/src/parquet/column_reader.h b/cpp/src/parquet/column_reader.h index 960f2107dfa09..42bf900c97932 100644 --- a/cpp/src/parquet/column_reader.h +++ b/cpp/src/parquet/column_reader.h @@ -178,9 +178,11 @@ namespace internal { static inline void DefinitionLevelsToBitmap( const int16_t* def_levels, int64_t num_def_levels, const int16_t max_definition_level, const int16_t max_repetition_level, int64_t* values_read, int64_t* null_count, - uint8_t* valid_bits, const int64_t valid_bits_offset) { + uint8_t* valid_bits, int64_t valid_bits_offset) { + // We assume here that valid_bits is large enough to accommodate the + // additional definition levels and the ones that have already been written ::arrow::internal::BitmapWriter valid_bits_writer(valid_bits, valid_bits_offset, - num_def_levels); + valid_bits_offset + num_def_levels); // TODO(itaiin): As an interim solution we are splitting the code path here // between repeated+flat column reads, and non-repeated+nested reads. diff --git a/cpp/src/parquet/column_writer-test.cc b/cpp/src/parquet/column_writer-test.cc index b81f3ed8152b6..4416e3d18e9ad 100644 --- a/cpp/src/parquet/column_writer-test.cc +++ b/cpp/src/parquet/column_writer-test.cc @@ -17,6 +17,8 @@ #include +#include + #include "parquet/column_reader.h" #include "parquet/column_writer.h" #include "parquet/test-specialization.h" @@ -28,6 +30,7 @@ namespace parquet { +using schema::GroupNode; using schema::NodePtr; using schema::PrimitiveNode; @@ -581,6 +584,52 @@ TEST_F(TestByteArrayValuesWriter, CheckDefaultStats) { ASSERT_TRUE(this->metadata_is_stats_set()); } +TEST(TestColumnWriter, RepeatedListsUpdateSpacedBug) { + // In ARROW-3930 we discovered a bug when writing from Arrow when we had data + // that looks like this: + // + // [null, [0, 1, null, 2, 3, 4, null]] + + // Create schema + NodePtr item = schema::Int32("item"); // optional item + NodePtr list(GroupNode::Make("b", Repetition::REPEATED, {item}, LogicalType::LIST)); + NodePtr bag(GroupNode::Make("bag", Repetition::OPTIONAL, {list})); // optional list + std::vector fields = {bag}; + NodePtr root = GroupNode::Make("schema", Repetition::REPEATED, fields); + + SchemaDescriptor schema; + schema.Init(root); + + InMemoryOutputStream sink; + auto props = WriterProperties::Builder().build(); + + auto metadata = ColumnChunkMetaDataBuilder::Make(props, schema.Column(0)); + std::unique_ptr pager = + PageWriter::Open(&sink, Compression::UNCOMPRESSED, metadata.get()); + std::shared_ptr writer = + ColumnWriter::Make(metadata.get(), std::move(pager), props.get()); + auto typed_writer = std::static_pointer_cast>(writer); + + std::vector def_levels = {1, 3, 3, 2, 3, 3, 3, 2, 3, 3, 3, 2, 3, 3}; + std::vector rep_levels = {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}; + std::vector values = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}; + + // Write the values into uninitialized memory + std::shared_ptr values_buffer; + ASSERT_OK(::arrow::AllocateBuffer(64, &values_buffer)); + memcpy(values_buffer->mutable_data(), values.data(), 13 * sizeof(int32_t)); + auto values_data = reinterpret_cast(values_buffer->data()); + + std::shared_ptr valid_bits; + ASSERT_OK(::arrow::BitUtil::BytesToBits({1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1}, + ::arrow::default_memory_pool(), &valid_bits)); + + // valgrind will warn about out of bounds access into def_levels_data + typed_writer->WriteBatchSpaced(14, def_levels.data(), rep_levels.data(), + valid_bits->data(), 0, values_data); + writer->Close(); +} + void GenerateLevels(int min_repeat_factor, int max_repeat_factor, int max_level, std::vector& input_levels) { // for each repetition count upto max_repeat_factor diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index a45613f1b982c..37fce9c036b31 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -353,7 +353,6 @@ ColumnWriter::ColumnWriter(ColumnChunkMetaDataBuilder* metadata, encoding_(encoding), properties_(properties), allocator_(properties->memory_pool()), - pool_(properties->memory_pool()), num_buffered_values_(0), num_buffered_encoded_values_(0), rows_written_(0), @@ -546,8 +545,7 @@ TypedColumnWriter::TypedColumnWriter(ColumnChunkMetaDataBuilder* metadata, break; case Encoding::PLAIN_DICTIONARY: case Encoding::RLE_DICTIONARY: - current_encoder_.reset( - new DictEncoder(descr_, &pool_, properties->memory_pool())); + current_encoder_.reset(new DictEncoder(descr_, properties->memory_pool())); break; default: ParquetException::NYI("Selected encoding is not supported"); @@ -582,8 +580,6 @@ void TypedColumnWriter::WriteDictionaryPage() { std::shared_ptr buffer = AllocateBuffer(properties_->memory_pool(), dict_encoder->dict_encoded_size()); dict_encoder->WriteDict(buffer->mutable_data()); - // TODO Get rid of this deep call - dict_encoder->mem_pool()->FreeAll(); DictionaryPage page(buffer, dict_encoder->num_entries(), properties_->dictionary_index_encoding()); @@ -721,7 +717,7 @@ inline int64_t TypedColumnWriter::WriteMiniBatch(int64_t num_values, template inline int64_t TypedColumnWriter::WriteMiniBatchSpaced( - int64_t num_values, const int16_t* def_levels, const int16_t* rep_levels, + int64_t num_levels, const int16_t* def_levels, const int16_t* rep_levels, const uint8_t* valid_bits, int64_t valid_bits_offset, const T* values, int64_t* num_spaced_written) { int64_t values_to_write = 0; @@ -733,7 +729,7 @@ inline int64_t TypedColumnWriter::WriteMiniBatchSpaced( if (descr_->schema_node()->is_optional()) { min_spaced_def_level--; } - for (int64_t i = 0; i < num_values; ++i) { + for (int64_t i = 0; i < num_levels; ++i) { if (def_levels[i] == descr_->max_definition_level()) { ++values_to_write; } @@ -742,27 +738,27 @@ inline int64_t TypedColumnWriter::WriteMiniBatchSpaced( } } - WriteDefinitionLevels(num_values, def_levels); + WriteDefinitionLevels(num_levels, def_levels); } else { // Required field, write all values - values_to_write = num_values; - spaced_values_to_write = num_values; + values_to_write = num_levels; + spaced_values_to_write = num_levels; } // Not present for non-repeated fields if (descr_->max_repetition_level() > 0) { // A row could include more than one value // Count the occasions where we start a new row - for (int64_t i = 0; i < num_values; ++i) { + for (int64_t i = 0; i < num_levels; ++i) { if (rep_levels[i] == 0) { rows_written_++; } } - WriteRepetitionLevels(num_values, rep_levels); + WriteRepetitionLevels(num_levels, rep_levels); } else { // Each value is exactly one row - rows_written_ += static_cast(num_values); + rows_written_ += static_cast(num_levels); } if (descr_->schema_node()->is_optional()) { @@ -774,10 +770,10 @@ inline int64_t TypedColumnWriter::WriteMiniBatchSpaced( if (page_statistics_ != nullptr) { page_statistics_->UpdateSpaced(values, valid_bits, valid_bits_offset, values_to_write, - num_values - values_to_write); + spaced_values_to_write - values_to_write); } - num_buffered_values_ += num_values; + num_buffered_values_ += num_levels; num_buffered_encoded_values_ += values_to_write; if (current_encoder_->EstimatedDataEncodedSize() >= properties_->data_pagesize()) { diff --git a/cpp/src/parquet/column_writer.h b/cpp/src/parquet/column_writer.h index 41bc7bd3bf2fe..e665ca718ffa5 100644 --- a/cpp/src/parquet/column_writer.h +++ b/cpp/src/parquet/column_writer.h @@ -15,8 +15,7 @@ // specific language governing permissions and limitations // under the License. -#ifndef PARQUET_COLUMN_WRITER_H -#define PARQUET_COLUMN_WRITER_H +#pragma once #include #include @@ -186,7 +185,6 @@ class PARQUET_EXPORT ColumnWriter { LevelEncoder level_encoder_; ::arrow::MemoryPool* allocator_; - ChunkedAllocator pool_; // The total number of values stored in the data page. This is the maximum of // the number of encoded definition levels or encoded values. For @@ -331,5 +329,3 @@ PARQUET_EXTERN_TEMPLATE TypedColumnWriter; PARQUET_EXTERN_TEMPLATE TypedColumnWriter; } // namespace parquet - -#endif // PARQUET_COLUMN_READER_H diff --git a/cpp/src/parquet/encoding-benchmark.cc b/cpp/src/parquet/encoding-benchmark.cc index 364cdba15a252..f8d2839af7ca7 100644 --- a/cpp/src/parquet/encoding-benchmark.cc +++ b/cpp/src/parquet/encoding-benchmark.cc @@ -104,11 +104,10 @@ static void DecodeDict(std::vector& values, typedef typename Type::c_type T; int num_values = static_cast(values.size()); - ChunkedAllocator pool; MemoryPool* allocator = default_memory_pool(); std::shared_ptr descr = Int64Schema(Repetition::REQUIRED); - DictEncoder encoder(descr.get(), &pool, allocator); + DictEncoder encoder(descr.get(), allocator); for (int i = 0; i < num_values; ++i) { encoder.Put(values[i]); } diff --git a/cpp/src/parquet/encoding-internal.h b/cpp/src/parquet/encoding-internal.h index b06ad41cc52c2..e2dfc2380ddcf 100644 --- a/cpp/src/parquet/encoding-internal.h +++ b/cpp/src/parquet/encoding-internal.h @@ -465,12 +465,10 @@ class DictEncoder : public Encoder { public: typedef typename DType::c_type T; - // XXX pool is unused - explicit DictEncoder(const ColumnDescriptor* desc, ChunkedAllocator* pool = nullptr, + explicit DictEncoder(const ColumnDescriptor* desc, ::arrow::MemoryPool* allocator = ::arrow::default_memory_pool()) : Encoder(desc, Encoding::PLAIN_DICTIONARY, allocator), allocator_(allocator), - pool_(pool), dict_encoded_size_(0), type_length_(desc->type_length()), memo_table_(INITIAL_HASH_TABLE_SIZE) {} @@ -538,8 +536,6 @@ class DictEncoder : public Encoder { /// dict_encoded_size() bytes. void WriteDict(uint8_t* buffer); - ChunkedAllocator* mem_pool() { return pool_; } - /// The number of entries in the dictionary. int num_entries() const { return memo_table_.size(); } @@ -549,9 +545,6 @@ class DictEncoder : public Encoder { ::arrow::MemoryPool* allocator_; - // For ByteArray / FixedLenByteArray data. Not owned - ChunkedAllocator* pool_; - /// Indices that have not yet be written out by WriteIndices(). std::vector buffered_indices_; diff --git a/cpp/src/parquet/encoding-test.cc b/cpp/src/parquet/encoding-test.cc index 50e1394c629d0..90ceb7828b139 100644 --- a/cpp/src/parquet/encoding-test.cc +++ b/cpp/src/parquet/encoding-test.cc @@ -155,7 +155,7 @@ class TestEncodingBase : public ::testing::Test { allocator_ = default_memory_pool(); } - void TearDown() { pool_.FreeAll(); } + void TearDown() {} void InitData(int nvalues, int repeats) { num_values_ = nvalues * repeats; @@ -181,7 +181,6 @@ class TestEncodingBase : public ::testing::Test { } protected: - ChunkedAllocator pool_; MemoryPool* allocator_; int num_values_; @@ -199,7 +198,6 @@ class TestEncodingBase : public ::testing::Test { // Member variables are not visible to templated subclasses. Possibly figure // out an alternative to this class layering at some point #define USING_BASE_MEMBERS() \ - using TestEncodingBase::pool_; \ using TestEncodingBase::allocator_; \ using TestEncodingBase::descr_; \ using TestEncodingBase::num_values_; \ @@ -253,14 +251,14 @@ class TestDictionaryEncoding : public TestEncodingBase { void CheckRoundtrip() { std::vector valid_bits(BitUtil::BytesForBits(num_values_) + 1, 255); - DictEncoder encoder(descr_.get(), &pool_); + DictEncoder encoder(descr_.get()); ASSERT_NO_THROW(encoder.Put(draws_, num_values_)); dict_buffer_ = AllocateBuffer(default_memory_pool(), encoder.dict_encoded_size()); encoder.WriteDict(dict_buffer_->mutable_data()); std::shared_ptr indices = encoder.FlushValues(); - DictEncoder spaced_encoder(descr_.get(), &pool_); + DictEncoder spaced_encoder(descr_.get()); // PutSpaced should lead to the same results ASSERT_NO_THROW(spaced_encoder.PutSpaced(draws_, num_values_, valid_bits.data(), 0)); std::shared_ptr indices_from_spaced = spaced_encoder.FlushValues(); diff --git a/cpp/src/parquet/test-specialization.h b/cpp/src/parquet/test-specialization.h index 3d88cfc9e3fb2..55d23748c5cea 100644 --- a/cpp/src/parquet/test-specialization.h +++ b/cpp/src/parquet/test-specialization.h @@ -19,8 +19,7 @@ // Parquet column chunk within a row group. It could be extended in the future // to iterate through all data pages in all chunks in a file. -#ifndef PARQUET_COLUMN_TEST_SPECIALIZATION_H -#define PARQUET_COLUMN_TEST_SPECIALIZATION_H +#pragma once #include #include @@ -179,5 +178,3 @@ void PrimitiveTypedTest::GenerateData(int64_t num_values) { } // namespace test } // namespace parquet - -#endif // PARQUET_COLUMN_TEST_SPECIALIZATION_H diff --git a/cpp/src/parquet/test-util.h b/cpp/src/parquet/test-util.h index 3e74398b054ca..ab9c50a392862 100644 --- a/cpp/src/parquet/test-util.h +++ b/cpp/src/parquet/test-util.h @@ -19,8 +19,7 @@ // Parquet column chunk within a row group. It could be extended in the future // to iterate through all data pages in all chunks in a file. -#ifndef PARQUET_COLUMN_TEST_UTIL_H -#define PARQUET_COLUMN_TEST_UTIL_H +#pragma once #include #include @@ -247,10 +246,10 @@ class DictionaryPageBuilder { // This class writes data and metadata to the passed inputs explicit DictionaryPageBuilder(const ColumnDescriptor* d) : num_dict_values_(0), have_values_(false) { - encoder_.reset(new DictEncoder(d, &pool_)); + encoder_.reset(new DictEncoder(d)); } - ~DictionaryPageBuilder() { pool_.FreeAll(); } + ~DictionaryPageBuilder() {} shared_ptr AppendValues(const vector& values) { int num_values = static_cast(values.size()); @@ -271,7 +270,6 @@ class DictionaryPageBuilder { int32_t num_values() const { return num_dict_values_; } private: - ChunkedAllocator pool_; shared_ptr> encoder_; int32_t num_dict_values_; bool have_values_; @@ -443,5 +441,3 @@ static int MakePages(const ColumnDescriptor* d, int num_pages, int levels_per_pa } // namespace test } // namespace parquet - -#endif // PARQUET_COLUMN_TEST_UTIL_H diff --git a/cpp/src/parquet/util/memory-test.cc b/cpp/src/parquet/util/memory-test.cc index bfd685db00d2a..cdeb8eef8110c 100644 --- a/cpp/src/parquet/util/memory-test.cc +++ b/cpp/src/parquet/util/memory-test.cc @@ -34,222 +34,6 @@ namespace parquet { class TestBuffer : public ::testing::Test {}; -// Utility class to call private functions on MemPool. -class ChunkedAllocatorTest { - public: - static bool CheckIntegrity(ChunkedAllocator* pool, bool current_chunk_empty) { - return pool->CheckIntegrity(current_chunk_empty); - } - - static const int INITIAL_CHUNK_SIZE = ChunkedAllocator::INITIAL_CHUNK_SIZE; - static const int MAX_CHUNK_SIZE = ChunkedAllocator::MAX_CHUNK_SIZE; -}; - -const int ChunkedAllocatorTest::INITIAL_CHUNK_SIZE; -const int ChunkedAllocatorTest::MAX_CHUNK_SIZE; - -TEST(ChunkedAllocatorTest, Basic) { - ChunkedAllocator p; - ChunkedAllocator p2; - ChunkedAllocator p3; - - for (int iter = 0; iter < 2; ++iter) { - // allocate a total of 24K in 32-byte pieces (for which we only request 25 bytes) - for (int i = 0; i < 768; ++i) { - // pads to 32 bytes - p.Allocate(25); - } - // we handed back 24K - EXPECT_EQ(24 * 1024, p.total_allocated_bytes()); - // .. and allocated 28K of chunks (4, 8, 16) - EXPECT_EQ(28 * 1024, p.GetTotalChunkSizes()); - - // we're passing on the first two chunks, containing 12K of data; we're left with - // one chunk of 16K containing 12K of data - p2.AcquireData(&p, true); - EXPECT_EQ(12 * 1024, p.total_allocated_bytes()); - EXPECT_EQ(16 * 1024, p.GetTotalChunkSizes()); - - // we allocate 8K, for which there isn't enough room in the current chunk, - // so another one is allocated (32K) - p.Allocate(8 * 1024); - EXPECT_EQ((16 + 32) * 1024, p.GetTotalChunkSizes()); - - // we allocate 65K, which doesn't fit into the current chunk or the default - // size of the next allocated chunk (64K) - p.Allocate(65 * 1024); - EXPECT_EQ((12 + 8 + 65) * 1024, p.total_allocated_bytes()); - if (iter == 0) { - EXPECT_EQ((12 + 8 + 65) * 1024, p.peak_allocated_bytes()); - } else { - EXPECT_EQ((1 + 120 + 33) * 1024, p.peak_allocated_bytes()); - } - EXPECT_EQ((16 + 32 + 65) * 1024, p.GetTotalChunkSizes()); - - // Clear() resets allocated data, but doesn't remove any chunks - p.Clear(); - EXPECT_EQ(0, p.total_allocated_bytes()); - if (iter == 0) { - EXPECT_EQ((12 + 8 + 65) * 1024, p.peak_allocated_bytes()); - } else { - EXPECT_EQ((1 + 120 + 33) * 1024, p.peak_allocated_bytes()); - } - EXPECT_EQ((16 + 32 + 65) * 1024, p.GetTotalChunkSizes()); - - // next allocation reuses existing chunks - p.Allocate(1024); - EXPECT_EQ(1024, p.total_allocated_bytes()); - if (iter == 0) { - EXPECT_EQ((12 + 8 + 65) * 1024, p.peak_allocated_bytes()); - } else { - EXPECT_EQ((1 + 120 + 33) * 1024, p.peak_allocated_bytes()); - } - EXPECT_EQ((16 + 32 + 65) * 1024, p.GetTotalChunkSizes()); - - // ... unless it doesn't fit into any available chunk - p.Allocate(120 * 1024); - EXPECT_EQ((1 + 120) * 1024, p.total_allocated_bytes()); - if (iter == 0) { - EXPECT_EQ((1 + 120) * 1024, p.peak_allocated_bytes()); - } else { - EXPECT_EQ((1 + 120 + 33) * 1024, p.peak_allocated_bytes()); - } - EXPECT_EQ((130 + 16 + 32 + 65) * 1024, p.GetTotalChunkSizes()); - - // ... Try another chunk that fits into an existing chunk - p.Allocate(33 * 1024); - EXPECT_EQ((1 + 120 + 33) * 1024, p.total_allocated_bytes()); - EXPECT_EQ((130 + 16 + 32 + 65) * 1024, p.GetTotalChunkSizes()); - - // we're releasing 3 chunks, which get added to p2 - p2.AcquireData(&p, false); - EXPECT_EQ(0, p.total_allocated_bytes()); - EXPECT_EQ((1 + 120 + 33) * 1024, p.peak_allocated_bytes()); - EXPECT_EQ(0, p.GetTotalChunkSizes()); - - p3.AcquireData(&p2, true); // we're keeping the 65k chunk - EXPECT_EQ(33 * 1024, p2.total_allocated_bytes()); - EXPECT_EQ(65 * 1024, p2.GetTotalChunkSizes()); - - p.FreeAll(); - p2.FreeAll(); - p3.FreeAll(); - } -} - -// Test that we can keep an allocated chunk and a free chunk. -// This case verifies that when chunks are acquired by another memory pool the -// remaining chunks are consistent if there were more than one used chunk and some -// free chunks. -TEST(ChunkedAllocatorTest, Keep) { - ChunkedAllocator p; - p.Allocate(4 * 1024); - p.Allocate(8 * 1024); - p.Allocate(16 * 1024); - EXPECT_EQ((4 + 8 + 16) * 1024, p.total_allocated_bytes()); - EXPECT_EQ((4 + 8 + 16) * 1024, p.GetTotalChunkSizes()); - p.Clear(); - EXPECT_EQ(0, p.total_allocated_bytes()); - EXPECT_EQ((4 + 8 + 16) * 1024, p.GetTotalChunkSizes()); - p.Allocate(1 * 1024); - p.Allocate(4 * 1024); - EXPECT_EQ((1 + 4) * 1024, p.total_allocated_bytes()); - EXPECT_EQ((4 + 8 + 16) * 1024, p.GetTotalChunkSizes()); - - ChunkedAllocator p2; - p2.AcquireData(&p, true); - EXPECT_EQ(4 * 1024, p.total_allocated_bytes()); - EXPECT_EQ((8 + 16) * 1024, p.GetTotalChunkSizes()); - EXPECT_EQ(1 * 1024, p2.total_allocated_bytes()); - EXPECT_EQ(4 * 1024, p2.GetTotalChunkSizes()); - - p.FreeAll(); - p2.FreeAll(); -} - -// Tests that we can return partial allocations. -TEST(ChunkedAllocatorTest, ReturnPartial) { - ChunkedAllocator p; - uint8_t* ptr = p.Allocate(1024); - EXPECT_EQ(1024, p.total_allocated_bytes()); - memset(ptr, 0, 1024); - p.ReturnPartialAllocation(1024); - - uint8_t* ptr2 = p.Allocate(1024); - EXPECT_EQ(1024, p.total_allocated_bytes()); - EXPECT_TRUE(ptr == ptr2); - p.ReturnPartialAllocation(1016); - - ptr2 = p.Allocate(1016); - EXPECT_EQ(1024, p.total_allocated_bytes()); - EXPECT_TRUE(ptr2 == ptr + 8); - p.ReturnPartialAllocation(512); - memset(ptr2, 1, 1016 - 512); - - uint8_t* ptr3 = p.Allocate(512); - EXPECT_EQ(1024, p.total_allocated_bytes()); - EXPECT_TRUE(ptr3 == ptr + 512); - memset(ptr3, 2, 512); - - for (int i = 0; i < 8; ++i) { - EXPECT_EQ(0, ptr[i]); - } - for (int i = 8; i < 512; ++i) { - EXPECT_EQ(1, ptr[i]); - } - for (int i = 512; i < 1024; ++i) { - EXPECT_EQ(2, ptr[i]); - } - - p.FreeAll(); -} - -// Test that the ChunkedAllocator overhead is bounded when we make allocations of -// INITIAL_CHUNK_SIZE. -TEST(ChunkedAllocatorTest, MemoryOverhead) { - ChunkedAllocator p; - const int alloc_size = ChunkedAllocatorTest::INITIAL_CHUNK_SIZE; - const int num_allocs = 1000; - int64_t total_allocated = 0; - - for (int i = 0; i < num_allocs; ++i) { - uint8_t* mem = p.Allocate(alloc_size); - ASSERT_TRUE(mem != nullptr); - total_allocated += alloc_size; - - int64_t wasted_memory = p.GetTotalChunkSizes() - total_allocated; - // The initial chunk fits evenly into MAX_CHUNK_SIZE, so should have at most - // one empty chunk at the end. - EXPECT_LE(wasted_memory, ChunkedAllocatorTest::MAX_CHUNK_SIZE); - // The chunk doubling algorithm should not allocate chunks larger than the total - // amount of memory already allocated. - EXPECT_LE(wasted_memory, total_allocated); - } - - p.FreeAll(); -} - -// Test that the ChunkedAllocator overhead is bounded when we make alternating -// large and small allocations. -TEST(ChunkedAllocatorTest, FragmentationOverhead) { - ChunkedAllocator p; - const int num_allocs = 100; - int64_t total_allocated = 0; - - for (int i = 0; i < num_allocs; ++i) { - int alloc_size = i % 2 == 0 ? 1 : ChunkedAllocatorTest::MAX_CHUNK_SIZE; - uint8_t* mem = p.Allocate(alloc_size); - ASSERT_TRUE(mem != nullptr); - total_allocated += alloc_size; - - int64_t wasted_memory = p.GetTotalChunkSizes() - total_allocated; - // Fragmentation should not waste more than half of each completed chunk. - EXPECT_LE(wasted_memory, total_allocated + ChunkedAllocatorTest::MAX_CHUNK_SIZE); - } - - p.FreeAll(); -} - TEST(TestBufferedInputStream, Basics) { int64_t source_size = 256; int64_t stream_offset = 10; @@ -315,9 +99,8 @@ TEST(TestBufferedInputStream, Basics) { TEST(TestArrowInputFile, ReadAt) { std::string data = "this is the data"; - auto data_buffer = reinterpret_cast(data.c_str()); - auto file = std::make_shared<::arrow::io::BufferReader>(data_buffer, data.size()); + auto file = std::make_shared<::arrow::io::BufferReader>(data); auto source = std::make_shared(file); ASSERT_EQ(0, source->Tell()); @@ -335,7 +118,7 @@ TEST(TestArrowInputFile, Read) { std::string data = "this is the data"; auto data_buffer = reinterpret_cast(data.c_str()); - auto file = std::make_shared<::arrow::io::BufferReader>(data_buffer, data.size()); + auto file = std::make_shared<::arrow::io::BufferReader>(data); auto source = std::make_shared(file); ASSERT_EQ(0, source->Tell()); diff --git a/cpp/src/parquet/util/memory.cc b/cpp/src/parquet/util/memory.cc index fde424aafe71d..6251f1c85c085 100644 --- a/cpp/src/parquet/util/memory.cc +++ b/cpp/src/parquet/util/memory.cc @@ -115,238 +115,6 @@ template class Vector; template class Vector; template class Vector; -const int ChunkedAllocator::INITIAL_CHUNK_SIZE; -const int ChunkedAllocator::MAX_CHUNK_SIZE; - -ChunkedAllocator::ChunkedAllocator(MemoryPool* pool) - : current_chunk_idx_(-1), - next_chunk_size_(INITIAL_CHUNK_SIZE), - total_allocated_bytes_(0), - peak_allocated_bytes_(0), - total_reserved_bytes_(0), - pool_(pool) {} - -ChunkedAllocator::ChunkInfo::ChunkInfo(int64_t size, uint8_t* buf) - : data(buf), size(size), allocated_bytes(0) {} - -ChunkedAllocator::~ChunkedAllocator() { - int64_t total_bytes_released = 0; - for (size_t i = 0; i < chunks_.size(); ++i) { - total_bytes_released += chunks_[i].size; - pool_->Free(chunks_[i].data, chunks_[i].size); - } - - DCHECK(chunks_.empty()) << "Must call FreeAll() or AcquireData() for this pool"; -} - -void ChunkedAllocator::ReturnPartialAllocation(int byte_size) { - DCHECK_GE(byte_size, 0); - DCHECK_NE(current_chunk_idx_, -1); - ChunkInfo& info = chunks_[current_chunk_idx_]; - DCHECK_GE(info.allocated_bytes, byte_size); - info.allocated_bytes -= byte_size; - total_allocated_bytes_ -= byte_size; -} - -template -uint8_t* ChunkedAllocator::Allocate(int size) { - if (size == 0) { - return nullptr; - } - - int64_t num_bytes = ::arrow::BitUtil::RoundUp(size, 8); - if (current_chunk_idx_ == -1 || - num_bytes + chunks_[current_chunk_idx_].allocated_bytes > - chunks_[current_chunk_idx_].size) { - // If we couldn't allocate a new chunk, return nullptr. - if (ARROW_PREDICT_FALSE(!FindChunk(num_bytes))) { - return nullptr; - } - } - ChunkInfo& info = chunks_[current_chunk_idx_]; - uint8_t* result = info.data + info.allocated_bytes; - DCHECK_LE(info.allocated_bytes + num_bytes, info.size); - info.allocated_bytes += num_bytes; - total_allocated_bytes_ += num_bytes; - DCHECK_LE(current_chunk_idx_, static_cast(chunks_.size()) - 1); - peak_allocated_bytes_ = std::max(total_allocated_bytes_, peak_allocated_bytes_); - return result; -} - -uint8_t* ChunkedAllocator::Allocate(int size) { return Allocate(size); } - -void ChunkedAllocator::Clear() { - current_chunk_idx_ = -1; - for (auto chunk = chunks_.begin(); chunk != chunks_.end(); ++chunk) { - chunk->allocated_bytes = 0; - } - total_allocated_bytes_ = 0; - DCHECK(CheckIntegrity(false)); -} - -void ChunkedAllocator::FreeAll() { - int64_t total_bytes_released = 0; - for (size_t i = 0; i < chunks_.size(); ++i) { - total_bytes_released += chunks_[i].size; - pool_->Free(chunks_[i].data, chunks_[i].size); - } - chunks_.clear(); - next_chunk_size_ = INITIAL_CHUNK_SIZE; - current_chunk_idx_ = -1; - total_allocated_bytes_ = 0; - total_reserved_bytes_ = 0; -} - -bool ChunkedAllocator::FindChunk(int64_t min_size) { - // Try to allocate from a free chunk. The first free chunk, if any, will be immediately - // after the current chunk. - int first_free_idx = current_chunk_idx_ + 1; - // (cast size() to signed int in order to avoid everything else being cast to - // unsigned long, in particular -1) - while (++current_chunk_idx_ < static_cast(chunks_.size())) { - // we found a free chunk - DCHECK_EQ(chunks_[current_chunk_idx_].allocated_bytes, 0); - - if (chunks_[current_chunk_idx_].size >= min_size) { - // This chunk is big enough. Move it before the other free chunks. - if (current_chunk_idx_ != first_free_idx) { - std::swap(chunks_[current_chunk_idx_], chunks_[first_free_idx]); - current_chunk_idx_ = first_free_idx; - } - break; - } - } - - if (current_chunk_idx_ == static_cast(chunks_.size())) { - // need to allocate new chunk. - int64_t chunk_size; - DCHECK_GE(next_chunk_size_, INITIAL_CHUNK_SIZE); - DCHECK_LE(next_chunk_size_, MAX_CHUNK_SIZE); - - chunk_size = std::max(min_size, next_chunk_size_); - - // Allocate a new chunk. Return early if malloc fails. - uint8_t* buf = nullptr; - PARQUET_THROW_NOT_OK(pool_->Allocate(chunk_size, &buf)); - if (ARROW_PREDICT_FALSE(buf == nullptr)) { - DCHECK_EQ(current_chunk_idx_, static_cast(chunks_.size())); - current_chunk_idx_ = static_cast(chunks_.size()) - 1; - return false; - } - - // If there are no free chunks put it at the end, otherwise before the first free. - if (first_free_idx == static_cast(chunks_.size())) { - chunks_.push_back(ChunkInfo(chunk_size, buf)); - } else { - current_chunk_idx_ = first_free_idx; - auto insert_chunk = chunks_.begin() + current_chunk_idx_; - chunks_.insert(insert_chunk, ChunkInfo(chunk_size, buf)); - } - total_reserved_bytes_ += chunk_size; - // Don't increment the chunk size until the allocation succeeds: if an attempted - // large allocation fails we don't want to increase the chunk size further. - next_chunk_size_ = - static_cast(std::min(chunk_size * 2, MAX_CHUNK_SIZE)); - } - - DCHECK_LT(current_chunk_idx_, static_cast(chunks_.size())); - DCHECK(CheckIntegrity(true)); - return true; -} - -void ChunkedAllocator::AcquireData(ChunkedAllocator* src, bool keep_current) { - DCHECK(src->CheckIntegrity(false)); - int num_acquired_chunks; - if (keep_current) { - num_acquired_chunks = src->current_chunk_idx_; - } else if (src->GetFreeOffset() == 0) { - // nothing in the last chunk - num_acquired_chunks = src->current_chunk_idx_; - } else { - num_acquired_chunks = src->current_chunk_idx_ + 1; - } - - if (num_acquired_chunks <= 0) { - if (!keep_current) src->FreeAll(); - return; - } - - auto end_chunk = src->chunks_.begin() + num_acquired_chunks; - int64_t total_transfered_bytes = 0; - for (auto i = src->chunks_.begin(); i != end_chunk; ++i) { - total_transfered_bytes += i->size; - } - src->total_reserved_bytes_ -= total_transfered_bytes; - total_reserved_bytes_ += total_transfered_bytes; - - // insert new chunks after current_chunk_idx_ - auto insert_chunk = chunks_.begin() + (current_chunk_idx_ + 1); - chunks_.insert(insert_chunk, src->chunks_.begin(), end_chunk); - src->chunks_.erase(src->chunks_.begin(), end_chunk); - current_chunk_idx_ += num_acquired_chunks; - - if (keep_current) { - src->current_chunk_idx_ = 0; - DCHECK(src->chunks_.size() == 1 || src->chunks_[1].allocated_bytes == 0); - total_allocated_bytes_ += src->total_allocated_bytes_ - src->GetFreeOffset(); - src->total_allocated_bytes_ = src->GetFreeOffset(); - } else { - src->current_chunk_idx_ = -1; - total_allocated_bytes_ += src->total_allocated_bytes_; - src->total_allocated_bytes_ = 0; - } - peak_allocated_bytes_ = std::max(total_allocated_bytes_, peak_allocated_bytes_); - - if (!keep_current) src->FreeAll(); - DCHECK(CheckIntegrity(false)); -} - -std::string ChunkedAllocator::DebugString() { - std::stringstream out; - char str[16]; - out << "ChunkedAllocator(#chunks=" << chunks_.size() << " ["; - for (size_t i = 0; i < chunks_.size(); ++i) { - sprintf(str, "0x%zx=", reinterpret_cast(chunks_[i].data)); // NOLINT - out << (i > 0 ? " " : "") << str << chunks_[i].size << "/" - << chunks_[i].allocated_bytes; - } - out << "] current_chunk=" << current_chunk_idx_ - << " total_sizes=" << GetTotalChunkSizes() - << " total_alloc=" << total_allocated_bytes_ << ")"; - return out.str(); -} - -int64_t ChunkedAllocator::GetTotalChunkSizes() const { - int64_t result = 0; - for (size_t i = 0; i < chunks_.size(); ++i) { - result += chunks_[i].size; - } - return result; -} - -bool ChunkedAllocator::CheckIntegrity(bool current_chunk_empty) { - // check that current_chunk_idx_ points to the last chunk with allocated data - DCHECK_LT(current_chunk_idx_, static_cast(chunks_.size())); - int64_t total_allocated = 0; - for (int i = 0; i < static_cast(chunks_.size()); ++i) { - DCHECK_GT(chunks_[i].size, 0); - if (i < current_chunk_idx_) { - DCHECK_GT(chunks_[i].allocated_bytes, 0); - } else if (i == current_chunk_idx_) { - if (current_chunk_empty) { - DCHECK_EQ(chunks_[i].allocated_bytes, 0); - } else { - DCHECK_GT(chunks_[i].allocated_bytes, 0); - } - } else { - DCHECK_EQ(chunks_[i].allocated_bytes, 0); - } - total_allocated += chunks_[i].allocated_bytes; - } - DCHECK_EQ(total_allocated, total_allocated_bytes_); - return true; -} - // ---------------------------------------------------------------------- // Arrow IO wrappers diff --git a/cpp/src/parquet/util/memory.h b/cpp/src/parquet/util/memory.h index cccafe8cb38bb..8677e6b9dacbc 100644 --- a/cpp/src/parquet/util/memory.h +++ b/cpp/src/parquet/util/memory.h @@ -77,149 +77,6 @@ class PARQUET_EXPORT Vector { PARQUET_DISALLOW_COPY_AND_ASSIGN(Vector); }; -/// A ChunkedAllocator maintains a list of memory chunks from which it -/// allocates memory in response to Allocate() calls; Chunks stay around for -/// the lifetime of the allocator or until they are passed on to another -/// allocator. -// -/// An Allocate() call will attempt to allocate memory from the chunk that was most -/// recently added; if that chunk doesn't have enough memory to -/// satisfy the allocation request, the free chunks are searched for one that is -/// big enough otherwise a new chunk is added to the list. -/// The current_chunk_idx_ always points to the last chunk with allocated memory. -/// In order to keep allocation overhead low, chunk sizes double with each new one -/// added, until they hit a maximum size. -// -/// Example: -/// ChunkedAllocator* p = new ChunkedAllocator(); -/// for (int i = 0; i < 1024; ++i) { -/// returns 8-byte aligned memory (effectively 24 bytes): -/// .. = p->Allocate(17); -/// } -/// at this point, 17K have been handed out in response to Allocate() calls and -/// 28K of chunks have been allocated (chunk sizes: 4K, 8K, 16K) -/// We track total and peak allocated bytes. At this point they would be the same: -/// 28k bytes. A call to Clear will return the allocated memory so -/// total_allocate_bytes_ -/// becomes 0 while peak_allocate_bytes_ remains at 28k. -/// p->Clear(); -/// the entire 1st chunk is returned: -/// .. = p->Allocate(4 * 1024); -/// 4K of the 2nd chunk are returned: -/// .. = p->Allocate(4 * 1024); -/// a new 20K chunk is created -/// .. = p->Allocate(20 * 1024); -// -/// ChunkedAllocator* p2 = new ChunkedAllocator(); -/// the new ChunkedAllocator receives all chunks containing data from p -/// p2->AcquireData(p, false); -/// At this point p.total_allocated_bytes_ would be 0 while p.peak_allocated_bytes_ -/// remains unchanged. -/// The one remaining (empty) chunk is released: -/// delete p; - -class PARQUET_EXPORT ChunkedAllocator { - public: - explicit ChunkedAllocator(::arrow::MemoryPool* pool = ::arrow::default_memory_pool()); - - /// Frees all chunks of memory and subtracts the total allocated bytes - /// from the registered limits. - ~ChunkedAllocator(); - - /// Allocates 8-byte aligned section of memory of 'size' bytes at the end - /// of the the current chunk. Creates a new chunk if there aren't any chunks - /// with enough capacity. - uint8_t* Allocate(int size); - - /// Returns 'byte_size' to the current chunk back to the mem pool. This can - /// only be used to return either all or part of the previous allocation returned - /// by Allocate(). - void ReturnPartialAllocation(int byte_size); - - /// Makes all allocated chunks available for re-use, but doesn't delete any chunks. - void Clear(); - - /// Deletes all allocated chunks. FreeAll() or AcquireData() must be called for - /// each mem pool - void FreeAll(); - - /// Absorb all chunks that hold data from src. If keep_current is true, let src hold on - /// to its last allocated chunk that contains data. - /// All offsets handed out by calls to GetCurrentOffset() for 'src' become invalid. - void AcquireData(ChunkedAllocator* src, bool keep_current); - - std::string DebugString(); - - int64_t total_allocated_bytes() const { return total_allocated_bytes_; } - int64_t peak_allocated_bytes() const { return peak_allocated_bytes_; } - int64_t total_reserved_bytes() const { return total_reserved_bytes_; } - - /// Return sum of chunk_sizes_. - int64_t GetTotalChunkSizes() const; - - private: - friend class ChunkedAllocatorTest; - static const int INITIAL_CHUNK_SIZE = 4 * 1024; - - /// The maximum size of chunk that should be allocated. Allocations larger than this - /// size will get their own individual chunk. - static const int MAX_CHUNK_SIZE = 1024 * 1024; - - struct ChunkInfo { - uint8_t* data; // Owned by the ChunkInfo. - int64_t size; // in bytes - - /// bytes allocated via Allocate() in this chunk - int64_t allocated_bytes; - - explicit ChunkInfo(int64_t size, uint8_t* buf); - - ChunkInfo() : data(NULLPTR), size(0), allocated_bytes(0) {} - }; - - /// chunk from which we served the last Allocate() call; - /// always points to the last chunk that contains allocated data; - /// chunks 0..current_chunk_idx_ are guaranteed to contain data - /// (chunks_[i].allocated_bytes > 0 for i: 0..current_chunk_idx_); - /// -1 if no chunks present - int current_chunk_idx_; - - /// The size of the next chunk to allocate. - int64_t next_chunk_size_; - - /// sum of allocated_bytes_ - int64_t total_allocated_bytes_; - - /// Maximum number of bytes allocated from this pool at one time. - int64_t peak_allocated_bytes_; - - /// sum of all bytes allocated in chunks_ - int64_t total_reserved_bytes_; - - std::vector chunks_; - - ::arrow::MemoryPool* pool_; - - /// Find or allocated a chunk with at least min_size spare capacity and update - /// current_chunk_idx_. Also updates chunks_, chunk_sizes_ and allocated_bytes_ - /// if a new chunk needs to be created. - bool FindChunk(int64_t min_size); - - /// Check integrity of the supporting data structures; always returns true but DCHECKs - /// all invariants. - /// If 'current_chunk_empty' is false, checks that the current chunk contains data. - bool CheckIntegrity(bool current_chunk_empty); - - /// Return offset to unoccpied space in current chunk. - int GetFreeOffset() const { - if (current_chunk_idx_ == -1) return 0; - return static_cast(chunks_[current_chunk_idx_].allocated_bytes); - } - - template - uint8_t* Allocate(int size); -}; - // File input and output interfaces that translate arrow::Status to exceptions class PARQUET_EXPORT FileInterface { diff --git a/cpp/src/parquet/util/test-common.h b/cpp/src/parquet/util/test-common.h index 4e95870025cda..cb4eb43be2a80 100644 --- a/cpp/src/parquet/util/test-common.h +++ b/cpp/src/parquet/util/test-common.h @@ -91,40 +91,33 @@ static vector slice(const vector& values, int start, int end) { } static inline vector flip_coins_seed(int n, double p, uint32_t seed) { - std::mt19937 gen(seed); + std::default_random_engine gen(seed); std::bernoulli_distribution d(p); - vector draws; + vector draws(n); for (int i = 0; i < n; ++i) { - draws.push_back(d(gen)); + draws[i] = d(gen); } return draws; } static inline vector flip_coins(int n, double p) { uint64_t seed = std::chrono::high_resolution_clock::now().time_since_epoch().count(); - std::mt19937 gen(static_cast(seed)); - - std::bernoulli_distribution d(p); - - vector draws; - for (int i = 0; i < n; ++i) { - draws.push_back(d(gen)); - } - return draws; + return flip_coins_seed(n, p, static_cast(seed)); } void random_bytes(int n, uint32_t seed, std::vector* out) { - std::mt19937 gen(seed); + std::default_random_engine gen(seed); std::uniform_int_distribution d(0, 255); + out->resize(n); for (int i = 0; i < n; ++i) { - out->push_back(static_cast(d(gen) & 0xFF)); + (*out)[i] = static_cast(d(gen)); } } void random_bools(int n, double p, uint32_t seed, bool* out) { - std::mt19937 gen(seed); + std::default_random_engine gen(seed); std::bernoulli_distribution d(p); for (int i = 0; i < n; ++i) { out[i] = d(gen); @@ -133,7 +126,7 @@ void random_bools(int n, double p, uint32_t seed, bool* out) { template void random_numbers(int n, uint32_t seed, T min_value, T max_value, T* out) { - std::mt19937 gen(seed); + std::default_random_engine gen(seed); std::uniform_int_distribution d(min_value, max_value); for (int i = 0; i < n; ++i) { out[i] = d(gen); @@ -142,7 +135,7 @@ void random_numbers(int n, uint32_t seed, T min_value, T max_value, T* out) { template <> void random_numbers(int n, uint32_t seed, float min_value, float max_value, float* out) { - std::mt19937 gen(seed); + std::default_random_engine gen(seed); std::uniform_real_distribution d(min_value, max_value); for (int i = 0; i < n; ++i) { out[i] = d(gen); @@ -152,7 +145,7 @@ void random_numbers(int n, uint32_t seed, float min_value, float max_value, floa template <> void random_numbers(int n, uint32_t seed, double min_value, double max_value, double* out) { - std::mt19937 gen(seed); + std::default_random_engine gen(seed); std::uniform_real_distribution d(min_value, max_value); for (int i = 0; i < n; ++i) { out[i] = d(gen); @@ -161,7 +154,7 @@ void random_numbers(int n, uint32_t seed, double min_value, double max_value, void random_Int96_numbers(int n, uint32_t seed, int32_t min_value, int32_t max_value, Int96* out) { - std::mt19937 gen(seed); + std::default_random_engine gen(seed); std::uniform_int_distribution d(min_value, max_value); for (int i = 0; i < n; ++i) { out[i].value[0] = d(gen); @@ -171,12 +164,12 @@ void random_Int96_numbers(int n, uint32_t seed, int32_t min_value, int32_t max_v } void random_fixed_byte_array(int n, uint32_t seed, uint8_t* buf, int len, FLBA* out) { - std::mt19937 gen(seed); + std::default_random_engine gen(seed); std::uniform_int_distribution d(0, 255); for (int i = 0; i < n; ++i) { out[i].ptr = buf; for (int j = 0; j < len; ++j) { - buf[j] = static_cast(d(gen) & 0xFF); + buf[j] = static_cast(d(gen)); } buf += len; } @@ -184,7 +177,7 @@ void random_fixed_byte_array(int n, uint32_t seed, uint8_t* buf, int len, FLBA* void random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* out, int min_size, int max_size) { - std::mt19937 gen(seed); + std::default_random_engine gen(seed); std::uniform_int_distribution d1(min_size, max_size); std::uniform_int_distribution d2(0, 255); for (int i = 0; i < n; ++i) { @@ -192,7 +185,7 @@ void random_byte_array(int n, uint32_t seed, uint8_t* buf, ByteArray* out, int m out[i].len = len; out[i].ptr = buf; for (int j = 0; j < len; ++j) { - buf[j] = static_cast(d2(gen) & 0xFF); + buf[j] = static_cast(d2(gen)); } buf += len; } diff --git a/cpp/src/plasma/CMakeLists.txt b/cpp/src/plasma/CMakeLists.txt index f9ed4e3d4e3f5..0f8916e6c48aa 100644 --- a/cpp/src/plasma/CMakeLists.txt +++ b/cpp/src/plasma/CMakeLists.txt @@ -83,10 +83,10 @@ set(PLASMA_SRCS set(PLASMA_LINK_LIBS arrow_shared) set(PLASMA_STATIC_LINK_LIBS arrow_static) -if (ARROW_GPU) - set(PLASMA_LINK_LIBS ${PLASMA_LINK_LIBS} arrow_gpu_shared) - set(PLASMA_STATIC_LINK_LIBS arrow_gpu_static ${PLASMA_STATIC_LINK_LIBS}) - add_definitions(-DPLASMA_GPU) +if (ARROW_CUDA) + set(PLASMA_LINK_LIBS ${PLASMA_LINK_LIBS} arrow_cuda_shared) + set(PLASMA_STATIC_LINK_LIBS arrow_cuda_static ${PLASMA_STATIC_LINK_LIBS}) + add_definitions(-DPLASMA_CUDA) endif() ADD_ARROW_LIB(plasma diff --git a/cpp/src/plasma/client.cc b/cpp/src/plasma/client.cc index 0c96be060e1c1..99cf00cab80fd 100644 --- a/cpp/src/plasma/client.cc +++ b/cpp/src/plasma/client.cc @@ -53,13 +53,13 @@ #include "plasma/plasma.h" #include "plasma/protocol.h" -#ifdef PLASMA_GPU +#ifdef PLASMA_CUDA #include "arrow/gpu/cuda_api.h" -using arrow::gpu::CudaBuffer; -using arrow::gpu::CudaBufferWriter; -using arrow::gpu::CudaContext; -using arrow::gpu::CudaDeviceManager; +using arrow::cuda::CudaBuffer; +using arrow::cuda::CudaBufferWriter; +using arrow::cuda::CudaContext; +using arrow::cuda::CudaDeviceManager; #endif #define XXH_INLINE_ALL 1 @@ -89,7 +89,7 @@ constexpr int64_t kL3CacheSizeBytes = 100000000; // ---------------------------------------------------------------------- // GPU support -#ifdef PLASMA_GPU +#ifdef PLASMA_CUDA struct GpuProcessHandle { /// Pointer to CUDA buffer that is backing this GPU object. std::shared_ptr ptr; @@ -202,6 +202,9 @@ class PlasmaClient::Impl : public std::enable_shared_from_this deletion_cache_; -#ifdef PLASMA_GPU +#ifdef PLASMA_CUDA /// Cuda Device Manager. - arrow::gpu::CudaDeviceManager* manager_; + arrow::cuda::CudaDeviceManager* manager_; #endif }; PlasmaBuffer::~PlasmaBuffer() { ARROW_UNUSED(client_->Release(object_id_)); } PlasmaClient::Impl::Impl() { -#ifdef PLASMA_GPU +#ifdef PLASMA_CUDA DCHECK_OK(CudaDeviceManager::GetInstance(&manager_)); #endif } @@ -410,7 +413,7 @@ Status PlasmaClient::Impl::Create(const ObjectID& object_id, int64_t data_size, memcpy((*data)->mutable_data() + object.data_size, metadata, metadata_size); } } else { -#ifdef PLASMA_GPU +#ifdef PLASMA_CUDA std::lock_guard lock(gpu_mutex); std::shared_ptr context; RETURN_NOT_OK(manager_->GetContext(device_num - 1, &context)); @@ -494,7 +497,7 @@ Status PlasmaClient::Impl::GetBuffers( physical_buf = std::make_shared( data + object->data_offset, object->data_size + object->metadata_size); } else { -#ifdef PLASMA_GPU +#ifdef PLASMA_CUDA physical_buf = gpu_object_map.find(object_ids[i])->second->ptr; #else ARROW_LOG(FATAL) << "Arrow GPU library is not enabled."; @@ -557,7 +560,7 @@ Status PlasmaClient::Impl::GetBuffers( physical_buf = std::make_shared( data + object->data_offset, object->data_size + object->metadata_size); } else { -#ifdef PLASMA_GPU +#ifdef PLASMA_CUDA std::lock_guard lock(gpu_mutex); auto handle = gpu_object_map.find(object_ids[i]); if (handle == gpu_object_map.end()) { @@ -943,13 +946,10 @@ Status PlasmaClient::Impl::Subscribe(int* fd) { return Status::OK(); } -Status PlasmaClient::Impl::GetNotification(int fd, ObjectID* object_id, - int64_t* data_size, int64_t* metadata_size) { - auto notification = ReadMessageAsync(fd); - if (notification == NULL) { - return Status::IOError("Failed to read object notification from Plasma socket"); - } - auto object_info = flatbuffers::GetRoot(notification.get()); +Status PlasmaClient::Impl::DecodeNotification(const uint8_t* buffer, ObjectID* object_id, + int64_t* data_size, + int64_t* metadata_size) { + auto object_info = flatbuffers::GetRoot(buffer); ARROW_CHECK(object_info->object_id()->size() == sizeof(ObjectID)); memcpy(object_id, object_info->object_id()->data(), sizeof(ObjectID)); if (object_info->is_deletion()) { @@ -962,6 +962,15 @@ Status PlasmaClient::Impl::GetNotification(int fd, ObjectID* object_id, return Status::OK(); } +Status PlasmaClient::Impl::GetNotification(int fd, ObjectID* object_id, + int64_t* data_size, int64_t* metadata_size) { + auto notification = ReadMessageAsync(fd); + if (notification == NULL) { + return Status::IOError("Failed to read object notification from Plasma socket"); + } + return DecodeNotification(notification.get(), object_id, data_size, metadata_size); +} + Status PlasmaClient::Impl::Connect(const std::string& store_socket_name, const std::string& manager_socket_name, int release_delay, int num_retries) { @@ -1138,6 +1147,11 @@ Status PlasmaClient::GetNotification(int fd, ObjectID* object_id, int64_t* data_ return impl_->GetNotification(fd, object_id, data_size, metadata_size); } +Status PlasmaClient::DecodeNotification(const uint8_t* buffer, ObjectID* object_id, + int64_t* data_size, int64_t* metadata_size) { + return impl_->DecodeNotification(buffer, object_id, data_size, metadata_size); +} + Status PlasmaClient::Disconnect() { return impl_->Disconnect(); } Status PlasmaClient::Fetch(int num_object_ids, const ObjectID* object_ids) { diff --git a/cpp/src/plasma/client.h b/cpp/src/plasma/client.h index 1ad09f5c06738..9e080b7760dc8 100644 --- a/cpp/src/plasma/client.h +++ b/cpp/src/plasma/client.h @@ -246,6 +246,9 @@ class ARROW_EXPORT PlasmaClient { Status GetNotification(int fd, ObjectID* object_id, int64_t* data_size, int64_t* metadata_size); + Status DecodeNotification(const uint8_t* buffer, ObjectID* object_id, + int64_t* data_size, int64_t* metadata_size); + /// Disconnect from the local plasma instance, including the local store and /// manager. /// diff --git a/cpp/src/plasma/common.h b/cpp/src/plasma/common.h index f7cdaf5ff51df..7090428ff41c9 100644 --- a/cpp/src/plasma/common.h +++ b/cpp/src/plasma/common.h @@ -34,7 +34,7 @@ #include "arrow/status.h" #include "arrow/util/logging.h" -#ifdef PLASMA_GPU +#ifdef PLASMA_CUDA #include "arrow/gpu/cuda_api.h" #endif @@ -118,9 +118,9 @@ struct ObjectTableEntry { int64_t data_size; /// Size of the object metadata in bytes. int64_t metadata_size; -#ifdef PLASMA_GPU +#ifdef PLASMA_CUDA /// IPC GPU handle to share with clients. - std::shared_ptr<::arrow::gpu::CudaIpcMemHandle> ipc_handle; + std::shared_ptr<::arrow::cuda::CudaIpcMemHandle> ipc_handle; #endif /// Number of clients currently using this object. int ref_count; diff --git a/cpp/src/plasma/fling.cc b/cpp/src/plasma/fling.cc index 26afd87066c2b..f0960aab6bf23 100644 --- a/cpp/src/plasma/fling.cc +++ b/cpp/src/plasma/fling.cc @@ -16,6 +16,8 @@ #include +#include "arrow/util/logging.h" + void init_msg(struct msghdr* msg, struct iovec* iov, char* buf, size_t buf_len) { iov->iov_base = buf; iov->iov_len = 1; @@ -46,11 +48,32 @@ int send_fd(int conn, int fd) { memcpy(CMSG_DATA(header), reinterpret_cast(&fd), sizeof(int)); // Send file descriptor. - ssize_t r = sendmsg(conn, &msg, 0); - if (r >= 0) { - return 0; - } else { - return static_cast(r); + while (true) { + ssize_t r = sendmsg(conn, &msg, 0); + if (r < 0) { + if (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR) { + continue; + } else if (errno == EMSGSIZE) { + ARROW_LOG(WARNING) << "Failed to send file descriptor" + << " (errno = EMSGSIZE), retrying."; + // If we failed to send the file descriptor, loop until we have sent it + // successfully. TODO(rkn): This is problematic for two reasons. First + // of all, sending the file descriptor should just succeed without any + // errors, but sometimes I see a "Message too long" error number. + // Second, looping like this allows a client to potentially block the + // plasma store event loop which should never happen. + continue; + } else { + ARROW_LOG(INFO) << "Error in send_fd (errno = " << errno << ")"; + return static_cast(r); + } + } else if (r == 0) { + ARROW_LOG(INFO) << "Encountered unexpected EOF"; + return 0; + } else { + ARROW_CHECK(r > 0); + return static_cast(r); + } } } @@ -60,7 +83,19 @@ int recv_fd(int conn) { char buf[CMSG_SPACE(sizeof(int))]; init_msg(&msg, &iov, buf, sizeof(buf)); - if (recvmsg(conn, &msg, 0) == -1) return -1; + while (true) { + ssize_t r = recvmsg(conn, &msg, 0); + if (r == -1) { + if (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR) { + continue; + } else { + ARROW_LOG(INFO) << "Error in recv_fd (errno = " << errno << ")"; + return -1; + } + } else { + break; + } + } int found_fd = -1; int oh_noes = 0; diff --git a/cpp/src/plasma/plasma.h b/cpp/src/plasma/plasma.h index e63d967676053..83caec7ee4958 100644 --- a/cpp/src/plasma/plasma.h +++ b/cpp/src/plasma/plasma.h @@ -40,8 +40,8 @@ #include "plasma/common.h" #include "plasma/common_generated.h" -#ifdef PLASMA_GPU -using arrow::gpu::CudaIpcMemHandle; +#ifdef PLASMA_CUDA +using arrow::cuda::CudaIpcMemHandle; #endif namespace plasma { @@ -73,7 +73,7 @@ typedef std::unordered_map ObjectRequestMap; // TODO(pcm): Replace this by the flatbuffers message PlasmaObjectSpec. struct PlasmaObject { -#ifdef PLASMA_GPU +#ifdef PLASMA_CUDA // IPC handle for Cuda. std::shared_ptr ipc_handle; #endif diff --git a/cpp/src/plasma/protocol.cc b/cpp/src/plasma/protocol.cc index a74db66fded8f..c437840874538 100644 --- a/cpp/src/plasma/protocol.cc +++ b/cpp/src/plasma/protocol.cc @@ -25,7 +25,7 @@ #include "plasma/common.h" #include "plasma/io.h" -#ifdef ARROW_GPU +#ifdef PLASMA_CUDA #include "arrow/gpu/cuda_api.h" #endif @@ -129,7 +129,7 @@ Status SendCreateReply(int sock, ObjectID object_id, PlasmaObject* object, object->metadata_offset, object->metadata_size, object->device_num); auto object_string = fbb.CreateString(object_id.binary()); -#ifdef PLASMA_GPU +#ifdef PLASMA_CUDA flatbuffers::Offset ipc_handle; if (object->device_num != 0) { std::shared_ptr handle; @@ -145,7 +145,7 @@ Status SendCreateReply(int sock, ObjectID object_id, PlasmaObject* object, crb.add_store_fd(object->store_fd); crb.add_mmap_size(mmap_size); if (object->device_num != 0) { -#ifdef PLASMA_GPU +#ifdef PLASMA_CUDA crb.add_ipc_handle(ipc_handle); #else ARROW_LOG(FATAL) << "This should be unreachable."; @@ -171,7 +171,7 @@ Status ReadCreateReply(uint8_t* data, size_t size, ObjectID* object_id, *mmap_size = message->mmap_size(); object->device_num = message->plasma_object()->device_num(); -#ifdef PLASMA_GPU +#ifdef PLASMA_CUDA if (object->device_num != 0) { RETURN_NOT_OK(CudaIpcMemHandle::FromBuffer(message->ipc_handle()->handle()->data(), &object->ipc_handle)); @@ -588,7 +588,7 @@ Status SendGetReply(int sock, ObjectID object_ids[], objects.push_back(PlasmaObjectSpec(object.store_fd, object.data_offset, object.data_size, object.metadata_offset, object.metadata_size, object.device_num)); -#ifdef PLASMA_GPU +#ifdef PLASMA_CUDA if (object.device_num != 0) { std::shared_ptr handle; RETURN_NOT_OK(object.ipc_handle->Serialize(arrow::default_memory_pool(), &handle)); @@ -609,7 +609,7 @@ Status ReadGetReply(uint8_t* data, size_t size, ObjectID object_ids[], std::vector& store_fds, std::vector& mmap_sizes) { DCHECK(data); auto message = flatbuffers::GetRoot(data); -#ifdef PLASMA_GPU +#ifdef PLASMA_CUDA int handle_pos = 0; #endif DCHECK(VerifyFlatbuffer(message, data, size)); @@ -624,7 +624,7 @@ Status ReadGetReply(uint8_t* data, size_t size, ObjectID object_ids[], plasma_objects[i].metadata_offset = object->metadata_offset(); plasma_objects[i].metadata_size = object->metadata_size(); plasma_objects[i].device_num = object->device_num(); -#ifdef PLASMA_GPU +#ifdef PLASMA_CUDA if (object->device_num() != 0) { const void* ipc_handle = message->handles()->Get(handle_pos)->handle()->data(); RETURN_NOT_OK( diff --git a/cpp/src/plasma/store.cc b/cpp/src/plasma/store.cc index 28624d0bc16bf..ae658d757c185 100644 --- a/cpp/src/plasma/store.cc +++ b/cpp/src/plasma/store.cc @@ -58,12 +58,12 @@ #include "plasma/io.h" #include "plasma/malloc.h" -#ifdef PLASMA_GPU +#ifdef PLASMA_CUDA #include "arrow/gpu/cuda_api.h" -using arrow::gpu::CudaBuffer; -using arrow::gpu::CudaContext; -using arrow::gpu::CudaDeviceManager; +using arrow::cuda::CudaBuffer; +using arrow::cuda::CudaContext; +using arrow::cuda::CudaDeviceManager; #endif using arrow::util::ArrowLog; @@ -117,7 +117,7 @@ PlasmaStore::PlasmaStore(EventLoop* loop, int64_t system_memory, std::string dir store_info_.memory_capacity = system_memory; store_info_.directory = directory; store_info_.hugepages_enabled = hugepages_enabled; -#ifdef PLASMA_GPU +#ifdef PLASMA_CUDA DCHECK_OK(CudaDeviceManager::GetInstance(&manager_)); #endif } @@ -162,7 +162,7 @@ PlasmaError PlasmaStore::CreateObject(const ObjectID& object_id, int64_t data_si } // Try to evict objects until there is enough space. uint8_t* pointer = nullptr; -#ifdef PLASMA_GPU +#ifdef PLASMA_CUDA std::shared_ptr gpu_handle; std::shared_ptr context_; if (device_num != 0) { @@ -195,7 +195,7 @@ PlasmaError PlasmaStore::CreateObject(const ObjectID& object_id, int64_t data_si break; } } else { -#ifdef PLASMA_GPU +#ifdef PLASMA_CUDA DCHECK_OK(context_->Allocate(data_size + metadata_size, &gpu_handle)); break; #endif @@ -220,7 +220,7 @@ PlasmaError PlasmaStore::CreateObject(const ObjectID& object_id, int64_t data_si entry->device_num = device_num; entry->create_time = std::time(nullptr); entry->construct_duration = -1; -#ifdef PLASMA_GPU +#ifdef PLASMA_CUDA if (device_num != 0) { DCHECK_OK(gpu_handle->ExportForIpc(&entry->ipc_handle)); result->ipc_handle = entry->ipc_handle; @@ -246,7 +246,7 @@ void PlasmaObject_init(PlasmaObject* object, ObjectTableEntry* entry) { DCHECK(object != nullptr); DCHECK(entry != nullptr); DCHECK(entry->state == ObjectState::PLASMA_SEALED); -#ifdef PLASMA_GPU +#ifdef PLASMA_CUDA if (entry->device_num != 0) { object->ipc_handle = entry->ipc_handle; } @@ -327,22 +327,7 @@ void PlasmaStore::ReturnFromGet(GetRequest* get_req) { if (s.ok()) { // Send all of the file descriptors for the present objects. for (int store_fd : store_fds) { - int error_code = send_fd(get_req->client->fd, store_fd); - // If we failed to send the file descriptor, loop until we have sent it - // successfully. TODO(rkn): This is problematic for two reasons. First - // of all, sending the file descriptor should just succeed without any - // errors, but sometimes I see a "Message too long" error number. - // Second, looping like this allows a client to potentially block the - // plasma store event loop which should never happen. - while (error_code < 0) { - if (errno == EMSGSIZE) { - ARROW_LOG(WARNING) << "Failed to send file descriptor, retrying."; - error_code = send_fd(get_req->client->fd, store_fd); - continue; - } - WarnIfSigpipe(error_code, get_req->client->fd); - break; - } + WarnIfSigpipe(send_fd(get_req->client->fd, store_fd), get_req->client->fd); } } diff --git a/cpp/src/plasma/store.h b/cpp/src/plasma/store.h index 44fdf603f7f44..8d3facd733f1c 100644 --- a/cpp/src/plasma/store.h +++ b/cpp/src/plasma/store.h @@ -223,8 +223,8 @@ class PlasmaStore { std::unordered_map> connected_clients_; std::unordered_set deletion_cache_; -#ifdef PLASMA_GPU - arrow::gpu::CudaDeviceManager* manager_; +#ifdef PLASMA_CUDA + arrow::cuda::CudaDeviceManager* manager_; #endif }; diff --git a/cpp/src/plasma/test/client_tests.cc b/cpp/src/plasma/test/client_tests.cc index 1ad60396af9ac..f820303aba42b 100644 --- a/cpp/src/plasma/test/client_tests.cc +++ b/cpp/src/plasma/test/client_tests.cc @@ -487,10 +487,10 @@ TEST_F(TestPlasmaStore, ManyObjectTest) { } } -#ifdef PLASMA_GPU -using arrow::gpu::CudaBuffer; -using arrow::gpu::CudaBufferReader; -using arrow::gpu::CudaBufferWriter; +#ifdef PLASMA_CUDA +using arrow::cuda::CudaBuffer; +using arrow::cuda::CudaBufferReader; +using arrow::cuda::CudaBufferWriter; namespace { @@ -590,7 +590,7 @@ TEST_F(TestPlasmaStore, MultipleClientGPUTest) { AssertCudaRead(object_buffers[0].metadata, {5}); } -#endif // PLASMA_GPU +#endif // PLASMA_CUDA } // namespace plasma diff --git a/dev/gen_apidocs/create_documents.sh b/dev/gen_apidocs/create_documents.sh index 6a3b06578829a..ee8f8c864d225 100755 --- a/dev/gen_apidocs/create_documents.sh +++ b/dev/gen_apidocs/create_documents.sh @@ -87,15 +87,6 @@ rsync -r doc/parquet-glib/html/ ../../site/asf-site/docs/c_glib/parquet-glib popd popd -# Now Python documentation can be built -pushd arrow/python -python setup.py build_ext --build-type=$ARROW_BUILD_TYPE \ - --with-plasma --with-parquet --inplace -python setup.py build_sphinx -s doc/source -mkdir -p ../site/asf-site/docs/python -rsync -r doc/_build/html/ ../site/asf-site/docs/python -popd - # Make C++ documentation pushd arrow/cpp/apidoc rm -rf html/* diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index 13918d55fca87..0baf29edd83e4 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -48,6 +48,7 @@ dev/tasks/linux-packages/debian.ubuntu-trusty/compat dev/tasks/linux-packages/debian.ubuntu-trusty/control dev/tasks/linux-packages/debian.ubuntu-trusty/gir1.2-arrow-1.0.install dev/tasks/linux-packages/debian.ubuntu-trusty/gir1.2-parquet-1.0.install +dev/tasks/linux-packages/debian.ubuntu-trusty/gir1.2-plasma-1.0.install dev/tasks/linux-packages/debian.ubuntu-trusty/libarrow-dev.install dev/tasks/linux-packages/debian.ubuntu-trusty/libarrow-glib-dev.install dev/tasks/linux-packages/debian.ubuntu-trusty/libarrow-glib-doc.doc-base @@ -63,25 +64,34 @@ dev/tasks/linux-packages/debian.ubuntu-trusty/libparquet-glib-doc.install dev/tasks/linux-packages/debian.ubuntu-trusty/libparquet-glib-doc.links dev/tasks/linux-packages/debian.ubuntu-trusty/libparquet-glib12.install dev/tasks/linux-packages/debian.ubuntu-trusty/libparquet12.install +dev/tasks/linux-packages/debian.ubuntu-trusty/libplasma-dev.install +dev/tasks/linux-packages/debian.ubuntu-trusty/libplasma-glib-dev.install +dev/tasks/linux-packages/debian.ubuntu-trusty/libplasma-glib-doc.doc-base +dev/tasks/linux-packages/debian.ubuntu-trusty/libplasma-glib-doc.install +dev/tasks/linux-packages/debian.ubuntu-trusty/libplasma-glib-doc.links +dev/tasks/linux-packages/debian.ubuntu-trusty/libplasma-glib12.install +dev/tasks/linux-packages/debian.ubuntu-trusty/libplasma12.install dev/tasks/linux-packages/debian.ubuntu-trusty/patches/series +dev/tasks/linux-packages/debian.ubuntu-trusty/plasma-store-server.install dev/tasks/linux-packages/debian.ubuntu-trusty/rules dev/tasks/linux-packages/debian.ubuntu-trusty/source/format dev/tasks/linux-packages/debian.ubuntu-trusty/watch dev/tasks/linux-packages/debian/compat dev/tasks/linux-packages/debian/control dev/tasks/linux-packages/debian/gir1.2-arrow-1.0.install -dev/tasks/linux-packages/debian/gir1.2-arrow-gpu-1.0.install +dev/tasks/linux-packages/debian/gir1.2-arrow-cuda-1.0.install dev/tasks/linux-packages/debian/gir1.2-parquet-1.0.install +dev/tasks/linux-packages/debian/gir1.2-plasma-1.0.install dev/tasks/linux-packages/debian/libarrow-dev.install dev/tasks/linux-packages/debian/libarrow-glib-dev.install dev/tasks/linux-packages/debian/libarrow-glib-doc.doc-base dev/tasks/linux-packages/debian/libarrow-glib-doc.install dev/tasks/linux-packages/debian/libarrow-glib-doc.links dev/tasks/linux-packages/debian/libarrow-glib12.install -dev/tasks/linux-packages/debian/libarrow-gpu-dev.install -dev/tasks/linux-packages/debian/libarrow-gpu-glib-dev.install -dev/tasks/linux-packages/debian/libarrow-gpu-glib12.install -dev/tasks/linux-packages/debian/libarrow-gpu12.install +dev/tasks/linux-packages/debian/libarrow-cuda-dev.install +dev/tasks/linux-packages/debian/libarrow-cuda-glib-dev.install +dev/tasks/linux-packages/debian/libarrow-cuda-glib12.install +dev/tasks/linux-packages/debian/libarrow-cuda12.install dev/tasks/linux-packages/debian/libarrow-python-dev.install dev/tasks/linux-packages/debian/libarrow-python12.install dev/tasks/linux-packages/debian/libarrow12.install @@ -92,10 +102,19 @@ dev/tasks/linux-packages/debian/libparquet-glib-doc.install dev/tasks/linux-packages/debian/libparquet-glib-doc.links dev/tasks/linux-packages/debian/libparquet-glib12.install dev/tasks/linux-packages/debian/libparquet12.install +dev/tasks/linux-packages/debian/libplasma-dev.install +dev/tasks/linux-packages/debian/libplasma-glib-dev.install +dev/tasks/linux-packages/debian/libplasma-glib-doc.doc-base +dev/tasks/linux-packages/debian/libplasma-glib-doc.install +dev/tasks/linux-packages/debian/libplasma-glib-doc.links +dev/tasks/linux-packages/debian/libplasma-glib12.install +dev/tasks/linux-packages/debian/libplasma12.install dev/tasks/linux-packages/debian/patches/series +dev/tasks/linux-packages/debian/plasma-store-server.install dev/tasks/linux-packages/debian/rules dev/tasks/linux-packages/debian/source/format dev/tasks/linux-packages/debian/watch +docs/requirements.txt go/arrow/go.sum go/arrow/Gopkg.lock go/arrow/internal/cpu/* @@ -106,7 +125,6 @@ js/.npmignore js/closure-compiler-scripts/* python/cmake_modules python/cmake_modules/* -python/doc/requirements.txt python/MANIFEST.in python/pyarrow/includes/__init__.pxd python/pyarrow/tests/__init__.py @@ -130,9 +148,15 @@ c_glib/config/ltmain.sh c_glib/doc/arrow-glib/arrow-glib.types c_glib/doc/arrow-glib/arrow-glib-sections.txt c_glib/doc/arrow-glib/arrow-glib-overrides.txt +c_glib/doc/gandiva-glib/gandiva-glib.types +c_glib/doc/gandiva-glib/gandiva-glib-sections.txt +c_glib/doc/gandiva-glib/gandiva-glib-overrides.txt c_glib/doc/parquet-glib/parquet-glib.types c_glib/doc/parquet-glib/parquet-glib-sections.txt c_glib/doc/parquet-glib/parquet-glib-overrides.txt +c_glib/doc/plasma-glib/plasma-glib.types +c_glib/doc/plasma-glib/plasma-glib-sections.txt +c_glib/doc/plasma-glib/plasma-glib-overrides.txt c_glib/gtk-doc.make csharp/.gitattributes csharp/src/Apache.Arrow/Flatbuf/* @@ -160,4 +184,4 @@ r/README.md r/README.Rmd r/man/*.Rd .gitattributes -rust/test/data/*.csv \ No newline at end of file +rust/test/data/*.csv diff --git a/dev/release/run-rat.sh b/dev/release/run-rat.sh index 53a322a969718..587e93af4622d 100755 --- a/dev/release/run-rat.sh +++ b/dev/release/run-rat.sh @@ -18,10 +18,14 @@ # under the License. # +RAT_VERSION=0.12 + # download apache rat -curl -s https://repo1.maven.org/maven2/org/apache/rat/apache-rat/0.12/apache-rat-0.12.jar > apache-rat-0.12.jar +if [ ! -f apache-rat-${RAT_VERSION}.jar ]; then + curl -s https://repo1.maven.org/maven2/org/apache/rat/apache-rat/${RAT_VERSION}/apache-rat-${RAT_VERSION}.jar > apache-rat-${RAT_VERSION}.jar +fi -RAT="java -jar apache-rat-0.12.jar -x " +RAT="java -jar apache-rat-${RAT_VERSION}.jar -x " RELEASE_DIR=$(cd "$(dirname "$BASH_SOURCE")"; pwd) diff --git a/dev/tasks/linux-packages/apt/ubuntu-cosmic/Dockerfile b/dev/tasks/linux-packages/apt/ubuntu-cosmic/Dockerfile index 7840e02e54b5b..519d058d4b2e3 100644 --- a/dev/tasks/linux-packages/apt/ubuntu-cosmic/Dockerfile +++ b/dev/tasks/linux-packages/apt/ubuntu-cosmic/Dockerfile @@ -26,10 +26,12 @@ RUN \ apt update ${quiet} && \ apt install -y -V ${quiet} \ autoconf-archive \ + bison \ build-essential \ cmake \ debhelper\ devscripts \ + flex \ git \ gtk-doc-tools \ libboost-filesystem-dev \ diff --git a/dev/tasks/linux-packages/debian.ubuntu-trusty/control b/dev/tasks/linux-packages/debian.ubuntu-trusty/control index 696f2c4b696bb..eb1f74b8d4553 100644 --- a/dev/tasks/linux-packages/debian.ubuntu-trusty/control +++ b/dev/tasks/linux-packages/debian.ubuntu-trusty/control @@ -30,7 +30,7 @@ Depends: ${shlibs:Depends} Description: Apache Arrow is a data processing library for analysis . - This package provides library files. + This package provides C++ library files. Package: libarrow-dev Section: libdevel @@ -41,7 +41,44 @@ Depends: libarrow12 (= ${binary:Version}) Description: Apache Arrow is a data processing library for analysis . - This package provides header files. + This package provides C++ header files. + +Package: libplasma12 +Section: libs +Architecture: any +Multi-Arch: same +Pre-Depends: ${misc:Pre-Depends} +Depends: + ${misc:Depends}, + ${shlibs:Depends}, + libarrow12 (= ${binary:Version}) +Description: Plasma is an in-memory object store and cache for big data. + . + This package provides C++ library files to connect plasma_store_server. + +Package: plasma-store-server +Section: utils +Architecture: any +Pre-Depends: ${misc:Pre-Depends} +Depends: + ${misc:Depends}, + ${shlibs:Depends}, + libplasma12 (= ${binary:Version}) +Description: Plasma is an in-memory object store and cache for big data. + . + This package provides plasma_store_server. + +Package: libplasma-dev +Section: libdevel +Architecture: any +Multi-Arch: same +Depends: + ${misc:Depends}, + libarrow-dev (= ${binary:Version}), + libplasma12 (= ${binary:Version}) +Description: Plasma is an in-memory object store and cache for big data. + . + This package provides C++ header files. Package: libparquet12 Section: libs @@ -78,7 +115,7 @@ Depends: libarrow12 (= ${binary:Version}) Description: Apache Arrow is a data processing library for analysis . - This package provides library files. + This package provides GLib based library files. Package: gir1.2-arrow-1.0 Section: introspection @@ -104,7 +141,7 @@ Depends: Suggests: libarrow-glib-doc Description: Apache Arrow is a data processing library for analysis . - This package provides header files. + This package provides GLib based header files. Package: libarrow-glib-doc Section: doc @@ -117,6 +154,56 @@ Description: Apache Arrow is a data processing library for analysis . This package provides documentations. +Package: libplasma-glib12 +Section: libs +Architecture: any +Multi-Arch: same +Pre-Depends: ${misc:Pre-Depends} +Depends: + ${misc:Depends}, + ${shlibs:Depends}, + libarrow-glib12 (= ${binary:Version}), + libplasma12 (= ${binary:Version}) +Description: Plasma is an in-memory object store and cache for big data. + . + This package provides GLib based library files to connect plasma_store_server. + +Package: gir1.2-plasma-1.0 +Section: introspection +Architecture: any +Multi-Arch: same +Depends: + ${gir:Depends}, + ${misc:Depends} +Description: Plasma is an in-memory object store and cache for big data. + . + This package provides GObject Introspection typelib files. + +Package: libplasma-glib-dev +Section: libdevel +Architecture: any +Multi-Arch: same +Depends: + ${misc:Depends}, + libplasma-dev (= ${binary:Version}), + libarrow-glib-dev (= ${binary:Version}), + libplasma-glib12 (= ${binary:Version}), + gir1.2-plasma-1.0 (= ${binary:Version}) +Description: Plasma is an in-memory object store and cache for big data. + . + This package provides GLib based header files. + +Package: libplasma-glib-doc +Section: doc +Architecture: all +Multi-Arch: foreign +Depends: + ${misc:Depends} +Recommends: libglib2.0-doc +Description: Plasma is an in-memory object store and cache for big data. + . + This package provides documentations. + Package: libparquet-glib12 Section: libs Architecture: any diff --git a/dev/tasks/linux-packages/debian.ubuntu-trusty/gir1.2-plasma-1.0.install b/dev/tasks/linux-packages/debian.ubuntu-trusty/gir1.2-plasma-1.0.install new file mode 100644 index 0000000000000..4366f4f1f5a25 --- /dev/null +++ b/dev/tasks/linux-packages/debian.ubuntu-trusty/gir1.2-plasma-1.0.install @@ -0,0 +1 @@ +usr/lib/girepository-1.0/Plasma-1.0.typelib diff --git a/dev/tasks/linux-packages/debian.ubuntu-trusty/libplasma-dev.install b/dev/tasks/linux-packages/debian.ubuntu-trusty/libplasma-dev.install new file mode 100644 index 0000000000000..d3538d2210af3 --- /dev/null +++ b/dev/tasks/linux-packages/debian.ubuntu-trusty/libplasma-dev.install @@ -0,0 +1,3 @@ +usr/lib/*/libplasma.a +usr/lib/*/libplasma.so +usr/lib/*/pkgconfig/plasma.pc diff --git a/dev/tasks/linux-packages/debian.ubuntu-trusty/libplasma-glib-dev.install b/dev/tasks/linux-packages/debian.ubuntu-trusty/libplasma-glib-dev.install new file mode 100644 index 0000000000000..f21a9aa8a8f9c --- /dev/null +++ b/dev/tasks/linux-packages/debian.ubuntu-trusty/libplasma-glib-dev.install @@ -0,0 +1,5 @@ +usr/include/plasma-glib/ +usr/lib/*/libplasma-glib.a +usr/lib/*/libplasma-glib.so +usr/lib/*/pkgconfig/plasma-glib.pc +usr/share/gir-1.0/Plasma-1.0.gir diff --git a/dev/tasks/linux-packages/debian.ubuntu-trusty/libplasma-glib-doc.doc-base b/dev/tasks/linux-packages/debian.ubuntu-trusty/libplasma-glib-doc.doc-base new file mode 100644 index 0000000000000..7863d7d07a36c --- /dev/null +++ b/dev/tasks/linux-packages/debian.ubuntu-trusty/libplasma-glib-doc.doc-base @@ -0,0 +1,9 @@ +Document: plasma-glib +Title: Plasma GLib Reference Manual +Author: The Apache Software Foundation +Abstract: Plasma GLib is an in-memory object store and cache for big data that uses GLib. +Section: Programming + +Format: HTML +Index: /usr/share/doc/libarrow-glib-doc/plasma-glib/index.html +Files: /usr/share/doc/libarrow-glib-doc/plasma-glib/*.html diff --git a/dev/tasks/linux-packages/debian.ubuntu-trusty/libplasma-glib-doc.install b/dev/tasks/linux-packages/debian.ubuntu-trusty/libplasma-glib-doc.install new file mode 100644 index 0000000000000..ef5a63b340c4e --- /dev/null +++ b/dev/tasks/linux-packages/debian.ubuntu-trusty/libplasma-glib-doc.install @@ -0,0 +1 @@ +usr/share/doc/libarrow-glib-doc/plasma-glib/ diff --git a/dev/tasks/linux-packages/debian.ubuntu-trusty/libplasma-glib-doc.links b/dev/tasks/linux-packages/debian.ubuntu-trusty/libplasma-glib-doc.links new file mode 100644 index 0000000000000..baea0ef4f4b78 --- /dev/null +++ b/dev/tasks/linux-packages/debian.ubuntu-trusty/libplasma-glib-doc.links @@ -0,0 +1,3 @@ +usr/share/doc/libglib2.0-doc/glib usr/share/doc/libplasma-glib-doc/glib +usr/share/doc/libglib2.0-doc/gobject usr/share/doc/libplasma-glib-doc/gobject +usr/share/doc/libarrow-glib-doc/plasma-glib usr/share/gtk-doc/html/plasma-glib diff --git a/dev/tasks/linux-packages/debian.ubuntu-trusty/libplasma-glib12.install b/dev/tasks/linux-packages/debian.ubuntu-trusty/libplasma-glib12.install new file mode 100644 index 0000000000000..339bcca3e7278 --- /dev/null +++ b/dev/tasks/linux-packages/debian.ubuntu-trusty/libplasma-glib12.install @@ -0,0 +1 @@ +usr/lib/*/libplasma-glib.so.* diff --git a/dev/tasks/linux-packages/debian.ubuntu-trusty/libplasma12.install b/dev/tasks/linux-packages/debian.ubuntu-trusty/libplasma12.install new file mode 100644 index 0000000000000..f8a744b65975d --- /dev/null +++ b/dev/tasks/linux-packages/debian.ubuntu-trusty/libplasma12.install @@ -0,0 +1 @@ +usr/lib/*/libplasma.so.* diff --git a/dev/tasks/linux-packages/debian.ubuntu-trusty/plasma-store-server.install b/dev/tasks/linux-packages/debian.ubuntu-trusty/plasma-store-server.install new file mode 100644 index 0000000000000..9c38179c17dc1 --- /dev/null +++ b/dev/tasks/linux-packages/debian.ubuntu-trusty/plasma-store-server.install @@ -0,0 +1 @@ +usr/bin/plasma_store_server diff --git a/dev/tasks/linux-packages/debian.ubuntu-trusty/rules b/dev/tasks/linux-packages/debian.ubuntu-trusty/rules index 01956fec40a9d..6f2ffdc416906 100755 --- a/dev/tasks/linux-packages/debian.ubuntu-trusty/rules +++ b/dev/tasks/linux-packages/debian.ubuntu-trusty/rules @@ -24,7 +24,8 @@ override_dh_auto_configure: -DCMAKE_BUILD_TYPE=$(BUILD_TYPE) \ -DARROW_BUILD_TESTS=OFF \ -DARROW_ORC=ON \ - -DARROW_PARQUET=ON + -DARROW_PARQUET=ON \ + -DARROW_PLASMA=ON dh_auto_configure \ --sourcedirectory=c_glib \ --builddirectory=c_glib_build \ diff --git a/dev/tasks/linux-packages/debian/control b/dev/tasks/linux-packages/debian/control index d497a31d1443a..b5c696363798f 100644 --- a/dev/tasks/linux-packages/debian/control +++ b/dev/tasks/linux-packages/debian/control @@ -37,7 +37,7 @@ Depends: ${shlibs:Depends} Description: Apache Arrow is a data processing library for analysis . - This package provides library files. + This package provides C++ library files. Package: libarrow-python12 Section: libs @@ -52,9 +52,9 @@ Depends: python3-numpy Description: Apache Arrow is a data processing library for analysis . - This package provides library files for Python support. + This package provides C++ library files for Python support. -Package: libarrow-gpu12 +Package: libarrow-cuda12 Section: libs Architecture: any Multi-Arch: same @@ -65,7 +65,7 @@ Depends: libarrow12 (= ${binary:Version}) Description: Apache Arrow is a data processing library for analysis . - This package provides library files for GPU support. + This package provides C++ library files for CUDA support. Package: libarrow-dev Section: libdevel @@ -76,7 +76,7 @@ Depends: libarrow12 (= ${binary:Version}) Description: Apache Arrow is a data processing library for analysis . - This package provides header files. + This package provides C++ header files. Package: libarrow-python-dev Section: libdevel @@ -88,19 +88,56 @@ Depends: libarrow-python12 (= ${binary:Version}) Description: Apache Arrow is a data processing library for analysis . - This package provides header files for Python support. + This package provides C++ header files for Python support. -Package: libarrow-gpu-dev +Package: libarrow-cuda-dev Section: libdevel Architecture: any Multi-Arch: same Depends: ${misc:Depends}, libarrow-dev (= ${binary:Version}), - libarrow-gpu12 (= ${binary:Version}) + libarrow-cuda12 (= ${binary:Version}) Description: Apache Arrow is a data processing library for analysis . - This package provides header files for GPU support. + This package provides C++ header files for CUDA support. + +Package: libplasma12 +Section: libs +Architecture: any +Multi-Arch: same +Pre-Depends: ${misc:Pre-Depends} +Depends: + ${misc:Depends}, + ${shlibs:Depends}, + libarrow-cuda12 (= ${binary:Version}) +Description: Plasma is an in-memory object store and cache for big data. + . + This package provides C++ library files to connect plasma_store_server. + +Package: plasma-store-server +Section: utils +Architecture: any +Pre-Depends: ${misc:Pre-Depends} +Depends: + ${misc:Depends}, + ${shlibs:Depends}, + libplasma12 (= ${binary:Version}) +Description: Plasma is an in-memory object store and cache for big data. + . + This package provides plasma_store_server. + +Package: libplasma-dev +Section: libdevel +Architecture: any +Multi-Arch: same +Depends: + ${misc:Depends}, + libarrow-cuda-dev (= ${binary:Version}), + libplasma12 (= ${binary:Version}) +Description: Plasma is an in-memory object store and cache for big data. + . + This package provides C++ header files. Package: libparquet12 Section: libs @@ -137,7 +174,7 @@ Depends: libarrow12 (= ${binary:Version}) Description: Apache Arrow is a data processing library for analysis . - This package provides library files. + This package provides GLib based library files. Package: gir1.2-arrow-1.0 Section: introspection @@ -163,7 +200,7 @@ Depends: Suggests: libarrow-glib-doc Description: Apache Arrow is a data processing library for analysis . - This package provides header files. + This package provides GLib based header files. Package: libarrow-glib-doc Section: doc @@ -176,7 +213,7 @@ Description: Apache Arrow is a data processing library for analysis . This package provides documentations. -Package: libarrow-gpu-glib12 +Package: libarrow-cuda-glib12 Section: libs Architecture: any Multi-Arch: same @@ -185,12 +222,12 @@ Depends: ${misc:Depends}, ${shlibs:Depends}, libarrow-glib12 (= ${binary:Version}), - libarrow-gpu12 (= ${binary:Version}) + libarrow-cuda12 (= ${binary:Version}) Description: Apache Arrow is a data processing library for analysis . - This package provides library files for GPU support. + This package provides GLib based library files for CUDA support. -Package: gir1.2-arrow-gpu-1.0 +Package: gir1.2-arrow-cuda-1.0 Section: introspection Architecture: any Multi-Arch: same @@ -199,22 +236,71 @@ Depends: ${misc:Depends} Description: Apache Arrow is a data processing library for analysis . - This package provides GObject Introspection typelib files for GPU support. + This package provides GObject Introspection typelib files for CUDA support. -Package: libarrow-gpu-glib-dev +Package: libarrow-cuda-glib-dev Section: libdevel Architecture: any Multi-Arch: same Depends: ${misc:Depends}, - libarrow-dev (= ${binary:Version}), + libarrow-cuda-dev (= ${binary:Version}), libarrow-glib-dev (= ${binary:Version}), - libarrow-gpu-dev (= ${binary:Version}), - libarrow-gpu-glib12 (= ${binary:Version}), - gir1.2-arrow-gpu-1.0 (= ${binary:Version}) + libarrow-cuda-glib12 (= ${binary:Version}), + gir1.2-arrow-cuda-1.0 (= ${binary:Version}) Description: Apache Arrow is a data processing library for analysis . - This package provides header files for GPU support. + This package provides GLib based header files for CUDA support. + +Package: libplasma-glib12 +Section: libs +Architecture: any +Multi-Arch: same +Pre-Depends: ${misc:Pre-Depends} +Depends: + ${misc:Depends}, + ${shlibs:Depends}, + libarrow-cuda-glib12 (= ${binary:Version}), + libplasma12 (= ${binary:Version}) +Description: Plasma is an in-memory object store and cache for big data. + . + This package provides GLib based library files to connect plasma_store_server. + +Package: gir1.2-plasma-1.0 +Section: introspection +Architecture: any +Multi-Arch: same +Depends: + ${gir:Depends}, + ${misc:Depends} +Description: Plasma is an in-memory object store and cache for big data. + . + This package provides GObject Introspection typelib files. + +Package: libplasma-glib-dev +Section: libdevel +Architecture: any +Multi-Arch: same +Depends: + ${misc:Depends}, + libplasma-dev (= ${binary:Version}), + libarrow-cuda-glib-dev (= ${binary:Version}), + libplasma-glib12 (= ${binary:Version}), + gir1.2-plasma-1.0 (= ${binary:Version}) +Description: Plasma is an in-memory object store and cache for big data. + . + This package provides GLib based header files. + +Package: libplasma-glib-doc +Section: doc +Architecture: all +Multi-Arch: foreign +Depends: + ${misc:Depends} +Recommends: libglib2.0-doc +Description: Plasma is an in-memory object store and cache for big data. + . + This package provides documentations. Package: libparquet-glib12 Section: libs diff --git a/dev/tasks/linux-packages/debian/gir1.2-arrow-cuda-1.0.install b/dev/tasks/linux-packages/debian/gir1.2-arrow-cuda-1.0.install new file mode 100644 index 0000000000000..ef0d9f56f9dbc --- /dev/null +++ b/dev/tasks/linux-packages/debian/gir1.2-arrow-cuda-1.0.install @@ -0,0 +1 @@ +usr/lib/*/girepository-1.0/ArrowCUDA-1.0.typelib diff --git a/dev/tasks/linux-packages/debian/gir1.2-arrow-gpu-1.0.install b/dev/tasks/linux-packages/debian/gir1.2-arrow-gpu-1.0.install deleted file mode 100644 index 10e0ca983be1a..0000000000000 --- a/dev/tasks/linux-packages/debian/gir1.2-arrow-gpu-1.0.install +++ /dev/null @@ -1 +0,0 @@ -usr/lib/*/girepository-1.0/ArrowGPU-1.0.typelib diff --git a/dev/tasks/linux-packages/debian/gir1.2-plasma-1.0.install b/dev/tasks/linux-packages/debian/gir1.2-plasma-1.0.install new file mode 100644 index 0000000000000..7b7ce21581dfd --- /dev/null +++ b/dev/tasks/linux-packages/debian/gir1.2-plasma-1.0.install @@ -0,0 +1 @@ +usr/lib/*/girepository-1.0/Plasma-1.0.typelib diff --git a/dev/tasks/linux-packages/debian/libarrow-cuda-dev.install b/dev/tasks/linux-packages/debian/libarrow-cuda-dev.install new file mode 100644 index 0000000000000..2270d9258668d --- /dev/null +++ b/dev/tasks/linux-packages/debian/libarrow-cuda-dev.install @@ -0,0 +1,3 @@ +usr/lib/*/libarrow_cuda.a +usr/lib/*/libarrow_cuda.so +usr/lib/*/pkgconfig/arrow-cuda.pc diff --git a/dev/tasks/linux-packages/debian/libarrow-cuda-glib-dev.install b/dev/tasks/linux-packages/debian/libarrow-cuda-glib-dev.install new file mode 100644 index 0000000000000..7025fd202850e --- /dev/null +++ b/dev/tasks/linux-packages/debian/libarrow-cuda-glib-dev.install @@ -0,0 +1,5 @@ +usr/include/arrow-cuda-glib/ +usr/lib/*/libarrow-cuda-glib.a +usr/lib/*/libarrow-cuda-glib.so +usr/lib/*/pkgconfig/arrow-cuda-glib.pc +usr/share/gir-1.0/ArrowCUDA-1.0.gir diff --git a/dev/tasks/linux-packages/debian/libarrow-cuda-glib12.install b/dev/tasks/linux-packages/debian/libarrow-cuda-glib12.install new file mode 100644 index 0000000000000..a6d6375268d34 --- /dev/null +++ b/dev/tasks/linux-packages/debian/libarrow-cuda-glib12.install @@ -0,0 +1 @@ +usr/lib/*/libarrow-cuda-glib.so.* diff --git a/dev/tasks/linux-packages/debian/libarrow-cuda12.install b/dev/tasks/linux-packages/debian/libarrow-cuda12.install new file mode 100644 index 0000000000000..5ae46468764f2 --- /dev/null +++ b/dev/tasks/linux-packages/debian/libarrow-cuda12.install @@ -0,0 +1 @@ +usr/lib/*/libarrow_cuda.so.* diff --git a/dev/tasks/linux-packages/debian/libarrow-gpu-dev.install b/dev/tasks/linux-packages/debian/libarrow-gpu-dev.install deleted file mode 100644 index 1892fb851535c..0000000000000 --- a/dev/tasks/linux-packages/debian/libarrow-gpu-dev.install +++ /dev/null @@ -1,3 +0,0 @@ -usr/lib/*/libarrow_gpu.a -usr/lib/*/libarrow_gpu.so -usr/lib/*/pkgconfig/arrow-gpu.pc diff --git a/dev/tasks/linux-packages/debian/libarrow-gpu-glib-dev.install b/dev/tasks/linux-packages/debian/libarrow-gpu-glib-dev.install deleted file mode 100644 index 9b3ef8fb25b35..0000000000000 --- a/dev/tasks/linux-packages/debian/libarrow-gpu-glib-dev.install +++ /dev/null @@ -1,5 +0,0 @@ -usr/include/arrow-gpu-glib/ -usr/lib/*/libarrow-gpu-glib.a -usr/lib/*/libarrow-gpu-glib.so -usr/lib/*/pkgconfig/arrow-gpu-glib.pc -usr/share/gir-1.0/ArrowGPU-1.0.gir diff --git a/dev/tasks/linux-packages/debian/libarrow-gpu-glib12.install b/dev/tasks/linux-packages/debian/libarrow-gpu-glib12.install deleted file mode 100644 index 4d97e5a60eb09..0000000000000 --- a/dev/tasks/linux-packages/debian/libarrow-gpu-glib12.install +++ /dev/null @@ -1 +0,0 @@ -usr/lib/*/libarrow-gpu-glib.so.* diff --git a/dev/tasks/linux-packages/debian/libarrow-gpu12.install b/dev/tasks/linux-packages/debian/libarrow-gpu12.install deleted file mode 100644 index cabd7e47d1e9a..0000000000000 --- a/dev/tasks/linux-packages/debian/libarrow-gpu12.install +++ /dev/null @@ -1 +0,0 @@ -usr/lib/*/libarrow_gpu.so.* diff --git a/dev/tasks/linux-packages/debian/libplasma-dev.install b/dev/tasks/linux-packages/debian/libplasma-dev.install new file mode 100644 index 0000000000000..d3538d2210af3 --- /dev/null +++ b/dev/tasks/linux-packages/debian/libplasma-dev.install @@ -0,0 +1,3 @@ +usr/lib/*/libplasma.a +usr/lib/*/libplasma.so +usr/lib/*/pkgconfig/plasma.pc diff --git a/dev/tasks/linux-packages/debian/libplasma-glib-dev.install b/dev/tasks/linux-packages/debian/libplasma-glib-dev.install new file mode 100644 index 0000000000000..f21a9aa8a8f9c --- /dev/null +++ b/dev/tasks/linux-packages/debian/libplasma-glib-dev.install @@ -0,0 +1,5 @@ +usr/include/plasma-glib/ +usr/lib/*/libplasma-glib.a +usr/lib/*/libplasma-glib.so +usr/lib/*/pkgconfig/plasma-glib.pc +usr/share/gir-1.0/Plasma-1.0.gir diff --git a/dev/tasks/linux-packages/debian/libplasma-glib-doc.doc-base b/dev/tasks/linux-packages/debian/libplasma-glib-doc.doc-base new file mode 100644 index 0000000000000..7863d7d07a36c --- /dev/null +++ b/dev/tasks/linux-packages/debian/libplasma-glib-doc.doc-base @@ -0,0 +1,9 @@ +Document: plasma-glib +Title: Plasma GLib Reference Manual +Author: The Apache Software Foundation +Abstract: Plasma GLib is an in-memory object store and cache for big data that uses GLib. +Section: Programming + +Format: HTML +Index: /usr/share/doc/libarrow-glib-doc/plasma-glib/index.html +Files: /usr/share/doc/libarrow-glib-doc/plasma-glib/*.html diff --git a/dev/tasks/linux-packages/debian/libplasma-glib-doc.install b/dev/tasks/linux-packages/debian/libplasma-glib-doc.install new file mode 100644 index 0000000000000..ef5a63b340c4e --- /dev/null +++ b/dev/tasks/linux-packages/debian/libplasma-glib-doc.install @@ -0,0 +1 @@ +usr/share/doc/libarrow-glib-doc/plasma-glib/ diff --git a/dev/tasks/linux-packages/debian/libplasma-glib-doc.links b/dev/tasks/linux-packages/debian/libplasma-glib-doc.links new file mode 100644 index 0000000000000..baea0ef4f4b78 --- /dev/null +++ b/dev/tasks/linux-packages/debian/libplasma-glib-doc.links @@ -0,0 +1,3 @@ +usr/share/doc/libglib2.0-doc/glib usr/share/doc/libplasma-glib-doc/glib +usr/share/doc/libglib2.0-doc/gobject usr/share/doc/libplasma-glib-doc/gobject +usr/share/doc/libarrow-glib-doc/plasma-glib usr/share/gtk-doc/html/plasma-glib diff --git a/dev/tasks/linux-packages/debian/libplasma-glib12.install b/dev/tasks/linux-packages/debian/libplasma-glib12.install new file mode 100644 index 0000000000000..339bcca3e7278 --- /dev/null +++ b/dev/tasks/linux-packages/debian/libplasma-glib12.install @@ -0,0 +1 @@ +usr/lib/*/libplasma-glib.so.* diff --git a/dev/tasks/linux-packages/debian/libplasma12.install b/dev/tasks/linux-packages/debian/libplasma12.install new file mode 100644 index 0000000000000..f8a744b65975d --- /dev/null +++ b/dev/tasks/linux-packages/debian/libplasma12.install @@ -0,0 +1 @@ +usr/lib/*/libplasma.so.* diff --git a/dev/tasks/linux-packages/debian/plasma-store-server.install b/dev/tasks/linux-packages/debian/plasma-store-server.install new file mode 100644 index 0000000000000..9c38179c17dc1 --- /dev/null +++ b/dev/tasks/linux-packages/debian/plasma-store-server.install @@ -0,0 +1 @@ +usr/bin/plasma_store_server diff --git a/dev/tasks/linux-packages/debian/rules b/dev/tasks/linux-packages/debian/rules index ce39fde6ebd23..f3cc2a045c1ee 100755 --- a/dev/tasks/linux-packages/debian/rules +++ b/dev/tasks/linux-packages/debian/rules @@ -29,11 +29,12 @@ override_dh_auto_configure: -DARROW_BOOST_USE_SHARED=ON \ -DARROW_ORC=ON \ -DARROW_PARQUET=ON \ + -DARROW_PLASMA=ON \ -DPROTOBUF_HOME=/usr \ -DARROW_PROTOBUF_USE_SHARED=ON \ -DPythonInterp_FIND_VERSION=ON \ -DPythonInterp_FIND_VERSION_MAJOR=3 \ - -DARROW_GPU=ON + -DARROW_CUDA=ON dh_auto_configure \ --sourcedirectory=c_glib \ --builddirectory=c_glib_build \ diff --git a/dev/tasks/linux-packages/yum/arrow.spec.in b/dev/tasks/linux-packages/yum/arrow.spec.in index 9db9d43e1e66e..ad60dfbdde18e 100644 --- a/dev/tasks/linux-packages/yum/arrow.spec.in +++ b/dev/tasks/linux-packages/yum/arrow.spec.in @@ -75,6 +75,7 @@ cd cpp/build %if %{use_parquet} -DARROW_PARQUET=ON \ %endif + -DARROW_PLASMA=ON \ -DARROW_BUILD_TESTS=OFF make %{?_smp_mflags} cd - @@ -88,8 +89,11 @@ cd c_glib --enable-gtk-doc sed -i 's|^hardcode_libdir_flag_spec=.*|hardcode_libdir_flag_spec=""|g' libtool sed -i 's|^runpath_var=LD_RUN_PATH|runpath_var=DIE_RPATH_DIE|g' libtool -LD_LIBRARY_PATH=$PWD/arrow-glib/.libs/:$PWD/parquet-glib/.libs/:$PWD/../cpp/build/$build_type \ - make %{?_smp_mflags} +ld_library_path=$PWD/arrow-glib/.libs/ +ld_library_path=$ld_library_path:$PWD/plasma-glib/.libs/ +ld_library_path=$ld_library_path:$PWD/parquet-glib/.libs/ +ld_library_path=$ld_library_path:$PWD/../cpp/build/$build_type +LD_LIBRARY_PATH=$ld_library_path make %{?_smp_mflags} cd - %endif @@ -172,6 +176,48 @@ Libraries and header files for Apache Arrow CPython extensions. %{_libdir}/pkgconfig/arrow-python.pc %endif +%package -n plasma-libs +Summary: Runtime libraries for Plasma in-memory object store +License: Apache-2.0 +Requires: %{name}-libs = %{version}-%{release} + +%description -n plasma-libs +This package contains the libraries for Plasma in-memory object store. + +%files -n plasma-libs +%defattr(-,root,root,-) +%doc README.md LICENSE.txt NOTICE.txt +%{_libdir}/libplasma.so.* + +%package -n plasma-store-server +Summary: Server for Plasma in-memory object store +License: Apache-2.0 +Requires: plasma-libs = %{version}-%{release} + +%description -n plasma-store-server +This package contains the server for Plasma in-memory object store. + +%files -n plasma-store-server +%defattr(-,root,root,-) +%doc README.md LICENSE.txt NOTICE.txt +%{_bindir}/plasma_store_server + +%package -n plasma-devel +Summary: Libraries and header files for Plasma in-memory object store +License: Apache-2.0 +Requires: plasma-libs = %{version}-%{release} + +%description -n plasma-devel +Libraries and header files for Plasma in-memory object store. + +%files -n plasma-devel +%defattr(-,root,root,-) +%doc README.md LICENSE.txt NOTICE.txt +%{_includedir}/plasma/ +%{_libdir}/libplasma.a +%{_libdir}/libplasma.so +%{_libdir}/pkgconfig/plasma*.pc + %if %{use_parquet} %package -n parquet-libs Summary: Runtime libraries for Apache Parquet C++ @@ -183,7 +229,7 @@ Requires: boost-regex Requires: %{name}-libs = %{version}-%{release} %description -n parquet-libs -This package contains the libraries for Apache Parquet +This package contains the libraries for Apache Parquet C++. %files -n parquet-libs %defattr(-,root,root,-) @@ -197,7 +243,7 @@ Requires: parquet-libs = %{version}-%{release} Requires: zlib-devel %description -n parquet-devel -Libraries and header files for Apache Parquet. +Libraries and header files for Apache Parquet C++. %files -n parquet-devel %defattr(-,root,root,-) @@ -257,6 +303,51 @@ Documentation for Apache Arrow GLib. %doc README.md LICENSE.txt NOTICE.txt %{_docdir}/arrow-glib/ %{_datadir}/gtk-doc/html/arrow-glib/ + +%package -n plasma-glib-libs +Summary: Runtime libraries for Plasma GLib +License: Apache-2.0 +Requires: plasma-libs = %{version}-%{release} +Requires: %{name}-glib-libs = %{version}-%{release} + +%description -n plasma-glib-libs +This package contains the libraries for Plasma GLib. + +%files -n plasma-glib-libs +%defattr(-,root,root,-) +%doc README.md LICENSE.txt NOTICE.txt +%{_libdir}/libplasma-glib.so.* +%{_datadir}/gir-1.0/Plasma-1.0.gir + +%package -n plasma-glib-devel +Summary: Libraries and header files for Plasma GLib +License: Apache-2.0 +Requires: plasma-devel = %{version}-%{release} +Requires: %{name}-glib-devel = %{version}-%{release} + +%description -n plasma-glib-devel +Libraries and header files for Plasma GLib. + +%files -n plasma-glib-devel +%defattr(-,root,root,-) +%doc README.md LICENSE.txt NOTICE.txt +%{_includedir}/plasma-glib/ +%{_libdir}/libplasma-glib.a +%{_libdir}/libplasma-glib.so +%{_libdir}/pkgconfig/plasma-glib.pc +%{_libdir}/girepository-1.0/Plasma-1.0.typelib + +%package -n plasma-glib-doc +Summary: Documentation for Plasma GLib +License: Apache-2.0 + +%description -n plasma-glib-doc +Documentation for Plasma GLib. + +%files -n plasma-glib-doc +%defattr(-,root,root,-) +%doc README.md LICENSE.txt NOTICE.txt +%{_datadir}/gtk-doc/html/plasma-glib/ %endif %if %{use_parquet} && %{use_glib} diff --git a/dev/tasks/tasks.yml b/dev/tasks/tasks.yml index 514942df93e1a..bd49616f6bd3e 100644 --- a/dev/tasks/tasks.yml +++ b/dev/tasks/tasks.yml @@ -293,19 +293,20 @@ tasks: - apache-arrow_{no_rc_version}-1.dsc - apache-arrow_{no_rc_version}.orig.tar.gz - gir1.2-arrow-1.0_{no_rc_version}-1_amd64.deb - - gir1.2-arrow-gpu-1.0_{no_rc_version}-1_amd64.deb + - gir1.2-arrow-cuda-1.0_{no_rc_version}-1_amd64.deb - gir1.2-parquet-1.0_{no_rc_version}-1_amd64.deb + - gir1.2-plasma-1.0_{no_rc_version}-1_amd64.deb - libarrow-dev_{no_rc_version}-1_amd64.deb - libarrow-glib-dev_{no_rc_version}-1_amd64.deb - libarrow-glib-doc_{no_rc_version}-1_all.deb - libarrow-glib12-dbgsym_{no_rc_version}-1_amd64.deb - libarrow-glib12_{no_rc_version}-1_amd64.deb - - libarrow-gpu-dev_{no_rc_version}-1_amd64.deb - - libarrow-gpu-glib-dev_{no_rc_version}-1_amd64.deb - - libarrow-gpu-glib12-dbgsym_{no_rc_version}-1_amd64.deb - - libarrow-gpu-glib12_{no_rc_version}-1_amd64.deb - - libarrow-gpu12-dbgsym_{no_rc_version}-1_amd64.deb - - libarrow-gpu12_{no_rc_version}-1_amd64.deb + - libarrow-cuda-dev_{no_rc_version}-1_amd64.deb + - libarrow-cuda-glib-dev_{no_rc_version}-1_amd64.deb + - libarrow-cuda-glib12-dbgsym_{no_rc_version}-1_amd64.deb + - libarrow-cuda-glib12_{no_rc_version}-1_amd64.deb + - libarrow-cuda12-dbgsym_{no_rc_version}-1_amd64.deb + - libarrow-cuda12_{no_rc_version}-1_amd64.deb - libarrow-python-dev_{no_rc_version}-1_amd64.deb - libarrow-python12-dbgsym_{no_rc_version}-1_amd64.deb - libarrow-python12_{no_rc_version}-1_amd64.deb @@ -318,6 +319,13 @@ tasks: - libparquet-glib12_{no_rc_version}-1_amd64.deb - libparquet12-dbgsym_{no_rc_version}-1_amd64.deb - libparquet12_{no_rc_version}-1_amd64.deb + - libplasma-dev_{no_rc_version}-1_amd64.deb + - libplasma-glib-dev_{no_rc_version}-1_amd64.deb + - libplasma-glib-doc_{no_rc_version}-1_all.deb + - libplasma-glib12-dbgsym_{no_rc_version}-1_amd64.deb + - libplasma-glib12_{no_rc_version}-1_amd64.deb + - libplasma12-dbgsym_{no_rc_version}-1_amd64.deb + - libplasma12_{no_rc_version}-1_amd64.deb ubuntu-trusty: platform: linux @@ -335,6 +343,7 @@ tasks: - apache-arrow_{no_rc_version}.orig.tar.gz - gir1.2-arrow-1.0_{no_rc_version}-1_amd64.deb - gir1.2-parquet-1.0_{no_rc_version}-1_amd64.deb + - gir1.2-plasma-1.0_{no_rc_version}-1_amd64.deb - libarrow-dev_{no_rc_version}-1_amd64.deb - libarrow-glib-dev_{no_rc_version}-1_amd64.deb - libarrow-glib-doc_{no_rc_version}-1_all.deb @@ -345,6 +354,11 @@ tasks: - libparquet-glib-doc_{no_rc_version}-1_all.deb - libparquet-glib12_{no_rc_version}-1_amd64.deb - libparquet12_{no_rc_version}-1_amd64.deb + - libplasma-dev_{no_rc_version}-1_amd64.deb + - libplasma-glib-dev_{no_rc_version}-1_amd64.deb + - libplasma-glib-doc_{no_rc_version}-1_all.deb + - libplasma-glib12_{no_rc_version}-1_amd64.deb + - libplasma12_{no_rc_version}-1_amd64.deb ubuntu-xenial: platform: linux @@ -361,16 +375,17 @@ tasks: - apache-arrow_{no_rc_version}-1.dsc - apache-arrow_{no_rc_version}.orig.tar.gz - gir1.2-arrow-1.0_{no_rc_version}-1_amd64.deb - - gir1.2-arrow-gpu-1.0_{no_rc_version}-1_amd64.deb + - gir1.2-arrow-cuda-1.0_{no_rc_version}-1_amd64.deb - gir1.2-parquet-1.0_{no_rc_version}-1_amd64.deb + - gir1.2-plasma-1.0_{no_rc_version}-1_amd64.deb - libarrow-dev_{no_rc_version}-1_amd64.deb - libarrow-glib-dev_{no_rc_version}-1_amd64.deb - libarrow-glib-doc_{no_rc_version}-1_all.deb - libarrow-glib12_{no_rc_version}-1_amd64.deb - - libarrow-gpu-dev_{no_rc_version}-1_amd64.deb - - libarrow-gpu-glib-dev_{no_rc_version}-1_amd64.deb - - libarrow-gpu-glib12_{no_rc_version}-1_amd64.deb - - libarrow-gpu12_{no_rc_version}-1_amd64.deb + - libarrow-cuda-dev_{no_rc_version}-1_amd64.deb + - libarrow-cuda-glib-dev_{no_rc_version}-1_amd64.deb + - libarrow-cuda-glib12_{no_rc_version}-1_amd64.deb + - libarrow-cuda12_{no_rc_version}-1_amd64.deb - libarrow-python-dev_{no_rc_version}-1_amd64.deb - libarrow-python12_{no_rc_version}-1_amd64.deb - libarrow12_{no_rc_version}-1_amd64.deb @@ -379,6 +394,11 @@ tasks: - libparquet-glib-doc_{no_rc_version}-1_all.deb - libparquet-glib12_{no_rc_version}-1_amd64.deb - libparquet12_{no_rc_version}-1_amd64.deb + - libplasma-dev_{no_rc_version}-1_amd64.deb + - libplasma-glib-dev_{no_rc_version}-1_amd64.deb + - libplasma-glib-doc_{no_rc_version}-1_all.deb + - libplasma-glib12_{no_rc_version}-1_amd64.deb + - libplasma12_{no_rc_version}-1_amd64.deb ubuntu-bionic: platform: linux @@ -395,16 +415,17 @@ tasks: - apache-arrow_{no_rc_version}-1.dsc - apache-arrow_{no_rc_version}.orig.tar.gz - gir1.2-arrow-1.0_{no_rc_version}-1_amd64.deb - - gir1.2-arrow-gpu-1.0_{no_rc_version}-1_amd64.deb + - gir1.2-arrow-cuda-1.0_{no_rc_version}-1_amd64.deb - gir1.2-parquet-1.0_{no_rc_version}-1_amd64.deb + - gir1.2-plasma-1.0_{no_rc_version}-1_amd64.deb - libarrow-dev_{no_rc_version}-1_amd64.deb - libarrow-glib-dev_{no_rc_version}-1_amd64.deb - libarrow-glib-doc_{no_rc_version}-1_all.deb - libarrow-glib12_{no_rc_version}-1_amd64.deb - - libarrow-gpu-dev_{no_rc_version}-1_amd64.deb - - libarrow-gpu-glib-dev_{no_rc_version}-1_amd64.deb - - libarrow-gpu-glib12_{no_rc_version}-1_amd64.deb - - libarrow-gpu12_{no_rc_version}-1_amd64.deb + - libarrow-cuda-dev_{no_rc_version}-1_amd64.deb + - libarrow-cuda-glib-dev_{no_rc_version}-1_amd64.deb + - libarrow-cuda-glib12_{no_rc_version}-1_amd64.deb + - libarrow-cuda12_{no_rc_version}-1_amd64.deb - libarrow-python-dev_{no_rc_version}-1_amd64.deb - libarrow-python12_{no_rc_version}-1_amd64.deb - libarrow12_{no_rc_version}-1_amd64.deb @@ -413,6 +434,11 @@ tasks: - libparquet-glib-doc_{no_rc_version}-1_all.deb - libparquet-glib12_{no_rc_version}-1_amd64.deb - libparquet12_{no_rc_version}-1_amd64.deb + - libplasma-dev_{no_rc_version}-1_amd64.deb + - libplasma-glib-dev_{no_rc_version}-1_amd64.deb + - libplasma-glib-doc_{no_rc_version}-1_all.deb + - libplasma-glib12_{no_rc_version}-1_amd64.deb + - libplasma12_{no_rc_version}-1_amd64.deb ubuntu-cosmic: platform: linux @@ -429,16 +455,17 @@ tasks: - apache-arrow_{no_rc_version}-1.dsc - apache-arrow_{no_rc_version}.orig.tar.gz - gir1.2-arrow-1.0_{no_rc_version}-1_amd64.deb - - gir1.2-arrow-gpu-1.0_{no_rc_version}-1_amd64.deb + - gir1.2-arrow-cuda-1.0_{no_rc_version}-1_amd64.deb - gir1.2-parquet-1.0_{no_rc_version}-1_amd64.deb + - gir1.2-plasma-1.0_{no_rc_version}-1_amd64.deb - libarrow-dev_{no_rc_version}-1_amd64.deb - libarrow-glib-dev_{no_rc_version}-1_amd64.deb - libarrow-glib-doc_{no_rc_version}-1_all.deb - libarrow-glib12_{no_rc_version}-1_amd64.deb - - libarrow-gpu-dev_{no_rc_version}-1_amd64.deb - - libarrow-gpu-glib-dev_{no_rc_version}-1_amd64.deb - - libarrow-gpu-glib12_{no_rc_version}-1_amd64.deb - - libarrow-gpu12_{no_rc_version}-1_amd64.deb + - libarrow-cuda-dev_{no_rc_version}-1_amd64.deb + - libarrow-cuda-glib-dev_{no_rc_version}-1_amd64.deb + - libarrow-cuda-glib12_{no_rc_version}-1_amd64.deb + - libarrow-cuda12_{no_rc_version}-1_amd64.deb - libarrow-python-dev_{no_rc_version}-1_amd64.deb - libarrow-python12_{no_rc_version}-1_amd64.deb - libarrow12_{no_rc_version}-1_amd64.deb @@ -447,6 +474,11 @@ tasks: - libparquet-glib-doc_{no_rc_version}-1_all.deb - libparquet-glib12_{no_rc_version}-1_amd64.deb - libparquet12_{no_rc_version}-1_amd64.deb + - libplasma-dev_{no_rc_version}-1_amd64.deb + - libplasma-glib-dev_{no_rc_version}-1_amd64.deb + - libplasma-glib-doc_{no_rc_version}-1_all.deb + - libplasma-glib12_{no_rc_version}-1_amd64.deb + - libplasma12_{no_rc_version}-1_amd64.deb centos-6: platform: linux @@ -462,6 +494,8 @@ tasks: - arrow-libs-{no_rc_version}-1.el6.x86_64.rpm - arrow-python-devel-{no_rc_version}-1.el6.x86_64.rpm - arrow-python-libs-{no_rc_version}-1.el6.x86_64.rpm + - plasma-devel-{no_rc_version}-1.el6.x86_64.rpm + - plasma-libs-{no_rc_version}-1.el6.x86_64.rpm centos-7: platform: linux @@ -485,6 +519,11 @@ tasks: - parquet-glib-doc-{no_rc_version}-1.el7.x86_64.rpm - parquet-glib-libs-{no_rc_version}-1.el7.x86_64.rpm - parquet-libs-{no_rc_version}-1.el7.x86_64.rpm + - plasma-devel-{no_rc_version}-1.el7.x86_64.rpm + - plasma-glib-devel-{no_rc_version}-1.el7.x86_64.rpm + - plasma-glib-doc-{no_rc_version}-1.el7.x86_64.rpm + - plasma-glib-libs-{no_rc_version}-1.el7.x86_64.rpm + - plasma-libs-{no_rc_version}-1.el7.x86_64.rpm ############################## Gandiva Tasks ################################ diff --git a/dev/tasks/tests.yml b/dev/tasks/tests.yml index 2365455a8a7cb..c158481de461e 100644 --- a/dev/tasks/tests.yml +++ b/dev/tasks/tests.yml @@ -21,11 +21,14 @@ groups: docker: - docker-rust - docker-cpp + - docker-cpp-alpine - docker-c_glib - docker-go - docker-python-2.7 - docker-python-3.6 - docker-python-3.7 + - docker-python-2.7-alpine + - docker-python-3.6-alpine - docker-java - docker-js - docker-lint @@ -34,6 +37,19 @@ groups: - docker-hdfs-integration - docker-pandas-master + integration: + - docker-hdfs-integration + - docker-pandas-master + + cpp-python: + - docker-cpp + - docker-cpp-alpine + - docker-python-2.7 + - docker-python-2.7-alpine + - docker-python-3.6 + - docker-python-3.6-alpine + - docker-python-3.7 + tasks: # arbitrary_task_name: # platform: osx|linux|win @@ -62,6 +78,14 @@ tasks: - docker-compose build cpp - docker-compose run cpp + docker-cpp-alpine: + platform: linux + template: docker-tests/travis.linux.yml + params: + commands: + - docker-compose build cpp-alpine + - docker-compose run cpp-alpine + docker-c_glib: platform: linux template: docker-tests/travis.linux.yml @@ -128,6 +152,28 @@ tasks: - docker-compose build python - docker-compose run python + docker-python-2.7-alpine: + platform: linux + template: docker-tests/travis.linux.yml + params: + environment: + PYTHON_VERSION: 2.7 + commands: + - docker-compose build cpp-alpine + - docker-compose build python-alpine + - docker-compose run python-alpine + + docker-python-3.6-alpine: + platform: linux + template: docker-tests/travis.linux.yml + params: + environment: + PYTHON_VERSION: 3.6 + commands: + - docker-compose build cpp-alpine + - docker-compose build python-alpine + - docker-compose run python-alpine + ############################## Linter tests ################################# docker-lint: diff --git a/docker-compose.yml b/docker-compose.yml index 50e4dded6146f..d6f11004233e5 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -19,10 +19,15 @@ version: '3.5' -x-volumes: - &volumes +x-ubuntu-volumes: + &ubuntu-volumes - .:/arrow:delegated - - ${ARROW_DOCKER_CACHE_DIR:-./docker_cache}:/build:delegated + - ${ARROW_DOCKER_CACHE_DIR:-./docker_cache}/ubuntu:/build:delegated + +x-alpine-volumes: + &alpine-volumes + - .:/arrow:delegated + - ${ARROW_DOCKER_CACHE_DIR:-./docker_cache}/alpine:/build:delegated services: @@ -37,7 +42,7 @@ services: build: context: . dockerfile: c_glib/Dockerfile - volumes: *volumes + volumes: *ubuntu-volumes cpp: # Usage: @@ -50,7 +55,20 @@ services: dockerfile: cpp/Dockerfile environment: PARQUET_TEST_DATA: /arrow/cpp/submodules/parquet-testing/data - volumes: *volumes + volumes: *ubuntu-volumes + + cpp-alpine: + # Usage: + # docker-compose build cpp-alpine + # docker-compose run cpp-alpine + image: arrow:cpp-alpine + shm_size: 2G + build: + context: . + dockerfile: cpp/Dockerfile.alpine + environment: + PARQUET_TEST_DATA: /arrow/cpp/submodules/parquet-testing/data + volumes: *alpine-volumes go: # Usage: @@ -60,7 +78,7 @@ services: build: context: . dockerfile: go/Dockerfile - volumes: *volumes + volumes: *ubuntu-volumes java: # Usage: @@ -93,7 +111,22 @@ services: dockerfile: python/Dockerfile args: PYTHON_VERSION: ${PYTHON_VERSION:-3.6} - volumes: *volumes + volumes: *ubuntu-volumes + + python-alpine: + # Usage: + # export PYTHON_VERSION=2.7|3.6 (minor version is ignored) + # docker-compose build cpp-alpine + # docker-compose build python-alpine + # docker-compose run python-alpine + image: arrow:python-${PYTHON_VERSION:-3.6}-alpine + shm_size: 2G + build: + context: . + dockerfile: python/Dockerfile.alpine + args: + PYTHON_VERSION: ${PYTHON_VERSION:-3.6} + volumes: *alpine-volumes rust: # Usage: @@ -103,7 +136,7 @@ services: build: context: . dockerfile: rust/Dockerfile - volumes: *volumes + volumes: *ubuntu-volumes r: # Usage: @@ -114,12 +147,12 @@ services: build: context: . dockerfile: r/Dockerfile - volumes: *volumes + volumes: *ubuntu-volumes ######################### Tools and Linters ################################# # TODO(kszucs): site - # TODO(kszucs): apidoc + # TODO(kszucs): {cpp,java,glib,js}-apidoc lint: # Usage: @@ -130,7 +163,7 @@ services: context: . dockerfile: dev/lint/Dockerfile command: arrow/dev/lint/run_linters.sh - volumes: *volumes + volumes: *ubuntu-volumes iwyu: # Usage: @@ -141,14 +174,28 @@ services: CC: clang CXX: clang++ command: arrow/dev/lint/run_iwyu.sh - volumes: *volumes + volumes: *ubuntu-volumes clang-format: # Usage: + # docker-compose build cpp + # docker-compose build python # docker-compose build lint # docker-compose run clang-format image: arrow:lint command: arrow/dev/lint/run_clang_format.sh + volumes: *ubuntu-volumes + + docs: + # Usage: + # docker-compose build cpp + # docker-compose build python + # docker-compose build docs + # docker-compose run docs + image: arrow:docs + build: + context: . + dockerfile: docs/Dockerfile volumes: *volumes ######################### Integration Tests ################################# @@ -208,7 +255,7 @@ services: context: . dockerfile: integration/pandas/Dockerfile shm_size: 2G - volumes: *volumes + volumes: *ubuntu-volumes # TODO(kszucs): dask-integration diff --git a/python/doc/.gitignore b/docs/.gitignore similarity index 97% rename from python/doc/.gitignore rename to docs/.gitignore index 3bee39fa36fe4..d2e9f6ccc8f79 100644 --- a/python/doc/.gitignore +++ b/docs/.gitignore @@ -16,4 +16,4 @@ # under the License. _build -source/generated \ No newline at end of file +source/python/generated diff --git a/python/doc/Benchmarks.md b/docs/Benchmarks.md similarity index 100% rename from python/doc/Benchmarks.md rename to docs/Benchmarks.md diff --git a/docs/Dockerfile b/docs/Dockerfile new file mode 100644 index 0000000000000..4908110b7fb56 --- /dev/null +++ b/docs/Dockerfile @@ -0,0 +1,26 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +FROM arrow:python-3.6 + +ADD ci/conda_env_sphinx.yml /arrow/ci/ +RUN conda install -c conda-forge \ + --file arrow/ci/conda_env_sphinx.yml && \ + conda clean --all +CMD arrow/ci/docker_build_cpp.sh && \ + arrow/ci/docker_build_python.sh && \ + arrow/ci/docker_build_sphinx.sh diff --git a/python/doc/Makefile b/docs/Makefile similarity index 100% rename from python/doc/Makefile rename to docs/Makefile diff --git a/python/doc/environment.yml b/docs/environment.yml similarity index 100% rename from python/doc/environment.yml rename to docs/environment.yml diff --git a/python/doc/requirements.txt b/docs/requirements.txt similarity index 86% rename from python/doc/requirements.txt rename to docs/requirements.txt index f3c3414a4be9a..7e33455de0e9b 100644 --- a/python/doc/requirements.txt +++ b/docs/requirements.txt @@ -1,3 +1,4 @@ +breathe ipython matplotlib numpydoc diff --git a/python/doc/source/_static/stub b/docs/source/_static/stub similarity index 100% rename from python/doc/source/_static/stub rename to docs/source/_static/stub diff --git a/python/doc/source/_templates/layout.html b/docs/source/_templates/layout.html similarity index 100% rename from python/doc/source/_templates/layout.html rename to docs/source/_templates/layout.html diff --git a/python/doc/source/conf.py b/docs/source/conf.py similarity index 93% rename from python/doc/source/conf.py rename to docs/source/conf.py index f8327902f218a..1cadef18b64f2 100644 --- a/python/doc/source/conf.py +++ b/docs/source/conf.py @@ -30,7 +30,7 @@ # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. # -import glob +import pyarrow import os import sys @@ -57,16 +57,17 @@ 'sphinx.ext.viewcode', 'sphinx.ext.napoleon', 'IPython.sphinxext.ipython_directive', - 'IPython.sphinxext.ipython_console_highlighting' + 'IPython.sphinxext.ipython_console_highlighting', + 'breathe' ] # Show members for classes in .. autosummary -autodoc_default_flags = [ - 'members', - 'undoc-members', - 'show-inheritance', - 'inherited-members' -] +autodoc_default_options = { + 'members': None, + 'undoc-members': None, + 'show-inheritance': None, + 'inherited-members': None +} # ipython directive options ipython_mplbackend = '' @@ -77,13 +78,16 @@ # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] +breathe_projects = {"arrow_cpp": "../../cpp/apidoc/xml"} +breathe_default_project = "arrow_cpp" + # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # -# source_suffix = ['.rst', '.md'] -source_suffix = '.rst' -autosummary_generate = glob.glob("*.rst") +source_suffix = ['.rst'] + +autosummary_generate = True # The encoding of source files. # @@ -93,7 +97,7 @@ master_doc = 'index' # General information about the project. -project = u'pyarrow' +project = u'Apache Arrow' copyright = u'2016-2018 Apache Software Foundation' author = u'Apache Software Foundation' @@ -102,9 +106,9 @@ # built documents. # # The short X.Y version. -version = u'' +version = pyarrow.__version__ # The full version, including alpha/beta/rc tags. -release = u'' +release = pyarrow.__version__ # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. @@ -180,7 +184,7 @@ # The name for this set of Sphinx documents. # " v documentation" by default. # -# html_title = u'pyarrow v0.1.0' +html_title = u'Apache Arrow v{}'.format(version) # A shorter title for the navigation bar. Default is the same as html_title. # @@ -280,7 +284,7 @@ # html_search_scorer = 'scorer.js' # Output file base name for HTML help builder. -htmlhelp_basename = 'pyarrowdoc' +htmlhelp_basename = 'arrowdoc' # -- Options for LaTeX output --------------------------------------------- @@ -306,7 +310,7 @@ # (source start file, target name, title, # author, documentclass [howto, manual, or own class]). latex_documents = [ - (master_doc, 'pyarrow.tex', u'pyarrow Documentation', + (master_doc, 'arrow.tex', u'Apache Arrow Documentation', u'Apache Arrow Team', 'manual'), ] @@ -348,7 +352,7 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ - (master_doc, 'pyarrow', u'pyarrow Documentation', + (master_doc, 'arrow', u'Apache Arrow Documentation', [author], 1) ] @@ -363,8 +367,8 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - (master_doc, 'pyarrow', u'pyarrow Documentation', - author, 'pyarrow', 'One line description of project.', + (master_doc, 'arrow', u'Apache Arrow Documentation', + author, 'Apache Arrow', 'One line description of project.', 'Miscellaneous'), ] diff --git a/docs/source/cpp/api.rst b/docs/source/cpp/api.rst new file mode 100644 index 0000000000000..894ed1f907f6d --- /dev/null +++ b/docs/source/cpp/api.rst @@ -0,0 +1,27 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +************* +API Reference +************* + +.. toctree:: + :maxdepth: 2 + :caption: Getting Started + + api/array + api/memory diff --git a/docs/source/cpp/api/array.rst b/docs/source/cpp/api/array.rst new file mode 100644 index 0000000000000..aed18763b6ce7 --- /dev/null +++ b/docs/source/cpp/api/array.rst @@ -0,0 +1,81 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Array types +============= + +.. doxygenclass:: arrow::Array + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::DictionaryArray + :project: arrow_cpp + :members: + +non-nested array types +---------------------- + +.. doxygenclass:: arrow::FlatArray + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::NullArray + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::BinaryArray + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::StringArray + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::PrimitiveArray + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::BooleanArray + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::FixedSizeBinaryArray + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::Decimal128Array + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::NumericArray + :project: arrow_cpp + :members: + +nested array types +------------------ + +.. doxygenclass:: arrow::UnionArray + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::ListArray + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::StructArray + :project: arrow_cpp + :members: diff --git a/docs/source/cpp/api/memory.rst b/docs/source/cpp/api/memory.rst new file mode 100644 index 0000000000000..fbb5dc818628c --- /dev/null +++ b/docs/source/cpp/api/memory.rst @@ -0,0 +1,57 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Memory (management) +=================== + +Basic containers +---------------- + +.. doxygenclass:: arrow::Buffer + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::MutableBuffer + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::ResizableBuffer + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::BufferBuilder + :project: arrow_cpp + :members: + +Memory Pools +------------ + +.. doxygenfunction:: arrow::default_memory_pool + :project: arrow_cpp + :outline: + +.. doxygenclass:: arrow::MemoryPool + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::LoggingMemoryPool + :project: arrow_cpp + :members: + +.. doxygenclass:: arrow::ProxyMemoryPool + :project: arrow_cpp + :members: diff --git a/docs/source/cpp/index.rst b/docs/source/cpp/index.rst new file mode 100644 index 0000000000000..4f874bac4fd1e --- /dev/null +++ b/docs/source/cpp/index.rst @@ -0,0 +1,88 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +C++ Implementation +================== + +.. toctree:: + :maxdepth: 2 + :caption: Getting Started + + api + +Getting Started +--------------- + +The most basic structure in Arrow is an :cpp:class:`arrow::Array`. It holds a sequence +of values with known length all having the same type. It consists of the data +itself and an additional bitmap that indicates if the corresponding entry of +array is a null-value. Note that for array with zero null entries, we can omit +this bitmap. + +As Arrow objects are immutable, there are classes provided that should help you +build these objects. To build an array of ``int64_t`` elements, we can use the +:cpp:class:`arrow::Int64Builder`. In the following example, we build an array of +the range 1 to 8 where the element that should hold the number 4 is nulled. + +.. code:: + + Int64Builder builder; + builder.Append(1); + builder.Append(2); + builder.Append(3); + builder.AppendNull(); + builder.Append(5); + builder.Append(6); + builder.Append(7); + builder.Append(8); + + std::shared_ptr array; + builder.Finish(&array); + +The resulting Array (which can be casted to :cpp:class:`arrow::Int64Array` if you want +to access its values) then consists of two :cpp:class:`arrow::Buffer`. The first one is +the null bitmap holding a single byte with the bits ``0|0|0|0|1|0|0|0``. +As we use `least-significant bit (LSB) numbering`_. +this indicates that the fourth entry in the array is null. The second +buffer is simply an ``int64_t`` array containing all the above values. +As the fourth entry is null, the value at that position in the buffer is +undefined. + +.. code:: + + // Cast the Array to its actual type to access its data + std::shared_ptr int64_array = std::static_pointer_cast(array); + + // Get the pointer to the null bitmap. + const uint8_t* null_bitmap = int64_array->null_bitmap_data(); + + // Get the pointer to the actual data + const int64_t* data = int64_array->raw_values(); + +In the above example, we have yet skipped explaining two things in the code. +On constructing the builder, we have passed :cpp:func:`arrow::int64()` to it. This is +the type information with which the resulting array will be annotated. In +this simple form, it is solely a :cpp:class:`std::shared_ptr` +instantiation. + +Furthermore, we have passed :cpp:func:`arrow::default_memory_pool()` to the constructor. +This :cpp:class:`arrow::MemoryPool` is used for the allocations of heap memory. Besides +tracking the amount of memory allocated, the allocator also ensures that the +allocated memory regions are 64-byte aligned (as required by the Arrow +specification). + +.. _least-significant bit (LSB) numbering: https://en.wikipedia.org/wiki/Bit_numbering diff --git a/format/Arrow.graffle b/docs/source/format/Arrow.graffle similarity index 100% rename from format/Arrow.graffle rename to docs/source/format/Arrow.graffle diff --git a/format/Arrow.png b/docs/source/format/Arrow.png similarity index 100% rename from format/Arrow.png rename to docs/source/format/Arrow.png diff --git a/docs/source/format/Guidelines.rst b/docs/source/format/Guidelines.rst new file mode 100644 index 0000000000000..5b032206c2611 --- /dev/null +++ b/docs/source/format/Guidelines.rst @@ -0,0 +1,43 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Implementation guidelines +========================= + +An execution engine (or framework, or UDF executor, or storage engine, etc) can implements only a subset of the Arrow spec and/or extend it given the following constraints: + +Implementing a subset the spec +------------------------------ + +If only producing (and not consuming) arrow vectors. +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Any subset of the vector spec and the corresponding metadata can be implemented. + +If consuming and producing vectors +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +There is a minimal subset of vectors to be supported. +Production of a subset of vectors and their corresponding metadata is always fine. +Consumption of vectors should at least convert the unsupported input vectors to the supported subset (for example Timestamp.millis to timestamp.micros or int32 to int64) + +Extensibility +------------- + +An execution engine implementor can also extend their memory representation with their own vectors internally as long as they are never exposed. Before sending data to another system expecting Arrow data these custom vectors should be converted to a type that exist in the Arrow spec. +An example of this is operating on compressed data. +These custom vectors are not exchanged externally and there is no support for custom metadata. diff --git a/docs/source/format/IPC.rst b/docs/source/format/IPC.rst new file mode 100644 index 0000000000000..8cb74b87afcdc --- /dev/null +++ b/docs/source/format/IPC.rst @@ -0,0 +1,237 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Interprocess messaging / communication (IPC) +============================================ + +Encapsulated message format +--------------------------- + +Data components in the stream and file formats are represented as encapsulated +*messages* consisting of: + +* A length prefix indicating the metadata size +* The message metadata as a `Flatbuffer`_ +* Padding bytes to an 8-byte boundary +* The message body, which must be a multiple of 8 bytes + +Schematically, we have: :: + + + + + + +The complete serialized message must be a multiple of 8 bytes so that messages +can be relocated between streams. Otherwise the amount of padding between the +metadata and the message body could be non-deterministic. + +The ``metadata_size`` includes the size of the flatbuffer plus padding. The +``Message`` flatbuffer includes a version number, the particular message (as a +flatbuffer union), and the size of the message body: :: + + table Message { + version: org.apache.arrow.flatbuf.MetadataVersion; + header: MessageHeader; + bodyLength: long; + } + +Currently, we support 4 types of messages: + +* Schema +* RecordBatch +* DictionaryBatch +* Tensor + +Streaming format +---------------- + +We provide a streaming format for record batches. It is presented as a sequence +of encapsulated messages, each of which follows the format above. The schema +comes first in the stream, and it is the same for all of the record batches +that follow. If any fields in the schema are dictionary-encoded, one or more +``DictionaryBatch`` messages will be included. ``DictionaryBatch`` and +``RecordBatch`` messages may be interleaved, but before any dictionary key is used +in a ``RecordBatch`` it should be defined in a ``DictionaryBatch``. :: + + + + ... + + + ... + + ... + + ... + + + +When a stream reader implementation is reading a stream, after each message, it +may read the next 4 bytes to know how large the message metadata that follows +is. Once the message flatbuffer is read, you can then read the message body. + +The stream writer can signal end-of-stream (EOS) either by writing a 0 length +as an ``int32`` or simply closing the stream interface. + +File format +----------- + +We define a "file format" supporting random access in a very similar format to +the streaming format. The file starts and ends with a magic string ``ARROW1`` +(plus padding). What follows in the file is identical to the stream format. At +the end of the file, we write a *footer* containing a redundant copy of the +schema (which is a part of the streaming format) plus memory offsets and sizes +for each of the data blocks in the file. This enables random access any record +batch in the file. See ``File.fbs`` for the precise details of the file +footer. + +Schematically we have: :: + + + + +
+
+ + +In the file format, there is no requirement that dictionary keys should be +defined in a ``DictionaryBatch`` before they are used in a ``RecordBatch``, as long +as the keys are defined somewhere in the file. + +RecordBatch body structure +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``RecordBatch`` metadata contains a depth-first (pre-order) flattened set of +field metadata and physical memory buffers (some comments from ``Message.fbs`` +have been shortened / removed): :: + + table RecordBatch { + length: long; + nodes: [FieldNode]; + buffers: [Buffer]; + } + + struct FieldNode { + length: long; + null_count: long; + } + + struct Buffer { + /// The relative offset into the shared memory page where the bytes for this + /// buffer starts + offset: long; + + /// The absolute length (in bytes) of the memory buffer. The memory is found + /// from offset (inclusive) to offset + length (non-inclusive). + length: long; + } + +In the context of a file, the ``page`` is not used, and the ``Buffer`` offsets use +as a frame of reference the start of the message body. So, while in a general +IPC setting these offsets may be anyplace in one or more shared memory regions, +in the file format the offsets start from 0. + +The location of a record batch and the size of the metadata block as well as +the body of buffers is stored in the file footer: :: + + struct Block { + offset: long; + metaDataLength: int; + bodyLength: long; + } + +The ``metaDataLength`` here includes the metadata length prefix, serialized +metadata, and any additional padding bytes, and by construction must be a +multiple of 8 bytes. + +Some notes about this + +* The ``Block`` offset indicates the starting byte of the record batch. +* The metadata length includes the flatbuffer size, the record batch metadata + flatbuffer, and any padding bytes + +Dictionary Batches +~~~~~~~~~~~~~~~~~~ + +Dictionaries are written in the stream and file formats as a sequence of record +batches, each having a single field. The complete semantic schema for a +sequence of record batches, therefore, consists of the schema along with all of +the dictionaries. The dictionary types are found in the schema, so it is +necessary to read the schema to first determine the dictionary types so that +the dictionaries can be properly interpreted. :: + + table DictionaryBatch { + id: long; + data: RecordBatch; + isDelta: boolean = false; + } + +The dictionary ``id`` in the message metadata can be referenced one or more times +in the schema, so that dictionaries can even be used for multiple fields. See +the :doc:`Layout` document for more about the semantics of +dictionary-encoded data. + +The dictionary ``isDelta`` flag allows dictionary batches to be modified +mid-stream. A dictionary batch with ``isDelta`` set indicates that its vector +should be concatenated with those of any previous batches with the same ``id``. A +stream which encodes one column, the list of strings +``["A", "B", "C", "B", "D", "C", "E", "A"]``, with a delta dictionary batch could +take the form: :: + + + + (0) "A" + (1) "B" + (2) "C" + + + 0 + 1 + 2 + 1 + + + (3) "D" + (4) "E" + + + 3 + 2 + 4 + 0 + EOS + +Tensor (Multi-dimensional Array) Message Format +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``Tensor`` message types provides a way to write a multidimensional array of +fixed-size values (such as a NumPy ndarray) using Arrow's shared memory +tools. Arrow implementations in general are not required to implement this data +format, though we provide a reference implementation in C++. + +When writing a standalone encapsulated tensor message, we use the format as +indicated above, but additionally align the starting offset of the metadata as +well as the starting offset of the tensor body (if writing to a shared memory +region) to be multiples of 64 bytes: :: + + + + + + +.. _Flatbuffer: https://github.com/google/flatbuffers diff --git a/format/Layout.md b/docs/source/format/Layout.rst similarity index 53% rename from format/Layout.md rename to docs/source/format/Layout.rst index 80af1d3d37a3b..868a99b34f8d0 100644 --- a/format/Layout.md +++ b/docs/source/format/Layout.rst @@ -1,25 +1,25 @@ - +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. -# Arrow: Physical memory layout +Physical memory layout +====================== -## Definitions / Terminology +Definitions / Terminology +------------------------- Since different projects have used different words to describe various concepts, here is a small glossary to help disambiguate. @@ -35,21 +35,22 @@ concepts, here is a small glossary to help disambiguate. in bit width or byte width * Nested or parametric type: a data type whose full structure depends on one or more other child relative types. Two fully-specified nested types are equal - if and only if their child types are equal. For example, `List` is distinct - from `List` iff U and V are different relative types. + if and only if their child types are equal. For example, ``List`` is distinct + from ``List`` iff U and V are different relative types. * Relative type or simply type (unqualified): either a specific primitive type or a fully-specified nested type. When we say slot we mean a relative type value, not necessarily any physical storage region. * Logical type: A data type that is implemented using some relative (physical) type. For example, Decimal values are stored as 16 bytes in a fixed byte - size array. Similarly, strings can be stored as `List<1-byte>`. + size array. Similarly, strings can be stored as ``List<1-byte>``. * Parent and child arrays: names to express relationships between physical - value arrays in a nested type structure. For example, a `List`-type parent + value arrays in a nested type structure. For example, a ``List``-type parent array has a T-type array as its child (see more on lists below). * Leaf node or leaf: A primitive value array that may or may not be a child array of some array with a nested type. -## Requirements, goals, and non-goals +Requirements, goals, and non-goals +---------------------------------- Base requirements @@ -59,7 +60,7 @@ Base requirements proprietary systems that utilize the open source components. * All array slots are accessible in constant time, with complexity growing linearly in the nesting level -* Capable of representing fully-materialized and decoded / decompressed [Parquet][5] +* Capable of representing fully-materialized and decoded / decompressed `Parquet`_ data * It is required to have all the contiguous memory buffers in an IPC payload aligned at 8-byte boundaries. In other words, each buffer must start at @@ -75,14 +76,16 @@ Base requirements be migrated to a different address space (e.g. via a memcpy-type of operation) without altering their contents. -## Goals (for this document) +Goals (for this document) +------------------------- * To describe relative types (physical value types and a preliminary set of nested types) sufficient for an unambiguous implementation * Memory layout and random access patterns for each relative type * Null value representation -## Non-goals (for this document) +Non-goals (for this document) +----------------------------- * To enumerate or specify logical types that can be implemented as primitive (fixed-width) value types. For example: signed and unsigned integers, @@ -98,7 +101,8 @@ Base requirements * Any memory management or reference counting subsystem * To enumerate or specify types of encodings or compression support -## Byte Order ([Endianness][3]) +Byte Order (`Endianness`_) +--------------------------- The Arrow format is little endian by default. The Schema metadata has an endianness field indicating endianness of RecordBatches. @@ -109,7 +113,8 @@ that does not match the underlying system. The reference implementation is focus Little Endian and provides tests for it. Eventually we may provide automatic conversion via byte swapping. -## Alignment and Padding +Alignment and Padding +--------------------- As noted above, all buffers must be aligned in memory at 8-byte boundaries and padded to a length that is a multiple of 8 bytes. The alignment requirement follows best @@ -117,10 +122,10 @@ practices for optimized memory access: * Elements in numeric arrays will be guaranteed to be retrieved via aligned access. * On some architectures alignment can help limit partially used cache lines. -* 64 byte alignment is recommended by the [Intel performance guide][2] for +* 64 byte alignment is recommended by the `Intel performance guide`_ for data-structures over 64 bytes (which will be a common case for Arrow Arrays). -Recommending padding to a multiple of 64 bytes allows for using [SIMD][4] instructions +Recommending padding to a multiple of 64 bytes allows for using `SIMD`_ instructions consistently in loops without additional conditional checks. This should allow for simpler, efficient and CPU cache-friendly code. The specific padding length was chosen because it matches the largest known @@ -129,27 +134,30 @@ words, we can load the entire 64-byte buffer into a 512-bit wide SIMD register and get data-level parallelism on all the columnar values packed into the 64-byte buffer. Guaranteed padding can also allow certain compilers to generate more optimized code directly (e.g. One can safely use Intel's -`-qopt-assume-safe-padding`). +``-qopt-assume-safe-padding``). Unless otherwise noted, padded bytes do not need to have a specific value. -## Array lengths +Array lengths +------------- Array lengths are represented in the Arrow metadata as a 64-bit signed integer. An implementation of Arrow is considered valid even if it only supports lengths up to the maximum 32-bit signed integer, though. If using Arrow in a multi-language environment, we recommend limiting lengths to -231 - 1 elements or less. Larger data sets can be represented using +2 :sup:`31` - 1 elements or less. Larger data sets can be represented using multiple array chunks. -## Null count +Null count +---------- The number of null value slots is a property of the physical array and considered part of the data structure. The null count is represented in the Arrow metadata as a 64-bit signed integer, as it may be as large as the array length. -## Null bitmaps +Null bitmaps +------------ Any relative type can have null value slots, whether primitive or nested type. @@ -159,25 +167,21 @@ and large enough to have at least 1 bit for each array slot. Whether any array slot is valid (non-null) is encoded in the respective bits of -this bitmap. A 1 (set bit) for index `j` indicates that the value is not null, +this bitmap. A 1 (set bit) for index ``j`` indicates that the value is not null, while a 0 (bit not set) indicates that it is null. Bitmaps are to be -initialized to be all unset at allocation time (this includes padding). +initialized to be all unset at allocation time (this includes padding).:: -``` -is_valid[j] -> bitmap[j / 8] & (1 << (j % 8)) -``` + is_valid[j] -> bitmap[j / 8] & (1 << (j % 8)) -We use [least-significant bit (LSB) numbering][1] (also known as +We use `least-significant bit (LSB) numbering`_ (also known as bit-endianness). This means that within a group of 8 bits, we read -right-to-left: +right-to-left: :: -``` -values = [0, 1, null, 2, null, 3] + values = [0, 1, null, 2, null, 3] -bitmap -j mod 8 7 6 5 4 3 2 1 0 - 0 0 1 0 1 0 1 1 -``` + bitmap + j mod 8 7 6 5 4 3 2 1 0 + 0 0 1 0 1 0 1 1 Arrays having a 0 null count may choose to not allocate the null bitmap. Implementations may choose to always allocate one anyway as a matter of @@ -186,7 +190,8 @@ convenience, but this should be noted when memory is being shared. Nested type arrays have their own null bitmap and null count regardless of the null count and null bits of their child arrays. -## Primitive value arrays +Primitive value arrays +---------------------- A primitive value array represents a fixed-length array of values each having the same physical slot width typically measured in bytes, though the spec also @@ -200,66 +205,64 @@ The associated null bitmap is contiguously allocated (as described above) but does not need to be adjacent in memory to the values buffer. -### Example Layout: Int32 Array -For example a primitive array of int32s: +Example Layout: Int32 Array +~~~~~~~~~~~~~~~~~~~~~~~~~~~ -[1, null, 2, 4, 8] +For example a primitive array of int32s: :: -Would look like: + [1, null, 2, 4, 8] -``` -* Length: 5, Null count: 1 -* Null bitmap buffer: +Would look like: :: - |Byte 0 (validity bitmap) | Bytes 1-63 | - |-------------------------|-----------------------| - | 00011101 | 0 (padding) | + * Length: 5, Null count: 1 + * Null bitmap buffer: -* Value Buffer: + |Byte 0 (validity bitmap) | Bytes 1-63 | + |-------------------------|-----------------------| + | 00011101 | 0 (padding) | + + * Value Buffer: - |Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-19 | Bytes 20-63 | - |------------|-------------|-------------|-------------|-------------|-------------| - | 1 | unspecified | 2 | 4 | 8 | unspecified | -``` + |Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-19 | Bytes 20-63 | + |------------|-------------|-------------|-------------|-------------|-------------| + | 1 | unspecified | 2 | 4 | 8 | unspecified | -### Example Layout: Non-null int32 Array +Example Layout: Non-null int32 Array +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -[1, 2, 3, 4, 8] has two possible layouts: +``[1, 2, 3, 4, 8]`` has two possible layouts: :: -``` -* Length: 5, Null count: 0 -* Null bitmap buffer: + * Length: 5, Null count: 0 + * Null bitmap buffer: - | Byte 0 (validity bitmap) | Bytes 1-63 | - |--------------------------|-----------------------| - | 00011111 | 0 (padding) | + | Byte 0 (validity bitmap) | Bytes 1-63 | + |--------------------------|-----------------------| + | 00011111 | 0 (padding) | -* Value Buffer: + * Value Buffer: - |Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | bytes 12-15 | bytes 16-19 | Bytes 20-63 | - |------------|-------------|-------------|-------------|-------------|-------------| - | 1 | 2 | 3 | 4 | 8 | unspecified | -``` + |Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | bytes 12-15 | bytes 16-19 | Bytes 20-63 | + |------------|-------------|-------------|-------------|-------------|-------------| + | 1 | 2 | 3 | 4 | 8 | unspecified | -or with the bitmap elided: +or with the bitmap elided: :: -``` -* Length 5, Null count: 0 -* Null bitmap buffer: Not required -* Value Buffer: + * Length 5, Null count: 0 + * Null bitmap buffer: Not required + * Value Buffer: - |Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | bytes 12-15 | bytes 16-19 | Bytes 20-63 | - |------------|-------------|-------------|-------------|-------------|-------------| - | 1 | 2 | 3 | 4 | 8 | unspecified | -``` + |Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | bytes 12-15 | bytes 16-19 | Bytes 20-63 | + |------------|-------------|-------------|-------------|-------------|-------------| + | 1 | 2 | 3 | 4 | 8 | unspecified | -## List type +List type +--------- List is a nested type in which each array slot contains a variable-size sequence of values all having the same relative type (heterogeneity can be achieved through unions, described later). -A list type is specified like `List`, where `T` is any relative type +A list type is specified like ``List``, where ``T`` is any relative type (primitive or nested). A list-array is represented by the combination of the following: @@ -267,93 +270,92 @@ A list-array is represented by the combination of the following: * A values array, a child array of type T. T may also be a nested type. * An offsets buffer containing 32-bit signed integers with length equal to the length of the top-level array plus one. Note that this limits the size of the - values array to 231-1. + values array to 2 :sup:`31` -1. The offsets array encodes a start position in the values array, and the length of the value in each slot is computed using the first difference with the next element in the offsets array. For example, the position and length of slot j is -computed as: +computed as: :: -``` -slot_position = offsets[j] -slot_length = offsets[j + 1] - offsets[j] // (for 0 <= j < length) -``` + slot_position = offsets[j] + slot_length = offsets[j + 1] - offsets[j] // (for 0 <= j < length) The first value in the offsets array is 0, and the last element is the length of the values array. -### Example Layout: `List` Array -Let's consider an example, the type `List`, where Char is a 1-byte +Example Layout: ``List`` Array +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Let's consider an example, the type ``List``, where Char is a 1-byte logical type. -For an array of length 4 with respective values: +For an array of length 4 with respective values: :: -[['j', 'o', 'e'], null, ['m', 'a', 'r', 'k'], []] + [['j', 'o', 'e'], null, ['m', 'a', 'r', 'k'], []] -will have the following representation: +will have the following representation: :: -``` -* Length: 4, Null count: 1 -* Null bitmap buffer: + * Length: 4, Null count: 1 + * Null bitmap buffer: - | Byte 0 (validity bitmap) | Bytes 1-63 | - |--------------------------|-----------------------| - | 00001101 | 0 (padding) | + | Byte 0 (validity bitmap) | Bytes 1-63 | + |--------------------------|-----------------------| + | 00001101 | 0 (padding) | -* Offsets buffer (int32) + * Offsets buffer (int32) - | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-19 | Bytes 20-63 | - |------------|-------------|-------------|-------------|-------------|-------------| - | 0 | 3 | 3 | 7 | 7 | unspecified | + | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-19 | Bytes 20-63 | + |------------|-------------|-------------|-------------|-------------|-------------| + | 0 | 3 | 3 | 7 | 7 | unspecified | -* Values array (char array): - * Length: 7, Null count: 0 - * Null bitmap buffer: Not required + * Values array (char array): + * Length: 7, Null count: 0 + * Null bitmap buffer: Not required - | Bytes 0-6 | Bytes 7-63 | - |------------|-------------| - | joemark | unspecified | -``` + | Bytes 0-6 | Bytes 7-63 | + |------------|-------------| + | joemark | unspecified | -### Example Layout: `List>` -[[[1, 2], [3, 4]], [[5, 6, 7], null, [8]], [[9, 10]]] +Example Layout: ``List>`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -will be be represented as follows: +``[[[1, 2], [3, 4]], [[5, 6, 7], null, [8]], [[9, 10]]]`` -``` -* Length 3 -* Nulls count: 0 -* Null bitmap buffer: Not required -* Offsets buffer (int32) +will be be represented as follows: :: - | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-63 | - |------------|------------|------------|-------------|-------------| - | 0 | 2 | 5 | 6 | unspecified | + * Length 3 + * Nulls count: 0 + * Null bitmap buffer: Not required + * Offsets buffer (int32) -* Values array (`List`) - * Length: 6, Null count: 1 - * Null bitmap buffer: + | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-63 | + |------------|------------|------------|-------------|-------------| + | 0 | 2 | 5 | 6 | unspecified | - | Byte 0 (validity bitmap) | Bytes 1-63 | - |--------------------------|-------------| - | 00110111 | 0 (padding) | + * Values array (`List`) + * Length: 6, Null count: 1 + * Null bitmap buffer: - * Offsets buffer (int32) + | Byte 0 (validity bitmap) | Bytes 1-63 | + |--------------------------|-------------| + | 00110111 | 0 (padding) | - | Bytes 0-27 | Bytes 28-63 | - |----------------------|-------------| - | 0, 2, 4, 7, 7, 8, 10 | unspecified | + * Offsets buffer (int32) - * Values array (bytes): - * Length: 10, Null count: 0 - * Null bitmap buffer: Not required + | Bytes 0-27 | Bytes 28-63 | + |----------------------|-------------| + | 0, 2, 4, 7, 7, 8, 10 | unspecified | + + * Values array (bytes): + * Length: 10, Null count: 0 + * Null bitmap buffer: Not required - | Bytes 0-9 | Bytes 10-63 | - |-------------------------------|-------------| - | 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 | unspecified | -``` + | Bytes 0-9 | Bytes 10-63 | + |-------------------------------|-------------| + | 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 | unspecified | -## Struct type +Struct type +----------- A struct is a nested type parameterized by an ordered sequence of relative types (which can all be distinct), called its fields. @@ -367,69 +369,66 @@ A struct array must still have an allocated null bitmap, if it has one or more n Physically, a struct type has one child array for each field. The child arrays are independent and need not be adjacent to each other in memory. For example, the struct (field names shown here as strings for illustration -purposes) +purposes):: -``` -Struct < - name: String (= List), - age: Int32 -> -``` + Struct < + name: String (= List), + age: Int32 + > -has two child arrays, one List array (layout as above) and one 4-byte -primitive value array having Int32 logical type. +has two child arrays, one ``List`` array (layout as above) and one 4-byte +primitive value array having ``Int32`` logical type. -### Example Layout: `Struct, Int32>`: -The layout for [{'joe', 1}, {null, 2}, null, {'mark', 4}] would be: +Example Layout: ``Struct, Int32>`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -``` -* Length: 4, Null count: 1 -* Null bitmap buffer: +The layout for ``[{'joe', 1}, {null, 2}, null, {'mark', 4}]`` would be: :: - |Byte 0 (validity bitmap) | Bytes 1-63 | - |-------------------------|-----------------------| - | 00001011 | 0 (padding) | - -* Children arrays: - * field-0 array (`List`): - * Length: 4, Null count: 2 + * Length: 4, Null count: 1 * Null bitmap buffer: - | Byte 0 (validity bitmap) | Bytes 1-63 | - |--------------------------|-----------------------| - | 00001001 | 0 (padding) | - - * Offsets buffer: + |Byte 0 (validity bitmap) | Bytes 1-63 | + |-------------------------|-----------------------| + | 00001011 | 0 (padding) | - | Bytes 0-19 | - |----------------| - | 0, 3, 3, 3, 7 | + * Children arrays: + * field-0 array (`List`): + * Length: 4, Null count: 2 + * Null bitmap buffer: - * Values array: - * Length: 7, Null count: 0 - * Null bitmap buffer: Not required + | Byte 0 (validity bitmap) | Bytes 1-63 | + |--------------------------|-----------------------| + | 00001001 | 0 (padding) | - * Value buffer: + * Offsets buffer: - | Bytes 0-6 | + | Bytes 0-19 | |----------------| - | joemark | + | 0, 3, 3, 3, 7 | - * field-1 array (int32 array): - * Length: 4, Null count: 1 - * Null bitmap buffer: + * Values array: + * Length: 7, Null count: 0 + * Null bitmap buffer: Not required - | Byte 0 (validity bitmap) | Bytes 1-63 | - |--------------------------|-----------------------| - | 00001011 | 0 (padding) | + * Value buffer: - * Value Buffer: + | Bytes 0-6 | + |----------------| + | joemark | - |Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-63 | - |------------|-------------|-------------|-------------|-------------| - | 1 | 2 | unspecified | 4 | unspecified | + * field-1 array (int32 array): + * Length: 4, Null count: 1 + * Null bitmap buffer: -``` + | Byte 0 (validity bitmap) | Bytes 1-63 | + |--------------------------|-----------------------| + | 00001011 | 0 (padding) | + + * Value Buffer: + + |Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-63 | + |------------|-------------|-------------|-------------|-------------| + | 1 | 2 | unspecified | 4 | unspecified | While a struct does not have physical storage for each of its semantic slots (i.e. each scalar C-like struct), an entire struct slot can be set to null via @@ -444,7 +443,8 @@ for the null struct but are 'hidden' from the consumer by the parent array's null bitmap. However, when treated independently corresponding values of the children array will be non-null. -## Dense union type +Dense union type +---------------- A dense union is semantically similar to a struct, and contains an ordered sequence of relative types. While a struct contains multiple arrays, a union is @@ -466,58 +466,58 @@ of overhead for each value. Its physical layout is as follows: offsets for each child value array must be in order / increasing. Critically, the dense union allows for minimal overhead in the ubiquitous -union-of-structs with non-overlapping-fields use case (`Union`) +union-of-structs with non-overlapping-fields use case (``Union``) -### Example Layout: Dense union +Example Layout: Dense union +~~~~~~~~~~~~~~~~~~~~~~~~~~~ An example layout for logical union of: -`Union` having the values: -[{f=1.2}, null, {f=3.4}, {i=5}] +``Union`` having the values: +``[{f=1.2}, null, {f=3.4}, {i=5}]``:: -``` -* Length: 4, Null count: 1 -* Null bitmap buffer: - |Byte 0 (validity bitmap) | Bytes 1-63 | - |-------------------------|-----------------------| - |00001101 | 0 (padding) | + * Length: 4, Null count: 1 + * Null bitmap buffer: + |Byte 0 (validity bitmap) | Bytes 1-63 | + |-------------------------|-----------------------| + |00001101 | 0 (padding) | -* Types buffer: + * Types buffer: - |Byte 0 | Byte 1 | Byte 2 | Byte 3 | Bytes 4-63 | - |---------|-------------|----------|----------|-------------| - | 0 | unspecified | 0 | 1 | unspecified | + |Byte 0 | Byte 1 | Byte 2 | Byte 3 | Bytes 4-63 | + |---------|-------------|----------|----------|-------------| + | 0 | unspecified | 0 | 1 | unspecified | -* Offset buffer: + * Offset buffer: - |Byte 0-3 | Byte 4-7 | Byte 8-11 | Byte 12-15 | Bytes 16-63 | - |---------|-------------|-----------|------------|-------------| - | 0 | unspecified | 1 | 0 | unspecified | + |Byte 0-3 | Byte 4-7 | Byte 8-11 | Byte 12-15 | Bytes 16-63 | + |---------|-------------|-----------|------------|-------------| + | 0 | unspecified | 1 | 0 | unspecified | -* Children arrays: - * Field-0 array (f: float): - * Length: 2, nulls: 0 - * Null bitmap buffer: Not required + * Children arrays: + * Field-0 array (f: float): + * Length: 2, nulls: 0 + * Null bitmap buffer: Not required - * Value Buffer: + * Value Buffer: - | Bytes 0-7 | Bytes 8-63 | - |-----------|-------------| - | 1.2, 3.4 | unspecified | + | Bytes 0-7 | Bytes 8-63 | + |-----------|-------------| + | 1.2, 3.4 | unspecified | - * Field-1 array (i: int32): - * Length: 1, nulls: 0 - * Null bitmap buffer: Not required + * Field-1 array (i: int32): + * Length: 1, nulls: 0 + * Null bitmap buffer: Not required - * Value Buffer: + * Value Buffer: - | Bytes 0-3 | Bytes 4-63 | - |-----------|-------------| - | 5 | unspecified | -``` + | Bytes 0-3 | Bytes 4-63 | + |-----------|-------------| + | 5 | unspecified | -## Sparse union type +Sparse union type +----------------- A sparse union has the same structure as a dense union, with the omission of the offsets array. In this case, the child arrays are each equal in length to @@ -529,75 +529,75 @@ union, it has some advantages that may be desirable in certain use cases: * A sparse union is more amenable to vectorized expression evaluation in some use cases. * Equal-length arrays can be interpreted as a union by only defining the types array. -### Example layout: `SparseUnion>` +Example layout: ``SparseUnion>`` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -For the union array: +For the union array: :: -[{u0=5}, {u1=1.2}, {u2='joe'}, {u1=3.4}, {u0=4}, {u2='mark'}] + [{u0=5}, {u1=1.2}, {u2='joe'}, {u1=3.4}, {u0=4}, {u2='mark'}] -will have the following layout: -``` -* Length: 6, Null count: 0 -* Null bitmap buffer: Not required +will have the following layout: :: -* Types buffer: + * Length: 6, Null count: 0 + * Null bitmap buffer: Not required - | Byte 0 | Byte 1 | Byte 2 | Byte 3 | Byte 4 | Byte 5 | Bytes 6-63 | - |------------|-------------|-------------|-------------|-------------|--------------|-----------------------| - | 0 | 1 | 2 | 1 | 0 | 2 | unspecified (padding) | + * Types buffer: -* Children arrays: + | Byte 0 | Byte 1 | Byte 2 | Byte 3 | Byte 4 | Byte 5 | Bytes 6-63 | + |------------|-------------|-------------|-------------|-------------|--------------|-----------------------| + | 0 | 1 | 2 | 1 | 0 | 2 | unspecified (padding) | - * u0 (Int32): - * Length: 6, Null count: 4 - * Null bitmap buffer: + * Children arrays: - |Byte 0 (validity bitmap) | Bytes 1-63 | - |-------------------------|-----------------------| - |00010001 | 0 (padding) | + * u0 (Int32): + * Length: 6, Null count: 4 + * Null bitmap buffer: - * Value buffer: + |Byte 0 (validity bitmap) | Bytes 1-63 | + |-------------------------|-----------------------| + |00010001 | 0 (padding) | - |Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-19 | Bytes 20-23 | Bytes 24-63 | - |------------|-------------|-------------|-------------|-------------|--------------|-----------------------| - | 5 | unspecified | unspecified | unspecified | 4 | unspecified | unspecified (padding) | + * Value buffer: - * u1 (float): - * Length: 6, Null count: 4 - * Null bitmap buffer: + |Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-19 | Bytes 20-23 | Bytes 24-63 | + |------------|-------------|-------------|-------------|-------------|--------------|-----------------------| + | 5 | unspecified | unspecified | unspecified | 4 | unspecified | unspecified (padding) | - |Byte 0 (validity bitmap) | Bytes 1-63 | - |-------------------------|-----------------------| - | 00001010 | 0 (padding) | + * u1 (float): + * Length: 6, Null count: 4 + * Null bitmap buffer: - * Value buffer: + |Byte 0 (validity bitmap) | Bytes 1-63 | + |-------------------------|-----------------------| + | 00001010 | 0 (padding) | - |Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-19 | Bytes 20-23 | Bytes 24-63 | - |-------------|-------------|-------------|-------------|-------------|--------------|-----------------------| - | unspecified | 1.2 | unspecified | 3.4 | unspecified | unspecified | unspecified (padding) | + * Value buffer: - * u2 (`List`) - * Length: 6, Null count: 4 - * Null bitmap buffer: + |Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-19 | Bytes 20-23 | Bytes 24-63 | + |-------------|-------------|-------------|-------------|-------------|--------------|-----------------------| + | unspecified | 1.2 | unspecified | 3.4 | unspecified | unspecified | unspecified (padding) | - | Byte 0 (validity bitmap) | Bytes 1-63 | - |--------------------------|-----------------------| - | 00100100 | 0 (padding) | + * u2 (`List`) + * Length: 6, Null count: 4 + * Null bitmap buffer: - * Offsets buffer (int32) + | Byte 0 (validity bitmap) | Bytes 1-63 | + |--------------------------|-----------------------| + | 00100100 | 0 (padding) | - | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-19 | Bytes 20-23 | Bytes 24-27 | Bytes 28-63 | - |------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------| - | 0 | 0 | 0 | 3 | 3 | 3 | 7 | unspecified | + * Offsets buffer (int32) - * Values array (char array): - * Length: 7, Null count: 0 - * Null bitmap buffer: Not required + | Bytes 0-3 | Bytes 4-7 | Bytes 8-11 | Bytes 12-15 | Bytes 16-19 | Bytes 20-23 | Bytes 24-27 | Bytes 28-63 | + |------------|-------------|-------------|-------------|-------------|-------------|-------------|-------------| + | 0 | 0 | 0 | 3 | 3 | 3 | 7 | unspecified | - | Bytes 0-7 | Bytes 8-63 | - |------------|-----------------------| - | joemark | unspecified (padding) | -``` + * Values array (char array): + * Length: 7, Null count: 0 + * Null bitmap buffer: Not required + + | Bytes 0-7 | Bytes 8-63 | + |------------|-----------------------| + | joemark | unspecified (padding) | Note that nested types in a sparse union must be internally consistent (e.g. see the List in the diagram), i.e. random access at any index j @@ -610,55 +610,55 @@ even if the null bitmap of the parent union array indicates the slot is null. Additionally, a child array may have a non-null slot even if the types array indicates that a slot contains a different type at the index. -## Dictionary encoding +Dictionary encoding +------------------- When a field is dictionary encoded, the values are represented by an array of Int32 representing the index of the value in the dictionary. The Dictionary is received as one or more DictionaryBatches with the id referenced by a -dictionary attribute defined in the metadata ([Message.fbs][7]) in the Field +dictionary attribute defined in the metadata (Message.fbs) in the Field table. The dictionary has the same layout as the type of the field would dictate. Each entry in the dictionary can be accessed by its index in the DictionaryBatches. When a Schema references a Dictionary id, it must send at least one DictionaryBatch for this id. -As an example, you could have the following data: -``` -type: List - -[ - ['a', 'b'], - ['a', 'b'], - ['a', 'b'], - ['c', 'd', 'e'], - ['c', 'd', 'e'], - ['c', 'd', 'e'], - ['c', 'd', 'e'], - ['a', 'b'] -] -``` -In dictionary-encoded form, this could appear as: -``` -data List (dictionary-encoded, dictionary id i) -indices: [0, 0, 0, 1, 1, 1, 0] - -dictionary i - -type: List - -[ - ['a', 'b'], - ['c', 'd', 'e'], -] -``` - -## References - -Apache Drill Documentation - [Value Vectors][6] - -[1]: https://en.wikipedia.org/wiki/Bit_numbering -[2]: https://software.intel.com/en-us/articles/practical-intel-avx-optimization-on-2nd-generation-intel-core-processors -[3]: https://en.wikipedia.org/wiki/Endianness -[4]: https://software.intel.com/en-us/node/600110 -[5]: https://parquet.apache.org/documentation/latest/ -[6]: https://drill.apache.org/docs/value-vectors/ -[7]: https://github.com/apache/arrow/blob/master/format/Message.fbs +As an example, you could have the following data: :: + + type: List + + [ + ['a', 'b'], + ['a', 'b'], + ['a', 'b'], + ['c', 'd', 'e'], + ['c', 'd', 'e'], + ['c', 'd', 'e'], + ['c', 'd', 'e'], + ['a', 'b'] + ] + +In dictionary-encoded form, this could appear as: :: + + data List (dictionary-encoded, dictionary id i) + indices: [0, 0, 0, 1, 1, 1, 0] + + dictionary i + + type: List + + [ + ['a', 'b'], + ['c', 'd', 'e'], + ] + +References +---------- + +Apache Drill Documentation - `Value Vectors`_ + +.. _least-significant bit (LSB) numbering: https://en.wikipedia.org/wiki/Bit_numbering +.. _Intel performance guide: https://software.intel.com/en-us/articles/practical-intel-avx-optimization-on-2nd-generation-intel-core-processors +.. _Endianness: https://en.wikipedia.org/wiki/Endianness +.. _SIMD: https://software.intel.com/en-us/node/600110 +.. _Parquet: https://parquet.apache.org/documentation/latest/ +.. _Value Vectors: https://drill.apache.org/docs/value-vectors/ diff --git a/docs/source/format/Metadata.rst b/docs/source/format/Metadata.rst new file mode 100644 index 0000000000000..4ed82e0078e2c --- /dev/null +++ b/docs/source/format/Metadata.rst @@ -0,0 +1,394 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Metadata: Logical types, schemas, data headers +============================================== + +This is documentation for the Arrow metadata specification, which enables +systems to communicate the + +* Logical array types (which are implemented using the physical memory layouts + specified in :doc:`Layout`) + +* Schemas for table-like collections of Arrow data structures + +* "Data headers" indicating the physical locations of memory buffers sufficient + to reconstruct a Arrow data structures without copying memory. + +Canonical implementation +------------------------ + +We are using `Flatbuffers`_ for low-overhead reading and writing of the Arrow +metadata. See ``Message.fbs``. + +Schemas +------- + +The ``Schema`` type describes a table-like structure consisting of any number of +Arrow arrays, each of which can be interpreted as a column in the table. A +schema by itself does not describe the physical structure of any particular set +of data. + +A schema consists of a sequence of **fields**, which are metadata describing +the columns. The Flatbuffers IDL for a field is: :: + + table Field { + // Name is not required, in i.e. a List + name: string; + nullable: bool; + type: Type; + + // Present only if the field is dictionary encoded + dictionary: DictionaryEncoding; + + // children apply only to Nested data types like Struct, List and Union + children: [Field]; + + // User-defined metadata + custom_metadata: [ KeyValue ]; + } + +The ``type`` is the logical type of the field. Nested types, such as List, +Struct, and Union, have a sequence of child fields. + +A JSON representation of the schema is also provided: + +Field: :: + + { + "name" : "name_of_the_field", + "nullable" : false, + "type" : /* Type */, + "children" : [ /* Field */ ], + } + +Type: :: + + { + "name" : "null|struct|list|union|int|floatingpoint|utf8|binary|fixedsizebinary|bool|decimal|date|time|timestamp|interval" + // fields as defined in the Flatbuffer depending on the type name + } + +Union: :: + + { + "name" : "union", + "mode" : "Sparse|Dense", + "typeIds" : [ /* integer */ ] + } + +The ``typeIds`` field in the Union are the codes used to denote each type, which +may be different from the index of the child array. This is so that the union +type ids do not have to be enumerated from 0. + +Int: :: + + { + "name" : "int", + "bitWidth" : /* integer */, + "isSigned" : /* boolean */ + } + +FloatingPoint: :: + + { + "name" : "floatingpoint", + "precision" : "HALF|SINGLE|DOUBLE" + } + +Decimal: :: + + { + "name" : "decimal", + "precision" : /* integer */, + "scale" : /* integer */ + } + +Timestamp: :: + + { + "name" : "timestamp", + "unit" : "SECOND|MILLISECOND|MICROSECOND|NANOSECOND" + } + +Date: :: + + { + "name" : "date", + "unit" : "DAY|MILLISECOND" + } + +Time: :: + + { + "name" : "time", + "unit" : "SECOND|MILLISECOND|MICROSECOND|NANOSECOND", + "bitWidth": /* integer: 32 or 64 */ + } + +Interval: :: + + { + "name" : "interval", + "unit" : "YEAR_MONTH|DAY_TIME" + } + +Schema: :: + + { + "fields" : [ + /* Field */ + ] + } + +Record data headers +------------------- + +A record batch is a collection of top-level named, equal length Arrow arrays +(or vectors). If one of the arrays contains nested data, its child arrays are +not required to be the same length as the top-level arrays. + +One can be thought of as a realization of a particular schema. The metadata +describing a particular record batch is called a "data header". Here is the +Flatbuffers IDL for a record batch data header: :: + + table RecordBatch { + length: long; + nodes: [FieldNode]; + buffers: [Buffer]; + } + +The ``RecordBatch`` metadata provides for record batches with length exceeding +2 :sup:`31` - 1, but Arrow implementations are not required to implement support +beyond this size. + +The ``nodes`` and ``buffers`` fields are produced by a depth-first traversal / +flattening of a schema (possibly containing nested types) for a given in-memory +data set. + +Buffers +~~~~~~~ + +A buffer is metadata describing a contiguous memory region relative to some +virtual address space. This may include: + +* Shared memory, e.g. a memory-mapped file +* An RPC message received in-memory +* Data in a file + +The key form of the Buffer type is: :: + + struct Buffer { + offset: long; + length: long; + } + +In the context of a record batch, each field has some number of buffers +associated with it, which are derived from their physical memory layout. + +Each logical type (separate from its children, if it is a nested type) has a +deterministic number of buffers associated with it. These will be specified in +the logical types section. + +Field metadata +~~~~~~~~~~~~~~ + +The ``FieldNode`` values contain metadata about each level in a nested type +hierarchy. :: + + struct FieldNode { + /// The number of value slots in the Arrow array at this level of a nested + /// tree + length: long; + + /// The number of observed nulls. + null_count: lohng; + } + +The ``FieldNode`` metadata provides for fields with length exceeding 2 :sup:`31` - 1, +but Arrow implementations are not required to implement support for large +arrays. + +Flattening of nested data +------------------------- + +Nested types are flattened in the record batch in depth-first order. When +visiting each field in the nested type tree, the metadata is appended to the +top-level ``fields`` array and the buffers associated with that field (but not +its children) are appended to the ``buffers`` array. + +For example, let's consider the schema :: + + col1: Struct, c: Float64> + col2: Utf8 + +The flattened version of this is: :: + + FieldNode 0: Struct name='col1' + FieldNode 1: Int32 name=a' + FieldNode 2: List name='b' + FieldNode 3: Int64 name='item' # arbitrary + FieldNode 4: Float64 name='c' + FieldNode 5: Utf8 name='col2' + +For the buffers produced, we would have the following (as described in more +detail for each type below): :: + + buffer 0: field 0 validity bitmap + + buffer 1: field 1 validity bitmap + buffer 2: field 1 values + + buffer 3: field 2 validity bitmap + buffer 4: field 2 list offsets + + buffer 5: field 3 validity bitmap + buffer 6: field 3 values + + buffer 7: field 4 validity bitmap + buffer 8: field 4 values + + buffer 9: field 5 validity bitmap + buffer 10: field 5 offsets + buffer 11: field 5 data + +Logical types +------------- + +A logical type consists of a type name and metadata along with an explicit +mapping to a physical memory representation. These may fall into some different +categories: + +* Types represented as fixed-width primitive arrays (for example: C-style + integers and floating point numbers) +* Types having equivalent memory layout to a physical nested type (e.g. strings + use the list representation, but logically are not nested types) + +Integers +~~~~~~~~ + +In the first version of Arrow we provide the standard 8-bit through 64-bit size +standard C integer types, both signed and unsigned: + +* Signed types: Int8, Int16, Int32, Int64 +* Unsigned types: UInt8, UInt16, UInt32, UInt64 + +The IDL looks like: :: + + table Int { + bitWidth: int; + is_signed: bool; + } + +The integer endianness is currently set globally at the schema level. If a +schema is set to be little-endian, then all integer types occurring within must +be little-endian. Integers that are part of other data representations, such as +list offsets and union types, must have the same endianness as the entire +record batch. + +Floating point numbers +~~~~~~~~~~~~~~~~~~~~~~ + +We provide 3 types of floating point numbers as fixed bit-width primitive array + +- Half precision, 16-bit width +- Single precision, 32-bit width +- Double precision, 64-bit width + +The IDL looks like: :: + + enum Precision:int {HALF, SINGLE, DOUBLE} + + table FloatingPoint { + precision: Precision; + } + +Boolean +~~~~~~~ + +The Boolean logical type is represented as a 1-bit wide primitive physical +type. The bits are numbered using least-significant bit (LSB) ordering. + +Like other fixed bit-width primitive types, boolean data appears as 2 buffers +in the data header (one bitmap for the validity vector and one for the values). + +List +~~~~ + +The ``List`` logical type is the logical (and identically-named) counterpart to +the List physical type. + +In data header form, the list field node contains 2 buffers: + +* Validity bitmap +* List offsets + +The buffers associated with a list's child field are handled recursively +according to the child logical type (e.g. ``List`` vs. ``List``). + +Utf8 and Binary +~~~~~~~~~~~~~~~ + +We specify two logical types for variable length bytes: + +* ``Utf8`` data is Unicode values with UTF-8 encoding +* ``Binary`` is any other variable length bytes + +These types both have the same memory layout as the nested type ``List``, +with the constraint that the inner bytes can contain no null values. From a +logical type perspective they are primitive, not nested types. + +In data header form, while ``List`` would appear as 2 field nodes (``List`` +and ``UInt8``) and 4 buffers (2 for each of the nodes, as per above), these types +have a simplified representation single field node (of ``Utf8`` or ``Binary`` +logical type, which have no children) and 3 buffers: + +* Validity bitmap +* List offsets +* Byte data + +Decimal +~~~~~~~ + +Decimals are represented as a 2's complement 128-bit (16 byte) signed integer +in little-endian byte order. + +Timestamp +~~~~~~~~~ + +All timestamps are stored as a 64-bit integer, with one of four unit +resolutions: second, millisecond, microsecond, and nanosecond. + +Date +~~~~ + +We support two different date types: + +* Days since the UNIX epoch as a 32-bit integer +* Milliseconds since the UNIX epoch as a 64-bit integer + +Time +~~~~ + +Time supports the same unit resolutions: second, millisecond, microsecond, and +nanosecond. We represent time as the smallest integer accommodating the +indicated unit. For second and millisecond: 32-bit, for the others 64-bit. + +Dictionary encoding +------------------- + +.. _Flatbuffers: http://github.com/google/flatbuffers diff --git a/docs/source/format/README.rst b/docs/source/format/README.rst new file mode 100644 index 0000000000000..f2f770bdc95c1 --- /dev/null +++ b/docs/source/format/README.rst @@ -0,0 +1,53 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Arrow specification documents +============================= + +Currently, the Arrow specification consists of these pieces: + +- Metadata specification (see :doc:`Metadata`) +- Physical memory layout specification (see :doc:`Layout`) +- Logical Types, Schemas, and Record Batch Metadata (see Schema.fbs) +- Encapsulated Messages (see Message.fbs) +- Mechanics of messaging between Arrow systems (IPC, RPC, etc.) (see :doc:`IPC`) +- Tensor (Multi-dimensional array) Metadata (see Tensor.fbs) + +The metadata currently uses Google's `flatbuffers library`_ for serializing a +couple related pieces of information: + +- Schemas for tables or record (row) batches. This contains the logical types, + field names, and other metadata. Schemas do not contain any information about + actual data. +- *Data headers* for record (row) batches. These must correspond to a known + schema, and enable a system to send and receive Arrow row batches in a form + that can be precisely disassembled or reconstructed. + +Arrow Format Maturity and Stability +----------------------------------- + +We have made significant progress hardening the Arrow in-memory format and +Flatbuffer metadata since the project started in February 2016. We have +integration tests which verify binary compatibility between the Java and C++ +implementations, for example. + +Major versions may still include breaking changes to the memory format or +metadata, so it is recommended to use the same released version of all +libraries in your applications for maximum compatibility. Data stored in the +Arrow IPC formats should not be used for long term storage. + +.. _flatbuffers library: http://github.com/google/flatbuffers diff --git a/python/doc/source/index.rst b/docs/source/index.rst similarity index 62% rename from python/doc/source/index.rst rename to docs/source/index.rst index 712b105a5bfbb..fa6c683d14ecb 100644 --- a/python/doc/source/index.rst +++ b/docs/source/index.rst @@ -15,8 +15,8 @@ .. specific language governing permissions and limitations .. under the License. -Python bindings for Apache Arrow -================================ +Apache Arrow +============ Apache Arrow is a cross-language development platform for in-memory data. It specifies a standardized language-independent columnar memory format for flat @@ -24,31 +24,19 @@ and hierarchical data, organized for efficient analytic operations on modern hardware. It also provides computational libraries and zero-copy streaming messaging and interprocess communication. -The Arrow Python bindings have first-class integration with NumPy, pandas, and -built-in Python objects. +.. toctree:: + :maxdepth: 1 + :caption: Memory Format -This is the documentation of the Python API of Apache Arrow. For more details -on the format and other language bindings see -`the main page for Arrow `_. Here will we only -detail the usage of the Python API for Arrow and the leaf libraries that add -additional functionality such as reading Apache Parquet files into Arrow -structures. + format/README + format/Guidelines + format/Layout + format/Metadata + format/IPC .. toctree:: :maxdepth: 2 - :caption: Getting Started - - install - development - memory - data - ipc - filesystems - plasma - numpy - pandas - csv - parquet - extending - api - getting_involved + :caption: Languages + + cpp/index + python/index diff --git a/python/doc/source/api.rst b/docs/source/python/api.rst similarity index 99% rename from python/doc/source/api.rst rename to docs/source/python/api.rst index 4ecd7d66cec1d..06863964978b3 100644 --- a/python/doc/source/api.rst +++ b/docs/source/python/api.rst @@ -372,6 +372,7 @@ Apache Parquet read_schema write_metadata write_table + write_to_dataset .. currentmodule:: pyarrow diff --git a/python/doc/source/csv.rst b/docs/source/python/csv.rst similarity index 100% rename from python/doc/source/csv.rst rename to docs/source/python/csv.rst diff --git a/python/doc/source/data.rst b/docs/source/python/data.rst similarity index 100% rename from python/doc/source/data.rst rename to docs/source/python/data.rst diff --git a/python/doc/source/development.rst b/docs/source/python/development.rst similarity index 95% rename from python/doc/source/development.rst rename to docs/source/python/development.rst index 3bd66893aff3d..e86a0be0d04a4 100644 --- a/python/doc/source/development.rst +++ b/docs/source/python/development.rst @@ -76,13 +76,24 @@ Using Conda Let's create a conda environment with all the C++ build and Python dependencies from conda-forge: +On Linux and OSX: + +.. code-block:: shell + + conda create -y -n pyarrow-dev -c conda-forge \ + --file arrow/ci/conda_env_unix.yml \ + --file arrow/ci/conda_env_cpp.yml \ + --file arrow/ci/conda_env_python.yml \ + python=3.6 + +On Windows: + .. code-block:: shell - conda create -y -q -n pyarrow-dev \ - python=3.6 numpy six setuptools cython pandas pytest \ - cmake flatbuffers rapidjson boost-cpp thrift-cpp snappy zlib \ - gflags brotli jemalloc lz4-c zstd -c conda-forge - conda activate pyarrow-dev + conda create -y -n pyarrow-dev -c conda-forge ^ + --file arrow\ci\conda_env_cpp.yml ^ + --file arrow\ci\conda_env_python.yml ^ + python=3.6 We need to set some environment variables to let Arrow's build system know about our build toolchain: diff --git a/python/doc/source/extending.rst b/docs/source/python/extending.rst similarity index 100% rename from python/doc/source/extending.rst rename to docs/source/python/extending.rst diff --git a/python/doc/source/filesystems.rst b/docs/source/python/filesystems.rst similarity index 100% rename from python/doc/source/filesystems.rst rename to docs/source/python/filesystems.rst diff --git a/python/doc/source/getting_involved.rst b/docs/source/python/getting_involved.rst similarity index 100% rename from python/doc/source/getting_involved.rst rename to docs/source/python/getting_involved.rst diff --git a/docs/source/python/index.rst b/docs/source/python/index.rst new file mode 100644 index 0000000000000..56282192b170b --- /dev/null +++ b/docs/source/python/index.rst @@ -0,0 +1,48 @@ +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Python bindings +=============== + +The Arrow Python bindings have first-class integration with NumPy, pandas, and +built-in Python objects. They are based on the C++ implementation of Arrow. + +This is the documentation of the Python API of Apache Arrow. For more details +on the format and other language bindings see the parent documentation. +Here will we only detail the usage of the Python API for Arrow and the leaf +libraries that add additional functionality such as reading Apache Parquet +files into Arrow structures. + +.. toctree:: + :maxdepth: 2 + :caption: Getting Started + + install + development + memory + data + ipc + filesystems + plasma + numpy + pandas + csv + parquet + extending + api + getting_involved + diff --git a/python/doc/source/install.rst b/docs/source/python/install.rst similarity index 100% rename from python/doc/source/install.rst rename to docs/source/python/install.rst diff --git a/python/doc/source/ipc.rst b/docs/source/python/ipc.rst similarity index 100% rename from python/doc/source/ipc.rst rename to docs/source/python/ipc.rst diff --git a/python/doc/source/memory.rst b/docs/source/python/memory.rst similarity index 100% rename from python/doc/source/memory.rst rename to docs/source/python/memory.rst diff --git a/python/doc/source/numpy.rst b/docs/source/python/numpy.rst similarity index 97% rename from python/doc/source/numpy.rst rename to docs/source/python/numpy.rst index 303e1823851fd..870f9cb734792 100644 --- a/python/doc/source/numpy.rst +++ b/docs/source/python/numpy.rst @@ -17,8 +17,8 @@ .. _numpy_interop: -Using PyArrow with NumPy -======================== +NumPy Integration +================= PyArrow allows converting back and forth from `NumPy `_ arrays to Arrow :ref:`Arrays `. diff --git a/python/doc/source/pandas.rst b/docs/source/python/pandas.rst similarity index 99% rename from python/doc/source/pandas.rst rename to docs/source/python/pandas.rst index 6ade17185a2c9..16b4ff6926809 100644 --- a/python/doc/source/pandas.rst +++ b/docs/source/python/pandas.rst @@ -17,8 +17,8 @@ .. _pandas_interop: -Using PyArrow with pandas -========================= +Pandas Integration +================== To interface with `pandas `_, PyArrow provides various conversion routines to consume pandas structures and convert back diff --git a/python/doc/source/parquet.rst b/docs/source/python/parquet.rst similarity index 100% rename from python/doc/source/parquet.rst rename to docs/source/python/parquet.rst diff --git a/python/doc/source/plasma.rst b/docs/source/python/plasma.rst similarity index 100% rename from python/doc/source/plasma.rst rename to docs/source/python/plasma.rst diff --git a/format/Guidelines.md b/format/Guidelines.md deleted file mode 100644 index 7b5f3a11bfc48..0000000000000 --- a/format/Guidelines.md +++ /dev/null @@ -1,35 +0,0 @@ - -# Implementation guidelines - -An execution engine (or framework, or UDF executor, or storage engine, etc) can implements only a subset of the Arrow spec and/or extend it given the following constraints: - -## Implementing a subset the spec -### If only producing (and not consuming) arrow vectors. -Any subset of the vector spec and the corresponding metadata can be implemented. - -### If consuming and producing vectors -There is a minimal subset of vectors to be supported. -Production of a subset of vectors and their corresponding metadata is always fine. -Consumption of vectors should at least convert the unsupported input vectors to the supported subset (for example Timestamp.millis to timestamp.micros or int32 to int64) - -## Extensibility -An execution engine implementor can also extend their memory representation with their own vectors internally as long as they are never exposed. Before sending data to another system expecting Arrow data these custom vectors should be converted to a type that exist in the Arrow spec. -An example of this is operating on compressed data. -These custom vectors are not exchanged externally and there is no support for custom metadata. diff --git a/format/IPC.md b/format/IPC.md deleted file mode 100644 index 97c1790e67ea2..0000000000000 --- a/format/IPC.md +++ /dev/null @@ -1,253 +0,0 @@ - - -# Interprocess messaging / communication (IPC) - -## Encapsulated message format - -Data components in the stream and file formats are represented as encapsulated -*messages* consisting of: - -* A length prefix indicating the metadata size -* The message metadata as a [Flatbuffer][3] -* Padding bytes to an 8-byte boundary -* The message body, which must be a multiple of 8 bytes - -Schematically, we have: - -``` - - - - -``` - -The complete serialized message must be a multiple of 8 bytes so that messages -can be relocated between streams. Otherwise the amount of padding between the -metadata and the message body could be non-deterministic. - -The `metadata_size` includes the size of the flatbuffer plus padding. The -`Message` flatbuffer includes a version number, the particular message (as a -flatbuffer union), and the size of the message body: - -``` -table Message { - version: org.apache.arrow.flatbuf.MetadataVersion; - header: MessageHeader; - bodyLength: long; -} -``` - -Currently, we support 4 types of messages: - -* Schema -* RecordBatch -* DictionaryBatch -* Tensor - -## Streaming format - -We provide a streaming format for record batches. It is presented as a sequence -of encapsulated messages, each of which follows the format above. The schema -comes first in the stream, and it is the same for all of the record batches -that follow. If any fields in the schema are dictionary-encoded, one or more -`DictionaryBatch` messages will be included. `DictionaryBatch` and -`RecordBatch` messages may be interleaved, but before any dictionary key is used -in a `RecordBatch` it should be defined in a `DictionaryBatch`. - -``` - - -... - - -... - -... - -... - - -``` - -When a stream reader implementation is reading a stream, after each message, it -may read the next 4 bytes to know how large the message metadata that follows -is. Once the message flatbuffer is read, you can then read the message body. - -The stream writer can signal end-of-stream (EOS) either by writing a 0 length -as an `int32` or simply closing the stream interface. - -## File format - -We define a "file format" supporting random access in a very similar format to -the streaming format. The file starts and ends with a magic string `ARROW1` -(plus padding). What follows in the file is identical to the stream format. At -the end of the file, we write a *footer* containing a redundant copy of the -schema (which is a part of the streaming format) plus memory offsets and sizes -for each of the data blocks in the file. This enables random access any record -batch in the file. See [format/File.fbs][1] for the precise details of the file -footer. - -Schematically we have: - -``` - - - -
-
- -``` - -In the file format, there is no requirement that dictionary keys should be -defined in a `DictionaryBatch` before they are used in a `RecordBatch`, as long -as the keys are defined somewhere in the file. - -### RecordBatch body structure - -The `RecordBatch` metadata contains a depth-first (pre-order) flattened set of -field metadata and physical memory buffers (some comments from [Message.fbs][2] -have been shortened / removed): - -``` -table RecordBatch { - length: long; - nodes: [FieldNode]; - buffers: [Buffer]; -} - -struct FieldNode { - length: long; - null_count: long; -} - -struct Buffer { - /// The relative offset into the shared memory page where the bytes for this - /// buffer starts - offset: long; - - /// The absolute length (in bytes) of the memory buffer. The memory is found - /// from offset (inclusive) to offset + length (non-inclusive). - length: long; -} -``` - -In the context of a file, the `page` is not used, and the `Buffer` offsets use -as a frame of reference the start of the message body. So, while in a general -IPC setting these offsets may be anyplace in one or more shared memory regions, -in the file format the offsets start from 0. - -The location of a record batch and the size of the metadata block as well as -the body of buffers is stored in the file footer: - -``` -struct Block { - offset: long; - metaDataLength: int; - bodyLength: long; -} -``` - -The `metaDataLength` here includes the metadata length prefix, serialized -metadata, and any additional padding bytes, and by construction must be a -multiple of 8 bytes. - -Some notes about this - -* The `Block` offset indicates the starting byte of the record batch. -* The metadata length includes the flatbuffer size, the record batch metadata - flatbuffer, and any padding bytes - -### Dictionary Batches - -Dictionaries are written in the stream and file formats as a sequence of record -batches, each having a single field. The complete semantic schema for a -sequence of record batches, therefore, consists of the schema along with all of -the dictionaries. The dictionary types are found in the schema, so it is -necessary to read the schema to first determine the dictionary types so that -the dictionaries can be properly interpreted. - -``` -table DictionaryBatch { - id: long; - data: RecordBatch; - isDelta: boolean = false; -} -``` - -The dictionary `id` in the message metadata can be referenced one or more times -in the schema, so that dictionaries can even be used for multiple fields. See -the [Physical Layout][4] document for more about the semantics of -dictionary-encoded data. - -The dictionary `isDelta` flag allows dictionary batches to be modified -mid-stream. A dictionary batch with `isDelta` set indicates that its vector -should be concatenated with those of any previous batches with the same `id`. A -stream which encodes one column, the list of strings -`["A", "B", "C", "B", "D", "C", "E", "A"]`, with a delta dictionary batch could -take the form: - -``` - - -(0) "A" -(1) "B" -(2) "C" - - -0 -1 -2 -1 - - -(3) "D" -(4) "E" - - -3 -2 -4 -0 -EOS -``` - -### Tensor (Multi-dimensional Array) Message Format - -The `Tensor` message types provides a way to write a multidimensional array of -fixed-size values (such as a NumPy ndarray) using Arrow's shared memory -tools. Arrow implementations in general are not required to implement this data -format, though we provide a reference implementation in C++. - -When writing a standalone encapsulated tensor message, we use the format as -indicated above, but additionally align the starting offset of the metadata as -well as the starting offset of the tensor body (if writing to a shared memory -region) to be multiples of 64 bytes: - -``` - - - - -``` - -[1]: https://github.com/apache/arrow/blob/master/format/File.fbs -[2]: https://github.com/apache/arrow/blob/master/format/Message.fbs -[3]: https://github.com/google/flatbuffers -[4]: https://github.com/apache/arrow/blob/master/format/Layout.md diff --git a/format/Metadata.md b/format/Metadata.md deleted file mode 100644 index 33d5065f89e23..0000000000000 --- a/format/Metadata.md +++ /dev/null @@ -1,409 +0,0 @@ - - -# Metadata: Logical types, schemas, data headers - -This is documentation for the Arrow metadata specification, which enables -systems to communicate the - -* Logical array types (which are implemented using the physical memory layouts - specified in [Layout.md][1]) - -* Schemas for table-like collections of Arrow data structures - -* "Data headers" indicating the physical locations of memory buffers sufficient - to reconstruct a Arrow data structures without copying memory. - -## Canonical implementation - -We are using [Flatbuffers][2] for low-overhead reading and writing of the Arrow -metadata. See [Message.fbs][3]. - -## Schemas - -The `Schema` type describes a table-like structure consisting of any number of -Arrow arrays, each of which can be interpreted as a column in the table. A -schema by itself does not describe the physical structure of any particular set -of data. - -A schema consists of a sequence of **fields**, which are metadata describing -the columns. The Flatbuffers IDL for a field is: - -``` -table Field { - // Name is not required, in i.e. a List - name: string; - nullable: bool; - type: Type; - - // Present only if the field is dictionary encoded - dictionary: DictionaryEncoding; - - // children apply only to Nested data types like Struct, List and Union - children: [Field]; - - // User-defined metadata - custom_metadata: [ KeyValue ]; -} -``` - -The `type` is the logical type of the field. Nested types, such as List, -Struct, and Union, have a sequence of child fields. - -A JSON representation of the schema is also provided: -Field: -``` -{ - "name" : "name_of_the_field", - "nullable" : false, - "type" : /* Type */, - "children" : [ /* Field */ ], -} -``` - -Type: -``` -{ - "name" : "null|struct|list|union|int|floatingpoint|utf8|binary|fixedsizebinary|bool|decimal|date|time|timestamp|interval" - // fields as defined in the Flatbuffer depending on the type name -} -``` - -Union: -``` -{ - "name" : "union", - "mode" : "Sparse|Dense", - "typeIds" : [ /* integer */ ] -} -``` - -The `typeIds` field in the Union are the codes used to denote each type, which -may be different from the index of the child array. This is so that the union -type ids do not have to be enumerated from 0. - -Int: -``` -{ - "name" : "int", - "bitWidth" : /* integer */, - "isSigned" : /* boolean */ -} -``` -FloatingPoint: -``` -{ - "name" : "floatingpoint", - "precision" : "HALF|SINGLE|DOUBLE" -} -``` -Decimal: -``` -{ - "name" : "decimal", - "precision" : /* integer */, - "scale" : /* integer */ -} -``` - -Timestamp: - -``` -{ - "name" : "timestamp", - "unit" : "SECOND|MILLISECOND|MICROSECOND|NANOSECOND" -} -``` - -Date: - -``` -{ - "name" : "date", - "unit" : "DAY|MILLISECOND" -} -``` - -Time: - -``` -{ - "name" : "time", - "unit" : "SECOND|MILLISECOND|MICROSECOND|NANOSECOND", - "bitWidth": /* integer: 32 or 64 */ -} -``` - -Interval: - -``` -{ - "name" : "interval", - "unit" : "YEAR_MONTH|DAY_TIME" -} -``` -Schema: -``` -{ - "fields" : [ - /* Field */ - ] -} -``` - -## Record data headers - -A record batch is a collection of top-level named, equal length Arrow arrays -(or vectors). If one of the arrays contains nested data, its child arrays are -not required to be the same length as the top-level arrays. - -One can be thought of as a realization of a particular schema. The metadata -describing a particular record batch is called a "data header". Here is the -Flatbuffers IDL for a record batch data header - -``` -table RecordBatch { - length: long; - nodes: [FieldNode]; - buffers: [Buffer]; -} -``` - -The `RecordBatch` metadata provides for record batches with length exceeding -2^31 - 1, but Arrow implementations are not required to implement support -beyond this size. - -The `nodes` and `buffers` fields are produced by a depth-first traversal / -flattening of a schema (possibly containing nested types) for a given in-memory -data set. - -### Buffers - -A buffer is metadata describing a contiguous memory region relative to some -virtual address space. This may include: - -* Shared memory, e.g. a memory-mapped file -* An RPC message received in-memory -* Data in a file - -The key form of the Buffer type is: - -``` -struct Buffer { - offset: long; - length: long; -} -``` - -In the context of a record batch, each field has some number of buffers -associated with it, which are derived from their physical memory layout. - -Each logical type (separate from its children, if it is a nested type) has a -deterministic number of buffers associated with it. These will be specified in -the logical types section. - -### Field metadata - -The `FieldNode` values contain metadata about each level in a nested type -hierarchy. - -``` -struct FieldNode { - /// The number of value slots in the Arrow array at this level of a nested - /// tree - length: long; - - /// The number of observed nulls. - null_count: lohng; -} -``` - -The `FieldNode` metadata provides for fields with length exceeding 2^31 - 1, -but Arrow implementations are not required to implement support for large -arrays. - -## Flattening of nested data - -Nested types are flattened in the record batch in depth-first order. When -visiting each field in the nested type tree, the metadata is appended to the -top-level `fields` array and the buffers associated with that field (but not -its children) are appended to the `buffers` array. - -For example, let's consider the schema - -``` -col1: Struct, c: Float64> -col2: Utf8 -``` - -The flattened version of this is: - -``` -FieldNode 0: Struct name='col1' -FieldNode 1: Int32 name=a' -FieldNode 2: List name='b' -FieldNode 3: Int64 name='item' # arbitrary -FieldNode 4: Float64 name='c' -FieldNode 5: Utf8 name='col2' -``` - -For the buffers produced, we would have the following (as described in more -detail for each type below): - -``` -buffer 0: field 0 validity bitmap - -buffer 1: field 1 validity bitmap -buffer 2: field 1 values - -buffer 3: field 2 validity bitmap -buffer 4: field 2 list offsets - -buffer 5: field 3 validity bitmap -buffer 6: field 3 values - -buffer 7: field 4 validity bitmap -buffer 8: field 4 values - -buffer 9: field 5 validity bitmap -buffer 10: field 5 offsets -buffer 11: field 5 data -``` - -## Logical types - -A logical type consists of a type name and metadata along with an explicit -mapping to a physical memory representation. These may fall into some different -categories: - -* Types represented as fixed-width primitive arrays (for example: C-style - integers and floating point numbers) -* Types having equivalent memory layout to a physical nested type (e.g. strings - use the list representation, but logically are not nested types) - -### Integers - -In the first version of Arrow we provide the standard 8-bit through 64-bit size -standard C integer types, both signed and unsigned: - -* Signed types: Int8, Int16, Int32, Int64 -* Unsigned types: UInt8, UInt16, UInt32, UInt64 - -The IDL looks like: - -``` -table Int { - bitWidth: int; - is_signed: bool; -} -``` - -The integer endianness is currently set globally at the schema level. If a -schema is set to be little-endian, then all integer types occurring within must -be little-endian. Integers that are part of other data representations, such as -list offsets and union types, must have the same endianness as the entire -record batch. - -### Floating point numbers - -We provide 3 types of floating point numbers as fixed bit-width primitive array - -- Half precision, 16-bit width -- Single precision, 32-bit width -- Double precision, 64-bit width - -The IDL looks like: - -``` -enum Precision:int {HALF, SINGLE, DOUBLE} - -table FloatingPoint { - precision: Precision; -} -``` - -### Boolean - -The Boolean logical type is represented as a 1-bit wide primitive physical -type. The bits are numbered using least-significant bit (LSB) ordering. - -Like other fixed bit-width primitive types, boolean data appears as 2 buffers -in the data header (one bitmap for the validity vector and one for the values). - -### List - -The `List` logical type is the logical (and identically-named) counterpart to -the List physical type. - -In data header form, the list field node contains 2 buffers: - -* Validity bitmap -* List offsets - -The buffers associated with a list's child field are handled recursively -according to the child logical type (e.g. `List` vs. `List`). - -### Utf8 and Binary - -We specify two logical types for variable length bytes: - -* `Utf8` data is Unicode values with UTF-8 encoding -* `Binary` is any other variable length bytes - -These types both have the same memory layout as the nested type `List`, -with the constraint that the inner bytes can contain no null values. From a -logical type perspective they are primitive, not nested types. - -In data header form, while `List` would appear as 2 field nodes (`List` -and `UInt8`) and 4 buffers (2 for each of the nodes, as per above), these types -have a simplified representation single field node (of `Utf8` or `Binary` -logical type, which have no children) and 3 buffers: - -* Validity bitmap -* List offsets -* Byte data - -### Decimal - -Decimals are represented as a 2's complement 128-bit (16 byte) signed integer -in little-endian byte order. - -### Timestamp - -All timestamps are stored as a 64-bit integer, with one of four unit -resolutions: second, millisecond, microsecond, and nanosecond. - -### Date - -We support two different date types: - -* Days since the UNIX epoch as a 32-bit integer -* Milliseconds since the UNIX epoch as a 64-bit integer - -### Time - -Time supports the same unit resolutions: second, millisecond, microsecond, and -nanosecond. We represent time as the smallest integer accommodating the -indicated unit. For second and millisecond: 32-bit, for the others 64-bit. - -## Dictionary encoding - -[1]: https://github.com/apache/arrow/blob/master/format/Layout.md -[2]: http://github.com/google/flatbuffers -[3]: https://github.com/apache/arrow/blob/master/format/Message.fbs diff --git a/format/README.md b/format/README.md index c87ac2a00d6ea..6da844549e640 100644 --- a/format/README.md +++ b/format/README.md @@ -1,53 +1,25 @@ - - -## Arrow specification documents - -Currently, the Arrow specification consists of these pieces: - -- Metadata specification (see Metadata.md) -- Physical memory layout specification (see Layout.md) -- Logical Types, Schemas, and Record Batch Metadata (see Schema.fbs) -- Encapsulated Messages (see Message.fbs) -- Mechanics of messaging between Arrow systems (IPC, RPC, etc.) (see IPC.md) -- Tensor (Multi-dimensional array) Metadata (see Tensor.fbs) - -The metadata currently uses Google's [flatbuffers library][1] for serializing a -couple related pieces of information: - -- Schemas for tables or record (row) batches. This contains the logical types, - field names, and other metadata. Schemas do not contain any information about - actual data. -- *Data headers* for record (row) batches. These must correspond to a known - schema, and enable a system to send and receive Arrow row batches in a form - that can be precisely disassembled or reconstructed. - -## Arrow Format Maturity and Stability - -We have made significant progress hardening the Arrow in-memory format and -Flatbuffer metadata since the project started in February 2016. We have -integration tests which verify binary compatibility between the Java and C++ -implementations, for example. - -Major versions may still include breaking changes to the memory format or -metadata, so it is recommended to use the same released version of all -libraries in your applications for maximum compatibility. Data stored in the -Arrow IPC formats should not be used for long term storage. - -[1]: http://github.com/google/flatbuffers +.. Licensed to the Apache Software Foundation (ASF) under one +.. or more contributor license agreements. See the NOTICE file +.. distributed with this work for additional information +.. regarding copyright ownership. The ASF licenses this file +.. to you under the Apache License, Version 2.0 (the +.. "License"); you may not use this file except in compliance +.. with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, +.. software distributed under the License is distributed on an +.. "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +.. KIND, either express or implied. See the License for the +.. specific language governing permissions and limitations +.. under the License. + +Arrow Protocol Files +==================== + +This folder contains binary protocol definitions for the Arrow columnar format +and other parts of the project, like the Flight RPC framework. + +For documentation about the Arrow format, see the `docs/source/format` +directory. \ No newline at end of file diff --git a/go/arrow/csv/csv.go b/go/arrow/csv/csv.go index 36f3abd6230de..022c46d8ece74 100644 --- a/go/arrow/csv/csv.go +++ b/go/arrow/csv/csv.go @@ -17,8 +17,6 @@ // Package csv reads CSV files and presents the extracted data as records. package csv -// TODO: implement a row chunker to accumulate N rows into the current record. - import ( "encoding/csv" "errors" @@ -61,6 +59,19 @@ func WithAllocator(mem memory.Allocator) Option { } } +// WithChunk specifies the chunk size used while parsing CSV files. +// +// If n is zero or 1, no chunking will take place and the reader will create +// one record per row. +// If n is greater than 1, chunks of n rows will be read. +// If n is negative, the reader will load the whole CSV file into memory and +// create one big record with all the rows. +func WithChunk(n int) Option { + return func(r *Reader) { + r.chunk = n + } +} + // Reader wraps encoding/csv.Reader and creates array.Records from a schema. type Reader struct { r *csv.Reader @@ -71,6 +82,10 @@ type Reader struct { cur array.Record err error + chunk int + done bool + next func() bool + mem memory.Allocator } @@ -82,7 +97,8 @@ type Reader struct { func NewReader(r io.Reader, schema *arrow.Schema, opts ...Option) *Reader { validate(schema) - rr := &Reader{r: csv.NewReader(r), schema: schema, refs: 1} + rr := &Reader{r: csv.NewReader(r), schema: schema, refs: 1, chunk: 1} + rr.r.ReuseRecord = true for _, opt := range opts { opt(rr) } @@ -93,6 +109,14 @@ func NewReader(r io.Reader, schema *arrow.Schema, opts ...Option) *Reader { rr.bld = array.NewRecordBuilder(rr.mem, rr.schema) + switch { + case rr.chunk < 0: + rr.next = rr.nextall + case rr.chunk > 1: + rr.next = rr.nextn + default: + rr.next = rr.next1 + } return rr } @@ -117,13 +141,20 @@ func (r *Reader) Next() bool { r.cur = nil } - if r.err != nil { + if r.err != nil || r.done { return false } + return r.next() +} + +// next1 reads one row from the CSV file and creates a single Record +// from that row. +func (r *Reader) next1() bool { var recs []string recs, r.err = r.r.Read() if r.err != nil { + r.done = true if r.err == io.EOF { r.err = nil } @@ -132,8 +163,65 @@ func (r *Reader) Next() bool { r.validate(recs) r.read(recs) + r.cur = r.bld.NewRecord() - return r.err == nil + return true +} + +// nextall reads the whole CSV file into memory and creates one single +// Record from all the CSV rows. +func (r *Reader) nextall() bool { + defer func() { + r.done = true + }() + + var ( + recs [][]string + ) + + recs, r.err = r.r.ReadAll() + if r.err != nil { + return false + } + + for _, rec := range recs { + r.validate(rec) + r.read(rec) + } + r.cur = r.bld.NewRecord() + + return true +} + +// nextn reads n rows from the CSV file, where n is the chunk size, and creates +// a Record from these rows. +func (r *Reader) nextn() bool { + var ( + recs []string + n = 0 + ) + + for i := 0; i < r.chunk && !r.done; i++ { + recs, r.err = r.r.Read() + if r.err != nil { + r.done = true + break + } + + r.validate(recs) + r.read(recs) + n++ + } + + if r.err != nil { + r.done = true + if r.err == io.EOF { + r.err = nil + } + } + + r.cur = r.bld.NewRecord() + return n > 0 } func (r *Reader) validate(recs []string) { @@ -193,7 +281,6 @@ func (r *Reader) read(recs []string) { r.bld.Field(i).(*array.StringBuilder).Append(str) } } - r.cur = r.bld.NewRecord() } func (r *Reader) readI8(str string) int8 { diff --git a/go/arrow/csv/csv_test.go b/go/arrow/csv/csv_test.go index 534e8eabd3a97..aaafb37554b87 100644 --- a/go/arrow/csv/csv_test.go +++ b/go/arrow/csv/csv_test.go @@ -89,6 +89,52 @@ func Example() { // rec[2]["str"]: ["str-9"] } +func Example_withChunk() { + f, err := os.Open("testdata/simple.csv") + if err != nil { + log.Fatal(err) + } + defer f.Close() + + schema := arrow.NewSchema( + []arrow.Field{ + arrow.Field{Name: "i64", Type: arrow.PrimitiveTypes.Int64}, + arrow.Field{Name: "f64", Type: arrow.PrimitiveTypes.Float64}, + arrow.Field{Name: "str", Type: arrow.BinaryTypes.String}, + }, + nil, + ) + r := csv.NewReader( + f, schema, + csv.WithComment('#'), csv.WithComma(';'), + csv.WithChunk(3), + ) + defer r.Release() + + n := 0 + for r.Next() { + rec := r.Record() + for i, col := range rec.Columns() { + fmt.Printf("rec[%d][%q]: %v\n", i, rec.ColumnName(i), col) + } + n++ + } + + // Output: + // rec[0]["i64"]: [0 1 2] + // rec[1]["f64"]: [0 1 2] + // rec[2]["str"]: ["str-0" "str-1" "str-2"] + // rec[0]["i64"]: [3 4 5] + // rec[1]["f64"]: [3 4 5] + // rec[2]["str"]: ["str-3" "str-4" "str-5"] + // rec[0]["i64"]: [6 7 8] + // rec[1]["f64"]: [6 7 8] + // rec[2]["str"]: ["str-6" "str-7" "str-8"] + // rec[0]["i64"]: [9] + // rec[1]["f64"]: [9] + // rec[2]["str"]: ["str-9"] +} + func TestCSVReader(t *testing.T) { mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) defer mem.AssertSize(t, 0) @@ -190,3 +236,318 @@ rec[11]["str"]: ["str-2"] r.Release() } } + +func TestCSVReaderWithChunk(t *testing.T) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(t, 0) + + raw, err := ioutil.ReadFile("testdata/simple.csv") + if err != nil { + t.Fatal(err) + } + + schema := arrow.NewSchema( + []arrow.Field{ + arrow.Field{Name: "i64", Type: arrow.PrimitiveTypes.Int64}, + arrow.Field{Name: "f64", Type: arrow.PrimitiveTypes.Float64}, + arrow.Field{Name: "str", Type: arrow.BinaryTypes.String}, + }, + nil, + ) + + for _, tc := range []struct { + name string + opts []csv.Option + records int + want string + }{ + { + name: "chunk=default", + opts: []csv.Option{csv.WithAllocator(mem), csv.WithComment('#'), csv.WithComma(';')}, + records: 10, + want: `rec[0]["i64"]: [0] +rec[1]["f64"]: [0] +rec[2]["str"]: ["str-0"] +rec[0]["i64"]: [1] +rec[1]["f64"]: [1] +rec[2]["str"]: ["str-1"] +rec[0]["i64"]: [2] +rec[1]["f64"]: [2] +rec[2]["str"]: ["str-2"] +rec[0]["i64"]: [3] +rec[1]["f64"]: [3] +rec[2]["str"]: ["str-3"] +rec[0]["i64"]: [4] +rec[1]["f64"]: [4] +rec[2]["str"]: ["str-4"] +rec[0]["i64"]: [5] +rec[1]["f64"]: [5] +rec[2]["str"]: ["str-5"] +rec[0]["i64"]: [6] +rec[1]["f64"]: [6] +rec[2]["str"]: ["str-6"] +rec[0]["i64"]: [7] +rec[1]["f64"]: [7] +rec[2]["str"]: ["str-7"] +rec[0]["i64"]: [8] +rec[1]["f64"]: [8] +rec[2]["str"]: ["str-8"] +rec[0]["i64"]: [9] +rec[1]["f64"]: [9] +rec[2]["str"]: ["str-9"] +`, + }, + { + name: "chunk=0", + opts: []csv.Option{ + csv.WithAllocator(mem), csv.WithComment('#'), csv.WithComma(';'), + csv.WithChunk(0), + }, + records: 10, + want: `rec[0]["i64"]: [0] +rec[1]["f64"]: [0] +rec[2]["str"]: ["str-0"] +rec[0]["i64"]: [1] +rec[1]["f64"]: [1] +rec[2]["str"]: ["str-1"] +rec[0]["i64"]: [2] +rec[1]["f64"]: [2] +rec[2]["str"]: ["str-2"] +rec[0]["i64"]: [3] +rec[1]["f64"]: [3] +rec[2]["str"]: ["str-3"] +rec[0]["i64"]: [4] +rec[1]["f64"]: [4] +rec[2]["str"]: ["str-4"] +rec[0]["i64"]: [5] +rec[1]["f64"]: [5] +rec[2]["str"]: ["str-5"] +rec[0]["i64"]: [6] +rec[1]["f64"]: [6] +rec[2]["str"]: ["str-6"] +rec[0]["i64"]: [7] +rec[1]["f64"]: [7] +rec[2]["str"]: ["str-7"] +rec[0]["i64"]: [8] +rec[1]["f64"]: [8] +rec[2]["str"]: ["str-8"] +rec[0]["i64"]: [9] +rec[1]["f64"]: [9] +rec[2]["str"]: ["str-9"] +`, + }, + { + name: "chunk=1", + opts: []csv.Option{ + csv.WithAllocator(mem), csv.WithComment('#'), csv.WithComma(';'), + csv.WithChunk(1), + }, + records: 10, + want: `rec[0]["i64"]: [0] +rec[1]["f64"]: [0] +rec[2]["str"]: ["str-0"] +rec[0]["i64"]: [1] +rec[1]["f64"]: [1] +rec[2]["str"]: ["str-1"] +rec[0]["i64"]: [2] +rec[1]["f64"]: [2] +rec[2]["str"]: ["str-2"] +rec[0]["i64"]: [3] +rec[1]["f64"]: [3] +rec[2]["str"]: ["str-3"] +rec[0]["i64"]: [4] +rec[1]["f64"]: [4] +rec[2]["str"]: ["str-4"] +rec[0]["i64"]: [5] +rec[1]["f64"]: [5] +rec[2]["str"]: ["str-5"] +rec[0]["i64"]: [6] +rec[1]["f64"]: [6] +rec[2]["str"]: ["str-6"] +rec[0]["i64"]: [7] +rec[1]["f64"]: [7] +rec[2]["str"]: ["str-7"] +rec[0]["i64"]: [8] +rec[1]["f64"]: [8] +rec[2]["str"]: ["str-8"] +rec[0]["i64"]: [9] +rec[1]["f64"]: [9] +rec[2]["str"]: ["str-9"] +`, + }, + { + name: "chunk=3", + opts: []csv.Option{ + csv.WithAllocator(mem), csv.WithComment('#'), csv.WithComma(';'), + csv.WithChunk(3), + }, + records: 4, + want: `rec[0]["i64"]: [0 1 2] +rec[1]["f64"]: [0 1 2] +rec[2]["str"]: ["str-0" "str-1" "str-2"] +rec[0]["i64"]: [3 4 5] +rec[1]["f64"]: [3 4 5] +rec[2]["str"]: ["str-3" "str-4" "str-5"] +rec[0]["i64"]: [6 7 8] +rec[1]["f64"]: [6 7 8] +rec[2]["str"]: ["str-6" "str-7" "str-8"] +rec[0]["i64"]: [9] +rec[1]["f64"]: [9] +rec[2]["str"]: ["str-9"] +`, + }, + { + name: "chunk=6", + opts: []csv.Option{ + csv.WithAllocator(mem), csv.WithComment('#'), csv.WithComma(';'), + csv.WithChunk(6), + }, + records: 2, + want: `rec[0]["i64"]: [0 1 2 3 4 5] +rec[1]["f64"]: [0 1 2 3 4 5] +rec[2]["str"]: ["str-0" "str-1" "str-2" "str-3" "str-4" "str-5"] +rec[0]["i64"]: [6 7 8 9] +rec[1]["f64"]: [6 7 8 9] +rec[2]["str"]: ["str-6" "str-7" "str-8" "str-9"] +`, + }, + { + name: "chunk=10", + opts: []csv.Option{ + csv.WithAllocator(mem), csv.WithComment('#'), csv.WithComma(';'), + csv.WithChunk(10), + }, + records: 1, + want: `rec[0]["i64"]: [0 1 2 3 4 5 6 7 8 9] +rec[1]["f64"]: [0 1 2 3 4 5 6 7 8 9] +rec[2]["str"]: ["str-0" "str-1" "str-2" "str-3" "str-4" "str-5" "str-6" "str-7" "str-8" "str-9"] +`, + }, + { + name: "chunk=11", + opts: []csv.Option{ + csv.WithAllocator(mem), csv.WithComment('#'), csv.WithComma(';'), + csv.WithChunk(11), + }, + records: 1, + want: `rec[0]["i64"]: [0 1 2 3 4 5 6 7 8 9] +rec[1]["f64"]: [0 1 2 3 4 5 6 7 8 9] +rec[2]["str"]: ["str-0" "str-1" "str-2" "str-3" "str-4" "str-5" "str-6" "str-7" "str-8" "str-9"] +`, + }, + { + name: "chunk=-1", + opts: []csv.Option{ + csv.WithAllocator(mem), csv.WithComment('#'), csv.WithComma(';'), + csv.WithChunk(-1), + }, + records: 1, + want: `rec[0]["i64"]: [0 1 2 3 4 5 6 7 8 9] +rec[1]["f64"]: [0 1 2 3 4 5 6 7 8 9] +rec[2]["str"]: ["str-0" "str-1" "str-2" "str-3" "str-4" "str-5" "str-6" "str-7" "str-8" "str-9"] +`, + }, + } { + t.Run(tc.name, func(t *testing.T) { + r := csv.NewReader(bytes.NewReader(raw), schema, tc.opts...) + + defer r.Release() + + r.Retain() + r.Release() + + if got, want := r.Schema(), schema; !got.Equal(want) { + t.Fatalf("invalid schema: got=%v, want=%v", got, want) + } + + out := new(bytes.Buffer) + + n := 0 + for r.Next() { + rec := r.Record() + for i, col := range rec.Columns() { + fmt.Fprintf(out, "rec[%d][%q]: %v\n", i, rec.ColumnName(i), col) + } + n++ + } + + if got, want := n, tc.records; got != want { + t.Fatalf("invalid number of records: got=%d, want=%d", got, want) + } + + if got, want := out.String(), tc.want; got != want { + t.Fatalf("invalid output:\ngot:\n%s\nwant:\n%s\n", got, want) + } + + if r.Err() != nil { + t.Fatalf("unexpected error: %v", r.Err()) + } + }) + } +} + +func BenchmarkRead(b *testing.B) { + gen := func(rows, cols int) []byte { + buf := new(bytes.Buffer) + for i := 0; i < rows; i++ { + for j := 0; j < cols; j++ { + if j > 0 { + fmt.Fprintf(buf, ";") + } + fmt.Fprintf(buf, "%d;%f;str-%d", i, float64(i), i) + } + fmt.Fprintf(buf, "\n") + } + return buf.Bytes() + } + + for _, rows := range []int{10, 1e2, 1e3, 1e4, 1e5} { + for _, cols := range []int{1, 10, 100, 1000} { + raw := gen(rows, cols) + for _, chunks := range []int{-1, 0, 10, 100, 1000} { + b.Run(fmt.Sprintf("rows=%d cols=%d chunks=%d", rows, cols, chunks), func(b *testing.B) { + benchRead(b, raw, rows, cols, chunks) + }) + } + } + } +} + +func benchRead(b *testing.B, raw []byte, rows, cols, chunks int) { + mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) + defer mem.AssertSize(b, 0) + + var fields []arrow.Field + for i := 0; i < cols; i++ { + fields = append(fields, []arrow.Field{ + arrow.Field{Name: fmt.Sprintf("i64-%d", i), Type: arrow.PrimitiveTypes.Int64}, + arrow.Field{Name: fmt.Sprintf("f64-%d", i), Type: arrow.PrimitiveTypes.Float64}, + arrow.Field{Name: fmt.Sprintf("str-%d", i), Type: arrow.BinaryTypes.String}, + }...) + } + + schema := arrow.NewSchema(fields, nil) + chunk := 0 + if chunks != 0 { + chunk = rows / chunks + } + opts := []csv.Option{ + csv.WithAllocator(mem), csv.WithComment('#'), csv.WithComma(';'), + csv.WithChunk(chunk), + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + r := csv.NewReader(bytes.NewReader(raw), schema, opts...) + + n := int64(0) + for r.Next() { + n += r.Record().NumRows() + } + + r.Release() + if n != int64(rows) { + b.Fatalf("invalid number of rows. want=%d, got=%d", n, rows) + } + } +} diff --git a/java/README.md b/java/README.md index d4efe428c4264..5a5f4d2c4211a 100644 --- a/java/README.md +++ b/java/README.md @@ -45,6 +45,22 @@ mvn install -P gandiva -pl gandiva -am -Dgandiva.cpp.build.dir=../../debug This library is still in Alpha stages, and subject to API changes without deprecation warnings. +## Java Code Style Guide + +Arrow Java follows the Google style guide [here][3] with the following +differences: + +* Imports are grouped, from top to bottom, in this order: static imports, +standard Java, org.\*, com.\* +* Line length can be up to 120 characters +* Operators for line wrapping are at end-of-line +* Naming rules for methods, parameters, etc. have been relaxed +* Disabled `NoFinalizer`, `OverloadMethodsDeclarationOrder`, and +`VariableDeclarationUsageDistance` due to the existing code base. These rules +should be followed when possible. + +Refer to `java/dev/checkstyle/checkstyle.xml for rule specifics. + ## Test Logging Configuration When running tests, Arrow Java uses the Logback logger with SLF4J. By default, @@ -65,3 +81,4 @@ See [Logback Configuration][1] for more details. [1]: https://logback.qos.ch/manual/configuration.html [2]: https://github.com/apache/arrow/blob/master/cpp/README.md +[3]: http://google.github.io/styleguide/javaguide.html diff --git a/js/.npmrc b/js/.npmrc index b6b25d1f1816d..5536efc09ce5c 100644 --- a/js/.npmrc +++ b/js/.npmrc @@ -1,2 +1,2 @@ save-prefix= -package-lock=false \ No newline at end of file +engine-strict=true diff --git a/js/README.md b/js/README.md index e048ba1c1cdf1..15d7ed03f65a4 100644 --- a/js/README.md +++ b/js/README.md @@ -94,26 +94,14 @@ console.log(table.toString()); ### Create a Table from JavaScript arrays ```es6 -const fields = [{ - name: 'precipitation', - type: { name: 'floatingpoint', precision: 'SINGLE'}, - nullable: false, children: [] - }, { - name: 'date', - type: { name: 'date', unit: 'MILLISECOND' }, - nullable: false, children: [] - }]; -const rainAmounts = Array.from({length: LENGTH}, () => Number((Math.random() * 20).toFixed(1))); -const rainDates = Array.from({length: LENGTH}, (_, i) => Date.now() - 1000 * 60 * 60 * 24 * i); - const LENGTH = 2000; -const rainfall = arrow.Table.from({ - schema: { fields: fields }, - batches: [{ - count: LENGTH, - columns: [ - {name: "precipitation", count: LENGTH, VALIDITY: [], DATA: rainAmounts }, - {name: "date", count: LENGTH, VALIDITY: [], DATA: rainDates } ] }] }) +const rainAmounts = Float32Array.from({length: LENGTH}, () => Number((Math.random() * 20).toFixed(1))); +const rainDates = Array.from({length: LENGTH}, (_, i) => new Date(Date.now() - 1000 * 60 * 60 * 24 * i)); + +const rainfall = arrow.Table.fromVectors( + [FloatVector.from(rainAmounts), DateVector.from(rainDates)], + ['precipitation', 'date'] +); ``` ### Load data with `fetch` diff --git a/js/package-lock.json b/js/package-lock.json index 1ab8bacc2269b..ef38db9a7468d 100644 --- a/js/package-lock.json +++ b/js/package-lock.json @@ -64,14 +64,14 @@ } }, "@lerna/add": { - "version": "3.4.1", - "resolved": "https://registry.npmjs.org/@lerna/add/-/add-3.4.1.tgz", - "integrity": "sha512-Vf54B42jlD6G52qnv/cAGH70cVQIa+LX//lfsbkxHvzkhIqBl5J4KsnTOPkA9uq3R+zP58ayicCHB9ReiEWGJg==", + "version": "3.5.0", + "resolved": "https://registry.npmjs.org/@lerna/add/-/add-3.5.0.tgz", + "integrity": "sha512-hoOqtal/ChEEtt9rxR/6xmyvTN7581XF4kWHoWPV9NbfZN9e8uTR8z4mCcJq2DiZhRuY7aA5FEROEbl12soowQ==", "dev": true, "requires": { - "@lerna/bootstrap": "^3.4.1", - "@lerna/command": "^3.3.0", - "@lerna/filter-options": "^3.3.2", + "@lerna/bootstrap": "^3.5.0", + "@lerna/command": "^3.5.0", + "@lerna/filter-options": "^3.5.0", "@lerna/npm-conf": "^3.4.1", "@lerna/validation-error": "^3.0.0", "dedent": "^0.7.0", @@ -93,14 +93,14 @@ } }, "@lerna/bootstrap": { - "version": "3.4.1", - "resolved": "https://registry.npmjs.org/@lerna/bootstrap/-/bootstrap-3.4.1.tgz", - "integrity": "sha512-yZDJgNm/KDoRH2klzmQGmpWMg/XMzWgeWvauXkrfW/mj1wwmufOuh5pN4fBFxVmUUa/RFZdfMeaaJt3+W3PPBw==", + "version": "3.5.0", + "resolved": "https://registry.npmjs.org/@lerna/bootstrap/-/bootstrap-3.5.0.tgz", + "integrity": "sha512-+z4kVVJFO5EGfC2ob/4C9LetqWwDtbhZgTRllr1+zOi/2clbD+WKcVI0ku+/ckzKjz783SOc83swX7RrmiLwMQ==", "dev": true, "requires": { "@lerna/batch-packages": "^3.1.2", - "@lerna/command": "^3.3.0", - "@lerna/filter-options": "^3.3.2", + "@lerna/command": "^3.5.0", + "@lerna/filter-options": "^3.5.0", "@lerna/has-npm-version": "^3.3.0", "@lerna/npm-conf": "^3.4.1", "@lerna/npm-install": "^3.3.0", @@ -124,25 +124,25 @@ } }, "@lerna/changed": { - "version": "3.4.1", - "resolved": "https://registry.npmjs.org/@lerna/changed/-/changed-3.4.1.tgz", - "integrity": "sha512-gT7fhl4zQWyGETDO4Yy5wsFnqNlBSsezncS1nkMW1uO6jwnolwYqcr1KbrMR8HdmsZBn/00Y0mRnbtbpPPey8w==", + "version": "3.5.0", + "resolved": "https://registry.npmjs.org/@lerna/changed/-/changed-3.5.0.tgz", + "integrity": "sha512-p9o7/hXwFAoet7UPeHIzIPonYxLHZe9bcNcjxKztZYAne5/OgmZiF4X1UPL2S12wtkT77WQy4Oz8NjRTczcapg==", "dev": true, "requires": { - "@lerna/collect-updates": "^3.3.2", - "@lerna/command": "^3.3.0", + "@lerna/collect-updates": "^3.5.0", + "@lerna/command": "^3.5.0", "@lerna/listable": "^3.0.0", "@lerna/output": "^3.0.0", - "@lerna/version": "^3.4.1" + "@lerna/version": "^3.5.0" } }, "@lerna/check-working-tree": { - "version": "3.3.0", - "resolved": "https://registry.npmjs.org/@lerna/check-working-tree/-/check-working-tree-3.3.0.tgz", - "integrity": "sha512-oeEP1dNhiiKUaO0pmcIi73YXJpaD0n5JczNctvVNZ8fGZmrALZtEnmC28o6Z7JgQaqq5nd2kO7xbnjoitrC51g==", + "version": "3.5.0", + "resolved": "https://registry.npmjs.org/@lerna/check-working-tree/-/check-working-tree-3.5.0.tgz", + "integrity": "sha512-aWeIputHddeZgf7/wA1e5yuv6q9S5si2y7fzO2Ah7m3KyDyl8XHP1M0VSSDzZeiloYCryAYQAoRgcrdH65Vhow==", "dev": true, "requires": { - "@lerna/describe-ref": "^3.3.0", + "@lerna/describe-ref": "^3.5.0", "@lerna/validation-error": "^3.0.0" } }, @@ -197,13 +197,13 @@ } }, "@lerna/clean": { - "version": "3.3.2", - "resolved": "https://registry.npmjs.org/@lerna/clean/-/clean-3.3.2.tgz", - "integrity": "sha512-mvqusgSp2ou5SGqQgTEoTvGJpGfH4+L6XSeN+Ims+eNFGXuMazmKCf+rz2PZBMFufaHJ/Os+JF0vPCcWI1Fzqg==", + "version": "3.5.0", + "resolved": "https://registry.npmjs.org/@lerna/clean/-/clean-3.5.0.tgz", + "integrity": "sha512-bHUFF6Wv7ms81Tmwe56xk296oqU74Sg9NSkUCDG4kZLpYZx347Aw+89ZPTlaSmUwqCgEXKYLr65ZVVvKmflpcA==", "dev": true, "requires": { - "@lerna/command": "^3.3.0", - "@lerna/filter-options": "^3.3.2", + "@lerna/command": "^3.5.0", + "@lerna/filter-options": "^3.5.0", "@lerna/prompt": "^3.3.1", "@lerna/rimraf-dir": "^3.3.0", "p-map": "^1.2.0", @@ -230,9 +230,9 @@ "dev": true }, "camelcase": { - "version": "4.1.0", - "resolved": "https://registry.npmjs.org/camelcase/-/camelcase-4.1.0.tgz", - "integrity": "sha1-1UVjW+HjPFQmScaRc+Xeas+uNN0=", + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/camelcase/-/camelcase-5.0.0.tgz", + "integrity": "sha512-faqwZqnWxbxn+F1d399ygeamQNy3lPp/H9H6rNrqYh4FSVCtcY+3cub1MxA8o9mDd55mM8Aghuu/kuyYA6VTsA==", "dev": true }, "cliui": { @@ -259,15 +259,6 @@ "which": "^1.2.9" } }, - "decamelize": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/decamelize/-/decamelize-2.0.0.tgz", - "integrity": "sha512-Ikpp5scV3MSYxY39ymh45ZLEecsTdv/Xj2CaQfI8RLMuwi7XvjX9H/fhraiSuU+C5w5NTDu4ZU72xNiZnurBPg==", - "dev": true, - "requires": { - "xregexp": "4.0.0" - } - }, "execa": { "version": "0.10.0", "resolved": "https://registry.npmjs.org/execa/-/execa-0.10.0.tgz", @@ -389,13 +380,13 @@ } }, "yargs": { - "version": "12.0.2", - "resolved": "https://registry.npmjs.org/yargs/-/yargs-12.0.2.tgz", - "integrity": "sha512-e7SkEx6N6SIZ5c5H22RTZae61qtn3PYUE8JYbBFlK9sYmh3DMQ6E5ygtaG/2BW0JZi4WGgTR2IV5ChqlqrDGVQ==", + "version": "12.0.5", + "resolved": "https://registry.npmjs.org/yargs/-/yargs-12.0.5.tgz", + "integrity": "sha512-Lhz8TLaYnxq/2ObqHDql8dX8CJi97oHxrjUcYtzKbbykPtVW9WB+poxI+NM2UIzsMgNCZTIf0AQwsjK5yMAqZw==", "dev": true, "requires": { "cliui": "^4.0.0", - "decamelize": "^2.0.0", + "decamelize": "^1.2.0", "find-up": "^3.0.0", "get-caller-file": "^1.0.1", "os-locale": "^3.0.0", @@ -405,42 +396,43 @@ "string-width": "^2.0.0", "which-module": "^2.0.0", "y18n": "^3.2.1 || ^4.0.0", - "yargs-parser": "^10.1.0" + "yargs-parser": "^11.1.1" } }, "yargs-parser": { - "version": "10.1.0", - "resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-10.1.0.tgz", - "integrity": "sha512-VCIyR1wJoEBZUqk5PA+oOBF6ypbwh5aNB3I50guxAL/quggdfs4TtNHQrSazFA3fYZ+tEqfs0zIGlv0c/rgjbQ==", + "version": "11.1.1", + "resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-11.1.1.tgz", + "integrity": "sha512-C6kB/WJDiaxONLJQnF8ccx9SEeoTTLek8RVbaOIsrAUS8VrBEXfmeSnCZxygc+XC2sNMBIwOOnfcxiynjHsVSQ==", "dev": true, "requires": { - "camelcase": "^4.1.0" + "camelcase": "^5.0.0", + "decamelize": "^1.2.0" } } } }, "@lerna/collect-updates": { - "version": "3.3.2", - "resolved": "https://registry.npmjs.org/@lerna/collect-updates/-/collect-updates-3.3.2.tgz", - "integrity": "sha512-9WyBJI2S5sYgEZEScu525Lbi6nknNrdBKop35sCDIC9y6AIGvH6Dr5tkTd+Kg3n1dE+kHwW/xjERkx3+h7th3w==", + "version": "3.5.0", + "resolved": "https://registry.npmjs.org/@lerna/collect-updates/-/collect-updates-3.5.0.tgz", + "integrity": "sha512-rFCng14K8vHyrDJSAacj6ABKKT/TxZdpL9uPEtZN7DsoJKlKPzqFeRvRGA2+ed/I6mEm4ltauEjEpKG5O6xqtw==", "dev": true, "requires": { "@lerna/child-process": "^3.3.0", - "@lerna/describe-ref": "^3.3.0", + "@lerna/describe-ref": "^3.5.0", "minimatch": "^3.0.4", "npmlog": "^4.1.2", "slash": "^1.0.0" } }, "@lerna/command": { - "version": "3.3.0", - "resolved": "https://registry.npmjs.org/@lerna/command/-/command-3.3.0.tgz", - "integrity": "sha512-NTOkLEKlWcBLHSvUr9tzVpV7RJ4GROLeOuZ6RfztGOW/31JPSwVVBD2kPifEXNZunldOx5GVWukR+7+NpAWhsg==", + "version": "3.5.0", + "resolved": "https://registry.npmjs.org/@lerna/command/-/command-3.5.0.tgz", + "integrity": "sha512-C/0e7qPbuKZ9vEqzRePksoKDJk4TOWzsU5qaPP/ikqc6vClJbKucsIehk3za6glSjlgLCJpzBTF2lFjHfb+JNw==", "dev": true, "requires": { "@lerna/child-process": "^3.3.0", "@lerna/package-graph": "^3.1.2", - "@lerna/project": "^3.0.0", + "@lerna/project": "^3.5.0", "@lerna/validation-error": "^3.0.0", "@lerna/write-log-file": "^3.0.0", "dedent": "^0.7.0", @@ -490,15 +482,15 @@ } }, "@lerna/conventional-commits": { - "version": "3.4.1", - "resolved": "https://registry.npmjs.org/@lerna/conventional-commits/-/conventional-commits-3.4.1.tgz", - "integrity": "sha512-3NETrA58aUkaEW3RdwdJ766Bg9NVpLzb26mtdlsJQcvB5sQBWH5dJSHIVQH1QsGloBeH2pE/mDUEVY8ZJXuR4w==", + "version": "3.5.0", + "resolved": "https://registry.npmjs.org/@lerna/conventional-commits/-/conventional-commits-3.5.0.tgz", + "integrity": "sha512-roKPILPYnDWiCDxOeBQ0cObJ2FbDgzJSToxr1ZwIqvJU5hGQ4RmooCf8GHcCW9maBJz7ETeestv8M2mBUgBPbg==", "dev": true, "requires": { "@lerna/validation-error": "^3.0.0", - "conventional-changelog-angular": "^5.0.1", - "conventional-changelog-core": "^3.1.0", - "conventional-recommended-bump": "^4.0.1", + "conventional-changelog-angular": "^5.0.2", + "conventional-changelog-core": "^3.1.5", + "conventional-recommended-bump": "^4.0.4", "fs-extra": "^7.0.0", "get-stream": "^4.0.0", "npm-package-arg": "^6.0.0", @@ -518,13 +510,13 @@ } }, "@lerna/create": { - "version": "3.4.1", - "resolved": "https://registry.npmjs.org/@lerna/create/-/create-3.4.1.tgz", - "integrity": "sha512-l+4t2SRO5nvW0MNYY+EWxbaMHsAN8bkWH3nyt7EzhBjs4+TlRAJRIEqd8o9NWznheE3pzwczFz1Qfl3BWbyM5A==", + "version": "3.5.0", + "resolved": "https://registry.npmjs.org/@lerna/create/-/create-3.5.0.tgz", + "integrity": "sha512-ek4flHRmpMegZp9tP3RmuDhmMb9+/Hhy9B5eaZc5X5KWqDvFKJtn56sw+M9hNjiYehiimCwhaLWgE2WSikPvcQ==", "dev": true, "requires": { "@lerna/child-process": "^3.3.0", - "@lerna/command": "^3.3.0", + "@lerna/command": "^3.5.0", "@lerna/npm-conf": "^3.4.1", "@lerna/validation-error": "^3.0.0", "camelcase": "^4.1.0", @@ -587,9 +579,9 @@ } }, "@lerna/describe-ref": { - "version": "3.3.0", - "resolved": "https://registry.npmjs.org/@lerna/describe-ref/-/describe-ref-3.3.0.tgz", - "integrity": "sha512-4t7M4OupnYMSPNLrLUau8qkS+dgLEi4w+DkRkV0+A+KNYga1W0jVgNLPIIsxta7OHfodPkCNAqZCzNCw/dmAwA==", + "version": "3.5.0", + "resolved": "https://registry.npmjs.org/@lerna/describe-ref/-/describe-ref-3.5.0.tgz", + "integrity": "sha512-XvecK2PSwUv4z+otib5moWJMI+h3mtAg8nFlfo4KbivVtD/sI11jfKsr3S75HuAwhVAa8tAijoAxmuBJSsTE1g==", "dev": true, "requires": { "@lerna/child-process": "^3.3.0", @@ -597,38 +589,38 @@ } }, "@lerna/diff": { - "version": "3.3.0", - "resolved": "https://registry.npmjs.org/@lerna/diff/-/diff-3.3.0.tgz", - "integrity": "sha512-sIoMjsm3NVxvmt6ofx8Uu/2fxgldQqLl0zmC9X1xW00j831o5hBffx1EoKj9CnmaEvoSP6j/KFjxy2RWjebCIg==", + "version": "3.5.0", + "resolved": "https://registry.npmjs.org/@lerna/diff/-/diff-3.5.0.tgz", + "integrity": "sha512-iyZ0ZRPqH5Y5XEhOYoKS8H/8UXC/gZ/idlToMFHhUn1oTSd8v9HVU1c2xq1ge0u36ZH/fx/YydUk0A/KSv+p3Q==", "dev": true, "requires": { "@lerna/child-process": "^3.3.0", - "@lerna/command": "^3.3.0", + "@lerna/command": "^3.5.0", "@lerna/validation-error": "^3.0.0", "npmlog": "^4.1.2" } }, "@lerna/exec": { - "version": "3.3.2", - "resolved": "https://registry.npmjs.org/@lerna/exec/-/exec-3.3.2.tgz", - "integrity": "sha512-mN6vGxNir7JOGvWLwKr3DW3LNy1ecCo2ziZj5rO9Mw5Rew3carUu1XLmhF/4judtsvXViUY+rvGIcqHe0vvb+w==", + "version": "3.5.0", + "resolved": "https://registry.npmjs.org/@lerna/exec/-/exec-3.5.0.tgz", + "integrity": "sha512-H5jeIueDiuNsxeuGKaP7HqTcenvMsFfBFeWr0W6knHv9NrOF8il34dBqYgApZEDSQ7+2fA3ghwWbF+jUGTSh/A==", "dev": true, "requires": { "@lerna/batch-packages": "^3.1.2", "@lerna/child-process": "^3.3.0", - "@lerna/command": "^3.3.0", - "@lerna/filter-options": "^3.3.2", + "@lerna/command": "^3.5.0", + "@lerna/filter-options": "^3.5.0", "@lerna/run-parallel-batches": "^3.0.0", "@lerna/validation-error": "^3.0.0" } }, "@lerna/filter-options": { - "version": "3.3.2", - "resolved": "https://registry.npmjs.org/@lerna/filter-options/-/filter-options-3.3.2.tgz", - "integrity": "sha512-0WHqdDgAnt5WKoByi1q+lFw8HWt5tEKP2DnLlGqWv3YFwVF5DsPRlO7xbzjY9sJgvyJtZcnkMtccdBPFhGGyIQ==", + "version": "3.5.0", + "resolved": "https://registry.npmjs.org/@lerna/filter-options/-/filter-options-3.5.0.tgz", + "integrity": "sha512-7pEQy1i5ynYOYjcSeo+Qaps4+Ais55RRdnT6/SLLBgyyHAMziflFLX5TnoyEaaXoU90iKfQ5z/ioEp6dFAXSMg==", "dev": true, "requires": { - "@lerna/collect-updates": "^3.3.2", + "@lerna/collect-updates": "^3.5.0", "@lerna/filter-packages": "^3.0.0", "dedent": "^0.7.0" } @@ -670,13 +662,13 @@ } }, "@lerna/import": { - "version": "3.3.1", - "resolved": "https://registry.npmjs.org/@lerna/import/-/import-3.3.1.tgz", - "integrity": "sha512-2OzTQDkYKbBPpyP2iOI1sWfcvMjNLjjHjmREq/uOWJaSIk5J3Ukt71OPpcOHh4V2CBOlXidCcO+Hyb4FVIy8fw==", + "version": "3.5.0", + "resolved": "https://registry.npmjs.org/@lerna/import/-/import-3.5.0.tgz", + "integrity": "sha512-vgI6lMEzd1ODgi75cmAlfPYylaK37WY3E2fwKyO/lj6UKSGj46dVSK0KwTRHx33tu4PLvPzFi5C6nbY57o5ykQ==", "dev": true, "requires": { "@lerna/child-process": "^3.3.0", - "@lerna/command": "^3.3.0", + "@lerna/command": "^3.5.0", "@lerna/prompt": "^3.3.1", "@lerna/validation-error": "^3.0.0", "dedent": "^0.7.0", @@ -685,25 +677,25 @@ } }, "@lerna/init": { - "version": "3.3.0", - "resolved": "https://registry.npmjs.org/@lerna/init/-/init-3.3.0.tgz", - "integrity": "sha512-HvgRLkIG6nDIeAO6ix5sUVIVV+W9UMk2rSSmFT66CDOefRi7S028amiyYnFUK1QkIAaUbVUyOnYaErtbJwICuw==", + "version": "3.5.0", + "resolved": "https://registry.npmjs.org/@lerna/init/-/init-3.5.0.tgz", + "integrity": "sha512-V21/UWj34Mph+9NxIGH1kYcuJAp+uFjfG8Ku2nMy62OGL3553+YQ+Izr+R6egY8y/99UMCDpi5gkQni5eGv3MA==", "dev": true, "requires": { "@lerna/child-process": "^3.3.0", - "@lerna/command": "^3.3.0", + "@lerna/command": "^3.5.0", "fs-extra": "^7.0.0", "p-map": "^1.2.0", "write-json-file": "^2.3.0" } }, "@lerna/link": { - "version": "3.3.0", - "resolved": "https://registry.npmjs.org/@lerna/link/-/link-3.3.0.tgz", - "integrity": "sha512-8CeXzGL7okrsVXsy2sHXI2KuBaczw3cblAnA2+FJPUqSKMPNbUTRzeU3bOlCjYtK0LbxC4ngENJTL3jJ8RaYQQ==", + "version": "3.5.0", + "resolved": "https://registry.npmjs.org/@lerna/link/-/link-3.5.0.tgz", + "integrity": "sha512-KSu1mhxwNRmguqMqUTJd4c7QIk9/xmxJxbmMkA71OaJd4fwondob6DyI/B17NIWutdLbvSWQ7pRlFOPxjQVoUw==", "dev": true, "requires": { - "@lerna/command": "^3.3.0", + "@lerna/command": "^3.5.0", "@lerna/package-graph": "^3.1.2", "@lerna/symlink-dependencies": "^3.3.0", "p-map": "^1.2.0", @@ -711,13 +703,13 @@ } }, "@lerna/list": { - "version": "3.3.2", - "resolved": "https://registry.npmjs.org/@lerna/list/-/list-3.3.2.tgz", - "integrity": "sha512-XXEVy7w+i/xx8NeJmGirw4upEoEF9OfD6XPLjISNQc24VgQV+frXdVJ02QcP7Y/PkY1rdIVrOjvo3ipKVLUxaQ==", + "version": "3.5.0", + "resolved": "https://registry.npmjs.org/@lerna/list/-/list-3.5.0.tgz", + "integrity": "sha512-T+NZBQ/l6FmZklgrtFuN7luMs3AC/BoS52APOPrM7ZmxW4nenvov0xMwQW1783w/t365YDkDlYd5gM0nX3D1Hg==", "dev": true, "requires": { - "@lerna/command": "^3.3.0", - "@lerna/filter-options": "^3.3.2", + "@lerna/command": "^3.5.0", + "@lerna/filter-options": "^3.5.0", "@lerna/listable": "^3.0.0", "@lerna/output": "^3.0.0" } @@ -837,9 +829,9 @@ } }, "@lerna/project": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/@lerna/project/-/project-3.0.0.tgz", - "integrity": "sha512-XhDFVfqj79jG2Speggd15RpYaE8uiR25UKcQBDmumbmqvTS7xf2cvl2pq2UTvDafaJ0YwFF3xkxQZeZnFMwdkw==", + "version": "3.5.0", + "resolved": "https://registry.npmjs.org/@lerna/project/-/project-3.5.0.tgz", + "integrity": "sha512-uFDzqwrD7a/tTohQoo0voTsRy2cgl9D1ZOU2pHZzHzow9S1M8E0x5q3hJI2HlwsZry9IUugmDUGO6UddTjwm3Q==", "dev": true, "requires": { "@lerna/package": "^3.0.0", @@ -943,17 +935,17 @@ } }, "@lerna/publish": { - "version": "3.4.3", - "resolved": "https://registry.npmjs.org/@lerna/publish/-/publish-3.4.3.tgz", - "integrity": "sha512-baeRL8xmOR25p86cAaS9mL0jdRzdv4dUo04PlK2Wes+YlL705F55cSXeC9npNie+9rGwFyLzCTQe18WdbZyLuw==", + "version": "3.5.1", + "resolved": "https://registry.npmjs.org/@lerna/publish/-/publish-3.5.1.tgz", + "integrity": "sha512-ltw2YdWWzev9cZRAzons5ywZh9NJARPX67meeA95oMDVMrhD4Y9VHQNJ3T8ueec/W78/4sKlMSr3ecWyPNp5bg==", "dev": true, "requires": { "@lerna/batch-packages": "^3.1.2", - "@lerna/check-working-tree": "^3.3.0", + "@lerna/check-working-tree": "^3.5.0", "@lerna/child-process": "^3.3.0", - "@lerna/collect-updates": "^3.3.2", - "@lerna/command": "^3.3.0", - "@lerna/describe-ref": "^3.3.0", + "@lerna/collect-updates": "^3.5.0", + "@lerna/command": "^3.5.0", + "@lerna/describe-ref": "^3.5.0", "@lerna/get-npm-exec-opts": "^3.0.0", "@lerna/npm-conf": "^3.4.1", "@lerna/npm-dist-tag": "^3.3.0", @@ -963,7 +955,7 @@ "@lerna/run-lifecycle": "^3.4.1", "@lerna/run-parallel-batches": "^3.0.0", "@lerna/validation-error": "^3.0.0", - "@lerna/version": "^3.4.1", + "@lerna/version": "^3.5.0", "fs-extra": "^7.0.0", "libnpmaccess": "^3.0.0", "npm-package-arg": "^6.0.0", @@ -1000,17 +992,18 @@ } }, "@lerna/run": { - "version": "3.3.2", - "resolved": "https://registry.npmjs.org/@lerna/run/-/run-3.3.2.tgz", - "integrity": "sha512-cruwRGZZWnQ5I0M+AqcoT3Xpq2wj3135iVw4n59/Op6dZu50sMFXZNLiTTTZ15k8rTKjydcccJMdPSpTHbH7/A==", + "version": "3.5.0", + "resolved": "https://registry.npmjs.org/@lerna/run/-/run-3.5.0.tgz", + "integrity": "sha512-BnPD52tj794xG2Xsc4FvgksyFX2CLmSR28TZw/xASEuy14NuQYMZkvbaj61SEhyOEsq7pLhHE5PpfbIv2AIFJw==", "dev": true, "requires": { "@lerna/batch-packages": "^3.1.2", - "@lerna/command": "^3.3.0", - "@lerna/filter-options": "^3.3.2", + "@lerna/command": "^3.5.0", + "@lerna/filter-options": "^3.5.0", "@lerna/npm-run-script": "^3.3.0", "@lerna/output": "^3.0.0", "@lerna/run-parallel-batches": "^3.0.0", + "@lerna/timer": "^3.5.0", "@lerna/validation-error": "^3.0.0", "p-map": "^1.2.0" } @@ -1114,6 +1107,12 @@ "p-map-series": "^1.0.0" } }, + "@lerna/timer": { + "version": "3.5.0", + "resolved": "https://registry.npmjs.org/@lerna/timer/-/timer-3.5.0.tgz", + "integrity": "sha512-TAb99hqQN6E3JBGtG9iyZNPq1/DbmqgBOeNrKtdJsGvIeX/NGLgUDWMrj2h04V4O+jpBFmSf6HIld6triKmxCA==", + "dev": true + }, "@lerna/validation-error": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/@lerna/validation-error/-/validation-error-3.0.0.tgz", @@ -1124,17 +1123,17 @@ } }, "@lerna/version": { - "version": "3.4.1", - "resolved": "https://registry.npmjs.org/@lerna/version/-/version-3.4.1.tgz", - "integrity": "sha512-oefNaQLBJSI2WLZXw5XxDXk4NyF5/ct0V9ys/J308NpgZthPgwRPjk9ZR0o1IOxW1ABi6z3E317W/dxHDjvAkg==", + "version": "3.5.0", + "resolved": "https://registry.npmjs.org/@lerna/version/-/version-3.5.0.tgz", + "integrity": "sha512-vxuGkUSfjJuvOIgPG7SDXVmk4GPwJF9F+uhDW9T/wJzTk4UaxL37GpBeJDo43eutQ7mwluP+t88Luwf8S3WXlA==", "dev": true, "requires": { "@lerna/batch-packages": "^3.1.2", - "@lerna/check-working-tree": "^3.3.0", + "@lerna/check-working-tree": "^3.5.0", "@lerna/child-process": "^3.3.0", - "@lerna/collect-updates": "^3.3.2", - "@lerna/command": "^3.3.0", - "@lerna/conventional-commits": "^3.4.1", + "@lerna/collect-updates": "^3.5.0", + "@lerna/command": "^3.5.0", + "@lerna/conventional-commits": "^3.5.0", "@lerna/output": "^3.0.0", "@lerna/prompt": "^3.3.1", "@lerna/run-lifecycle": "^3.4.1", @@ -1173,9 +1172,9 @@ } }, "@nodelib/fs.stat": { - "version": "1.1.2", - "resolved": "https://registry.npmjs.org/@nodelib/fs.stat/-/fs.stat-1.1.2.tgz", - "integrity": "sha512-yprFYuno9FtNsSHVlSWd+nRlmGoAbqbeCwOryP6sC/zoCjhpArcRMYp19EvpSUSizJAlsXEwJv+wcWS9XaXdMw==", + "version": "1.1.3", + "resolved": "https://registry.npmjs.org/@nodelib/fs.stat/-/fs.stat-1.1.3.tgz", + "integrity": "sha512-shAmDyaQC4H92APFoIaVDHCx5bStIocgvbwQyxPRrbUY20V1EYTbSDchWbuwlMG3V17cprZhA6+78JfB+3DTPw==", "dev": true }, "@samverschueren/stream-to-observable": { @@ -1282,9 +1281,9 @@ "dev": true }, "@types/lodash": { - "version": "4.14.117", - "resolved": "https://registry.npmjs.org/@types/lodash/-/lodash-4.14.117.tgz", - "integrity": "sha512-xyf2m6tRbz8qQKcxYZa7PA4SllYcay+eh25DN3jmNYY6gSTL7Htc/bttVdkqj2wfJGbeWlQiX8pIyJpKU+tubw==", + "version": "4.14.118", + "resolved": "https://registry.npmjs.org/@types/lodash/-/lodash-4.14.118.tgz", + "integrity": "sha512-iiJbKLZbhSa6FYRip/9ZDX6HXhayXLDGY2Fqws9cOkEQ6XeKfaxB0sC541mowZJueYyMnVUmmG+al5/4fCDrgw==", "dev": true }, "@types/marked": { @@ -2697,9 +2696,9 @@ "dev": true }, "camelcase": { - "version": "2.1.1", - "resolved": "https://registry.npmjs.org/camelcase/-/camelcase-2.1.1.tgz", - "integrity": "sha1-fB0W1nmhu+WcoCys7PsBHiAfWh8=", + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/camelcase/-/camelcase-3.0.0.tgz", + "integrity": "sha1-MvxLn82vhF/N9+c7uXysImHwqwo=", "dev": true }, "camelcase-keys": { @@ -3098,9 +3097,9 @@ "dev": true }, "conventional-changelog-angular": { - "version": "5.0.1", - "resolved": "https://registry.npmjs.org/conventional-changelog-angular/-/conventional-changelog-angular-5.0.1.tgz", - "integrity": "sha512-q4ylJ68fWZDdrFC9z4zKcf97HW6hp7Mo2YlqD4owfXhecFKy/PJCU/1oVFF4TqochchChqmZ0Vb0e0g8/MKNlA==", + "version": "5.0.2", + "resolved": "https://registry.npmjs.org/conventional-changelog-angular/-/conventional-changelog-angular-5.0.2.tgz", + "integrity": "sha512-yx7m7lVrXmt4nKWQgWZqxSALEiAKZhOAcbxdUaU9575mB0CzXVbgrgpfSnSP7OqWDUTYGD0YVJ0MSRdyOPgAwA==", "dev": true, "requires": { "compare-func": "^1.3.1", @@ -3108,23 +3107,23 @@ } }, "conventional-changelog-core": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/conventional-changelog-core/-/conventional-changelog-core-3.1.0.tgz", - "integrity": "sha512-bcZkcFXkqVgG2W8m/1wjlp2wn/BKDcrPgw3/mvSEQtzs8Pax8JbAPFpEQReHY92+EKNNXC67wLA8y2xcNx0rDA==", + "version": "3.1.5", + "resolved": "https://registry.npmjs.org/conventional-changelog-core/-/conventional-changelog-core-3.1.5.tgz", + "integrity": "sha512-iwqAotS4zk0wA4S84YY1JCUG7X3LxaRjJxuUo6GI4dZuIy243j5nOg/Ora35ExT4DOiw5dQbMMQvw2SUjh6moQ==", "dev": true, "requires": { - "conventional-changelog-writer": "^4.0.0", - "conventional-commits-parser": "^3.0.0", + "conventional-changelog-writer": "^4.0.2", + "conventional-commits-parser": "^3.0.1", "dateformat": "^3.0.0", "get-pkg-repo": "^1.0.0", - "git-raw-commits": "^2.0.0", + "git-raw-commits": "2.0.0", "git-remote-origin-url": "^2.0.0", - "git-semver-tags": "^2.0.0", + "git-semver-tags": "^2.0.2", "lodash": "^4.2.1", "normalize-package-data": "^2.3.5", "q": "^1.5.1", - "read-pkg": "^1.1.0", - "read-pkg-up": "^1.0.1", + "read-pkg": "^3.0.0", + "read-pkg-up": "^3.0.0", "through2": "^2.0.0" }, "dependencies": { @@ -3133,23 +3132,81 @@ "resolved": "https://registry.npmjs.org/dateformat/-/dateformat-3.0.3.tgz", "integrity": "sha512-jyCETtSl3VMZMWeRo7iY1FL19ges1t55hMo5yaam4Jrsm5EPL89UQkoQRyiI+Yf4k8r2ZpdngkV8hr1lIdjb3Q==", "dev": true + }, + "load-json-file": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/load-json-file/-/load-json-file-4.0.0.tgz", + "integrity": "sha1-L19Fq5HjMhYjT9U62rZo607AmTs=", + "dev": true, + "requires": { + "graceful-fs": "^4.1.2", + "parse-json": "^4.0.0", + "pify": "^3.0.0", + "strip-bom": "^3.0.0" + } + }, + "parse-json": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/parse-json/-/parse-json-4.0.0.tgz", + "integrity": "sha1-vjX1Qlvh9/bHRxhPmKeIy5lHfuA=", + "dev": true, + "requires": { + "error-ex": "^1.3.1", + "json-parse-better-errors": "^1.0.1" + } + }, + "path-type": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/path-type/-/path-type-3.0.0.tgz", + "integrity": "sha512-T2ZUsdZFHgA3u4e5PfPbjd7HDDpxPnQb5jN0SrDsjNSuVXHJqtwTnWqG0B1jZrgmJ/7lj1EmVIByWt1gxGkWvg==", + "dev": true, + "requires": { + "pify": "^3.0.0" + } + }, + "read-pkg": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/read-pkg/-/read-pkg-3.0.0.tgz", + "integrity": "sha1-nLxoaXj+5l0WwA4rGcI3/Pbjg4k=", + "dev": true, + "requires": { + "load-json-file": "^4.0.0", + "normalize-package-data": "^2.3.2", + "path-type": "^3.0.0" + } + }, + "read-pkg-up": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/read-pkg-up/-/read-pkg-up-3.0.0.tgz", + "integrity": "sha1-PtSWaF26D4/hGNBpHcUfSh/5bwc=", + "dev": true, + "requires": { + "find-up": "^2.0.0", + "read-pkg": "^3.0.0" + } + }, + "strip-bom": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/strip-bom/-/strip-bom-3.0.0.tgz", + "integrity": "sha1-IzTBjpx1n3vdVv3vfprj1YjmjtM=", + "dev": true } } }, "conventional-changelog-preset-loader": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/conventional-changelog-preset-loader/-/conventional-changelog-preset-loader-2.0.1.tgz", - "integrity": "sha512-HiSfhXNzAzG9klIqJaA97MMiNBR4js+53g4Px0k7tgKeCNVXmrDrm+CY+nIqcmG5NVngEPf8rAr7iji1TWW7zg==", + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/conventional-changelog-preset-loader/-/conventional-changelog-preset-loader-2.0.2.tgz", + "integrity": "sha512-pBY+qnUoJPXAXXqVGwQaVmcye05xi6z231QM98wHWamGAmu/ghkBprQAwmF5bdmyobdVxiLhPY3PrCfSeUNzRQ==", "dev": true }, "conventional-changelog-writer": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/conventional-changelog-writer/-/conventional-changelog-writer-4.0.0.tgz", - "integrity": "sha512-hMZPe0AQ6Bi05epeK/7hz80xxk59nPA5z/b63TOHq2wigM0/akreOc8N4Jam5b9nFgKWX1e9PdPv2ewgW6bcfg==", + "version": "4.0.2", + "resolved": "https://registry.npmjs.org/conventional-changelog-writer/-/conventional-changelog-writer-4.0.2.tgz", + "integrity": "sha512-d8/FQY/fix2xXEBUhOo8u3DCbyEw3UOQgYHxLsPDw+wHUDma/GQGAGsGtoH876WyNs32fViHmTOUrgRKVLvBug==", "dev": true, "requires": { "compare-func": "^1.3.1", - "conventional-commits-filter": "^2.0.0", + "conventional-commits-filter": "^2.0.1", "dateformat": "^3.0.0", "handlebars": "^4.0.2", "json-stringify-safe": "^5.0.1", @@ -3169,9 +3226,9 @@ } }, "conventional-commits-filter": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/conventional-commits-filter/-/conventional-commits-filter-2.0.0.tgz", - "integrity": "sha512-Cfl0j1/NquB/TMVx7Wrmyq7uRM+/rPQbtVVGwzfkhZ6/yH6fcMmP0Q/9044TBZPTNdGzm46vXFXL14wbET0/Mg==", + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/conventional-commits-filter/-/conventional-commits-filter-2.0.1.tgz", + "integrity": "sha512-92OU8pz/977udhBjgPEbg3sbYzIxMDFTlQT97w7KdhR9igNqdJvy8smmedAAgn4tPiqseFloKkrVfbXCVd+E7A==", "dev": true, "requires": { "is-subset": "^0.1.1", @@ -3179,9 +3236,9 @@ } }, "conventional-commits-parser": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/conventional-commits-parser/-/conventional-commits-parser-3.0.0.tgz", - "integrity": "sha512-GWh71U26BLWgMykCp+VghZ4s64wVbtseECcKQ/PvcPZR2cUnz+FUc2J9KjxNl7/ZbCxST8R03c9fc+Vi0umS9Q==", + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/conventional-commits-parser/-/conventional-commits-parser-3.0.1.tgz", + "integrity": "sha512-P6U5UOvDeidUJ8ebHVDIoXzI7gMlQ1OF/id6oUvp8cnZvOXMt1n8nYl74Ey9YMn0uVQtxmCtjPQawpsssBWtGg==", "dev": true, "requires": { "JSONStream": "^1.0.4", @@ -3194,17 +3251,17 @@ } }, "conventional-recommended-bump": { - "version": "4.0.1", - "resolved": "https://registry.npmjs.org/conventional-recommended-bump/-/conventional-recommended-bump-4.0.1.tgz", - "integrity": "sha512-9waJvW01TUs4HQJ3khwGSSlTlKsY+5u7OrxHL+oWEoGNvaNO/0qL6qqnhS3J0Fq9fNKA9bmlf5cOXjCQoW+I4Q==", + "version": "4.0.4", + "resolved": "https://registry.npmjs.org/conventional-recommended-bump/-/conventional-recommended-bump-4.0.4.tgz", + "integrity": "sha512-9mY5Yoblq+ZMqJpBzgS+RpSq+SUfP2miOR3H/NR9drGf08WCrY9B6HAGJZEm6+ThsVP917VHAahSOjM6k1vhPg==", "dev": true, "requires": { "concat-stream": "^1.6.0", - "conventional-changelog-preset-loader": "^2.0.1", - "conventional-commits-filter": "^2.0.0", - "conventional-commits-parser": "^3.0.0", - "git-raw-commits": "^2.0.0", - "git-semver-tags": "^2.0.0", + "conventional-changelog-preset-loader": "^2.0.2", + "conventional-commits-filter": "^2.0.1", + "conventional-commits-parser": "^3.0.1", + "git-raw-commits": "2.0.0", + "git-semver-tags": "^2.0.2", "meow": "^4.0.0", "q": "^1.5.1" } @@ -3366,7 +3423,7 @@ }, "cross-spawn-async": { "version": "2.2.5", - "resolved": "https://registry.npmjs.org/cross-spawn-async/-/cross-spawn-async-2.2.5.tgz", + "resolved": "http://registry.npmjs.org/cross-spawn-async/-/cross-spawn-async-2.2.5.tgz", "integrity": "sha1-hF/wwINKPe2dFg2sptOQkGuyiMw=", "dev": true, "requires": { @@ -3445,7 +3502,7 @@ }, "d": { "version": "1.0.0", - "resolved": "https://registry.npmjs.org/d/-/d-1.0.0.tgz", + "resolved": "http://registry.npmjs.org/d/-/d-1.0.0.tgz", "integrity": "sha1-dUu1v+VUUdpppYuU1F9MWwRi1Y8=", "dev": true, "requires": { @@ -3471,13 +3528,13 @@ } }, "data-urls": { - "version": "1.0.1", - "resolved": "https://registry.npmjs.org/data-urls/-/data-urls-1.0.1.tgz", - "integrity": "sha512-0HdcMZzK6ubMUnsMmQmG0AcLQPvbvb47R0+7CCZQCYgcd8OUWG91CG7sM6GoXgjz+WLl4ArFzHtBMy/QqSF4eg==", + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/data-urls/-/data-urls-1.1.0.tgz", + "integrity": "sha512-YTWYI9se1P55u58gL5GkQHW4P6VJBJ5iBT+B5a7i2Tjadhv52paJG0qHX4A0OR6/t52odI64KP2YvFpkDOi3eQ==", "dev": true, "requires": { "abab": "^2.0.0", - "whatwg-mimetype": "^2.1.0", + "whatwg-mimetype": "^2.2.0", "whatwg-url": "^7.0.0" }, "dependencies": { @@ -4055,7 +4112,7 @@ }, "es6-promisify": { "version": "5.0.0", - "resolved": "https://registry.npmjs.org/es6-promisify/-/es6-promisify-5.0.0.tgz", + "resolved": "http://registry.npmjs.org/es6-promisify/-/es6-promisify-5.0.0.tgz", "integrity": "sha1-UQnWLz5W6pZ8S2NQWu8IKRyKUgM=", "dev": true, "requires": { @@ -4170,22 +4227,6 @@ "es5-ext": "~0.10.14" } }, - "event-stream": { - "version": "3.3.6", - "resolved": "https://registry.npmjs.org/event-stream/-/event-stream-3.3.6.tgz", - "integrity": "sha512-dGXNg4F/FgVzlApjzItL+7naHutA3fDqbV/zAZqDDlXTjiMnQmZKu+prImWKszeBM5UQeGvAl3u1wBiKeDh61g==", - "dev": true, - "requires": { - "duplexer": "^0.1.1", - "flatmap-stream": "^0.1.0", - "from": "^0.1.7", - "map-stream": "0.0.7", - "pause-stream": "^0.0.11", - "split": "^1.0.1", - "stream-combiner": "^0.2.2", - "through": "^2.3.8" - } - }, "events": { "version": "1.1.1", "resolved": "http://registry.npmjs.org/events/-/events-1.1.1.tgz", @@ -4345,21 +4386,21 @@ }, "fast-deep-equal": { "version": "1.1.0", - "resolved": "https://registry.npmjs.org/fast-deep-equal/-/fast-deep-equal-1.1.0.tgz", + "resolved": "http://registry.npmjs.org/fast-deep-equal/-/fast-deep-equal-1.1.0.tgz", "integrity": "sha1-wFNHeBfIa1HaqFPIHgWbcz0CNhQ=", "dev": true }, "fast-glob": { - "version": "2.2.3", - "resolved": "https://registry.npmjs.org/fast-glob/-/fast-glob-2.2.3.tgz", - "integrity": "sha512-NiX+JXjnx43RzvVFwRWfPKo4U+1BrK5pJPsHQdKMlLoFHrrGktXglQhHliSihWAq+m1z6fHk3uwGHrtRbS9vLA==", + "version": "2.2.4", + "resolved": "https://registry.npmjs.org/fast-glob/-/fast-glob-2.2.4.tgz", + "integrity": "sha512-FjK2nCGI/McyzgNtTESqaWP3trPvHyRyoyY70hxjc3oKPNmDe8taohLZpoVKoUjW85tbU5txaYUZCNtVzygl1g==", "dev": true, "requires": { "@mrmlnc/readdir-enhanced": "^2.2.1", - "@nodelib/fs.stat": "^1.0.1", + "@nodelib/fs.stat": "^1.1.2", "glob-parent": "^3.1.0", "is-glob": "^4.0.0", - "merge2": "^1.2.1", + "merge2": "^1.2.3", "micromatch": "^3.1.10" }, "dependencies": { @@ -5162,12 +5203,6 @@ "resolved": "https://registry.npmjs.org/flatbuffers/-/flatbuffers-1.10.2.tgz", "integrity": "sha512-VK7lHZF/corkykjXZ0+dqViI8Wk1YpwPCFN2wrnTs+PMCMG5+uHRvkRW14fuA7Smkhkgx+Dj5UdS3YXktJL+qw==" }, - "flatmap-stream": { - "version": "0.1.1", - "resolved": "https://registry.npmjs.org/flatmap-stream/-/flatmap-stream-0.1.1.tgz", - "integrity": "sha512-lAq4tLbm3sidmdCN8G3ExaxH7cUCtP5mgDvrYowsx84dcYkJJ4I28N7gkxA6+YlSXzaGLJYIDEi9WGfXzMiXdw==", - "dev": true - }, "flush-write-stream": { "version": "1.0.3", "resolved": "https://registry.npmjs.org/flush-write-stream/-/flush-write-stream-1.0.3.tgz", @@ -5219,12 +5254,6 @@ "map-cache": "^0.2.2" } }, - "from": { - "version": "0.1.7", - "resolved": "https://registry.npmjs.org/from/-/from-0.1.7.tgz", - "integrity": "sha1-g8YK/Fi5xWmXAH7Rp2izqzA6RP4=", - "dev": true - }, "from2": { "version": "2.3.0", "resolved": "https://registry.npmjs.org/from2/-/from2-2.3.0.tgz", @@ -5236,9 +5265,9 @@ } }, "fs-extra": { - "version": "7.0.0", - "resolved": "https://registry.npmjs.org/fs-extra/-/fs-extra-7.0.0.tgz", - "integrity": "sha512-EglNDLRpmaTWiD/qraZn6HREAEAHJcJOmxNEYwq6xeMKnVMAy3GUcFB+wXt2C6k4CNvB/mP1y/U3dzvKKj5OtQ==", + "version": "7.0.1", + "resolved": "https://registry.npmjs.org/fs-extra/-/fs-extra-7.0.1.tgz", + "integrity": "sha512-YJDaCJZEnBmcbw13fvdAM9AwNOJwOzrE4pqMqBq5nFiEqXUqHwlK4B+3pUw6JNvfSPtX05xFHtYy/1ni01eGCw==", "dev": true, "requires": { "graceful-fs": "^4.1.2", @@ -5296,28 +5325,24 @@ "dependencies": { "abbrev": { "version": "1.1.1", - "resolved": "https://registry.npmjs.org/abbrev/-/abbrev-1.1.1.tgz", - "integrity": "sha512-nne9/IiQ/hzIhY6pdDnbBtz7DjPTKrY00P/zvPSm5pOFkl6xuGrGnXn/VtTNNfNtAfZ9/1RtehkszU9qcTii0Q==", + "bundled": true, "dev": true, "optional": true }, "ansi-regex": { "version": "2.1.1", - "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-2.1.1.tgz", - "integrity": "sha1-w7M6te42DYbg5ijwRorn7yfWVN8=", + "bundled": true, "dev": true }, "aproba": { "version": "1.2.0", - "resolved": "https://registry.npmjs.org/aproba/-/aproba-1.2.0.tgz", - "integrity": "sha512-Y9J6ZjXtoYh8RnXVCMOU/ttDmk1aBjunq9vO0ta5x85WDQiQfUF9sIPBITdbiiIVcBo03Hi3jMxigBtsddlXRw==", + "bundled": true, "dev": true, "optional": true }, "are-we-there-yet": { "version": "1.1.4", - "resolved": "https://registry.npmjs.org/are-we-there-yet/-/are-we-there-yet-1.1.4.tgz", - "integrity": "sha1-u13KOCu5TwXhUZQ3PRb9O6HKEQ0=", + "bundled": true, "dev": true, "optional": true, "requires": { @@ -5327,14 +5352,12 @@ }, "balanced-match": { "version": "1.0.0", - "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.0.tgz", - "integrity": "sha1-ibTRmasr7kneFk6gK4nORi1xt2c=", + "bundled": true, "dev": true }, "brace-expansion": { "version": "1.1.11", - "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz", - "integrity": "sha512-iCuPHDFgrHX7H2vEI/5xpz07zSHB00TpugqhmYtVmMO6518mCuRMoOYFldEBl0g187ufozdaHgWKcYFb61qGiA==", + "bundled": true, "dev": true, "requires": { "balanced-match": "^1.0.0", @@ -5343,40 +5366,34 @@ }, "chownr": { "version": "1.0.1", - "resolved": "https://registry.npmjs.org/chownr/-/chownr-1.0.1.tgz", - "integrity": "sha1-4qdQQqlVGQi+vSW4Uj1fl2nXkYE=", + "bundled": true, "dev": true, "optional": true }, "code-point-at": { "version": "1.1.0", - "resolved": "https://registry.npmjs.org/code-point-at/-/code-point-at-1.1.0.tgz", - "integrity": "sha1-DQcLTQQ6W+ozovGkDi7bPZpMz3c=", + "bundled": true, "dev": true }, "concat-map": { "version": "0.0.1", - "resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz", - "integrity": "sha1-2Klr13/Wjfd5OnMDajug1UBdR3s=", + "bundled": true, "dev": true }, "console-control-strings": { "version": "1.1.0", - "resolved": "https://registry.npmjs.org/console-control-strings/-/console-control-strings-1.1.0.tgz", - "integrity": "sha1-PXz0Rk22RG6mRL9LOVB/mFEAjo4=", + "bundled": true, "dev": true }, "core-util-is": { "version": "1.0.2", - "resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.2.tgz", - "integrity": "sha1-tf1UIgqivFq1eqtxQMlAdUUDwac=", + "bundled": true, "dev": true, "optional": true }, "debug": { "version": "2.6.9", - "resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz", - "integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==", + "bundled": true, "dev": true, "optional": true, "requires": { @@ -5385,29 +5402,25 @@ }, "deep-extend": { "version": "0.5.1", - "resolved": "https://registry.npmjs.org/deep-extend/-/deep-extend-0.5.1.tgz", - "integrity": "sha512-N8vBdOa+DF7zkRrDCsaOXoCs/E2fJfx9B9MrKnnSiHNh4ws7eSys6YQE4KvT1cecKmOASYQBhbKjeuDD9lT81w==", + "bundled": true, "dev": true, "optional": true }, "delegates": { "version": "1.0.0", - "resolved": "https://registry.npmjs.org/delegates/-/delegates-1.0.0.tgz", - "integrity": "sha1-hMbhWbgZBP3KWaDvRM2HDTElD5o=", + "bundled": true, "dev": true, "optional": true }, "detect-libc": { "version": "1.0.3", - "resolved": "https://registry.npmjs.org/detect-libc/-/detect-libc-1.0.3.tgz", - "integrity": "sha1-+hN8S9aY7fVc1c0CrFWfkaTEups=", + "bundled": true, "dev": true, "optional": true }, "fs-minipass": { "version": "1.2.5", - "resolved": "https://registry.npmjs.org/fs-minipass/-/fs-minipass-1.2.5.tgz", - "integrity": "sha512-JhBl0skXjUPCFH7x6x61gQxrKyXsxB5gcgePLZCwfyCGGsTISMoIeObbrvVeP6Xmyaudw4TT43qV2Gz+iyd2oQ==", + "bundled": true, "dev": true, "optional": true, "requires": { @@ -5416,15 +5429,13 @@ }, "fs.realpath": { "version": "1.0.0", - "resolved": "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz", - "integrity": "sha1-FQStJSMVjKpA20onh8sBQRmU6k8=", + "bundled": true, "dev": true, "optional": true }, "gauge": { "version": "2.7.4", - "resolved": "https://registry.npmjs.org/gauge/-/gauge-2.7.4.tgz", - "integrity": "sha1-LANAXHU4w51+s3sxcCLjJfsBi/c=", + "bundled": true, "dev": true, "optional": true, "requires": { @@ -5440,8 +5451,7 @@ }, "glob": { "version": "7.1.2", - "resolved": "https://registry.npmjs.org/glob/-/glob-7.1.2.tgz", - "integrity": "sha512-MJTUg1kjuLeQCJ+ccE4Vpa6kKVXkPYJ2mOCQyUuKLcLQsdrMCpBPUi8qVE6+YuaJkozeA9NusTAw3hLr8Xe5EQ==", + "bundled": true, "dev": true, "optional": true, "requires": { @@ -5455,15 +5465,13 @@ }, "has-unicode": { "version": "2.0.1", - "resolved": "https://registry.npmjs.org/has-unicode/-/has-unicode-2.0.1.tgz", - "integrity": "sha1-4Ob+aijPUROIVeCG0Wkedx3iqLk=", + "bundled": true, "dev": true, "optional": true }, "iconv-lite": { "version": "0.4.21", - "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.4.21.tgz", - "integrity": "sha512-En5V9za5mBt2oUA03WGD3TwDv0MKAruqsuxstbMUZaj9W9k/m1CV/9py3l0L5kw9Bln8fdHQmzHSYtvpvTLpKw==", + "bundled": true, "dev": true, "optional": true, "requires": { @@ -5472,8 +5480,7 @@ }, "ignore-walk": { "version": "3.0.1", - "resolved": "https://registry.npmjs.org/ignore-walk/-/ignore-walk-3.0.1.tgz", - "integrity": "sha512-DTVlMx3IYPe0/JJcYP7Gxg7ttZZu3IInhuEhbchuqneY9wWe5Ojy2mXLBaQFUQmo0AW2r3qG7m1mg86js+gnlQ==", + "bundled": true, "dev": true, "optional": true, "requires": { @@ -5482,8 +5489,7 @@ }, "inflight": { "version": "1.0.6", - "resolved": "https://registry.npmjs.org/inflight/-/inflight-1.0.6.tgz", - "integrity": "sha1-Sb1jMdfQLQwJvJEKEHW6gWW1bfk=", + "bundled": true, "dev": true, "optional": true, "requires": { @@ -5493,21 +5499,18 @@ }, "inherits": { "version": "2.0.3", - "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.3.tgz", - "integrity": "sha1-Yzwsg+PaQqUC9SRmAiSA9CCCYd4=", + "bundled": true, "dev": true }, "ini": { "version": "1.3.5", - "resolved": "https://registry.npmjs.org/ini/-/ini-1.3.5.tgz", - "integrity": "sha512-RZY5huIKCMRWDUqZlEi72f/lmXKMvuszcMBduliQ3nnWbx9X/ZBQO7DijMEYS9EhHBb2qacRUMtC7svLwe0lcw==", + "bundled": true, "dev": true, "optional": true }, "is-fullwidth-code-point": { "version": "1.0.0", - "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-1.0.0.tgz", - "integrity": "sha1-754xOG8DGn8NZDr4L95QxFfvAMs=", + "bundled": true, "dev": true, "requires": { "number-is-nan": "^1.0.0" @@ -5515,15 +5518,13 @@ }, "isarray": { "version": "1.0.0", - "resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz", - "integrity": "sha1-u5NdSFgsuhaMBoNJV6VKPgcSTxE=", + "bundled": true, "dev": true, "optional": true }, "minimatch": { "version": "3.0.4", - "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.0.4.tgz", - "integrity": "sha512-yJHVQEhyqPLUTgt9B83PXu6W3rx4MvvHvSUvToogpwoGDOUQ+yDrR0HRot+yOCdCO7u4hX3pWft6kWBBcqh0UA==", + "bundled": true, "dev": true, "requires": { "brace-expansion": "^1.1.7" @@ -5531,14 +5532,12 @@ }, "minimist": { "version": "0.0.8", - "resolved": "https://registry.npmjs.org/minimist/-/minimist-0.0.8.tgz", - "integrity": "sha1-hX/Kv8M5fSYluCKCYuhqp6ARsF0=", + "bundled": true, "dev": true }, "minipass": { "version": "2.2.4", - "resolved": "https://registry.npmjs.org/minipass/-/minipass-2.2.4.tgz", - "integrity": "sha512-hzXIWWet/BzWhYs2b+u7dRHlruXhwdgvlTMDKC6Cb1U7ps6Ac6yQlR39xsbjWJE377YTCtKwIXIpJ5oP+j5y8g==", + "bundled": true, "dev": true, "requires": { "safe-buffer": "^5.1.1", @@ -5547,8 +5546,7 @@ }, "minizlib": { "version": "1.1.0", - "resolved": "https://registry.npmjs.org/minizlib/-/minizlib-1.1.0.tgz", - "integrity": "sha512-4T6Ur/GctZ27nHfpt9THOdRZNgyJ9FZchYO1ceg5S8Q3DNLCKYy44nCZzgCJgcvx2UM8czmqak5BCxJMrq37lA==", + "bundled": true, "dev": true, "optional": true, "requires": { @@ -5557,8 +5555,7 @@ }, "mkdirp": { "version": "0.5.1", - "resolved": "https://registry.npmjs.org/mkdirp/-/mkdirp-0.5.1.tgz", - "integrity": "sha1-MAV0OOrGz3+MR2fzhkjWaX11yQM=", + "bundled": true, "dev": true, "requires": { "minimist": "0.0.8" @@ -5566,15 +5563,13 @@ }, "ms": { "version": "2.0.0", - "resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz", - "integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g=", + "bundled": true, "dev": true, "optional": true }, "needle": { "version": "2.2.0", - "resolved": "https://registry.npmjs.org/needle/-/needle-2.2.0.tgz", - "integrity": "sha512-eFagy6c+TYayorXw/qtAdSvaUpEbBsDwDyxYFgLZ0lTojfH7K+OdBqAF7TAFwDokJaGpubpSGG0wO3iC0XPi8w==", + "bundled": true, "dev": true, "optional": true, "requires": { @@ -5585,8 +5580,7 @@ }, "node-pre-gyp": { "version": "0.10.0", - "resolved": "https://registry.npmjs.org/node-pre-gyp/-/node-pre-gyp-0.10.0.tgz", - "integrity": "sha512-G7kEonQLRbcA/mOoFoxvlMrw6Q6dPf92+t/l0DFSMuSlDoWaI9JWIyPwK0jyE1bph//CUEL65/Fz1m2vJbmjQQ==", + "bundled": true, "dev": true, "optional": true, "requires": { @@ -5604,8 +5598,7 @@ }, "nopt": { "version": "4.0.1", - "resolved": "https://registry.npmjs.org/nopt/-/nopt-4.0.1.tgz", - "integrity": "sha1-0NRoWv1UFRk8jHUFYC0NF81kR00=", + "bundled": true, "dev": true, "optional": true, "requires": { @@ -5615,15 +5608,13 @@ }, "npm-bundled": { "version": "1.0.3", - "resolved": "https://registry.npmjs.org/npm-bundled/-/npm-bundled-1.0.3.tgz", - "integrity": "sha512-ByQ3oJ/5ETLyglU2+8dBObvhfWXX8dtPZDMePCahptliFX2iIuhyEszyFk401PZUNQH20vvdW5MLjJxkwU80Ow==", + "bundled": true, "dev": true, "optional": true }, "npm-packlist": { "version": "1.1.10", - "resolved": "https://registry.npmjs.org/npm-packlist/-/npm-packlist-1.1.10.tgz", - "integrity": "sha512-AQC0Dyhzn4EiYEfIUjCdMl0JJ61I2ER9ukf/sLxJUcZHfo+VyEfz2rMJgLZSS1v30OxPQe1cN0LZA1xbcaVfWA==", + "bundled": true, "dev": true, "optional": true, "requires": { @@ -5633,8 +5624,7 @@ }, "npmlog": { "version": "4.1.2", - "resolved": "https://registry.npmjs.org/npmlog/-/npmlog-4.1.2.tgz", - "integrity": "sha512-2uUqazuKlTaSI/dC8AzicUck7+IrEaOnN/e0jd3Xtt1KcGpwx30v50mL7oPyr/h9bL3E4aZccVwpwP+5W9Vjkg==", + "bundled": true, "dev": true, "optional": true, "requires": { @@ -5646,21 +5636,18 @@ }, "number-is-nan": { "version": "1.0.1", - "resolved": "https://registry.npmjs.org/number-is-nan/-/number-is-nan-1.0.1.tgz", - "integrity": "sha1-CXtgK1NCKlIsGvuHkDGDNpQaAR0=", + "bundled": true, "dev": true }, "object-assign": { "version": "4.1.1", - "resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz", - "integrity": "sha1-IQmtx5ZYh8/AXLvUQsrIv7s2CGM=", + "bundled": true, "dev": true, "optional": true }, "once": { "version": "1.4.0", - "resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz", - "integrity": "sha1-WDsap3WWHUsROsF9nFC6753Xa9E=", + "bundled": true, "dev": true, "requires": { "wrappy": "1" @@ -5668,22 +5655,19 @@ }, "os-homedir": { "version": "1.0.2", - "resolved": "https://registry.npmjs.org/os-homedir/-/os-homedir-1.0.2.tgz", - "integrity": "sha1-/7xJiDNuDoM94MFox+8VISGqf7M=", + "bundled": true, "dev": true, "optional": true }, "os-tmpdir": { "version": "1.0.2", - "resolved": "https://registry.npmjs.org/os-tmpdir/-/os-tmpdir-1.0.2.tgz", - "integrity": "sha1-u+Z0BseaqFxc/sdm/lc0VV36EnQ=", + "bundled": true, "dev": true, "optional": true }, "osenv": { "version": "0.1.5", - "resolved": "https://registry.npmjs.org/osenv/-/osenv-0.1.5.tgz", - "integrity": "sha512-0CWcCECdMVc2Rw3U5w9ZjqX6ga6ubk1xDVKxtBQPK7wis/0F2r9T6k4ydGYhecl7YUBxBVxhL5oisPsNxAPe2g==", + "bundled": true, "dev": true, "optional": true, "requires": { @@ -5693,22 +5677,19 @@ }, "path-is-absolute": { "version": "1.0.1", - "resolved": "https://registry.npmjs.org/path-is-absolute/-/path-is-absolute-1.0.1.tgz", - "integrity": "sha1-F0uSaHNVNP+8es5r9TpanhtcX18=", + "bundled": true, "dev": true, "optional": true }, "process-nextick-args": { "version": "2.0.0", - "resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-2.0.0.tgz", - "integrity": "sha512-MtEC1TqN0EU5nephaJ4rAtThHtC86dNN9qCuEhtshvpVBkAW5ZO7BASN9REnF9eoXGcRub+pFuKEpOHE+HbEMw==", + "bundled": true, "dev": true, "optional": true }, "rc": { "version": "1.2.7", - "resolved": "https://registry.npmjs.org/rc/-/rc-1.2.7.tgz", - "integrity": "sha512-LdLD8xD4zzLsAT5xyushXDNscEjB7+2ulnl8+r1pnESlYtlJtVSoCMBGr30eDRJ3+2Gq89jK9P9e4tCEH1+ywA==", + "bundled": true, "dev": true, "optional": true, "requires": { @@ -5720,8 +5701,7 @@ "dependencies": { "minimist": { "version": "1.2.0", - "resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.0.tgz", - "integrity": "sha1-o1AIsg9BOD7sH7kU9M1d95omQoQ=", + "bundled": true, "dev": true, "optional": true } @@ -5729,8 +5709,7 @@ }, "readable-stream": { "version": "2.3.6", - "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.6.tgz", - "integrity": "sha512-tQtKA9WIAhBF3+VLAseyMqZeBjW0AHJoxOtYqSUZNJxauErmLbVm2FW1y+J/YA9dUrAC39ITejlZWhVIwawkKw==", + "bundled": true, "dev": true, "optional": true, "requires": { @@ -5745,8 +5724,7 @@ }, "rimraf": { "version": "2.6.2", - "resolved": "https://registry.npmjs.org/rimraf/-/rimraf-2.6.2.tgz", - "integrity": "sha512-lreewLK/BlghmxtfH36YYVg1i8IAce4TI7oao75I1g245+6BctqTVQiBP3YUJ9C6DQOXJmkYR9X9fCLtCOJc5w==", + "bundled": true, "dev": true, "optional": true, "requires": { @@ -5755,49 +5733,42 @@ }, "safe-buffer": { "version": "5.1.1", - "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.1.tgz", - "integrity": "sha512-kKvNJn6Mm93gAczWVJg7wH+wGYWNrDHdWvpUmHyEsgCtIwwo3bqPtV4tR5tuPaUhTOo/kvhVwd8XwwOllGYkbg==", + "bundled": true, "dev": true }, "safer-buffer": { "version": "2.1.2", - "resolved": "https://registry.npmjs.org/safer-buffer/-/safer-buffer-2.1.2.tgz", - "integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==", + "bundled": true, "dev": true, "optional": true }, "sax": { "version": "1.2.4", - "resolved": "https://registry.npmjs.org/sax/-/sax-1.2.4.tgz", - "integrity": "sha512-NqVDv9TpANUjFm0N8uM5GxL36UgKi9/atZw+x7YFnQ8ckwFGKrl4xX4yWtrey3UJm5nP1kUbnYgLopqWNSRhWw==", + "bundled": true, "dev": true, "optional": true }, "semver": { "version": "5.5.0", - "resolved": "https://registry.npmjs.org/semver/-/semver-5.5.0.tgz", - "integrity": "sha512-4SJ3dm0WAwWy/NVeioZh5AntkdJoWKxHxcmyP622fOkgHa4z3R0TdBJICINyaSDE6uNwVc8gZr+ZinwZAH4xIA==", + "bundled": true, "dev": true, "optional": true }, "set-blocking": { "version": "2.0.0", - "resolved": "https://registry.npmjs.org/set-blocking/-/set-blocking-2.0.0.tgz", - "integrity": "sha1-BF+XgtARrppoA93TgrJDkrPYkPc=", + "bundled": true, "dev": true, "optional": true }, "signal-exit": { "version": "3.0.2", - "resolved": "https://registry.npmjs.org/signal-exit/-/signal-exit-3.0.2.tgz", - "integrity": "sha1-tf3AjxKH6hF4Yo5BXiUTK3NkbG0=", + "bundled": true, "dev": true, "optional": true }, "string-width": { "version": "1.0.2", - "resolved": "https://registry.npmjs.org/string-width/-/string-width-1.0.2.tgz", - "integrity": "sha1-EYvfW4zcUaKn5w0hHgfisLmxB9M=", + "bundled": true, "dev": true, "requires": { "code-point-at": "^1.0.0", @@ -5807,8 +5778,7 @@ }, "string_decoder": { "version": "1.1.1", - "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.1.1.tgz", - "integrity": "sha512-n/ShnvDi6FHbbVfviro+WojiFzv+s8MPMHBczVePfUpDJLwoLT0ht1l4YwBCbi8pJAveEEdnkHyPyTP/mzRfwg==", + "bundled": true, "dev": true, "optional": true, "requires": { @@ -5817,8 +5787,7 @@ }, "strip-ansi": { "version": "3.0.1", - "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-3.0.1.tgz", - "integrity": "sha1-ajhfuIU9lS1f8F0Oiq+UJ43GPc8=", + "bundled": true, "dev": true, "requires": { "ansi-regex": "^2.0.0" @@ -5826,15 +5795,13 @@ }, "strip-json-comments": { "version": "2.0.1", - "resolved": "https://registry.npmjs.org/strip-json-comments/-/strip-json-comments-2.0.1.tgz", - "integrity": "sha1-PFMZQukIwml8DsNEhYwobHygpgo=", + "bundled": true, "dev": true, "optional": true }, "tar": { "version": "4.4.1", - "resolved": "https://registry.npmjs.org/tar/-/tar-4.4.1.tgz", - "integrity": "sha512-O+v1r9yN4tOsvl90p5HAP4AEqbYhx4036AGMm075fH9F8Qwi3oJ+v4u50FkT/KkvywNGtwkk0zRI+8eYm1X/xg==", + "bundled": true, "dev": true, "optional": true, "requires": { @@ -5849,15 +5816,13 @@ }, "util-deprecate": { "version": "1.0.2", - "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz", - "integrity": "sha1-RQ1Nyfpw3nMnYvvS1KKJgUGaDM8=", + "bundled": true, "dev": true, "optional": true }, "wide-align": { "version": "1.1.2", - "resolved": "https://registry.npmjs.org/wide-align/-/wide-align-1.1.2.tgz", - "integrity": "sha512-ijDLlyQ7s6x1JgCLur53osjm/UXUYD9+0PbYKrBsYisYXzCxN+HC3mYDNy/dWdmf3AwqwU3CXwDCvsNgGK1S0w==", + "bundled": true, "dev": true, "optional": true, "requires": { @@ -5866,14 +5831,12 @@ }, "wrappy": { "version": "1.0.2", - "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz", - "integrity": "sha1-tSQ9jz7BqjXxNkYFvA0QNuMKtp8=", + "bundled": true, "dev": true }, "yallist": { "version": "3.0.2", - "resolved": "https://registry.npmjs.org/yallist/-/yallist-3.0.2.tgz", - "integrity": "sha1-hFK0u36Dx8GI2AQcGoN8dz1ti7k=", + "bundled": true, "dev": true } } @@ -5913,9 +5876,9 @@ } }, "genfun": { - "version": "4.0.1", - "resolved": "https://registry.npmjs.org/genfun/-/genfun-4.0.1.tgz", - "integrity": "sha1-7RAEHy5KfxsKOEZtF6XD4n3x38E=", + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/genfun/-/genfun-5.0.0.tgz", + "integrity": "sha512-KGDOARWVga7+rnB3z9Sd2Letx515owfk0hSxHGuqjANb1M+x2bGZGqHLiozPsYMdM2OubeMni/Hpwmjq6qIUhA==", "dev": true }, "get-caller-file": { @@ -5943,6 +5906,12 @@ "through2": "^2.0.0" }, "dependencies": { + "camelcase": { + "version": "2.1.1", + "resolved": "https://registry.npmjs.org/camelcase/-/camelcase-2.1.1.tgz", + "integrity": "sha1-fB0W1nmhu+WcoCys7PsBHiAfWh8=", + "dev": true + }, "camelcase-keys": { "version": "2.1.0", "resolved": "http://registry.npmjs.org/camelcase-keys/-/camelcase-keys-2.1.0.tgz", @@ -6092,16 +6061,16 @@ "dependencies": { "pify": { "version": "2.3.0", - "resolved": "https://registry.npmjs.org/pify/-/pify-2.3.0.tgz", + "resolved": "http://registry.npmjs.org/pify/-/pify-2.3.0.tgz", "integrity": "sha1-7RQaasBDqEnqWISY59yosVMw6Qw=", "dev": true } } }, "git-semver-tags": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/git-semver-tags/-/git-semver-tags-2.0.0.tgz", - "integrity": "sha512-lSgFc3zQTul31nFje2Q8XdNcTOI6B4I3mJRPCgFzHQQLfxfqdWTYzdtCaynkK5Xmb2wQlSJoKolhXJ1VhKROnQ==", + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/git-semver-tags/-/git-semver-tags-2.0.2.tgz", + "integrity": "sha512-34lMF7Yo1xEmsK2EkbArdoU79umpvm0MfzaDkSNYSJqtM5QLAVTPWgpiXSVI5o/O9EvZPSrP4Zvnec/CqhSd5w==", "dev": true, "requires": { "meow": "^4.0.0", @@ -6202,13 +6171,15 @@ "dev": true }, "glob-watcher": { - "version": "5.0.1", - "resolved": "https://registry.npmjs.org/glob-watcher/-/glob-watcher-5.0.1.tgz", - "integrity": "sha512-fK92r2COMC199WCyGUblrZKhjra3cyVMDiypDdqg1vsSDmexnbYivK1kNR4QItiNXLKmGlqan469ks67RtNa2g==", + "version": "5.0.3", + "resolved": "https://registry.npmjs.org/glob-watcher/-/glob-watcher-5.0.3.tgz", + "integrity": "sha512-8tWsULNEPHKQ2MR4zXuzSmqbdyV5PtwwCaWSGQ1WwHsJ07ilNeN1JB8ntxhckbnpSHaf9dXFUHzIWvm1I13dsg==", "dev": true, "requires": { + "anymatch": "^2.0.0", "async-done": "^1.2.0", "chokidar": "^2.0.0", + "is-negated-glob": "^1.0.0", "just-debounce": "^1.0.0", "object.defaults": "^1.1.0" }, @@ -6610,7 +6581,7 @@ "dependencies": { "pify": { "version": "2.3.0", - "resolved": "https://registry.npmjs.org/pify/-/pify-2.3.0.tgz", + "resolved": "http://registry.npmjs.org/pify/-/pify-2.3.0.tgz", "integrity": "sha1-7RQaasBDqEnqWISY59yosVMw6Qw=", "dev": true } @@ -7160,9 +7131,9 @@ } }, "hash.js": { - "version": "1.1.5", - "resolved": "https://registry.npmjs.org/hash.js/-/hash.js-1.1.5.tgz", - "integrity": "sha512-eWI5HG9Np+eHV1KQhisXWwM+4EPPYe5dFX1UZZH7k/E3JzDEazVH+VGlZi6R94ZqImq+A3D1mCEtrFIfg/E7sA==", + "version": "1.1.7", + "resolved": "https://registry.npmjs.org/hash.js/-/hash.js-1.1.7.tgz", + "integrity": "sha512-taOaskGt4z4SOANNseOviYDvjEJinIkRgmp7LbKP2YTTmVxWBl87s/uzK9r+44BclBSp2X7K1hqeNfz9JbBeXA==", "dev": true, "requires": { "inherits": "^2.0.3", @@ -7403,9 +7374,9 @@ } }, "inquirer": { - "version": "6.2.0", - "resolved": "https://registry.npmjs.org/inquirer/-/inquirer-6.2.0.tgz", - "integrity": "sha512-QIEQG4YyQ2UYZGDC4srMZ7BjHOmNk1lR2JQj5UknBapklm6WHA+VVH7N+sUdX3A7NeCfGF8o4X1S3Ao7nAcIeg==", + "version": "6.2.1", + "resolved": "https://registry.npmjs.org/inquirer/-/inquirer-6.2.1.tgz", + "integrity": "sha512-088kl3DRT2dLU5riVMKKr1DlImd6X7smDhpXUCkJDCKvTEJeRiXh0G132HG9u5a+6Ylw9plFRY7RuTnwohYSpg==", "dev": true, "requires": { "ansi-escapes": "^3.0.0", @@ -7419,7 +7390,7 @@ "run-async": "^2.2.0", "rxjs": "^6.1.0", "string-width": "^2.1.0", - "strip-ansi": "^4.0.0", + "strip-ansi": "^5.0.0", "through": "^2.3.6" }, "dependencies": { @@ -7452,15 +7423,34 @@ "requires": { "is-fullwidth-code-point": "^2.0.0", "strip-ansi": "^4.0.0" + }, + "dependencies": { + "strip-ansi": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-4.0.0.tgz", + "integrity": "sha1-qEeQIusaw2iocTibY1JixQXuNo8=", + "dev": true, + "requires": { + "ansi-regex": "^3.0.0" + } + } } }, "strip-ansi": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-4.0.0.tgz", - "integrity": "sha1-qEeQIusaw2iocTibY1JixQXuNo8=", + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-5.0.0.tgz", + "integrity": "sha512-Uu7gQyZI7J7gn5qLn1Np3G9vcYGTVqB+lFTytnDJv83dd8T22aGH451P3jueT2/QemInJDfxHB5Tde5OzgG1Ow==", "dev": true, "requires": { - "ansi-regex": "^3.0.0" + "ansi-regex": "^4.0.0" + }, + "dependencies": { + "ansi-regex": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-4.0.0.tgz", + "integrity": "sha512-iB5Dda8t/UqpPI/IjsejXu5jOGDrzn41wJyljwPH65VCIbk6+1BzFIMJGFwTNrYXT1CrD+B4l19U7awiQ8rk7w==", + "dev": true + } } } } @@ -8211,7 +8201,7 @@ }, "jest-get-type": { "version": "22.4.3", - "resolved": "https://registry.npmjs.org/jest-get-type/-/jest-get-type-22.4.3.tgz", + "resolved": "http://registry.npmjs.org/jest-get-type/-/jest-get-type-22.4.3.tgz", "integrity": "sha512-/jsz0Y+V29w1chdXVygEKSz2nBoHoYqNShPe+QgxSNjAuP1i8+k4LbQNrfoliKej0P45sivkSCh7yiD6ubHS3w==", "dev": true }, @@ -8812,9 +8802,9 @@ } }, "libnpmaccess": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/libnpmaccess/-/libnpmaccess-3.0.0.tgz", - "integrity": "sha512-SiE4AZAzMpD7pmmXHfgD7rof8QIQGoKaeyAS8exgx2CKA6tzRTbRljq1xM4Tgj8/tIg+KBJPJWkR0ifqKT3irQ==", + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/libnpmaccess/-/libnpmaccess-3.0.1.tgz", + "integrity": "sha512-RlZ7PNarCBt+XbnP7R6PoVgOq9t+kou5rvhaInoNibhPO7eMlRfS0B8yjatgn2yaHIwWNyoJDolC/6Lc5L/IQA==", "dev": true, "requires": { "aproba": "^2.0.0", @@ -9407,7 +9397,7 @@ "dependencies": { "pify": { "version": "2.3.0", - "resolved": "https://registry.npmjs.org/pify/-/pify-2.3.0.tgz", + "resolved": "http://registry.npmjs.org/pify/-/pify-2.3.0.tgz", "integrity": "sha1-7RQaasBDqEnqWISY59yosVMw6Qw=", "dev": true } @@ -9744,9 +9734,9 @@ } }, "map-age-cleaner": { - "version": "0.1.2", - "resolved": "https://registry.npmjs.org/map-age-cleaner/-/map-age-cleaner-0.1.2.tgz", - "integrity": "sha512-UN1dNocxQq44IhJyMI4TU8phc2m9BddacHRPRjKGLYaF0jqd3xLz0jS0skpAU9WgYyoR4gHtUpzytNBS385FWQ==", + "version": "0.1.3", + "resolved": "https://registry.npmjs.org/map-age-cleaner/-/map-age-cleaner-0.1.3.tgz", + "integrity": "sha512-bJzx6nMoP6PDLPBFmg7+xRKeFZvFboMrGlxmNj9ClvX53KrmvM5bXFXEWjbz4cz1AFn+jWJ9z/DJSz7hrs0w3w==", "dev": true, "requires": { "p-defer": "^1.0.0" @@ -9764,12 +9754,6 @@ "integrity": "sha1-plzSkIepJZi4eRJXpSPgISIqwfk=", "dev": true }, - "map-stream": { - "version": "0.0.7", - "resolved": "https://registry.npmjs.org/map-stream/-/map-stream-0.0.7.tgz", - "integrity": "sha1-ih8HiW2CsQkmvTdEokIACfiJdKg=", - "dev": true - }, "map-visit": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/map-visit/-/map-visit-1.0.0.tgz", @@ -10209,9 +10193,9 @@ } }, "merge": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/merge/-/merge-1.2.0.tgz", - "integrity": "sha1-dTHjnUlJwoGma4xabgJl6LBYlNo=", + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/merge/-/merge-1.2.1.tgz", + "integrity": "sha512-VjFo4P5Whtj4vsLzsYBu5ayHhoHJ0UqNm7ibvShmbmoz7tGi0vXaoJbGdB+GmDMLUdg8DpQXEIeVDAe8MaABvQ==", "dev": true }, "merge-stream": { @@ -10261,18 +10245,18 @@ } }, "mime-db": { - "version": "1.36.0", - "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.36.0.tgz", - "integrity": "sha512-L+xvyD9MkoYMXb1jAmzI/lWYAxAMCPvIBSWur0PZ5nOf5euahRLVqH//FKW9mWp2lkqUgYiXPgkzfMUFi4zVDw==", + "version": "1.37.0", + "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.37.0.tgz", + "integrity": "sha512-R3C4db6bgQhlIhPU48fUtdVmKnflq+hRdad7IyKhtFj06VPNVdk2RhiYL3UjQIlso8L+YxAtFkobT0VK+S/ybg==", "dev": true }, "mime-types": { - "version": "2.1.20", - "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.20.tgz", - "integrity": "sha512-HrkrPaP9vGuWbLK1B1FfgAkbqNjIuy4eHlIYnFi7kamZyLLrGlo2mpcx0bBmNpKqBtYtAfGbodDddIgddSJC2A==", + "version": "2.1.21", + "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.21.tgz", + "integrity": "sha512-3iL6DbwpyLzjR3xHSFNFeb9Nz/M8WDkX33t1GFQnFOllWk8pOrh/LSrB5OXlnlW5P9LH73X6loW/eogc+F5lJg==", "dev": true, "requires": { - "mime-db": "~1.36.0" + "mime-db": "~1.37.0" } }, "mimic-fn": { @@ -10319,9 +10303,9 @@ } }, "minipass": { - "version": "2.3.4", - "resolved": "https://registry.npmjs.org/minipass/-/minipass-2.3.4.tgz", - "integrity": "sha512-mlouk1OHlaUE8Odt1drMtG1bAJA4ZA6B/ehysgV0LUIrDHdKgo1KorZq3pK0b/7Z7LJIQ12MNM6aC+Tn6lUZ5w==", + "version": "2.3.5", + "resolved": "https://registry.npmjs.org/minipass/-/minipass-2.3.5.tgz", + "integrity": "sha512-Gi1W4k059gyRbyVUZQ4mEqLm0YIUiGYfvxhF6SIlk3ui1WVxMTGfGdQ2SInh3PDrRTVvPKgULkpJtT4RH10+VA==", "dev": true, "requires": { "safe-buffer": "^5.1.2", @@ -10329,9 +10313,9 @@ }, "dependencies": { "yallist": { - "version": "3.0.2", - "resolved": "https://registry.npmjs.org/yallist/-/yallist-3.0.2.tgz", - "integrity": "sha1-hFK0u36Dx8GI2AQcGoN8dz1ti7k=", + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/yallist/-/yallist-3.0.3.tgz", + "integrity": "sha512-S+Zk8DEWE6oKpV+vI3qWkaK+jSbIK86pCwe2IF/xwIpQ8jEuxpw9NyaGjmp9+BoJv5FV2piqCDcoCtStppiq2A==", "dev": true } } @@ -10426,7 +10410,7 @@ }, "pify": { "version": "2.3.0", - "resolved": "https://registry.npmjs.org/pify/-/pify-2.3.0.tgz", + "resolved": "http://registry.npmjs.org/pify/-/pify-2.3.0.tgz", "integrity": "sha1-7RQaasBDqEnqWISY59yosVMw6Qw=", "dev": true } @@ -10632,13 +10616,13 @@ } }, "node-notifier": { - "version": "5.2.1", - "resolved": "https://registry.npmjs.org/node-notifier/-/node-notifier-5.2.1.tgz", - "integrity": "sha512-MIBs+AAd6dJ2SklbbE8RUDRlIVhU8MaNLh1A9SUZDUHPiZkWLFde6UNwG41yQHZEToHgJMXqyVZ9UcS/ReOVTg==", + "version": "5.3.0", + "resolved": "https://registry.npmjs.org/node-notifier/-/node-notifier-5.3.0.tgz", + "integrity": "sha512-AhENzCSGZnZJgBARsUjnQ7DnZbzyP+HxlVXuD0xqAnvL8q+OqtSX7lGg9e8nHzwXkMMXNdVeqq4E2M3EUAqX6Q==", "dev": true, "requires": { "growly": "^1.3.0", - "semver": "^5.4.1", + "semver": "^5.5.0", "shellwords": "^0.1.1", "which": "^1.3.0" } @@ -10744,11 +10728,12 @@ } }, "npm-pick-manifest": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/npm-pick-manifest/-/npm-pick-manifest-2.1.0.tgz", - "integrity": "sha512-q9zLP8cTr8xKPmMZN3naxp1k/NxVFsjxN6uWuO1tiw9gxg7wZWQ/b5UTfzD0ANw2q1lQxdLKTeCCksq+bPSgbQ==", + "version": "2.2.3", + "resolved": "https://registry.npmjs.org/npm-pick-manifest/-/npm-pick-manifest-2.2.3.tgz", + "integrity": "sha512-+IluBC5K201+gRU85vFlUwX3PFShZAbAgDNp2ewJdWMVSppdo/Zih0ul2Ecky/X7b51J7LrrUAP+XOmOCvYZqA==", "dev": true, "requires": { + "figgy-pudding": "^3.5.1", "npm-package-arg": "^6.0.0", "semver": "^5.4.1" } @@ -10768,17 +10753,17 @@ } }, "npm-run-all": { - "version": "4.1.3", - "resolved": "https://registry.npmjs.org/npm-run-all/-/npm-run-all-4.1.3.tgz", - "integrity": "sha512-aOG0N3Eo/WW+q6sUIdzcV2COS8VnTZCmdji0VQIAZF3b+a3YWb0AD0vFIyjKec18A7beLGbaQ5jFTNI2bPt9Cg==", + "version": "4.1.5", + "resolved": "https://registry.npmjs.org/npm-run-all/-/npm-run-all-4.1.5.tgz", + "integrity": "sha512-Oo82gJDAVcaMdi3nuoKFavkIHBRVqQ1qvMb+9LHk/cF4P6B2m8aP04hGf7oL6wZ9BuGwX1onlLhpuoofSyoQDQ==", "dev": true, "requires": { - "ansi-styles": "^3.2.0", - "chalk": "^2.1.0", - "cross-spawn": "^6.0.4", + "ansi-styles": "^3.2.1", + "chalk": "^2.4.1", + "cross-spawn": "^6.0.5", "memorystream": "^0.3.1", "minimatch": "^3.0.4", - "ps-tree": "^1.1.0", + "pidtree": "^0.3.0", "read-pkg": "^3.0.0", "shell-quote": "^1.6.1", "string.prototype.padend": "^3.0.0" @@ -11251,67 +11236,76 @@ } }, "pacote": { - "version": "9.1.0", - "resolved": "https://registry.npmjs.org/pacote/-/pacote-9.1.0.tgz", - "integrity": "sha512-AFXaSWhOtQf3jHqEvg+ZYH/dfT8TKq6TKspJ4qEFwVVuh5aGvMIk6SNF8vqfzz+cBceDIs9drOcpBbrPai7i+g==", + "version": "9.2.3", + "resolved": "https://registry.npmjs.org/pacote/-/pacote-9.2.3.tgz", + "integrity": "sha512-Y3+yY3nBRAxMlZWvr62XLJxOwCmG9UmkGZkFurWHoCjqF0cZL72cTOCRJTvWw8T4OhJS2RTg13x4oYYriauvEw==", "dev": true, "requires": { - "bluebird": "^3.5.1", - "cacache": "^11.0.2", - "figgy-pudding": "^3.2.1", - "get-stream": "^3.0.0", - "glob": "^7.1.2", + "bluebird": "^3.5.2", + "cacache": "^11.2.0", + "figgy-pudding": "^3.5.1", + "get-stream": "^4.1.0", + "glob": "^7.1.3", "lru-cache": "^4.1.3", "make-fetch-happen": "^4.0.1", "minimatch": "^3.0.4", - "minipass": "^2.3.3", + "minipass": "^2.3.5", "mississippi": "^3.0.0", "mkdirp": "^0.5.1", "normalize-package-data": "^2.4.0", "npm-package-arg": "^6.1.0", - "npm-packlist": "^1.1.10", - "npm-pick-manifest": "^2.1.0", - "npm-registry-fetch": "^3.0.0", + "npm-packlist": "^1.1.12", + "npm-pick-manifest": "^2.2.3", + "npm-registry-fetch": "^3.8.0", "osenv": "^0.1.5", "promise-inflight": "^1.0.1", "promise-retry": "^1.1.1", - "protoduck": "^5.0.0", + "protoduck": "^5.0.1", "rimraf": "^2.6.2", "safe-buffer": "^5.1.2", - "semver": "^5.5.0", - "ssri": "^6.0.0", - "tar": "^4.4.3", - "unique-filename": "^1.1.0", - "which": "^1.3.0" + "semver": "^5.6.0", + "ssri": "^6.0.1", + "tar": "^4.4.6", + "unique-filename": "^1.1.1", + "which": "^1.3.1" }, "dependencies": { + "get-stream": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/get-stream/-/get-stream-4.1.0.tgz", + "integrity": "sha512-GMat4EJ5161kIy2HevLlr4luNjBgvmj413KaQA7jt4V8B4RDsfpHk7WQ9GVqfYyyx8OS/L66Kox+rJRNklLK7w==", + "dev": true, + "requires": { + "pump": "^3.0.0" + } + }, "tar": { - "version": "4.4.6", - "resolved": "https://registry.npmjs.org/tar/-/tar-4.4.6.tgz", - "integrity": "sha512-tMkTnh9EdzxyfW+6GK6fCahagXsnYk6kE6S9Gr9pjVdys769+laCTbodXDhPAjzVtEBazRgP0gYqOjnk9dQzLg==", + "version": "4.4.8", + "resolved": "https://registry.npmjs.org/tar/-/tar-4.4.8.tgz", + "integrity": "sha512-LzHF64s5chPQQS0IYBn9IN5h3i98c12bo4NCO7e0sGM2llXQ3p2FGC5sdENN4cTW48O915Sh+x+EXx7XW96xYQ==", "dev": true, "requires": { - "chownr": "^1.0.1", + "chownr": "^1.1.1", "fs-minipass": "^1.2.5", - "minipass": "^2.3.3", - "minizlib": "^1.1.0", + "minipass": "^2.3.4", + "minizlib": "^1.1.1", "mkdirp": "^0.5.0", "safe-buffer": "^5.1.2", "yallist": "^3.0.2" } }, "yallist": { - "version": "3.0.2", - "resolved": "https://registry.npmjs.org/yallist/-/yallist-3.0.2.tgz", - "integrity": "sha1-hFK0u36Dx8GI2AQcGoN8dz1ti7k=", + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/yallist/-/yallist-3.0.3.tgz", + "integrity": "sha512-S+Zk8DEWE6oKpV+vI3qWkaK+jSbIK86pCwe2IF/xwIpQ8jEuxpw9NyaGjmp9+BoJv5FV2piqCDcoCtStppiq2A==", "dev": true } } }, "pako": { - "version": "1.0.6", - "resolved": "https://registry.npmjs.org/pako/-/pako-1.0.6.tgz", - "integrity": "sha512-lQe48YPsMJAig+yngZ87Lus+NF+3mtu7DVOBu6b/gHO1YpKwIj5AWjZ/TOS7i46HD/UixzWb1zeWDZfGZ3iYcg==", + "version": "1.0.7", + "resolved": "https://registry.npmjs.org/pako/-/pako-1.0.7.tgz", + "integrity": "sha512-3HNK5tW4x8o5mO8RuHZp3Ydw9icZXx0RANAOMzlMzx7LVXhMJ4mo3MOBpzyd7r/+RUu8BmndP47LXT+vzjtWcQ==", "dev": true }, "parallel-transform": { @@ -11464,21 +11458,12 @@ "dependencies": { "pify": { "version": "2.3.0", - "resolved": "https://registry.npmjs.org/pify/-/pify-2.3.0.tgz", + "resolved": "http://registry.npmjs.org/pify/-/pify-2.3.0.tgz", "integrity": "sha1-7RQaasBDqEnqWISY59yosVMw6Qw=", "dev": true } } }, - "pause-stream": { - "version": "0.0.11", - "resolved": "http://registry.npmjs.org/pause-stream/-/pause-stream-0.0.11.tgz", - "integrity": "sha1-/lo0sMvOErWqaitAPuLnO2AvFEU=", - "dev": true, - "requires": { - "through": "~2.3" - } - }, "pbkdf2": { "version": "3.0.17", "resolved": "https://registry.npmjs.org/pbkdf2/-/pbkdf2-3.0.17.tgz", @@ -11498,6 +11483,12 @@ "integrity": "sha1-Ywn04OX6kT7BxpMHrjZLSzd8nns=", "dev": true }, + "pidtree": { + "version": "0.3.0", + "resolved": "https://registry.npmjs.org/pidtree/-/pidtree-0.3.0.tgz", + "integrity": "sha512-9CT4NFlDcosssyg8KVFltgokyKZIFjoBxw8CTGy+5F38Y1eQWrt8tRayiUOXE+zVKQnYu5BR8JjCtvK3BcnBhg==", + "dev": true + }, "pify": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/pify/-/pify-3.0.0.tgz", @@ -11639,9 +11630,9 @@ "dev": true }, "progress": { - "version": "2.0.1", - "resolved": "https://registry.npmjs.org/progress/-/progress-2.0.1.tgz", - "integrity": "sha512-OE+a6vzqazc+K6LxJrX5UPyKFvGnL5CYmq2jFGNIBWHpc4QyE49/YOumcrpQFJpfejmvRtbJzgO1zPmMCqlbBg==", + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/progress/-/progress-2.0.2.tgz", + "integrity": "sha512-/OLz5F9beZUWwSHZDreXgap1XShX6W+DCHQCqwCF7uZ88s6uTlD2cR3JBE77SegCmNtb1Idst+NfmwcdU6KVhw==", "dev": true }, "promise": { @@ -11695,12 +11686,12 @@ "dev": true }, "protoduck": { - "version": "5.0.0", - "resolved": "https://registry.npmjs.org/protoduck/-/protoduck-5.0.0.tgz", - "integrity": "sha512-agsGWD8/RZrS4ga6v82Fxb0RHIS2RZnbsSue6A9/MBRhB/jcqOANAMNrqM9900b8duj+Gx+T/JMy5IowDoO/hQ==", + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/protoduck/-/protoduck-5.0.1.tgz", + "integrity": "sha512-WxoCeDCoCBY55BMvj4cAEjdVUFGRWed9ZxPlqTKYyw1nDDTQ4pqmnIMAGfJlg7Dx35uB/M+PHJPTmGOvaCaPTg==", "dev": true, "requires": { - "genfun": "^4.0.1" + "genfun": "^5.0.0" } }, "prr": { @@ -11709,15 +11700,6 @@ "integrity": "sha1-0/wRS6BplaRexok/SEzrHXj19HY=", "dev": true }, - "ps-tree": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/ps-tree/-/ps-tree-1.1.0.tgz", - "integrity": "sha1-tCGyQUDWID8e08dplrRCewjowBQ=", - "dev": true, - "requires": { - "event-stream": "~3.3.0" - } - }, "pseudomap": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/pseudomap/-/pseudomap-1.0.2.tgz", @@ -11814,9 +11796,9 @@ "dev": true }, "randomatic": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/randomatic/-/randomatic-3.1.0.tgz", - "integrity": "sha512-KnGPVE0lo2WoXxIZ7cPR8YBpiol4gsSuOwDSg410oHh80ZMp5EiypNqL2K4Z77vJn6lB5rap7IkAmcUlalcnBQ==", + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/randomatic/-/randomatic-3.1.1.tgz", + "integrity": "sha512-TuDE5KxZ0J461RVjrJZCJc+J+zCkTb1MbH9AQUq68sMhOMcy9jLcb3BrZKgp9q9Ncltdg4QVqWrH02W2EFFVYw==", "dev": true, "requires": { "is-number": "^4.0.0", @@ -12618,7 +12600,7 @@ }, "safe-regex": { "version": "1.1.0", - "resolved": "https://registry.npmjs.org/safe-regex/-/safe-regex-1.1.0.tgz", + "resolved": "http://registry.npmjs.org/safe-regex/-/safe-regex-1.1.0.tgz", "integrity": "sha1-QKNmnzsHfR6UPURinhV91IAjvy4=", "dev": true, "requires": { @@ -13249,9 +13231,9 @@ } }, "socks": { - "version": "2.2.1", - "resolved": "https://registry.npmjs.org/socks/-/socks-2.2.1.tgz", - "integrity": "sha512-0GabKw7n9mI46vcNrVfs0o6XzWzjVa3h6GaSo2UPxtWAROXUWavfJWh1M4PR5tnE0dcnQXZIDFP4yrAysLze/w==", + "version": "2.2.2", + "resolved": "https://registry.npmjs.org/socks/-/socks-2.2.2.tgz", + "integrity": "sha512-g6wjBnnMOZpE0ym6e0uHSddz9p3a+WsBaaYQaBaSCJYvrC4IXykQR9MNGjLQf38e9iIIhp3b1/Zk8YZI3KGJ0Q==", "dev": true, "requires": { "ip": "^1.1.5", @@ -13360,9 +13342,9 @@ } }, "spdx-license-ids": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/spdx-license-ids/-/spdx-license-ids-3.0.1.tgz", - "integrity": "sha512-TfOfPcYGBB5sDuPn3deByxPhmfegAhpDYKSOXZQN81Oyrrif8ZCodOLzK3AesELnCx03kikhyDwh0pfvvQvF8w==", + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/spdx-license-ids/-/spdx-license-ids-3.0.2.tgz", + "integrity": "sha512-qky9CVt0lVIECkEsYbNILVnPvycuEBkXoMFLRWsREkomQLevYhtRKC+R91a5TOAQ3bCMjikRwhyaRqj1VYatYg==", "dev": true }, "split": { @@ -13399,9 +13381,9 @@ "dev": true }, "sshpk": { - "version": "1.15.1", - "resolved": "https://registry.npmjs.org/sshpk/-/sshpk-1.15.1.tgz", - "integrity": "sha512-mSdgNUaidk+dRU5MhYtN9zebdzF2iG0cNPWy8HG+W8y+fT1JnSkh0fzzpjOa0L7P8i1Rscz38t0h4gPcKz43xA==", + "version": "1.15.2", + "resolved": "https://registry.npmjs.org/sshpk/-/sshpk-1.15.2.tgz", + "integrity": "sha512-Ra/OXQtuh0/enyl4ETZAfTaeksa6BXks5ZcjpSUNrjBr0DvrJKX+1fsKDPpT9TBXgHAFsa4510aNVgI8g/+SzA==", "dev": true, "requires": { "asn1": "~0.2.3", @@ -13438,7 +13420,7 @@ }, "staged-git-files": { "version": "1.1.1", - "resolved": "https://registry.npmjs.org/staged-git-files/-/staged-git-files-1.1.1.tgz", + "resolved": "http://registry.npmjs.org/staged-git-files/-/staged-git-files-1.1.1.tgz", "integrity": "sha512-H89UNKr1rQJvI1c/PIR3kiAMBV23yvR7LItZiV74HWZwzt7f3YHuujJ9nJZlt58WlFox7XQsOahexwk7nTe69A==", "dev": true }, @@ -13479,16 +13461,6 @@ "readable-stream": "^2.0.2" } }, - "stream-combiner": { - "version": "0.2.2", - "resolved": "http://registry.npmjs.org/stream-combiner/-/stream-combiner-0.2.2.tgz", - "integrity": "sha1-rsjLrBd7Vrb0+kec7YwZEs7lKFg=", - "dev": true, - "requires": { - "duplexer": "~0.1.1", - "through": "~2.3.4" - } - }, "stream-each": { "version": "1.2.3", "resolved": "https://registry.npmjs.org/stream-each/-/stream-each-1.2.3.tgz", @@ -13625,7 +13597,7 @@ }, "strip-eof": { "version": "1.0.0", - "resolved": "https://registry.npmjs.org/strip-eof/-/strip-eof-1.0.0.tgz", + "resolved": "http://registry.npmjs.org/strip-eof/-/strip-eof-1.0.0.tgz", "integrity": "sha1-u0P/VZim6wXYm1n80SnJgzE2Br8=", "dev": true }, @@ -13699,9 +13671,9 @@ } }, "tapable": { - "version": "1.1.0", - "resolved": "https://registry.npmjs.org/tapable/-/tapable-1.1.0.tgz", - "integrity": "sha512-IlqtmLVaZA2qab8epUXbVWRn3aB1imbDMJtjB3nu4X0NqPkcY/JH9ZtCBWKHWPxs8Svi9tyo8w2dBoi07qZbBA==", + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/tapable/-/tapable-1.1.1.tgz", + "integrity": "sha512-9I2ydhj8Z9veORCw5PRm4u9uebCn0mcCa6scWoNcbZ6dAtoo2618u9UUzxgmsCOreJpqDDuv61LvwofW7hLcBA==", "dev": true }, "tar": { @@ -13736,9 +13708,9 @@ } }, "terser": { - "version": "3.10.1", - "resolved": "https://registry.npmjs.org/terser/-/terser-3.10.1.tgz", - "integrity": "sha512-GE0ShECt1/dZUZt9Kyr/IC6xXG46pTbm1C1WfzQbbnRB5LhdJlF8p5NBZ38RjspD7hEM9O5ud8aIcOFY6evl4A==", + "version": "3.10.8", + "resolved": "https://registry.npmjs.org/terser/-/terser-3.10.8.tgz", + "integrity": "sha512-GQJHWJ/vbx0EgRk+lBMONMmKaT+ifeo/XgT/hi3KpzEEFOERVyFuJSVXH8grcmJjiqKY35ds8rBCxvABUeyyuQ==", "dev": true, "requires": { "commander": "~2.17.1", @@ -14182,7 +14154,7 @@ }, "jest-environment-jsdom": { "version": "22.4.3", - "resolved": "https://registry.npmjs.org/jest-environment-jsdom/-/jest-environment-jsdom-22.4.3.tgz", + "resolved": "http://registry.npmjs.org/jest-environment-jsdom/-/jest-environment-jsdom-22.4.3.tgz", "integrity": "sha512-FviwfR+VyT3Datf13+ULjIMO5CSeajlayhhYQwpzgunswoaLIPutdbrnfUHEMyJCwvqQFaVtTmn9+Y8WCt6n1w==", "dev": true, "requires": { @@ -14193,7 +14165,7 @@ }, "jest-environment-node": { "version": "22.4.3", - "resolved": "https://registry.npmjs.org/jest-environment-node/-/jest-environment-node-22.4.3.tgz", + "resolved": "http://registry.npmjs.org/jest-environment-node/-/jest-environment-node-22.4.3.tgz", "integrity": "sha512-reZl8XF6t/lMEuPWwo9OLfttyC26A5AMgDyEQ6DBgZuyfyeNUzYT8BFo6uxCCP/Av/b7eb9fTi3sIHFPBzmlRA==", "dev": true, "requires": { @@ -14233,7 +14205,7 @@ }, "jest-message-util": { "version": "22.4.3", - "resolved": "https://registry.npmjs.org/jest-message-util/-/jest-message-util-22.4.3.tgz", + "resolved": "http://registry.npmjs.org/jest-message-util/-/jest-message-util-22.4.3.tgz", "integrity": "sha512-iAMeKxhB3Se5xkSjU0NndLLCHtP4n+GtCqV0bISKA5dmOXQfEbdEmYiu2qpnWBDCQdEafNDDU6Q+l6oBMd/+BA==", "dev": true, "requires": { @@ -14246,13 +14218,13 @@ }, "jest-mock": { "version": "22.4.3", - "resolved": "https://registry.npmjs.org/jest-mock/-/jest-mock-22.4.3.tgz", + "resolved": "http://registry.npmjs.org/jest-mock/-/jest-mock-22.4.3.tgz", "integrity": "sha512-+4R6mH5M1G4NK16CKg9N1DtCaFmuxhcIqF4lQK/Q1CIotqMs/XBemfpDPeVZBFow6iyUNu6EBT9ugdNOTT5o5Q==", "dev": true }, "jest-regex-util": { "version": "22.4.3", - "resolved": "https://registry.npmjs.org/jest-regex-util/-/jest-regex-util-22.4.3.tgz", + "resolved": "http://registry.npmjs.org/jest-regex-util/-/jest-regex-util-22.4.3.tgz", "integrity": "sha512-LFg1gWr3QinIjb8j833bq7jtQopiwdAs67OGfkPrvy7uNUbVMfTXXcOKXJaeY5GgjobELkKvKENqq1xrUectWg==", "dev": true }, @@ -14282,7 +14254,7 @@ }, "jest-util": { "version": "22.4.3", - "resolved": "https://registry.npmjs.org/jest-util/-/jest-util-22.4.3.tgz", + "resolved": "http://registry.npmjs.org/jest-util/-/jest-util-22.4.3.tgz", "integrity": "sha512-rfDfG8wyC5pDPNdcnAlZgwKnzHvZDu8Td2NJI/jAGKEGxJPYiE4F0ss/gSAkG4778Y23Hvbz+0GMrDJTeo7RjQ==", "dev": true, "requires": { @@ -14569,9 +14541,9 @@ }, "dependencies": { "ajv": { - "version": "6.5.4", - "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.5.4.tgz", - "integrity": "sha512-4Wyjt8+t6YszqaXnLDfMmG/8AlO5Zbcsy3ATHncCzjW/NoPzAId8AK6749Ybjmdt+kUY1gP60fCu46oDxPv/mg==", + "version": "6.6.1", + "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.6.1.tgz", + "integrity": "sha512-ZoJjft5B+EJBjUyu9C9Hc0OZyPZSSlOF+plzouTrg6UlA8f+e/n8NIgBFG/9tppJtpPWfthHakK7juJdNDODww==", "dev": true, "requires": { "fast-deep-equal": "^2.0.1", @@ -15488,9 +15460,9 @@ }, "dependencies": { "ajv": { - "version": "6.5.4", - "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.5.4.tgz", - "integrity": "sha512-4Wyjt8+t6YszqaXnLDfMmG/8AlO5Zbcsy3ATHncCzjW/NoPzAId8AK6749Ybjmdt+kUY1gP60fCu46oDxPv/mg==", + "version": "6.6.1", + "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.6.1.tgz", + "integrity": "sha512-ZoJjft5B+EJBjUyu9C9Hc0OZyPZSSlOF+plzouTrg6UlA8f+e/n8NIgBFG/9tppJtpPWfthHakK7juJdNDODww==", "dev": true, "requires": { "fast-deep-equal": "^2.0.1", @@ -15981,7 +15953,7 @@ "dependencies": { "pify": { "version": "2.3.0", - "resolved": "https://registry.npmjs.org/pify/-/pify-2.3.0.tgz", + "resolved": "http://registry.npmjs.org/pify/-/pify-2.3.0.tgz", "integrity": "sha1-7RQaasBDqEnqWISY59yosVMw6Qw=", "dev": true }, @@ -16018,12 +15990,6 @@ "integrity": "sha1-Ey7mPS7FVlxVfiD0wi35rKaGsQ0=", "dev": true }, - "xregexp": { - "version": "4.0.0", - "resolved": "https://registry.npmjs.org/xregexp/-/xregexp-4.0.0.tgz", - "integrity": "sha512-PHyM+sQouu7xspQQwELlGwwd05mXUFqwFYfqPO0cC7x4fxyHnnuetmQr6CjJiafIDoH4MogHb9dOoJzR/Y4rFg==", - "dev": true - }, "xtend": { "version": "4.0.1", "resolved": "https://registry.npmjs.org/xtend/-/xtend-4.0.1.tgz", @@ -16063,12 +16029,6 @@ "yargs-parser": "^5.0.0" }, "dependencies": { - "camelcase": { - "version": "3.0.0", - "resolved": "https://registry.npmjs.org/camelcase/-/camelcase-3.0.0.tgz", - "integrity": "sha1-MvxLn82vhF/N9+c7uXysImHwqwo=", - "dev": true - }, "which-module": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/which-module/-/which-module-1.0.0.tgz", diff --git a/js/package.json b/js/package.json index b3f3f162ad0f1..9f76819c2e1fd 100644 --- a/js/package.json +++ b/js/package.json @@ -85,7 +85,7 @@ "lint-staged": "7.3.0", "merge2": "1.2.3", "mkdirp": "0.5.1", - "npm-run-all": "4.1.3", + "npm-run-all": "4.1.5", "pump": "3.0.0", "rimraf": "2.6.2", "rxjs": "5.5.11", @@ -101,6 +101,9 @@ "webpack": "4.23.1", "xml2js": "0.4.19" }, + "engines": { + "node": ">=10.0" + }, "@std/esm": { "warnings": false }, diff --git a/js/src/Arrow.externs.js b/js/src/Arrow.externs.js index f01ea5cedc406..7ad066585712e 100644 --- a/js/src/Arrow.externs.js +++ b/js/src/Arrow.externs.js @@ -32,6 +32,8 @@ var Table = function() {}; /** @type {?} */ Table.from = function() {}; /** @type {?} */ +Table.fromVectors = function() {}; +/** @type {?} */ Table.fromAsync = function() {}; /** @type {?} */ Table.fromStruct = function() {}; diff --git a/js/src/Arrow.ts b/js/src/Arrow.ts index b1f4a3a4fa9bd..c76578b62996d 100644 --- a/js/src/Arrow.ts +++ b/js/src/Arrow.ts @@ -238,6 +238,7 @@ try { // set them via string indexers to save them from the mangler Schema['from'] = Schema.from; Table['from'] = Table.from; +Table['fromVectors'] = Table.fromVectors; Table['fromAsync'] = Table.fromAsync; Table['fromStruct'] = Table.fromStruct; Table['empty'] = Table.empty; diff --git a/js/src/recordbatch.ts b/js/src/recordbatch.ts index 91ea5cfbffd95..cfc236dffce36 100644 --- a/js/src/recordbatch.ts +++ b/js/src/recordbatch.ts @@ -25,8 +25,8 @@ import { valueToString, leftPad } from './util/pretty'; import Long = flatbuffers.Long; export class RecordBatch extends StructVector { - public static from(vectors: Vector[]) { - return new RecordBatch(Schema.from(vectors), + public static from(vectors: Vector[], names?: string[]) { + return new RecordBatch(Schema.from(vectors, names), Math.max(...vectors.map((v) => v.length)), vectors ); diff --git a/js/src/table.ts b/js/src/table.ts index 634092f0e4bc9..3559cd84ff372 100644 --- a/js/src/table.ts +++ b/js/src/table.ts @@ -38,6 +38,9 @@ export interface DataFrame { export class Table implements DataFrame { static empty() { return new Table(new Schema([]), []); } + static fromVectors(vectors: Vector[], names?: string[]) { + return new Table([RecordBatch.from(vectors, names)]) + } static from(sources?: Iterable | object | string) { if (sources) { let schema: Schema | undefined; @@ -199,6 +202,10 @@ export class Table implements DataFrame { } } +// protect batches, batchesUnion from es2015/umd mangler +( Table.prototype).batches = Object.freeze([]); +( Table.prototype).batchesUnion = Object.freeze([]); + class FilteredDataFrame implements DataFrame { private predicate: Predicate; private batches: RecordBatch[]; diff --git a/js/src/type.ts b/js/src/type.ts index 811086c9382b9..3f75903593134 100644 --- a/js/src/type.ts +++ b/js/src/type.ts @@ -47,8 +47,8 @@ function generateDictionaryMap(fields: Field[]) { } export class Schema { - public static from(vectors: Vector[]) { - return new Schema(vectors.map((v, i) => new Field('' + i, v.type))); + public static from(vectors: Vector[], names?: string[]) { + return new Schema(vectors.map((v, i) => new Field(names ? names[i] : ('' + i), v.type))); } // @ts-ignore protected _bodyLength: number; diff --git a/js/test/unit/table-tests.ts b/js/test/unit/table-tests.ts index 4ee1411d0f493..3a90a0d05e7e4 100644 --- a/js/test/unit/table-tests.ts +++ b/js/test/unit/table-tests.ts @@ -16,13 +16,20 @@ // under the License. import '../jest-extensions'; +import { TextEncoder } from 'text-encoding-utf-8'; import Arrow, { vector, RecordBatch } from '../Arrow'; const { predicate, Table } = Arrow; +const { DictionaryVector, IntVector, FloatVector, Utf8Vector } = Arrow.vector; +const { Dictionary, Utf8, Int } = Arrow.type; + const { col, lit, custom, and, or, And, Or } = predicate; +const utf8Encoder = new TextEncoder('utf-8'); + +const NAMES = ['f32', 'i32', 'dictionary']; const F32 = 0, I32 = 1, DICT = 2; const test_data = [ { @@ -336,338 +343,69 @@ function leftPad(str: string, fill: string, n: number) { return (new Array(n + 1).join(fill) + str).slice(-1 * n); } +function makeUtf8Vector(values) { + const n = values.length; + let offset = 0; + const offsets = Uint32Array.of(0, ...values.map((d) => { offset += d.length; return offset; })); + return new Utf8Vector(new Arrow.data.FlatListData(new Utf8(), n, null, offsets, utf8Encoder.encode(values.join('')))); +} + +function getTestVectors(f32Values, i32Values, dictionaryValues) { + const f32Vec = FloatVector.from( + Float32Array.from(f32Values) + ); + + const i32Vec = IntVector.from( + Int32Array.from(i32Values) + ); + + const dictionaryVec = new DictionaryVector( + new Arrow.data.DictionaryData( + new Dictionary(new Utf8(), new Int(true, 8)), + makeUtf8Vector(['a', 'b', 'c']), + IntVector.from(Int8Array.from(dictionaryValues)).data + ) + ); + + return [f32Vec, i32Vec, dictionaryVec]; +} + export function getSingleRecordBatchTable() { - return Table.from({ - 'schema': { - 'fields': [ - { - 'name': 'f32', - 'type': { - 'name': 'floatingpoint', - 'precision': 'SINGLE' - }, - 'nullable': false, - 'children': [], - }, - { - 'name': 'i32', - 'type': { - 'name': 'int', - 'isSigned': true, - 'bitWidth': 32 - }, - 'nullable': false, - 'children': [], - }, - { - 'name': 'dictionary', - 'type': { - 'name': 'utf8' - }, - 'nullable': false, - 'children': [], - 'dictionary': { - 'id': 0, - 'indexType': { - 'name': 'int', - 'isSigned': true, - 'bitWidth': 8 - }, - 'isOrdered': false - } - } - ] - }, - 'dictionaries': [{ - 'id': 0, - 'data': { - 'count': 3, - 'columns': [ - { - 'name': 'DICT0', - 'count': 3, - 'VALIDITY': [], - 'OFFSET': [ - 0, - 1, - 2, - 3 - ], - 'DATA': [ - 'a', - 'b', - 'c', - ] - } - ] - } - }], - 'batches': [{ - 'count': 7, - 'columns': [ - { - 'name': 'f32', - 'count': 7, - 'VALIDITY': [], - 'DATA': [-0.3, -0.2, -0.1, 0, 0.1, 0.2, 0.3] - }, - { - 'name': 'i32', - 'count': 7, - 'VALIDITY': [], - 'DATA': [-1, 1, -1, 1, -1, 1, -1] - }, - { - 'name': 'dictionary', - 'count': 7, - 'VALIDITY': [], - 'DATA': [0, 1, 2, 0, 1, 2, 0] - } - ] - }] - }); + const vectors = getTestVectors( + [-0.3, -0.2, -0.1, 0, 0.1, 0.2, 0.3], + [-1, 1, -1, 1, -1, 1, -1], + [0, 1, 2, 0, 1, 2, 0] + ); + + return Table.fromVectors( + vectors, + NAMES + ); } function getMultipleRecordBatchesTable() { - return Table.from({ - 'schema': { - 'fields': [ - { - 'name': 'f32', - 'type': { - 'name': 'floatingpoint', - 'precision': 'SINGLE' - }, - 'nullable': false, - 'children': [], - }, - { - 'name': 'i32', - 'type': { - 'name': 'int', - 'isSigned': true, - 'bitWidth': 32 - }, - 'nullable': false, - 'children': [], - }, - { - 'name': 'dictionary', - 'type': { - 'name': 'utf8' - }, - 'nullable': false, - 'children': [], - 'dictionary': { - 'id': 0, - 'indexType': { - 'name': 'int', - 'isSigned': true, - 'bitWidth': 8 - }, - 'isOrdered': false - } - } - ] - }, - 'dictionaries': [{ - 'id': 0, - 'data': { - 'count': 3, - 'columns': [ - { - 'name': 'DICT0', - 'count': 3, - 'VALIDITY': [], - 'OFFSET': [ - 0, - 1, - 2, - 3 - ], - 'DATA': [ - 'a', - 'b', - 'c', - ] - } - ] - } - }], - 'batches': [{ - 'count': 3, - 'columns': [ - { - 'name': 'f32', - 'count': 3, - 'VALIDITY': [], - 'DATA': [-0.3, -0.2, -0.1] - }, - { - 'name': 'i32', - 'count': 3, - 'VALIDITY': [], - 'DATA': [-1, 1, -1] - }, - { - 'name': 'dictionary', - 'count': 3, - 'VALIDITY': [], - 'DATA': [0, 1, 2] - } - ] - }, { - 'count': 3, - 'columns': [ - { - 'name': 'f32', - 'count': 3, - 'VALIDITY': [], - 'DATA': [0, 0.1, 0.2] - }, - { - 'name': 'i32', - 'count': 3, - 'VALIDITY': [], - 'DATA': [1, -1, 1] - }, - { - 'name': 'dictionary', - 'count': 3, - 'VALIDITY': [], - 'DATA': [0, 1, 2] - } - ] - }, { - 'count': 3, - 'columns': [ - { - 'name': 'f32', - 'count': 3, - 'VALIDITY': [], - 'DATA': [0.3, 0.2, 0.1] - }, - { - 'name': 'i32', - 'count': 3, - 'VALIDITY': [], - 'DATA': [-1, 1, -1] - }, - { - 'name': 'dictionary', - 'count': 3, - 'VALIDITY': [], - 'DATA': [0, 1, 2] - } - ] - }] - }); + const b1 = Arrow.RecordBatch.from(getTestVectors( + [-0.3, -0.2, -0.1], + [-1, 1, -1], + [0, 1, 2] + ), NAMES); + + const b2 = Arrow.RecordBatch.from(getTestVectors( + [0, 0.1, 0.2], + [1, -1, 1], + [0, 1, 2] + ), NAMES); + + const b3 = Arrow.RecordBatch.from(getTestVectors( + [0.3, 0.2, 0.1], + [-1, 1, -1], + [0, 1, 2] + ), NAMES); + + return new Table([b1, b2, b3]) } function getStructTable() { - return Table.from({ - 'schema': { - 'fields': [ - { - 'name': 'struct', - 'type': { - 'name': 'struct' - }, - 'nullable': false, - 'children': [ - { - 'name': 'f32', - 'type': { - 'name': 'floatingpoint', - 'precision': 'SINGLE' - }, - 'nullable': false, - 'children': [], - }, - { - 'name': 'i32', - 'type': { - 'name': 'int', - 'isSigned': true, - 'bitWidth': 32 - }, - 'nullable': false, - 'children': [], - }, - { - 'name': 'dictionary', - 'type': { - 'name': 'utf8' - }, - 'nullable': false, - 'children': [], - 'dictionary': { - 'id': 0, - 'indexType': { - 'name': 'int', - 'isSigned': true, - 'bitWidth': 8 - }, - 'isOrdered': false - } - } - ] - } - ] - }, - 'dictionaries': [{ - 'id': 0, - 'data': { - 'count': 3, - 'columns': [ - { - 'name': 'DICT0', - 'count': 3, - 'VALIDITY': [], - 'OFFSET': [ - 0, - 1, - 2, - 3 - ], - 'DATA': [ - 'a', - 'b', - 'c', - ] - } - ] - } - }], - 'batches': [{ - 'count': 7, - 'columns': [ - { - 'name': 'struct', - 'count': 7, - 'VALIDITY': [], - 'children': [ - { - 'name': 'f32', - 'count': 7, - 'VALIDITY': [], - 'DATA': [-0.3, -0.2, -0.1, 0, 0.1, 0.2, 0.3] - }, - { - 'name': 'i32', - 'count': 7, - 'VALIDITY': [], - 'DATA': [-1, 1, -1, 1, -1, 1, -1] - }, - { - 'name': 'dictionary', - 'count': 7, - 'VALIDITY': [], - 'DATA': [0, 1, 2, 0, 1, 2, 0] - } - ] - } - ] - }] - }); + const structVec = getSingleRecordBatchTable().batchesUnion + return Table.fromVectors([structVec], ['struct']) } diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 15a3479f63ad0..1a874542c8f9d 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -17,9 +17,6 @@ # # Includes code assembled from BSD/MIT/Apache-licensed code from some 3rd-party # projects, including Kudu, Impala, and libdynd. See python/LICENSE.txt -# -# TODO(ARROW-3209): rename arrow_gpu to arrow_cuda -# cmake_minimum_required(VERSION 2.7) project(pyarrow) @@ -393,13 +390,13 @@ if (PYARROW_BUILD_CUDA) endif() endif() if (MSVC) - ADD_THIRDPARTY_LIB(arrow_gpu + ADD_THIRDPARTY_LIB(arrow_cuda SHARED_LIB ${ARROW_CUDA_SHARED_IMP_LIB}) else() - ADD_THIRDPARTY_LIB(arrow_gpu + ADD_THIRDPARTY_LIB(arrow_cuda SHARED_LIB ${ARROW_CUDA_SHARED_LIB}) endif() - set(LINK_LIBS ${LINK_LIBS} arrow_gpu_shared) + set(LINK_LIBS ${LINK_LIBS} arrow_cuda_shared) set(CYTHON_EXTENSIONS ${CYTHON_EXTENSIONS} _cuda) endif() endif() diff --git a/python/Dockerfile b/python/Dockerfile index e97f82ec5300b..5c2ef1e30d142 100644 --- a/python/Dockerfile +++ b/python/Dockerfile @@ -25,9 +25,7 @@ RUN conda install -c conda-forge \ python=$PYTHON_VERSION && \ conda clean --all -ENV CC=gcc \ - CXX=g++ \ - ARROW_PYTHON=ON +ENV ARROW_PYTHON=ON # build and test CMD arrow/ci/docker_build_cpp.sh && \ diff --git a/python/Dockerfile.alpine b/python/Dockerfile.alpine new file mode 100644 index 0000000000000..7eedeac2860b0 --- /dev/null +++ b/python/Dockerfile.alpine @@ -0,0 +1,39 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +FROM arrow:cpp-alpine + +# install python, either python3(3.6) or python2(2.7) +ARG PYTHON_VERSION=3.6 +RUN export PYTHON_MAJOR=${PYTHON_VERSION:0:1} && \ + apk add --no-cache python${PYTHON_MAJOR}-dev && \ + python${PYTHON_MAJOR} -m ensurepip && \ + ln -sf /usr/bin/pip${PYTHON_MAJOR} /usr/bin/pip && \ + ln -sf /usr/bin/python${PYTHON_MAJOR} /usr/bin/python && \ + pip install --upgrade pip setuptools + +# install python requirements +ADD python/requirements.txt /arrow/python/ +RUN pip install -r /arrow/python/requirements.txt cython pandas + +ENV ARROW_PYTHON=ON \ + PYARROW_WITH_PARQUET=0 + +# build and test +CMD arrow/ci/docker_build_cpp.sh && \ + arrow/ci/docker_build_python.sh && \ + pytest -v --pyargs pyarrow diff --git a/python/README.md b/python/README.md index a0d727e80a73a..7d66dddd87c77 100644 --- a/python/README.md +++ b/python/README.md @@ -79,10 +79,10 @@ and look for the "custom options" section. ### Building the documentation ```bash -pip install -r doc/requirements.txt -python setup.py build_sphinx -s doc/source +pip install -r ../docs/requirements.txt +python setup.py build_sphinx -s ../docs/source ``` [2]: https://github.com/apache/arrow/blob/master/python/doc/source/development.rst [3]: https://github.com/pandas-dev/pandas -[4]: https://docs.pytest.org/en/latest/ \ No newline at end of file +[4]: https://docs.pytest.org/en/latest/ diff --git a/python/manylinux1/Dockerfile-x86_64_base b/python/manylinux1/Dockerfile-x86_64_base index cc139f71ced57..d4b84629c1735 100644 --- a/python/manylinux1/Dockerfile-x86_64_base +++ b/python/manylinux1/Dockerfile-x86_64_base @@ -17,7 +17,7 @@ FROM quay.io/pypa/manylinux1_x86_64:latest # Install dependencies -RUN yum install -y ccache flex wget && yum clean all +RUN yum install -y xz ccache flex wget && yum clean all ADD scripts/build_zlib.sh / RUN /build_zlib.sh @@ -79,3 +79,9 @@ RUN git clone https://github.com/matthew-brett/multibuild.git && cd multibuild & ADD scripts/build_virtualenvs.sh / RUN /build_virtualenvs.sh + +ADD scripts/build_llvm.sh / +RUN /build_llvm.sh + +ADD scripts/build_clang.sh / +RUN /build_clang.sh diff --git a/python/manylinux1/scripts/build_boost.sh b/python/manylinux1/scripts/build_boost.sh index 4bdd7fa842877..3fb394d5ab7cc 100755 --- a/python/manylinux1/scripts/build_boost.sh +++ b/python/manylinux1/scripts/build_boost.sh @@ -25,7 +25,7 @@ mkdir /arrow_boost pushd /boost_${BOOST_VERSION_UNDERSCORE} ./bootstrap.sh ./b2 tools/bcp -./dist/bin/bcp --namespace=arrow_boost --namespace-alias filesystem date_time system regex build algorithm locale format /arrow_boost +./dist/bin/bcp --namespace=arrow_boost --namespace-alias filesystem date_time system regex build algorithm locale format variant /arrow_boost popd pushd /arrow_boost diff --git a/cpp/examples/parquet/low-level-api/CMakeLists.txt b/python/manylinux1/scripts/build_clang.sh old mode 100644 new mode 100755 similarity index 55% rename from cpp/examples/parquet/low-level-api/CMakeLists.txt rename to python/manylinux1/scripts/build_clang.sh index 26e8220c0d057..0bf4979e836e2 --- a/cpp/examples/parquet/low-level-api/CMakeLists.txt +++ b/python/manylinux1/scripts/build_clang.sh @@ -1,3 +1,4 @@ +#!/bin/bash -ex # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -15,15 +16,24 @@ # specific language governing permissions and limitations # under the License. -if (PARQUET_BUILD_EXAMPLES) - add_executable(parquet-reader-writer reader-writer.cc) - add_executable(parquet-reader-writer2 reader-writer2.cc) - target_include_directories(parquet-reader-writer PRIVATE .) - target_include_directories(parquet-reader-writer2 PRIVATE .) - target_link_libraries(parquet-reader-writer parquet_static) - target_link_libraries(parquet-reader-writer2 parquet_static) +source /multibuild/manylinux_utils.sh - add_dependencies(parquet - parquet-reader-writer - parquet-reader-writer2) -endif() +export LLVM_VERSION="6.0.0" +curl -sL http://releases.llvm.org/${LLVM_VERSION}/cfe-${LLVM_VERSION}.src.tar.xz -o cfe-${LLVM_VERSION}.src.tar.xz +unxz cfe-${LLVM_VERSION}.src.tar.xz +tar xf cfe-${LLVM_VERSION}.src.tar +pushd cfe-${LLVM_VERSION}.src +mkdir build +pushd build +cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DCLANG_INCLUDE_TESTS=OFF \ + -DCLANG_INCLUDE_DOCS=OFF \ + -DLLVM_INCLUDE_TESTS=OFF \ + -DLLVM_INCLUDE_DOCS=OFF \ + -GNinja \ + .. +ninja install +popd +popd +rm -rf cfe-${LLVM_VERSION}.src.tar.xz cfe-${LLVM_VERSION}.src.tar cfe-${LLVM_VERSION}.src diff --git a/python/manylinux1/scripts/build_llvm.sh b/python/manylinux1/scripts/build_llvm.sh new file mode 100755 index 0000000000000..8298a869877b6 --- /dev/null +++ b/python/manylinux1/scripts/build_llvm.sh @@ -0,0 +1,40 @@ +#!/bin/bash -ex +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +source /multibuild/manylinux_utils.sh + +export LLVM_VERSION="6.0.0" +curl -sL http://releases.llvm.org/${LLVM_VERSION}/llvm-${LLVM_VERSION}.src.tar.xz -o llvm-${LLVM_VERSION}.src.tar.xz +unxz llvm-${LLVM_VERSION}.src.tar.xz +tar xf llvm-${LLVM_VERSION}.src.tar +pushd llvm-${LLVM_VERSION}.src +mkdir build +pushd build +cmake -DCMAKE_INSTALL_PREFIX=$PREFIX \ + -DCMAKE_BUILD_TYPE=Release -DLLVM_TARGETS_TO_BUILD=host \ + -DLLVM_INCLUDE_TESTS=OFF -DLLVM_INCLUDE_UTILS=OFF \ + -DLLVM_INCLUDE_DOCS=OFF -DLLVM_INCLUDE_EXAMPLES=OFF \ + -DLLVM_ENABLE_TERMINFO=OFF -DLLVM_ENABLE_ASSERTIONS=ON \ + -DLLVM_USE_INTEL_JITEVENTS=ON \ + -DPYTHON_EXECUTABLE="$(cpython_path 2.7 32)/bin/python" \ + -GNinja \ + .. +ninja install +popd +popd +rm -rf llvm-${LLVM_VERSION}.src.tar.xz llvm-${LLVM_VERSION}.src.tar llvm-${LLVM_VERSION} diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 12c2285f2d24b..63ed53e0ebab5 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -192,7 +192,7 @@ def get_libraries(): Return list of library names to include in the `libraries` argument for C or Cython extensions using pyarrow """ - return ['arrow_python'] + return ['arrow', 'arrow_python'] def get_library_dirs(): diff --git a/python/pyarrow/_plasma.pyx b/python/pyarrow/_plasma.pyx index 677e768035e12..2fad09c0549c2 100644 --- a/python/pyarrow/_plasma.pyx +++ b/python/pyarrow/_plasma.pyx @@ -32,11 +32,13 @@ from cpython.pycapsule cimport * import collections import pyarrow import random +import socket from pyarrow.lib cimport Buffer, NativeFile, check_status, pyarrow_wrap_buffer from pyarrow.includes.libarrow cimport (CBuffer, CMutableBuffer, CFixedSizeBufferWriter, CStatus) +from pyarrow import compat PLASMA_WAIT_TIMEOUT = 2 ** 30 @@ -131,6 +133,10 @@ cdef extern from "plasma/client.h" nogil: CStatus Subscribe(int* fd) + CStatus DecodeNotification(const uint8_t* buffer, + CUniqueID* object_id, int64_t* data_size, + int64_t* metadata_size) + CStatus GetNotification(int fd, CUniqueID* object_id, int64_t* data_size, int64_t* metadata_size) @@ -729,6 +735,38 @@ cdef class PlasmaClient: with nogil: check_status(self.client.get().Subscribe(&self.notification_fd)) + def get_notification_socket(self): + """ + Get the notification socket. + """ + return compat.get_socket_from_fd(self.notification_fd, + family=socket.AF_UNIX, + type=socket.SOCK_STREAM) + + def decode_notification(self, const uint8_t* buf): + """ + Get the notification from the buffer. + + Returns + ------- + ObjectID + The object ID of the object that was stored. + int + The data size of the object that was stored. + int + The metadata size of the object that was stored. + """ + cdef CUniqueID object_id + cdef int64_t data_size + cdef int64_t metadata_size + with nogil: + check_status(self.client.get() + .DecodeNotification(buf, + &object_id, + &data_size, + &metadata_size)) + return ObjectID(object_id.binary()), data_size, metadata_size + def get_next_notification(self): """ Get the next notification from the notification socket. diff --git a/python/pyarrow/compat.py b/python/pyarrow/compat.py index a481db0d53c5d..068d5607de813 100644 --- a/python/pyarrow/compat.py +++ b/python/pyarrow/compat.py @@ -25,6 +25,7 @@ import sys import six from six import BytesIO, StringIO, string_types as py_string +import socket PY26 = sys.version_info[:2] == (2, 6) @@ -267,4 +268,13 @@ def import_pytorch_extension(): integer_types = six.integer_types + (np.integer,) + +def get_socket_from_fd(fileno, family, type): + if PY2: + socket_obj = socket.fromfd(fileno, family, type) + return socket.socket(family, type, _sock=socket_obj) + else: + return socket.socket(fileno=fileno, family=family, type=type) + + __all__ = [] diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 3f533d93145da..c5e745708308f 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -533,10 +533,12 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: shared_ptr[CTable] ReplaceSchemaMetadata( const shared_ptr[CKeyValueMetadata]& metadata) - cdef cppclass RecordBatchReader: - CStatus ReadNext(shared_ptr[CRecordBatch]* out) + cdef cppclass CRecordBatchReader" arrow::RecordBatchReader": + shared_ptr[CSchema] schema() + CStatus ReadNext(shared_ptr[CRecordBatch]* batch) + CStatus ReadAll(shared_ptr[CTable]* out) - cdef cppclass TableBatchReader(RecordBatchReader): + cdef cppclass TableBatchReader(CRecordBatchReader): TableBatchReader(const CTable& table) void set_chunksize(int64_t chunksize) @@ -825,10 +827,6 @@ cdef extern from "arrow/ipc/api.h" namespace "arrow::ipc" nogil: c_bool allow_64bit) CStatus WriteTable(const CTable& table, int64_t max_chunksize) - cdef cppclass CRecordBatchReader" arrow::ipc::RecordBatchReader": - shared_ptr[CSchema] schema() - CStatus ReadNext(shared_ptr[CRecordBatch]* batch) - cdef cppclass CRecordBatchStreamReader \ " arrow::ipc::RecordBatchStreamReader"(CRecordBatchReader): @staticmethod diff --git a/python/pyarrow/includes/libarrow_cuda.pxd b/python/pyarrow/includes/libarrow_cuda.pxd index 0e0d5e1ce0987..cedc43263e744 100644 --- a/python/pyarrow/includes/libarrow_cuda.pxd +++ b/python/pyarrow/includes/libarrow_cuda.pxd @@ -19,9 +19,9 @@ from pyarrow.includes.libarrow cimport * -cdef extern from "arrow/gpu/cuda_api.h" namespace "arrow::gpu" nogil: +cdef extern from "arrow/gpu/cuda_api.h" namespace "arrow::cuda" nogil: - cdef cppclass CCudaDeviceManager" arrow::gpu::CudaDeviceManager": + cdef cppclass CCudaDeviceManager" arrow::cuda::CudaDeviceManager": @staticmethod CStatus GetInstance(CCudaDeviceManager** manager) CStatus GetContext(int gpu_number, shared_ptr[CCudaContext]* ctx) @@ -33,7 +33,7 @@ cdef extern from "arrow/gpu/cuda_api.h" namespace "arrow::gpu" nogil: # CStatus FreeHost(void* data, int64_t nbytes) int num_devices() const - cdef cppclass CCudaContext" arrow::gpu::CudaContext": + cdef cppclass CCudaContext" arrow::cuda::CudaContext": shared_ptr[CCudaContext] shared_from_this() # CStatus Close() CStatus Allocate(int64_t nbytes, shared_ptr[CCudaBuffer]* out) @@ -47,13 +47,13 @@ cdef extern from "arrow/gpu/cuda_api.h" namespace "arrow::gpu" nogil: const void* handle() const int device_number() const - cdef cppclass CCudaIpcMemHandle" arrow::gpu::CudaIpcMemHandle": + cdef cppclass CCudaIpcMemHandle" arrow::cuda::CudaIpcMemHandle": @staticmethod CStatus FromBuffer(const void* opaque_handle, shared_ptr[CCudaIpcMemHandle]* handle) CStatus Serialize(CMemoryPool* pool, shared_ptr[CBuffer]* out) const - cdef cppclass CCudaBuffer" arrow::gpu::CudaBuffer"(CBuffer): + cdef cppclass CCudaBuffer" arrow::cuda::CudaBuffer"(CBuffer): CCudaBuffer(uint8_t* data, int64_t size, const shared_ptr[CCudaContext]& context, c_bool own_data=false, c_bool is_ipc=false) @@ -73,17 +73,18 @@ cdef extern from "arrow/gpu/cuda_api.h" namespace "arrow::gpu" nogil: CStatus ExportForIpc(shared_ptr[CCudaIpcMemHandle]* handle) shared_ptr[CCudaContext] context() const - cdef cppclass CCudaHostBuffer" arrow::gpu::CudaHostBuffer"(CMutableBuffer): + cdef cppclass \ + CCudaHostBuffer" arrow::cuda::CudaHostBuffer"(CMutableBuffer): pass cdef cppclass \ - CCudaBufferReader" arrow::gpu::CudaBufferReader"(CBufferReader): + CCudaBufferReader" arrow::cuda::CudaBufferReader"(CBufferReader): CCudaBufferReader(const shared_ptr[CBuffer]& buffer) CStatus Read(int64_t nbytes, int64_t* bytes_read, void* buffer) CStatus Read(int64_t nbytes, shared_ptr[CBuffer]* out) cdef cppclass \ - CCudaBufferWriter" arrow::gpu::CudaBufferWriter"(WritableFile): + CCudaBufferWriter" arrow::cuda::CudaBufferWriter"(WritableFile): CCudaBufferWriter(const shared_ptr[CCudaBuffer]& buffer) CStatus Close() CStatus Flush() @@ -98,17 +99,17 @@ cdef extern from "arrow/gpu/cuda_api.h" namespace "arrow::gpu" nogil: CStatus AllocateCudaHostBuffer(int device_number, const int64_t size, shared_ptr[CCudaHostBuffer]* out) - # Cuda prefix is added to avoid picking up arrow::gpu functions + # Cuda prefix is added to avoid picking up arrow::cuda functions # from arrow namespace. - CStatus CudaSerializeRecordBatch" arrow::gpu::SerializeRecordBatch"\ + CStatus CudaSerializeRecordBatch" arrow::cuda::SerializeRecordBatch"\ (const CRecordBatch& batch, CCudaContext* ctx, shared_ptr[CCudaBuffer]* out) - CStatus CudaReadMessage" arrow::gpu::ReadMessage"\ + CStatus CudaReadMessage" arrow::cuda::ReadMessage"\ (CCudaBufferReader* reader, CMemoryPool* pool, unique_ptr[CMessage]* message) - CStatus CudaReadRecordBatch" arrow::gpu::ReadRecordBatch"\ + CStatus CudaReadRecordBatch" arrow::cuda::ReadRecordBatch"\ (const shared_ptr[CSchema]& schema, const shared_ptr[CCudaBuffer]& buffer, CMemoryPool* pool, shared_ptr[CRecordBatch]* out) diff --git a/python/pyarrow/ipc.pxi b/python/pyarrow/ipc.pxi index c9898f020cf52..137d5261d2474 100644 --- a/python/pyarrow/ipc.pxi +++ b/python/pyarrow/ipc.pxi @@ -285,21 +285,9 @@ cdef class _RecordBatchReader: """ Read all record batches as a pyarrow.Table """ - cdef: - vector[shared_ptr[CRecordBatch]] batches - shared_ptr[CRecordBatch] batch - shared_ptr[CTable] table - + cdef shared_ptr[CTable] table with nogil: - while True: - check_status(self.reader.get().ReadNext(&batch)) - if batch.get() == NULL: - break - batches.push_back(batch) - - check_status(CTable.FromRecordBatches(self.schema.sp_schema, - batches, &table)) - + check_status(self.reader.get().ReadAll(&table)) return pyarrow_wrap_table(table) diff --git a/python/pyarrow/plasma.py b/python/pyarrow/plasma.py index fbca9d55f1a0a..056172c9800de 100644 --- a/python/pyarrow/plasma.py +++ b/python/pyarrow/plasma.py @@ -39,7 +39,9 @@ tf_plasma_op = None -if os.path.exists(TF_PLASMA_OP_PATH): + +def load_plasma_tensorflow_op(): + global tf_plasma_op import tensorflow as tf tf_plasma_op = tf.load_op_library(TF_PLASMA_OP_PATH) diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 1350ad636ab2d..f9bd06ee04ef7 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information @@ -1221,3 +1222,29 @@ def test_nested_dictionary_array(): dict_arr = pa.DictionaryArray.from_arrays([0, 1, 0], ['a', 'b']) dict_arr2 = pa.DictionaryArray.from_arrays([0, 1, 2, 1, 0], dict_arr) assert dict_arr2.to_pylist() == ['a', 'b', 'a', 'b', 'a'] + + +def test_array_from_numpy_str_utf8(): + # ARROW-3890 -- in Python 3, NPY_UNICODE arrays are produced, but in Python + # 2 they are NPY_STRING (binary), so we must do UTF-8 validation + vec = np.array(["toto", "tata"]) + vec2 = np.array(["toto", "tata"], dtype=object) + + arr = pa.array(vec, pa.string()) + arr2 = pa.array(vec2, pa.string()) + expected = pa.array([u"toto", u"tata"]) + assert arr.equals(expected) + assert arr2.equals(expected) + + # with mask, separate code path + mask = np.array([False, False], dtype=bool) + arr = pa.array(vec, pa.string(), mask=mask) + assert arr.equals(expected) + + # UTF8 validation failures + vec = np.array([(u'mañana').encode('utf-16-le')]) + with pytest.raises(ValueError): + pa.array(vec, pa.string()) + + with pytest.raises(ValueError): + pa.array(vec, pa.string(), mask=np.array([False])) diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index e4f38ffc91b18..ce9d6d117acb2 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -1362,6 +1362,13 @@ def test_selective_categoricals(self): result4 = table.to_pandas(categories=tuple()) tm.assert_frame_equal(result4, expected_str, check_dtype=True) + def test_to_pandas_categorical_zero_length(self): + # ARROW-3586 + array = pa.array([], type=pa.int32()) + table = pa.Table.from_arrays(arrays=[array], names=['col']) + # This would segfault under 0.11.0 + table.to_pandas(categories=['col']) + def test_table_str_to_categorical_without_na(self): values = ['a', 'a', 'b', 'b', 'c'] df = pd.DataFrame({'strings': values}) diff --git a/python/pyarrow/tests/test_ipc.py b/python/pyarrow/tests/test_ipc.py index b1fa06fd0778f..0fb66f8fa4d43 100644 --- a/python/pyarrow/tests/test_ipc.py +++ b/python/pyarrow/tests/test_ipc.py @@ -639,3 +639,19 @@ def read_file(source): reader = pa.open_file(source) return [reader.get_batch(i) for i in range(reader.num_record_batches)] + + +def test_write_empty_ipc_file(): + # ARROW-3894: IPC file was not being properly initialized when no record + # batches are being written + schema = pa.schema([('field', pa.int64())]) + + sink = pa.BufferOutputStream() + writer = pa.RecordBatchFileWriter(sink, schema) + writer.close() + + buf = sink.getvalue() + reader = pa.RecordBatchFileReader(pa.BufferReader(buf)) + table = reader.read_all() + assert len(table) == 0 + assert table.schema.equals(schema) diff --git a/python/pyarrow/tests/test_plasma.py b/python/pyarrow/tests/test_plasma.py index 69b3d9c0166fc..e3d31b7de1990 100644 --- a/python/pyarrow/tests/test_plasma.py +++ b/python/pyarrow/tests/test_plasma.py @@ -25,6 +25,7 @@ import pytest import random import signal +import struct import subprocess import sys import time @@ -742,6 +743,34 @@ def test_subscribe(self): assert data_sizes[j] == recv_dsize assert metadata_sizes[j] == recv_msize + def test_subscribe_socket(self): + # Subscribe to notifications from the Plasma Store. + self.plasma_client.subscribe() + rsock = self.plasma_client.get_notification_socket() + for i in self.SUBSCRIBE_TEST_SIZES: + # Get notification from socket. + object_ids = [random_object_id() for _ in range(i)] + metadata_sizes = [np.random.randint(1000) for _ in range(i)] + data_sizes = [np.random.randint(1000) for _ in range(i)] + + for j in range(i): + self.plasma_client.create( + object_ids[j], data_sizes[j], + metadata=bytearray(np.random.bytes(metadata_sizes[j]))) + self.plasma_client.seal(object_ids[j]) + + # Check that we received notifications for all of the objects. + for j in range(i): + # Assume the plasma store will not be full, + # so we always get the data size instead of -1. + msg_len, = struct.unpack('L', rsock.recv(8)) + content = rsock.recv(msg_len) + recv_objid, recv_dsize, recv_msize = ( + self.plasma_client.decode_notification(content)) + assert object_ids[j] == recv_objid + assert data_sizes[j] == recv_dsize + assert metadata_sizes[j] == recv_msize + def test_subscribe_deletions(self): # Subscribe to notifications from the Plasma Store. We use # plasma_client2 to make sure that all used objects will get evicted diff --git a/r/.gitignore b/r/.gitignore index 85c986810bdf0..0f405f5713608 100644 --- a/r/.gitignore +++ b/r/.gitignore @@ -1,3 +1,6 @@ +Meta +doc +inst/doc *.o *.o-* *.d diff --git a/r/DESCRIPTION b/r/DESCRIPTION index 0250023e8fbc1..5f93c83f236eb 100644 --- a/r/DESCRIPTION +++ b/r/DESCRIPTION @@ -61,6 +61,9 @@ Collate: 'memory_pool.R' 'message.R' 'on_exit.R' + 'read_record_batch.R' + 'read_table.R' 'reexports-bit64.R' 'reexports-tibble.R' + 'write_arrow.R' 'zzz.R' diff --git a/r/NAMESPACE b/r/NAMESPACE index 490d2118c5805..cc5961e5ba148 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -6,57 +6,60 @@ S3method("==","arrow::DataType") S3method("==","arrow::Field") S3method("==","arrow::RecordBatch") S3method("==","arrow::ipc::Message") +S3method(BufferReader,"arrow::Buffer") +S3method(BufferReader,default) +S3method(FixedSizeBufferWriter,"arrow::Buffer") +S3method(FixedSizeBufferWriter,default) +S3method(MessageReader,"arrow::io::InputStream") +S3method(MessageReader,default) +S3method(RecordBatchFileReader,"arrow::Buffer") +S3method(RecordBatchFileReader,"arrow::io::RandomAccessFile") +S3method(RecordBatchFileReader,character) +S3method(RecordBatchFileReader,fs_path) +S3method(RecordBatchFileReader,raw) +S3method(RecordBatchFileWriter,"arrow::io::OutputStream") +S3method(RecordBatchFileWriter,character) +S3method(RecordBatchFileWriter,fs_path) +S3method(RecordBatchStreamReader,"arrow::Buffer") +S3method(RecordBatchStreamReader,"arrow::io::InputStream") +S3method(RecordBatchStreamReader,raw) +S3method(RecordBatchStreamWriter,"arrow::io::OutputStream") +S3method(RecordBatchStreamWriter,character) +S3method(RecordBatchStreamWriter,fs_path) S3method(as_tibble,"arrow::RecordBatch") S3method(as_tibble,"arrow::Table") +S3method(buffer,complex) S3method(buffer,default) S3method(buffer,integer) S3method(buffer,numeric) S3method(buffer,raw) -S3method(buffer_reader,"arrow::Buffer") -S3method(buffer_reader,default) S3method(feather_table_reader,"arrow::io::RandomAccessFile") S3method(feather_table_reader,"arrow::ipc::feather::TableReader") S3method(feather_table_reader,character) S3method(feather_table_reader,default) S3method(feather_table_reader,fs_path) S3method(feather_table_writer,"arrow::io::OutputStream") -S3method(fixed_size_buffer_writer,"arrow::Buffer") -S3method(fixed_size_buffer_writer,default) S3method(length,"arrow::Array") -S3method(message_reader,"arrow::io::InputStream") -S3method(message_reader,default) -S3method(message_reader,raw) S3method(names,"arrow::RecordBatch") S3method(print,"arrow-enum") S3method(read_message,"arrow::io::InputStream") -S3method(read_message,default) -S3method(read_record_batch,"arrow::io::BufferReader") -S3method(read_record_batch,"arrow::io::RandomAccessFile") +S3method(read_message,"arrow::ipc::MessageReader") +S3method(read_record_batch,"arrow::Buffer") +S3method(read_record_batch,"arrow::io::InputStream") S3method(read_record_batch,"arrow::ipc::Message") -S3method(read_record_batch,"arrow::ipc::RecordBatchFileReader") -S3method(read_record_batch,"arrow::ipc::RecordBatchStreamReader") -S3method(read_record_batch,character) -S3method(read_record_batch,fs_path) S3method(read_record_batch,raw) S3method(read_schema,"arrow::Buffer") S3method(read_schema,"arrow::io::InputStream") -S3method(read_schema,default) S3method(read_schema,raw) -S3method(read_table,"arrow::io::BufferReader") -S3method(read_table,"arrow::io::RandomAccessFile") S3method(read_table,"arrow::ipc::RecordBatchFileReader") S3method(read_table,"arrow::ipc::RecordBatchStreamReader") S3method(read_table,character) S3method(read_table,fs_path) S3method(read_table,raw) -S3method(record_batch_file_reader,"arrow::io::RandomAccessFile") -S3method(record_batch_file_reader,character) -S3method(record_batch_file_reader,fs_path) -S3method(record_batch_stream_reader,"arrow::io::InputStream") -S3method(record_batch_stream_reader,raw) -S3method(write_arrow,"arrow::RecordBatch") -S3method(write_arrow,"arrow::Table") -S3method(write_arrow,data.frame) +S3method(write_arrow,"arrow::ipc::RecordBatchWriter") +S3method(write_arrow,character) +S3method(write_arrow,fs_path) +S3method(write_arrow,raw) S3method(write_feather,"arrow::RecordBatch") S3method(write_feather,data.frame) S3method(write_feather,default) @@ -64,19 +67,20 @@ S3method(write_feather_RecordBatch,"arrow::io::OutputStream") S3method(write_feather_RecordBatch,character) S3method(write_feather_RecordBatch,default) S3method(write_feather_RecordBatch,fs_path) -S3method(write_record_batch,"arrow::io::OutputStream") -S3method(write_record_batch,"arrow::ipc::RecordBatchWriter") -S3method(write_record_batch,character) -S3method(write_record_batch,fs_path) -S3method(write_record_batch,raw) -S3method(write_table,"arrow::io::OutputStream") -S3method(write_table,"arrow::ipc::RecordBatchWriter") -S3method(write_table,character) -S3method(write_table,fs_path) -S3method(write_table,raw) +export(BufferOutputStream) +export(BufferReader) export(DateUnit) export(FileMode) +export(FileOutputStream) +export(FixedSizeBufferWriter) +export(MessageReader) export(MessageType) +export(MockOutputStream) +export(ReadableFile) +export(RecordBatchFileReader) +export(RecordBatchFileWriter) +export(RecordBatchStreamReader) +export(RecordBatchStreamWriter) export(StatusCode) export(TimeUnit) export(Type) @@ -84,20 +88,16 @@ export(array) export(as_tibble) export(boolean) export(buffer) -export(buffer_output_stream) -export(buffer_reader) export(cast_options) export(chunked_array) export(date32) export(date64) export(decimal) +export(default_memory_pool) export(dictionary) export(feather_table_reader) export(feather_table_writer) export(field) -export(file_open) -export(file_output_stream) -export(fixed_size_buffer_writer) export(float16) export(float32) export(float64) @@ -106,10 +106,8 @@ export(int32) export(int64) export(int8) export(list_of) -export(message_reader) export(mmap_create) export(mmap_open) -export(mock_output_stream) export(null) export(print.integer64) export(read_arrow) @@ -119,10 +117,6 @@ export(read_record_batch) export(read_schema) export(read_table) export(record_batch) -export(record_batch_file_reader) -export(record_batch_file_writer) -export(record_batch_stream_reader) -export(record_batch_stream_writer) export(schema) export(str.integer64) export(struct) @@ -138,8 +132,6 @@ export(utf8) export(write_arrow) export(write_feather) export(write_feather_RecordBatch) -export(write_record_batch) -export(write_table) importFrom(R6,R6Class) importFrom(Rcpp,sourceCpp) importFrom(assertthat,assert_that) diff --git a/r/R/ArrayData.R b/r/R/ArrayData.R index 47b858d589f3f..765971b405b00 100644 --- a/r/R/ArrayData.R +++ b/r/R/ArrayData.R @@ -17,6 +17,30 @@ #' @include R6.R +#' @title class arrow::ArrayData +#' +#' @usage NULL +#' @format NULL +#' @docType class +#' +#' @section Usage: +#' +#' ``` +#' data <- array(...)$data() +#' +#' data$type() +#' data$length() +#' data$null_count() +#' data$offset() +#' data$buffers() +#' ``` +#' +#' @section Methods: +#' +#' ... +#' +#' @rdname arrow__ArrayData +#' @name arrow__ArrayData `arrow::ArrayData` <- R6Class("arrow::ArrayData", inherit = `arrow::Object`, active = list( diff --git a/r/R/ChunkedArray.R b/r/R/ChunkedArray.R index 338438f578d7f..46e4076629099 100644 --- a/r/R/ChunkedArray.R +++ b/r/R/ChunkedArray.R @@ -17,14 +17,22 @@ #' @include R6.R +#' @title class arrow::ChunkedArray +#' +#' @usage NULL +#' @format NULL +#' @docType class +#' +#' @section Methods: +#' +#' TODO +#' +#' @rdname arrow__ChunkedArray +#' @name arrow__ChunkedArray `arrow::ChunkedArray` <- R6Class("arrow::ChunkedArray", inherit = `arrow::Object`, public = list( length = function() ChunkedArray__length(self), - null_count = function() ChunkedArray__null_count(self), - num_chunks = function() ChunkedArray__num_chunks(self), chunk = function(i) shared_ptr(`arrow::Array`, ChunkedArray__chunk(self, i)), - chunks = function() purrr::map(ChunkedArray__chunks(self), shared_ptr, class = `arrow::Array`), - type = function() `arrow::DataType`$dispatch(ChunkedArray__type(self)), as_vector = function() ChunkedArray__as_vector(self), Slice = function(offset, length = NULL){ if (is.null(length)) { @@ -38,10 +46,16 @@ assert_that(inherits(options, "arrow::compute::CastOptions")) shared_ptr(`arrow::ChunkedArray`, ChunkedArray__cast(self, target_type, options)) } + ), + active = list( + null_count = function() ChunkedArray__null_count(self), + num_chunks = function() ChunkedArray__num_chunks(self), + chunks = function() map(ChunkedArray__chunks(self), shared_ptr, class = `arrow::Array`), + type = function() `arrow::DataType`$dispatch(ChunkedArray__type(self)) ) ) -#' create an arrow::Array from an R vector +#' create an [arrow::ChunkedArray][arrow__ChunkedArray] from various R vectors #' #' @param \dots Vectors to coerce #' @param type currently ignored diff --git a/r/R/Column.R b/r/R/Column.R index bf3fe0a0e10df..fb8af1ea31543 100644 --- a/r/R/Column.R +++ b/r/R/Column.R @@ -17,11 +17,26 @@ #' @include R6.R +#' @title class arrow::Column +#' +#' @usage NULL +#' @format NULL +#' @docType class +#' +#' @section Methods: +#' +#' TODO +#' +#' @rdname arrow__Column +#' @name arrow__Column `arrow::Column` <- R6Class("arrow::Column", inherit = `arrow::Object`, public = list( length = function() Column__length(self), - null_count = function() Column__null_count(self), - type = function() `arrow::DataType`$dispatch(Column__type(self)), data = function() shared_ptr(`arrow::ChunkedArray`, Column__data(self)) + ), + + active = list( + null_count = function() Column__null_count(self), + type = function() `arrow::DataType`$dispatch(Column__type(self)) ) ) diff --git a/r/R/Field.R b/r/R/Field.R index 79c0f33be6846..4f5636fbfffe2 100644 --- a/r/R/Field.R +++ b/r/R/Field.R @@ -17,20 +17,35 @@ #' @include R6.R +#' @title class arrow::Field +#' +#' @usage NULL +#' @format NULL +#' @docType class +#' +#' @section Methods: +#' +#' TODO +#' +#' @rdname arrow__Field +#' @name arrow__Field `arrow::Field` <- R6Class("arrow::Field", inherit = `arrow::Object`, public = list( ToString = function() { Field__ToString(self) }, + Equals = function(other) { + inherits(other, "arrow::Field") && Field__Equals(self, other) + } + ), + + active = list( name = function() { Field__name(self) }, nullable = function() { Field__nullable(self) }, - Equals = function(other) { - inherits(other, "arrow::Field") && Field__Equals(self, other) - }, type = function() { `arrow::DataType`$dispatch(Field__type(self)) } diff --git a/r/R/R6.R b/r/R/R6.R index 1caa885d90cab..69d58e0c13663 100644 --- a/r/R/R6.R +++ b/r/R/R6.R @@ -54,15 +54,24 @@ unique_ptr <- function(class, xp) { !(lhs == rhs) } +#' @title class arrow::DataType +#' +#' @usage NULL +#' @format NULL +#' @docType class +#' +#' @section Methods: +#' +#' TODO +#' +#' @rdname arrow__DataType +#' @name arrow__DataType `arrow::DataType` <- R6Class("arrow::DataType", inherit = `arrow::Object`, public = list( ToString = function() { DataType__ToString(self) }, - name = function() { - DataType__name(self) - }, Equals = function(other) { assert_that(inherits(other, "arrow::DataType")) DataType__Equals(self, other) @@ -73,11 +82,9 @@ unique_ptr <- function(class, xp) { children = function() { map(DataType__children_pointer(self), shared_ptr, class= `arrow::Field`) }, - id = function(){ - DataType__id(self) - }, + ..dispatch = function(){ - switch(names(Type)[self$id()+1], + switch(names(Type)[self$id + 1], "NA" = null(), BOOL = boolean(), UINT8 = uint8(), @@ -107,6 +114,15 @@ unique_ptr <- function(class, xp) { MAP = stop("Type MAP not implemented yet") ) } + ), + + active = list( + id = function(){ + DataType__id(self) + }, + name = function() { + DataType__name(self) + } ) ) @@ -116,9 +132,21 @@ unique_ptr <- function(class, xp) { #----- metadata +#' @title class arrow::FixedWidthType +#' +#' @usage NULL +#' @format NULL +#' @docType class +#' +#' @section Methods: +#' +#' TODO +#' +#' @rdname arrow__FixedWidthType +#' @name arrow__FixedWidthType `arrow::FixedWidthType` <- R6Class("arrow::FixedWidthType", inherit = `arrow::DataType`, - public = list( + active = list( bit_width = function() FixedWidthType__bit_width(self) ) ) diff --git a/r/R/RcppExports.R b/r/R/RcppExports.R index 324510cf1b680..ccf854927b76e 100644 --- a/r/R/RcppExports.R +++ b/r/R/RcppExports.R @@ -629,6 +629,10 @@ RecordBatch__schema <- function(x) { .Call(`_arrow_RecordBatch__schema`, x) } +RecordBatch__columns <- function(batch) { + .Call(`_arrow_RecordBatch__columns`, batch) +} + RecordBatch__column <- function(batch, i) { .Call(`_arrow_RecordBatch__column`, batch, i) } @@ -665,6 +669,14 @@ RecordBatch__Slice2 <- function(self, offset, length) { .Call(`_arrow_RecordBatch__Slice2`, self, offset, length) } +ipc___SerializeRecordBatch__Raw <- function(batch) { + .Call(`_arrow_ipc___SerializeRecordBatch__Raw`, batch) +} + +ipc___ReadRecordBatch__InputStream__Schema <- function(stream, schema) { + .Call(`_arrow_ipc___ReadRecordBatch__InputStream__Schema`, stream, schema) +} + RecordBatchReader__schema <- function(reader) { .Call(`_arrow_RecordBatchReader__schema`, reader) } @@ -677,6 +689,10 @@ ipc___RecordBatchStreamReader__Open <- function(stream) { .Call(`_arrow_ipc___RecordBatchStreamReader__Open`, stream) } +ipc___RecordBatchStreamReader__batches <- function(reader) { + .Call(`_arrow_ipc___RecordBatchStreamReader__batches`, reader) +} + ipc___RecordBatchFileReader__schema <- function(reader) { .Call(`_arrow_ipc___RecordBatchFileReader__schema`, reader) } @@ -701,16 +717,12 @@ Table__from_RecordBatchStreamReader <- function(reader) { .Call(`_arrow_Table__from_RecordBatchStreamReader`, reader) } -ipc___RecordBatchFileWriter__Open <- function(stream, schema) { - .Call(`_arrow_ipc___RecordBatchFileWriter__Open`, stream, schema) -} - -ipc___RecordBatchStreamWriter__Open <- function(stream, schema) { - .Call(`_arrow_ipc___RecordBatchStreamWriter__Open`, stream, schema) +ipc___RecordBatchFileReader__batches <- function(reader) { + .Call(`_arrow_ipc___RecordBatchFileReader__batches`, reader) } -ipc___RecordBatchWriter__WriteRecordBatch <- function(batch_writer, batch, allow_64bit) { - invisible(.Call(`_arrow_ipc___RecordBatchWriter__WriteRecordBatch`, batch_writer, batch, allow_64bit)) +ipc___RecordBatchWriter__WriteRecordBatch <- function(batch_writer, batch) { + invisible(.Call(`_arrow_ipc___RecordBatchWriter__WriteRecordBatch`, batch_writer, batch)) } ipc___RecordBatchWriter__WriteTable <- function(batch_writer, table) { @@ -721,6 +733,14 @@ ipc___RecordBatchWriter__Close <- function(batch_writer) { invisible(.Call(`_arrow_ipc___RecordBatchWriter__Close`, batch_writer)) } +ipc___RecordBatchFileWriter__Open <- function(stream, schema) { + .Call(`_arrow_ipc___RecordBatchFileWriter__Open`, stream, schema) +} + +ipc___RecordBatchStreamWriter__Open <- function(stream, schema) { + .Call(`_arrow_ipc___RecordBatchStreamWriter__Open`, stream, schema) +} + Table__from_dataframe <- function(tbl) { .Call(`_arrow_Table__from_dataframe`, tbl) } @@ -745,3 +765,7 @@ Table__column <- function(table, i) { .Call(`_arrow_Table__column`, table, i) } +Table__columns <- function(table) { + .Call(`_arrow_Table__columns`, table) +} + diff --git a/r/R/RecordBatch.R b/r/R/RecordBatch.R index c606d12143bcd..fed10abee769c 100644 --- a/r/R/RecordBatch.R +++ b/r/R/RecordBatch.R @@ -17,11 +17,20 @@ #' @include R6.R +#' @title class arrow::RecordBatch +#' +#' @usage NULL +#' @format NULL +#' @docType class +#' +#' @section Methods: +#' +#' TODO +#' +#' @rdname arrow__RecordBatch +#' @name arrow__RecordBatch `arrow::RecordBatch` <- R6Class("arrow::RecordBatch", inherit = `arrow::Object`, public = list( - num_columns = function() RecordBatch__num_columns(self), - num_rows = function() RecordBatch__num_rows(self), - schema = function() shared_ptr(`arrow::Schema`, RecordBatch__schema(self)), column = function(i) shared_ptr(`arrow::Array`, RecordBatch__column(self, i)), column_name = function(i) RecordBatch__column_name(self, i), names = function() RecordBatch__names(self), @@ -29,9 +38,11 @@ assert_that(inherits(other, "arrow::RecordBatch")) RecordBatch__Equals(self, other) }, + RemoveColumn = function(i){ shared_ptr(`arrow::RecordBatch`, RecordBatch__RemoveColumn(self, i)) }, + Slice = function(offset, length = NULL) { if (is.null(length)) { shared_ptr(`arrow::RecordBatch`, RecordBatch__Slice1(self, offset)) @@ -40,14 +51,21 @@ } }, - serialize = function(output_stream, ...) write_record_batch(self, output_stream, ...), + serialize = function() ipc___SerializeRecordBatch__Raw(self), cast = function(target_schema, safe = TRUE, options = cast_options(safe)) { assert_that(inherits(target_schema, "arrow::Schema")) assert_that(inherits(options, "arrow::compute::CastOptions")) - assert_that(identical(self$schema()$names, target_schema$names), msg = "incompatible schemas") + assert_that(identical(self$schema$names, target_schema$names), msg = "incompatible schemas") shared_ptr(`arrow::RecordBatch`, RecordBatch__cast(self, target_schema, options)) } + ), + + active = list( + num_columns = function() RecordBatch__num_columns(self), + num_rows = function() RecordBatch__num_rows(self), + schema = function() shared_ptr(`arrow::Schema`, RecordBatch__schema(self)), + columns = function() map(RecordBatch__columns(self), shared_ptr, `arrow::Array`) ) ) @@ -66,10 +84,11 @@ RecordBatch__to_dataframe(x) } -#' Create an arrow::RecordBatch from a data frame +#' Create an [arrow::RecordBatch][arrow__RecordBatch] from a data frame #' #' @param .data a data frame #' +#' @return a [arrow::RecordBatch][arrow__RecordBatch] #' @export record_batch <- function(.data){ shared_ptr(`arrow::RecordBatch`, RecordBatch__from_dataframe(.data)) diff --git a/r/R/RecordBatchReader.R b/r/R/RecordBatchReader.R index 350375384266f..6dab2d1ff7676 100644 --- a/r/R/RecordBatchReader.R +++ b/r/R/RecordBatchReader.R @@ -17,179 +17,127 @@ #' @include R6.R +#' @title class arrow::RecordBatchReader +#' +#' @usage NULL +#' @format NULL +#' @docType class +#' +#' @section Methods: +#' +#' TODO +#' +#' @rdname arrow__RecordBatchReader +#' @name arrow__RecordBatchReader `arrow::RecordBatchReader` <- R6Class("arrow::RecordBatchReader", inherit = `arrow::Object`, public = list( - schema = function() shared_ptr(`arrow::Schema`, RecordBatchReader__schema(self)), - ReadNext = function() { + read_next_batch = function() { shared_ptr(`arrow::RecordBatch`, RecordBatchReader__ReadNext(self)) } + ), + active = list( + schema = function() shared_ptr(`arrow::Schema`, RecordBatchReader__schema(self)) ) ) -`arrow::ipc::RecordBatchStreamReader` <- R6Class("arrow::ipc::RecordBatchStreamReader", inherit = `arrow::RecordBatchReader`) - -`arrow::ipc::RecordBatchFileReader` <- R6Class("arrow::ipc::RecordBatchFileReader", inherit = `arrow::Object`, +#' @title class arrow::ipc::RecordBatchStreamReader +#' +#' @usage NULL +#' @format NULL +#' @docType class +#' +#' @section Methods: +#' +#' TODO +#' +#' @rdname arrow__ipc__RecordBatchStreamReader +#' @name arrow__ipc__RecordBatchStreamReader +`arrow::ipc::RecordBatchStreamReader` <- R6Class("arrow::ipc::RecordBatchStreamReader", inherit = `arrow::RecordBatchReader`, public = list( - schema = function() shared_ptr(`arrow::Schema`, ipc___RecordBatchFileReader__schema(self)), - num_record_batches = function() ipc___RecordBatchFileReader__num_record_batches(self), - ReadRecordBatch = function(i) shared_ptr(`arrow::RecordBatch`, ipc___RecordBatchFileReader__ReadRecordBatch(self, i)) + batches = function() map(ipc___RecordBatchStreamReader__batches(self), shared_ptr, class = `arrow::RecordBatch`) ) ) - -#' Create a `arrow::ipc::RecordBatchStreamReader` from an input stream +#' @title class arrow::ipc::RecordBatchFileReader #' -#' @param stream input stream -#' @export -record_batch_stream_reader <- function(stream){ - UseMethod("record_batch_stream_reader") -} - -#' @export -`record_batch_stream_reader.arrow::io::InputStream` <- function(stream) { - shared_ptr(`arrow::ipc::RecordBatchStreamReader`, ipc___RecordBatchStreamReader__Open(stream)) -} - -#' @export -`record_batch_stream_reader.raw` <- function(stream) { - record_batch_stream_reader(buffer_reader(stream)) -} - - -#' Create an `arrow::ipc::RecordBatchFileReader` from a file +#' @usage NULL +#' @format NULL +#' @docType class #' -#' @param file The file to read from +#' @section Methods: #' -#' @export -record_batch_file_reader <- function(file) { - UseMethod("record_batch_file_reader") -} - -#' @export -`record_batch_file_reader.arrow::io::RandomAccessFile` <- function(file) { - shared_ptr(`arrow::ipc::RecordBatchFileReader`, ipc___RecordBatchFileReader__Open(file)) -} - -#' @export -`record_batch_file_reader.character` <- function(file) { - assert_that(length(file) == 1L) - record_batch_file_reader(fs::path_abs(file)) -} - -#' @export -`record_batch_file_reader.fs_path` <- function(file) { - record_batch_file_reader(file_open(file)) -} +#' TODO +#' +#' @rdname arrow__ipc__RecordBatchFileReader +#' @name arrow__ipc__RecordBatchFileReader +`arrow::ipc::RecordBatchFileReader` <- R6Class("arrow::ipc::RecordBatchFileReader", inherit = `arrow::Object`, + public = list( + get_batch = function(i) shared_ptr(`arrow::RecordBatch`, ipc___RecordBatchFileReader__ReadRecordBatch(self, i)), -#-------- read_record_batch + batches = function() map(ipc___RecordBatchFileReader__batches(self), shared_ptr, class = `arrow::RecordBatch`) + ), + active = list( + num_record_batches = function() ipc___RecordBatchFileReader__num_record_batches(self), + schema = function() shared_ptr(`arrow::Schema`, ipc___RecordBatchFileReader__schema(self)) + ) +) -#' Read a single record batch from a stream +#' Create a [arrow::ipc::RecordBatchStreamReader][arrow__ipc__RecordBatchStreamReader] from an input stream #' -#' @param stream input stream -#' @param ... additional parameters +#' @param stream input stream, an [arrow::io::InputStream][arrow__io__InputStream] or a raw vector #' -#' @details `stream` can be a `arrow::io::RandomAccessFile` stream as created by [file_open()] or [mmap_open()] or a path. -#' -#' @export -read_record_batch <- function(stream, ...){ - UseMethod("read_record_batch") -} - -#' @export -read_record_batch.character <- function(stream, ...){ - assert_that(length(stream) == 1L) - read_record_batch(fs::path_abs(stream)) -} - -#' @export -read_record_batch.fs_path <- function(stream, ...){ - stream <- close_on_exit(file_open(stream)) - read_record_batch(stream) -} - #' @export -`read_record_batch.arrow::io::RandomAccessFile` <- function(stream, ...){ - reader <- record_batch_file_reader(stream) - reader$ReadRecordBatch(0) +RecordBatchStreamReader <- function(stream){ + UseMethod("RecordBatchStreamReader") } #' @export -`read_record_batch.arrow::io::BufferReader` <- function(stream, ...){ - reader <- record_batch_stream_reader(stream) - reader$ReadNext() -} - -#' @export -read_record_batch.raw <- function(stream, ...){ - stream <- close_on_exit(buffer_reader(stream)) - read_record_batch(stream) -} - -#' @export -`read_record_batch.arrow::ipc::RecordBatchStreamReader` <- function(stream, ...) { - stream$ReadNext() +`RecordBatchStreamReader.arrow::io::InputStream` <- function(stream) { + shared_ptr(`arrow::ipc::RecordBatchStreamReader`, ipc___RecordBatchStreamReader__Open(stream)) } #' @export -`read_record_batch.arrow::ipc::RecordBatchFileReader` <- function(stream, i = 0, ...) { - stream$ReadRecordBatch(i) +`RecordBatchStreamReader.raw` <- function(stream) { + RecordBatchStreamReader(BufferReader(stream)) } #' @export -`read_record_batch.arrow::ipc::Message` <- function(stream, schema, ...) { - assert_that(inherits(schema, "arrow::Schema")) - shared_ptr(`arrow::RecordBatch`, ipc___ReadRecordBatch__Message__Schema(stream, schema)) +`RecordBatchStreamReader.arrow::Buffer` <- function(stream) { + RecordBatchStreamReader(BufferReader(stream)) } -#--------- read_table - -#' Read an arrow::Table from a stream +#' Create an [arrow::ipc::RecordBatchFileReader][arrow__ipc__RecordBatchFileReader] from a file #' -#' @param stream stream. Either a stream created by [file_open()] or [mmap_open()] or a file path. +#' @param file The file to read from. A file path, or an [arrow::io::RandomAccessFile][arrow__ipc__RecordBatchFileReader] #' #' @export -read_table <- function(stream){ - UseMethod("read_table") +RecordBatchFileReader <- function(file) { + UseMethod("RecordBatchFileReader") } #' @export -read_table.character <- function(stream){ - assert_that(length(stream) == 1L) - read_table(fs::path_abs(stream)) -} - -#' @export -read_table.fs_path <- function(stream) { - stream <- close_on_exit(file_open(stream)) - read_table(stream) -} - -#' @export -`read_table.arrow::io::RandomAccessFile` <- function(stream) { - reader <- record_batch_file_reader(stream) - read_table(reader) +`RecordBatchFileReader.arrow::io::RandomAccessFile` <- function(file) { + shared_ptr(`arrow::ipc::RecordBatchFileReader`, ipc___RecordBatchFileReader__Open(file)) } #' @export -`read_table.arrow::ipc::RecordBatchFileReader` <- function(stream) { - shared_ptr(`arrow::Table`, Table__from_RecordBatchFileReader(stream)) +`RecordBatchFileReader.character` <- function(file) { + assert_that(length(file) == 1L) + RecordBatchFileReader(fs::path_abs(file)) } #' @export -`read_table.arrow::ipc::RecordBatchStreamReader` <- function(stream) { - shared_ptr(`arrow::Table`, Table__from_RecordBatchStreamReader(stream)) +`RecordBatchFileReader.fs_path` <- function(file) { + RecordBatchFileReader(ReadableFile(file)) } #' @export -`read_table.arrow::io::BufferReader` <- function(stream) { - reader <- record_batch_stream_reader(stream) - read_table(reader) +`RecordBatchFileReader.arrow::Buffer` <- function(file) { + RecordBatchFileReader(BufferReader(file)) } #' @export -`read_table.raw` <- function(stream) { - stream <- close_on_exit(buffer_reader(stream)) - read_table(stream) +`RecordBatchFileReader.raw` <- function(file) { + RecordBatchFileReader(BufferReader(file)) } - diff --git a/r/R/RecordBatchWriter.R b/r/R/RecordBatchWriter.R index 515b6986b9445..77305114d3344 100644 --- a/r/R/RecordBatchWriter.R +++ b/r/R/RecordBatchWriter.R @@ -17,175 +17,174 @@ #' @include R6.R +#' @title class arrow::ipc::RecordBatchWriter +#' +#' @usage NULL +#' @format NULL +#' @docType class +#' +#' @section Methods: +#' +#' - `$write_batch(batch)`: Write record batch to stream +#' - `$write_table(table)`: write Table to stream +#' - `$close()`: close stream +#' +#' @section Derived classes: +#' +#' - [arrow::ipc::RecordBatchStreamWriter][arrow__ipc__RecordBatchStreamWriter] implements the streaming binary format +#' - [arrow::ipc::RecordBatchFileWriter][arrow__ipc__RecordBatchFileWriter] implements the binary file format +#' +#' @rdname arrow__ipc__RecordBatchWriter +#' @name arrow__ipc__RecordBatchWriter `arrow::ipc::RecordBatchWriter` <- R6Class("arrow::ipc::RecordBatchWriter", inherit = `arrow::Object`, public = list( - WriteRecordBatch = function(batch, allow_64bit) ipc___RecordBatchWriter__WriteRecordBatch(self, batch, allow_64bit), - WriteTable = function(table) ipc___RecordBatchWriter__WriteTable(self, table), - Close = function() ipc___RecordBatchWriter__Close(self) + write_batch = function(batch) ipc___RecordBatchWriter__WriteRecordBatch(self, batch), + write_table = function(table) ipc___RecordBatchWriter__WriteTable(self, table), + + write = function(x) { + if (inherits(x, "arrow::RecordBatch")) { + self$write_batch(x) + } else if(inherits(x, "arrow::Table")) { + self$write_table(x) + } else if (inherits(x, "data.frame")) { + self$write_table(table(x)) + } else { + abort("unexpected type for RecordBatchWriter$write(), must be an arrow::RecordBatch or an arrow::Table") + } + }, + + close = function() ipc___RecordBatchWriter__Close(self) ) ) -`arrow::ipc::RecordBatchStreamWriter` <- R6Class("arrow::ipc::RecordBatchStreamWriter", inherit = `arrow::ipc::RecordBatchWriter`) -`arrow::ipc::RecordBatchFileWriter` <- R6Class("arrow::ipc::RecordBatchFileWriter", inherit = `arrow::ipc::RecordBatchStreamWriter`) - -#' Create a record batch file writer from a stream +#' @title class arrow::ipc::RecordBatchStreamWriter #' -#' @param stream a stream -#' @param schema the schema of the batches +#' Writer for the Arrow streaming binary format #' -#' @return an `arrow::ipc::RecordBatchWriter` object +#' @usage NULL +#' @format NULL +#' @docType class #' -#' @export -record_batch_file_writer <- function(stream, schema) { - assert_that( - inherits(stream, "arrow::io::OutputStream"), - inherits(schema, "arrow::Schema") - ) - shared_ptr(`arrow::ipc::RecordBatchFileWriter`, ipc___RecordBatchFileWriter__Open(stream, schema)) -} - -#' Create a record batch stream writer +#' @section usage: #' -#' @param stream a stream -#' @param schema a schema +#' ``` +#' writer <- RecordBatchStreamWriter(sink, schema) #' -#' @export -record_batch_stream_writer <- function(stream, schema) { - assert_that( - inherits(stream, "arrow::io::OutputStream"), - inherits(schema, "arrow::Schema") - ) - shared_ptr(`arrow::ipc::RecordBatchStreamWriter`, ipc___RecordBatchStreamWriter__Open(stream, schema)) -} - -#-------- write_record_batch - -#' write a record batch +#' writer$write_batch(batch) +#' writer$write_table(table) +#' writer$close() +#' ``` #' -#' @param x a `arrow::RecordBatch` -#' @param stream where to stream the record batch -#' @param ... extra parameters +#' @section Factory: #' -#' @export -write_record_batch <- function(x, stream, ...){ - UseMethod("write_record_batch", stream) -} - -#' @export -`write_record_batch.arrow::io::OutputStream` <- function(x, stream, ...) { - stream_writer <- close_on_exit(record_batch_stream_writer(stream, x$schema())) - write_record_batch(x, stream_writer) -} - -#' @export -`write_record_batch.arrow::ipc::RecordBatchWriter` <- function(x, stream, allow_64bit = TRUE, ...){ - stream$WriteRecordBatch(x, allow_64bit) -} - -#' @export -`write_record_batch.character` <- function(x, stream, ...) { - assert_that(length(stream) == 1L) - write_record_batch(x, fs::path_abs(stream), ...) -} - -#' @export -`write_record_batch.fs_path` <- function(x, stream, ...) { - assert_that(length(stream) == 1L) - file_stream <- close_on_exit(file_output_stream(stream)) - file_writer <- close_on_exit(record_batch_file_writer(file_stream, x$schema())) - write_record_batch(x, file_writer, ...) -} - -#' @export -`write_record_batch.raw` <- function(x, stream, ...) { - # how many bytes do we need - mock <- mock_output_stream() - write_record_batch(x, mock) - n <- mock$GetExtentBytesWritten() - - bytes <- raw(n) - buffer <- buffer(bytes) - buffer_writer <- fixed_size_buffer_writer(buffer) - write_record_batch(x, buffer_writer) - - bytes -} - -#-------- stream Table - -#' write an arrow::Table +#' The [RecordBatchStreamWriter()] function creates a record batch stream writer. #' -#' @param x an `arrow::Table` -#' @param stream where to stream the record batch -#' @param ... extra parameters +#' @section Methods: +#' inherited from [arrow::ipc::RecordBatchWriter][arrow__ipc__RecordBatchWriter] #' -#' @export -write_table <- function(x, stream, ...) { - UseMethod("write_table", stream) -} +#' - `$write_batch(batch)`: Write record batch to stream +#' - `$write_table(table)`: write Table to stream +#' - `$close()`: close stream +#' +#' @rdname arrow__ipc__RecordBatchStreamWriter +#' @name arrow__ipc__RecordBatchStreamWriter +`arrow::ipc::RecordBatchStreamWriter` <- R6Class("arrow::ipc::RecordBatchStreamWriter", inherit = `arrow::ipc::RecordBatchWriter`) +#' Writer for the Arrow streaming binary format +#' +#' @param sink Where to write. Can either be: +#' +#' - A string, meant as a file path, passed to [fs::path_abs()] +#' - a [file path][fs::path_abs()] +#' - [arrow::io::OutputStream][arrow__io__OutputStream] +#' +#' @param schema The [arrow::Schema][arrow__Schema] for data to be written. +#' +#' @return a [arrow::ipc::RecordBatchStreamWriter][arrow__ipc__RecordBatchStreamWriter] +#' #' @export -`write_table.arrow::io::OutputStream` <- function(x, stream, ...) { - stream_writer <- close_on_exit(record_batch_stream_writer(stream, x$schema())) - write_table(x, stream_writer) +RecordBatchStreamWriter <- function(sink, schema) { + UseMethod("RecordBatchStreamWriter") } #' @export -`write_table.arrow::ipc::RecordBatchWriter` <- function(x, stream, ...){ - stream$WriteTable(x) +RecordBatchStreamWriter.character <- function(sink, schema){ + RecordBatchStreamWriter(fs::path_abs(sink), schema) } #' @export -`write_table.character` <- function(x, stream, ...) { - assert_that(length(stream) == 1L) - write_table(x, fs::path_abs(stream), ...) +RecordBatchStreamWriter.fs_path <- function(sink, schema){ + RecordBatchStreamWriter(FileOutputStream(sink), schema) } #' @export -`write_table.fs_path` <- function(x, stream, ...) { - assert_that(length(stream) == 1L) - file_stream <- close_on_exit(file_output_stream(stream)) - file_writer <- close_on_exit(record_batch_file_writer(file_stream, x$schema())) - write_table(x, file_writer, ...) +`RecordBatchStreamWriter.arrow::io::OutputStream` <- function(sink, schema){ + assert_that(inherits(schema, "arrow::Schema")) + shared_ptr(`arrow::ipc::RecordBatchStreamWriter`, ipc___RecordBatchStreamWriter__Open(sink, schema)) } -#' @export -`write_table.raw` <- function(x, stream, ...) { - # how many bytes do we need - mock <- mock_output_stream() - write_table(x, mock) - n <- mock$GetExtentBytesWritten() - - bytes <- raw(n) - buffer <- buffer(bytes) - buffer_writer <- fixed_size_buffer_writer(buffer) - write_table(x, buffer_writer) - - bytes -} +#' @title class arrow::ipc::RecordBatchFileWriter +#' +#' Writer for the Arrow binary file format +#' +#' @usage NULL +#' @format NULL +#' @docType class +#' +#' @section usage: +#' +#' ``` +#' writer <- RecordBatchFileWriter(sink, schema) +#' +#' writer$write_batch(batch) +#' writer$write_table(table) +#' writer$close() +#' ``` +#' +#' @section Factory: +#' +#' The [RecordBatchFileWriter()] function creates a record batch stream writer. +#' +#' @section Methods: +#' inherited from [arrow::ipc::RecordBatchWriter][arrow__ipc__RecordBatchWriter] +#' +#' - `$write_batch(batch)`: Write record batch to stream +#' - `$write_table(table)`: write Table to stream +#' - `$close()`: close stream +#' +#' @rdname arrow__ipc__RecordBatchFileWriter +#' @name arrow__ipc__RecordBatchFileWriter +`arrow::ipc::RecordBatchFileWriter` <- R6Class("arrow::ipc::RecordBatchFileWriter", inherit = `arrow::ipc::RecordBatchStreamWriter`) -#' Write an object to a stream +#' Create a record batch file writer from a stream #' -#' @param x An object to stream -#' @param stream A stream -#' @param ... additional parameters +#' @param sink Where to write. Can either be: +#' +#' - character vector of length one +#' - a [file path][fs::path_abs()] +#' - [arrow::io::OutputStream][arrow__io__OutputStream] +#' +#' @param schema The [arrow::Schema][arrow__Schema] for data to be written. +#' +#' @return an `arrow::ipc::RecordBatchWriter` object #' #' @export -write_arrow <- function(x, stream, ...){ - UseMethod("write_arrow") +RecordBatchFileWriter <- function(sink, schema) { + UseMethod("RecordBatchFileWriter") } #' @export -`write_arrow.arrow::RecordBatch` <- function(x, stream, ...) { - write_record_batch(x, stream, ...) +RecordBatchFileWriter.character <- function(sink, schema){ + RecordBatchFileWriter(fs::path_abs(sink), schema) } #' @export -`write_arrow.arrow::Table` <- function(x, stream, ...) { - write_table(x, stream, ...) +RecordBatchFileWriter.fs_path <- function(sink, schema){ + RecordBatchFileWriter(FileOutputStream(sink), schema) } #' @export -`write_arrow.data.frame` <- function(x, stream, ...) { - write_record_batch(record_batch(x), stream, ...) +`RecordBatchFileWriter.arrow::io::OutputStream` <- function(sink, schema){ + assert_that(inherits(schema, "arrow::Schema")) + shared_ptr(`arrow::ipc::RecordBatchFileWriter`, ipc___RecordBatchFileWriter__Open(sink, schema)) } diff --git a/r/R/Schema.R b/r/R/Schema.R index b158fee169d34..08047a3b11f46 100644 --- a/r/R/Schema.R +++ b/r/R/Schema.R @@ -17,6 +17,30 @@ #' @include R6.R +#' @title class arrow::Schema +#' +#' @usage NULL +#' @format NULL +#' @docType class +#' +#' @section Usage: +#' +#' ``` +#' s <- schema(...) +#' +#' s$ToString() +#' s$num_fields() +#' s$field(i) +#' ``` +#' +#' @section Methods: +#' +#' - `$ToString()`: convert to a string +#' - `$num_fields()`: returns the number of fields +#' - `$field(i)`: returns the field at index `i` (0-based) +#' +#' @rdname arrow__Schema +#' @name arrow__Schema `arrow::Schema` <- R6Class("arrow::Schema", inherit = `arrow::Object`, public = list( @@ -29,11 +53,11 @@ ) ) -#' Schema functions +#' Schema factory #' #' @param ... named list of data types #' -#' @return a Schema +#' @return a [schema][arrow__Schema] #' #' @export schema <- function(...){ @@ -50,11 +74,6 @@ read_schema <- function(stream, ...) { UseMethod("read_schema") } -#' @export -read_schema.default <- function(stream, ...) { - stop("unsupported") -} - #' @export `read_schema.arrow::io::InputStream` <- function(stream, ...) { shared_ptr(`arrow::Schema`, ipc___ReadSchema_InputStream(stream)) @@ -62,10 +81,12 @@ read_schema.default <- function(stream, ...) { #' @export `read_schema.arrow::Buffer` <- function(stream, ...) { - read_schema(buffer_reader(stream), ...) + stream <- close_on_exit(BufferReader(stream)) + shared_ptr(`arrow::Schema`, ipc___ReadSchema_InputStream(stream)) } #' @export `read_schema.raw` <- function(stream, ...) { - read_schema(buffer(stream), ...) + stream <- close_on_exit(BufferReader(stream)) + shared_ptr(`arrow::Schema`, ipc___ReadSchema_InputStream(stream)) } diff --git a/r/R/Table.R b/r/R/Table.R index e7d4545c1f646..8972634d59f1d 100644 --- a/r/R/Table.R +++ b/r/R/Table.R @@ -16,12 +16,21 @@ # under the License. #' @include R6.R - +#' +#' @title class arrow::Table +#' +#' @usage NULL +#' @format NULL +#' @docType class +#' +#' @section Methods: +#' +#' TODO +#' +#' @rdname arrow__Table +#' @name arrow__Table `arrow::Table` <- R6Class("arrow::Table", inherit = `arrow::Object`, public = list( - num_columns = function() Table__num_columns(self), - num_rows = function() Table__num_rows(self), - schema = function() shared_ptr(`arrow::Schema`, Table__schema(self)), column = function(i) shared_ptr(`arrow::Column`, Table__column(self, i)), serialize = function(output_stream, ...) write_table(self, output_stream, ...), @@ -29,9 +38,16 @@ cast = function(target_schema, safe = TRUE, options = cast_options(safe)) { assert_that(inherits(target_schema, "arrow::Schema")) assert_that(inherits(options, "arrow::compute::CastOptions")) - assert_that(identical(self$schema()$names, target_schema$names), msg = "incompatible schemas") + assert_that(identical(self$schema$names, target_schema$names), msg = "incompatible schemas") shared_ptr(`arrow::Table`, Table__cast(self, target_schema, options)) } + ), + + active = list( + num_columns = function() Table__num_columns(self), + num_rows = function() Table__num_rows(self), + schema = function() shared_ptr(`arrow::Schema`, Table__schema(self)), + columns = function() map(Table__columns(self), shared_ptr, class = `arrow::Column`) ) ) @@ -48,14 +64,3 @@ table <- function(.data){ `as_tibble.arrow::Table` <- function(x, ...){ Table__to_dataframe(x) } - -#' Read an tibble from an arrow::Table on disk -#' -#' @param stream input stream -#' -#' @return a [tibble::tibble] -#' -#' @export -read_arrow <- function(stream){ - as_tibble(read_table(stream)) -} diff --git a/r/R/array.R b/r/R/array.R index 2d434f9a2218c..63fdb4e0f6119 100644 --- a/r/R/array.R +++ b/r/R/array.R @@ -17,18 +17,65 @@ #' @include R6.R +#' @title class arrow::Array +#' +#' Array base type. Immutable data array with some logical type and some length. +#' +#' @usage NULL +#' @format NULL +#' @docType class +#' +#' @section Usage: +#' +#' ``` +#' a <- array(...) +#' +#' a$IsNull(i) +#' a$IsValid(i) +#' a$length() or length(a) +#' a$offset() +#' a$null_count() +#' a$type() +#' a$type_id() +#' a$Equals(b) +#' a$ApproxEquals(b) +#' a$as_vector() +#' a$ToString() +#' a$Slice(offset, length = NULL) +#' a$RangeEquals(other, start_idx, end_idx, other_start_idx) +#' +#' print(a) +#' a == a +#' ``` +#' +#' @section Methods: +#' +#' - `$IsNull(i)`: Return true if value at index is null. Does not boundscheck +#' - `$IsValid(i)`: Return true if value at index is valid. Does not boundscheck +#' - `$length()`: Size in the number of elements this array contains +#' - `$offset()`: A relative position into another array's data, to enable zero-copy slicing +#' - `$null_count()`: The number of null entries in the array +#' - `$type()`: logical type of data +#' - `$type_id()`: type id +#' - `$Equals(other)` : is this array equal to `other` +#' - `$ApproxEquals(other)` : +#' - `$data()`: return the underlying [arrow::ArrayData][arrow__ArrayData] +#' - `$as_vector()`: convert to an R vector +#' - `$ToString()`: string representation of the array +#' - `$Slice(offset, length = NULL)` : Construct a zero-copy slice of the array with the indicated offset and length. If length is `NULL`, the slice goes until the end of the array. +#' - `$RangeEquals(other, start_idx, end_idx, other_start_idx)` : +#' +#' @rdname arrow__Array +#' @name arrow__Array `arrow::Array` <- R6Class("arrow::Array", inherit = `arrow::Object`, public = list( IsNull = function(i) Array__IsNull(self, i), IsValid = function(i) Array__IsValid(self, i), length = function() Array__length(self), - offset = function() Array__offset(self), - null_count = function() Array__null_count(self), - type = function() `arrow::DataType`$dispatch(Array__type(self)), type_id = function() Array__type_id(self), Equals = function(other) Array__Equals(self, other), - ApproxEquals = function(othet) Array__ApproxEquals(self, other), + ApproxEquals = function(other) Array__ApproxEquals(self, other), data = function() shared_ptr(`arrow::ArrayData`, Array__data(self)), as_vector = function() Array__as_vector(self), ToString = function() Array__ToString(self), @@ -48,6 +95,11 @@ assert_that(inherits(options, "arrow::compute::CastOptions")) `arrow::Array`$dispatch(Array__cast(self, target_type, options)) } + ), + active = list( + null_count = function() Array__null_count(self), + offset = function() Array__offset(self), + type = function() `arrow::DataType`$dispatch(Array__type(self)) ) ) @@ -65,7 +117,7 @@ #' @export `==.arrow::Array` <- function(x, y) x$Equals(y) -#' create an arrow::Array from an R vector +#' create an [arrow::Array][arrow__Array] from an R vector #' #' @param \dots Vectors to coerce #' @param type currently ignored diff --git a/r/R/buffer.R b/r/R/buffer.R index 9684a9729130f..2fecd0e4fc64b 100644 --- a/r/R/buffer.R +++ b/r/R/buffer.R @@ -18,21 +18,38 @@ #' @include R6.R #' @include enums.R +#' @title class arrow::Buffer +#' +#' @usage NULL +#' @format NULL +#' @docType class +#' +#' @section Methods: +#' +#' - `$is_mutable()` : +#' - `$ZeroPadding()` : +#' - `$size()` : +#' - `$capacity()`: +#' +#' @rdname arrow__Buffer +#' @name arrow__Buffer `arrow::Buffer` <- R6Class("arrow::Buffer", inherit = `arrow::Object`, public = list( + ZeroPadding = function() Buffer__ZeroPadding(self) + ), + + active = list( is_mutable = function() Buffer__is_mutable(self), - ZeroPadding = function() Buffer__ZeroPadding(self), size = function() Buffer__size(self), capacity = function() Buffer__capacity(self) ) ) -`arrow::MutableBuffer` <- R6Class("arrow::Buffer", inherit = `arrow::Buffer`) - -#' Create a buffer from an R object +#' Create a [arrow::Buffer][arrow__Buffer] from an R object #' -#' @param x R object -#' @return an instance of `arrow::Buffer` that borrows memory from `x` +#' @param x R object. Only raw, numeric and integer vectors are currently supported +#' +#' @return an instance of [arrow::Buffer][arrow__Buffer] that borrows memory from `x` #' #' @export buffer <- function(x){ @@ -44,7 +61,6 @@ buffer.default <- function(x) { stop("cannot convert to Buffer") } - #' @export buffer.raw <- function(x) { shared_ptr(`arrow::Buffer`, r___RBuffer__initialize(x)) diff --git a/r/R/dictionary.R b/r/R/dictionary.R index d8a71d92a9f69..3c3758df303e8 100644 --- a/r/R/dictionary.R +++ b/r/R/dictionary.R @@ -17,15 +17,27 @@ #' @include R6.R +#' @title class arrow::DictionaryType +#' +#' @usage NULL +#' @format NULL +#' @docType class +#' +#' @section Methods: +#' +#' TODO +#' +#' @rdname arrow__DictionaryType +#' @name arrow__DictionaryType `arrow::DictionaryType` <- R6Class("arrow::DictionaryType", inherit = `arrow::FixedWidthType`, - public = list( + + active = list( index_type = function() `arrow::DataType`$dispatch(DictionaryType__index_type(self)), - name = function() DictionaryType__name(self), dictionary = function() shared_ptr(`arrow::Array`, DictionaryType__dictionary(self)), + name = function() DictionaryType__name(self), ordered = function() DictionaryType__ordered(self) ) - ) #' dictionary type factory @@ -34,6 +46,8 @@ #' @param values values array, typically an arrow array of strings #' @param ordered Is this an ordered dictionary #' +#' @return a [arrow::DictionaryType][arrow__DictionaryType] +#' #' @export dictionary <- function(type, values, ordered = FALSE) { assert_that( diff --git a/r/R/feather.R b/r/R/feather.R index c36c571bd4bd4..bae71d31bc1e5 100644 --- a/r/R/feather.R +++ b/r/R/feather.R @@ -100,7 +100,7 @@ write_feather_RecordBatch <- function(data, stream) { #' @export #' @method write_feather_RecordBatch fs_path `write_feather_RecordBatch.fs_path` <- function(data, stream) { - file_stream <- close_on_exit(file_output_stream(stream)) + file_stream <- close_on_exit(FileOutputStream(stream)) `write_feather_RecordBatch.arrow::io::OutputStream`(data, file_stream) } @@ -133,7 +133,7 @@ feather_table_reader.character <- function(file, mmap = TRUE, ...) { #' @export feather_table_reader.fs_path <- function(file, mmap = TRUE, ...) { - stream <- if(isTRUE(mmap)) mmap_open(file, ...) else file_open(file, ...) + stream <- if(isTRUE(mmap)) mmap_open(file, ...) else ReadableFile(file, ...) feather_table_reader(stream) } diff --git a/r/R/io.R b/r/R/io.R index d4534927412bd..b772be30acf07 100644 --- a/r/R/io.R +++ b/r/R/io.R @@ -19,45 +19,151 @@ #' @include enums.R #' @include buffer.R -`arrow::io::Readable` <- R6Class("arrow::io::Readable", inherit = `arrow::Object`, - public = list( - Read = function(nbytes) shared_ptr(`arrow::Buffer`, io___Readable__Read(self, nbytes)) - ) -) - -`arrow::io::InputStream` <- R6Class("arrow::io::InputStream", inherit = `arrow::io::Readable`, - public = list( - Close = function() io___InputStream__Close(self) - ) -) +# OutputStream ------------------------------------------------------------ `arrow::io::Writable` <- R6Class("arrow::io::Writable", inherit = `arrow::Object`) +#' @title OutputStream +#' +#' @usage NULL +#' @format NULL +#' @docType class +#' +#' @section Methods: +#' +#' - `arrow::Buffer` `Read`(`int` nbytes): Read `nbytes` bytes +#' - `void` `close`(): close the stream +#' +#' @rdname arrow__io__OutputStream +#' @name arrow__io__OutputStream `arrow::io::OutputStream` <- R6Class("arrow::io::OutputStream", inherit = `arrow::io::Writable`, public = list( - Close = function() io___OutputStream__Close(self) + close = function() io___OutputStream__Close(self) ) ) +#' @title class arrow::io::FileOutputStream +#' +#' @usage NULL +#' @format NULL +#' @docType class +#' +#' @section Methods: +#' +#' TODO +#' +#' @rdname arrow__io__FileOutputStream +#' @name arrow__io__FileOutputStream `arrow::io::FileOutputStream` <- R6Class("arrow::io::FileOutputStream", inherit = `arrow::io::OutputStream`) +#' @title class arrow::io::MockOutputStream +#' +#' @usage NULL +#' @format NULL +#' @docType class +#' +#' +#' @section Methods: +#' +#' TODO +#' +#' @rdname arrow__io__MockOutputStream +#' @name arrow__io__MockOutputStream `arrow::io::MockOutputStream` <- R6Class("arrow::io::MockOutputStream", inherit = `arrow::io::OutputStream`, public = list( GetExtentBytesWritten = function() io___MockOutputStream__GetExtentBytesWritten(self) ) ) +#' @title class arrow::io::BufferOutputStream +#' +#' @usage NULL +#' @docType class +#' @section Methods: +#' +#' TODO +#' +#' @rdname arrow__io__BufferOutputStream +#' @name arrow__io__BufferOutputStream `arrow::io::BufferOutputStream` <- R6Class("arrow::io::BufferOutputStream", inherit = `arrow::io::OutputStream`, public = list( capacity = function() io___BufferOutputStream__capacity(self), - Finish = function() shared_ptr(`arrow::Buffer`, io___BufferOutputStream__Finish(self)), + getvalue = function() shared_ptr(`arrow::Buffer`, io___BufferOutputStream__Finish(self)), + Write = function(bytes) io___BufferOutputStream__Write(self, bytes), Tell = function() io___BufferOutputStream__Tell(self) ) ) +#' @title class arrow::io::FixedSizeBufferWriter +#' +#' @usage NULL +#' @format NULL +#' @docType class +#' +#' +#' @section Methods: +#' +#' TODO +#' +#' @rdname arrow__io__FixedSizeBufferWriter +#' @name arrow__io__FixedSizeBufferWriter `arrow::io::FixedSizeBufferWriter` <- R6Class("arrow::io::FixedSizeBufferWriter", inherit = `arrow::io::OutputStream`) + +# InputStream ------------------------------------------------------------- + +#' @title class arrow::io::Readable +#' +#' @usage NULL +#' @format NULL +#' @docType class +#' +#' +#' @section Methods: +#' +#' TODO +#' +#' @rdname arrow__io__Readable +#' @name arrow__io__Readable +`arrow::io::Readable` <- R6Class("arrow::io::Readable", inherit = `arrow::Object`, + public = list( + Read = function(nbytes) shared_ptr(`arrow::Buffer`, io___Readable__Read(self, nbytes)) + ) +) + +#' @title class arrow::io::InputStream +#' +#' @usage NULL +#' @format NULL +#' @docType class +#' +#' +#' @section Methods: +#' +#' TODO +#' +#' @rdname arrow__io__InputStream +#' @name arrow__io__InputStream +`arrow::io::InputStream` <- R6Class("arrow::io::InputStream", inherit = `arrow::io::Readable`, + public = list( + close = function() io___InputStream__Close(self) + ) +) + +#' @title class arrow::io::RandomAccessFile +#' +#' @usage NULL +#' @format NULL +#' @docType class +#' +#' +#' @section Methods: +#' +#' TODO +#' +#' @rdname arrow__io__RandomAccessFile +#' @name arrow__io__RandomAccessFile `arrow::io::RandomAccessFile` <- R6Class("arrow::io::RandomAccessFile", inherit = `arrow::io::InputStream`, public = list( GetSize = function() io___RandomAccessFile__GetSize(self), @@ -67,94 +173,159 @@ ) ) +#' @title class arrow::io::MemoryMappedFile +#' +#' @usage NULL +#' @format NULL +#' @docType class +#' +#' +#' @section Methods: +#' +#' TODO +#' +#' @seealso [mmap_open()], [mmap_create()] +#' +#' +#' @rdname arrow__io__MemoryMappedFile +#' @name arrow__io__MemoryMappedFile `arrow::io::MemoryMappedFile` <- R6Class("arrow::io::MemoryMappedFile", inherit = `arrow::io::RandomAccessFile`, public = list( Resize = function(size) io___MemoryMappedFile__Resize(self, size) ) ) +#' @title class arrow::io::ReadableFile +#' +#' @usage NULL +#' @format NULL +#' @docType class +#' +#' +#' @section Methods: +#' +#' TODO +#' +#' @rdname arrow__io__ReadableFile +#' @name arrow__io__ReadableFile `arrow::io::ReadableFile` <- R6Class("arrow::io::ReadableFile", inherit = `arrow::io::RandomAccessFile`) -`arrow::io::BufferReader` <- R6Class("arrow::io::BufferReader", inherit = `arrow::io::RandomAccessFile`) +#' @title class arrow::io::BufferReader +#' +#' @usage NULL +#' @format NULL +#' @docType class +#' +#' @section Methods: +#' +#' TODO +#' +#' @rdname arrow__io__BufferReader +#' @name arrow__io__BufferReader +`arrow::io::BufferReader` <- R6Class("arrow::io::BufferReader", inherit = `arrow::io::RandomAccessFile`) #' Create a new read/write memory mapped file of a given size #' #' @param path file path #' @param size size in bytes -#' @param mode file mode (read/write/readwrite) -#' @param buffer an `arrow::Buffer`, typically created by [buffer()] -#' @param initial_capacity initial capacity for the buffer output stream #' -#' @rdname io +#' @return a [arrow::io::MemoryMappedFile][arrow__io__MemoryMappedFile] +#' #' @export -mmap_create <- `arrow::io::MemoryMappedFile`$create <- function(path, size) { +mmap_create <- function(path, size) { shared_ptr(`arrow::io::MemoryMappedFile`, io___MemoryMappedFile__Create(fs::path_abs(path), size)) } -#' @rdname io +#' Open a memory mapped file +#' +#' @param path file path +#' @param mode file mode (read/write/readwrite) +#' #' @export -mmap_open <- `arrow::io::MemoryMappedFile`$open <- function(path, mode = c("read", "write", "readwrite")) { +mmap_open <- function(path, mode = c("read", "write", "readwrite")) { mode <- match(match.arg(mode), c("read", "write", "readwrite")) - 1L shared_ptr(`arrow::io::MemoryMappedFile`, io___MemoryMappedFile__Open(fs::path_abs(path), mode)) } -#' @rdname io +#' open a [arrow::io::ReadableFile][arrow__io__ReadableFile] +#' +#' @param path file path +#' +#' @return a [arrow::io::ReadableFile][arrow__io__ReadableFile] +#' #' @export -file_open <- `arrow::io::ReadableFile`$open <- function(path) { +ReadableFile <- function(path) { shared_ptr(`arrow::io::ReadableFile`, io___ReadableFile__Open(fs::path_abs(path))) } -#' @rdname io +#' Open a [arrow::io::FileOutputStream][arrow__io__FileOutputStream] +#' +#' @param path file path +#' +#' @return a [arrow::io::FileOutputStream][arrow__io__FileOutputStream] +#' #' @export -file_output_stream <- function(path) { +FileOutputStream <- function(path) { shared_ptr(`arrow::io::FileOutputStream`, io___FileOutputStream__Open(path)) } -#' @rdname io +#' Open a [arrow::io::MockOutputStream][arrow__io__MockOutputStream] +#' +#' @return a [arrow::io::MockOutputStream][arrow__io__MockOutputStream] +#' #' @export -mock_output_stream <- function() { +MockOutputStream <- function() { shared_ptr(`arrow::io::MockOutputStream`, io___MockOutputStream__initialize()) } -#' @rdname io +#' Open a [arrow::io::BufferOutputStream][arrow__io__BufferOutputStream] +#' +#' @param initial_capacity initial capacity +#' +#' @return a [arrow::io::BufferOutputStream][arrow__io__BufferOutputStream] +#' #' @export -buffer_output_stream <- function(initial_capacity = 0L) { +BufferOutputStream <- function(initial_capacity = 0L) { shared_ptr(`arrow::io::BufferOutputStream`, io___BufferOutputStream__Create(initial_capacity)) } -#' @rdname io +#' Open a [arrow::io::FixedSizeBufferWriter][arrow__io__FixedSizeBufferWriter] +#' +#' @param buffer [arrow::Buffer][arrow__Buffer] or something [buffer()] can handle +#' +#' @return a [arrow::io::BufferOutputStream][arrow__io__BufferOutputStream] +#' #' @export -fixed_size_buffer_writer <- function(buffer){ - UseMethod("fixed_size_buffer_writer") +FixedSizeBufferWriter <- function(buffer){ + UseMethod("FixedSizeBufferWriter") } #' @export -fixed_size_buffer_writer.default <- function(buffer){ - fixed_size_buffer_writer(buffer(buffer)) +FixedSizeBufferWriter.default <- function(buffer){ + FixedSizeBufferWriter(buffer(buffer)) } #' @export -`fixed_size_buffer_writer.arrow::Buffer` <- function(buffer){ - assert_that(buffer$is_mutable()) +`FixedSizeBufferWriter.arrow::Buffer` <- function(buffer){ + assert_that(buffer$is_mutable) shared_ptr(`arrow::io::FixedSizeBufferWriter`, io___FixedSizeBufferWriter__initialize(buffer)) } -#' Create a `arrow::BufferReader` +#' Create a [arrow::io::BufferReader][arrow__io__BufferReader] #' #' @param x R object to treat as a buffer or a buffer created by [buffer()] #' #' @export -buffer_reader <- function(x) { - UseMethod("buffer_reader") +BufferReader <- function(x) { + UseMethod("BufferReader") } #' @export -`buffer_reader.arrow::Buffer` <- function(x) { - shared_ptr(`arrow::io::BufferReader`, io___BufferReader__initialize(x)) +BufferReader.default <- function(x) { + BufferReader(buffer(x)) } #' @export -buffer_reader.default <- function(x) { - buffer_reader(buffer(x)) +`BufferReader.arrow::Buffer` <- function(x) { + shared_ptr(`arrow::io::BufferReader`, io___BufferReader__initialize(x)) } - diff --git a/r/R/memory_pool.R b/r/R/memory_pool.R index 49f65d2a1f397..88c2c7bc1980e 100644 --- a/r/R/memory_pool.R +++ b/r/R/memory_pool.R @@ -16,7 +16,19 @@ # under the License. #' @include R6.R - +#' +#' @title class arrow::MemoryPool +#' +#' @usage NULL +#' @format NULL +#' @docType class +#' +#' @section Methods: +#' +#' TODO +#' +#' @rdname arrow___MemoryPool +#' @name arrow__MemoryPool `arrow::MemoryPool` <- R6Class("arrow::MemoryPool", inherit = `arrow::Object`, public = list( @@ -28,6 +40,10 @@ ) ) +#' default [arrow::MemoryPool][arrow__MemoryPool] +#' +#' @return the default [arrow::MemoryPool][arrow__MemoryPool] +#' @export default_memory_pool <- function() { shared_ptr(`arrow::MemoryPool`, MemoryPool__default()) } diff --git a/r/R/message.R b/r/R/message.R index f31fb9a53b7ab..93c90c097639a 100644 --- a/r/R/message.R +++ b/r/R/message.R @@ -17,6 +17,18 @@ #' @include R6.R +#' @title class arrow::ipc::Message +#' +#' @usage NULL +#' @format NULL +#' @docType class +#' +#' @section Methods: +#' +#' TODO +#' +#' @rdname arrow__ipc__Message +#' @name arrow__ipc__Message `arrow::ipc::Message` <- R6Class("arrow::ipc::Message", inherit = `arrow::Object`, public = list( Equals = function(other){ @@ -24,10 +36,10 @@ ipc___Message__Equals(self, other) }, body_length = function() ipc___Message__body_length(self), - Verify = function() ipc___Message__Verify(self), - type = function() ipc___Message__type(self) + Verify = function() ipc___Message__Verify(self) ), active = list( + type = function() ipc___Message__type(self), metadata = function() shared_ptr(`arrow::Buffer`, ipc___Message__metadata(self)), body = function() shared_ptr(`arrow::Buffer`, ipc___Message__body(self)) ) @@ -36,51 +48,58 @@ #' @export `==.arrow::ipc::Message` <- function(x, y) x$Equals(y) +#' @title class arrow::ipc::MessageReader +#' +#' @usage NULL +#' @format NULL +#' @docType class +#' +#' @section Methods: +#' +#' TODO +#' +#' @rdname arrow__ipc__MessageReader +#' @name arrow__ipc__MessageReader `arrow::ipc::MessageReader` <- R6Class("arrow::ipc::MessageReader", inherit = `arrow::Object`, public = list( ReadNextMessage = function() unique_ptr(`arrow::ipc::Message`, ipc___MessageReader__ReadNextMessage(self)) ) ) -#' Read a Message from a stream +#' Open a MessageReader that reads from a stream #' #' @param stream an InputStream #' #' @export -read_message <- function(stream) { - UseMethod("read_message") +MessageReader <- function(stream) { + UseMethod("MessageReader") } #' @export -read_message.default <- function(stream) { - stop("unsupported") +MessageReader.default <- function(stream) { + MessageReader(BufferReader(stream)) } #' @export -`read_message.arrow::io::InputStream` <- function(stream) { - unique_ptr(`arrow::ipc::Message`, ipc___ReadMessage(stream) ) +`MessageReader.arrow::io::InputStream` <- function(stream) { + unique_ptr(`arrow::ipc::MessageReader`, ipc___MessageReader__Open(stream)) } -#' Open a MessageReader that reads from a stream +#' Read a Message from a stream #' #' @param stream an InputStream #' #' @export -message_reader <- function(stream) { - UseMethod("message_reader") -} - -#' @export -message_reader.default <- function(stream) { - stop("unsupported") +read_message <- function(stream) { + UseMethod("read_message") } #' @export -message_reader.raw <- function(stream) { - message_reader(buffer_reader(stream)) +`read_message.arrow::io::InputStream` <- function(stream) { + unique_ptr(`arrow::ipc::Message`, ipc___ReadMessage(stream) ) } #' @export -`message_reader.arrow::io::InputStream` <- function(stream) { - unique_ptr(`arrow::ipc::MessageReader`, ipc___MessageReader__Open(stream)) +`read_message.arrow::ipc::MessageReader` <- function(stream) { + stream$ReadNextMessage() } diff --git a/r/R/on_exit.R b/r/R/on_exit.R index 9387169b8be9f..52b017404deb8 100644 --- a/r/R/on_exit.R +++ b/r/R/on_exit.R @@ -17,7 +17,7 @@ #' @importFrom withr defer_parent close_on_exit <- function(x, ...){ - defer_parent(x$Close(), ...) + defer_parent(x$close(), ...) x } diff --git a/r/R/read_record_batch.R b/r/R/read_record_batch.R new file mode 100644 index 0000000000000..967ac5b7650a9 --- /dev/null +++ b/r/R/read_record_batch.R @@ -0,0 +1,52 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' read [arrow::RecordBatch][arrow__RecordBatch] as encapsulated IPC message, given a known [arrow::Schema][arrow__Schema] +#' +#' @param obj a [arrow::ipc::Message][arrow__ipc__Message], a [arrow::io::InputStream][arrow__io__InputStream], a [arrow::Buffer][arrow__Buffer], or a raw vector +#' @param schema a [arrow::Schema][arrow__Schema] +#' +#' @return a [arrow::RecordBatch][arrow__RecordBatch] +#' +#' @export +read_record_batch <- function(obj, schema){ + UseMethod("read_record_batch") +} + +#' @export +`read_record_batch.arrow::ipc::Message` <- function(obj, schema) { + assert_that(inherits(schema, "arrow::Schema")) + shared_ptr(`arrow::RecordBatch`, ipc___ReadRecordBatch__Message__Schema(obj, schema)) +} + +#' @export +`read_record_batch.arrow::io::InputStream` <- function(obj, schema) { + assert_that(inherits(schema, "arrow::Schema")) + shared_ptr(`arrow::RecordBatch`, ipc___ReadRecordBatch__InputStream__Schema(obj, schema)) +} + +#' @export +read_record_batch.raw <- function(obj, schema){ + stream <- close_on_exit(BufferReader(obj)) + read_record_batch(stream, schema) +} + +#' @export +`read_record_batch.arrow::Buffer` <- function(obj, schema){ + stream <- close_on_exit(BufferReader(obj)) + read_record_batch(stream, schema) +} diff --git a/r/R/read_table.R b/r/R/read_table.R new file mode 100644 index 0000000000000..a540a42173556 --- /dev/null +++ b/r/R/read_table.R @@ -0,0 +1,86 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +#' Read an [arrow::Table][arrow__Table] from a stream +#' +#' @param stream stream. +#' +#' - a [arrow::ipc::RecordBatchFileReader][arrow__ipc__RecordBatchFileReader]: +#' read an [arrow::Table][arrow__Table] +#' from all the record batches in the reader +#' +#' - a [arrow::ipc::RecordBatchStreamReader][arrow__ipc__RecordBatchStreamReader]: +#' read an [arrow::Table][arrow__Table] from the remaining record batches +#' in the reader +#' +#' - a string or [file path][fs::path_abs()]: interpret the file as an arrow +#' binary file format, and uses a [arrow::ipc::RecordBatchFileReader][arrow__ipc__RecordBatchFileReader] +#' to process it. +#' +#' - a raw vector: read using a [arrow::ipc::RecordBatchStreamReader][arrow__ipc__RecordBatchStreamReader] +#' +#' @return +#' +#' - `read_table` returns an [arrow::Table][arrow__Table] +#' - `read_arrow` returns a [tibble::tibble()] +#' +#' @details +#' +#' The methods using [arrow::ipc::RecordBatchFileReader][arrow__ipc__RecordBatchFileReader] and +#' [arrow::ipc::RecordBatchStreamReader][arrow__ipc__RecordBatchStreamReader] offer the most +#' flexibility. The other methods are for convenience. +#' +#' @export +read_table <- function(stream){ + UseMethod("read_table") +} + +#' @export +`read_table.arrow::ipc::RecordBatchFileReader` <- function(stream) { + shared_ptr(`arrow::Table`, Table__from_RecordBatchFileReader(stream)) +} + +#' @export +`read_table.arrow::ipc::RecordBatchStreamReader` <- function(stream) { + shared_ptr(`arrow::Table`, Table__from_RecordBatchStreamReader(stream)) +} + +#' @export +read_table.character <- function(stream){ + assert_that(length(stream) == 1L) + read_table(fs::path_abs(stream)) +} + +#' @export +read_table.fs_path <- function(stream) { + stream <- close_on_exit(ReadableFile(stream)) + batch_reader <- close_on_exit(RecordBatchFileReader(stream)) + shared_ptr(`arrow::Table`, Table__from_RecordBatchFileReader(batch_reader)) +} + +#' @export +`read_table.raw` <- function(stream) { + stream <- close_on_exit(BufferReader(stream)) + batch_reader <- close_on_exit(RecordBatchStreamReader(stream)) + shared_ptr(`arrow::Table`, Table__from_RecordBatchStreamReader(batch_reader)) +} + +#' @rdname read_table +#' @export +read_arrow <- function(stream){ + as_tibble(read_table(stream)) +} diff --git a/r/R/write_arrow.R b/r/R/write_arrow.R new file mode 100644 index 0000000000000..5fc684771e5f2 --- /dev/null +++ b/r/R/write_arrow.R @@ -0,0 +1,94 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +to_arrow <- function(x) { + UseMethod("to_arrow") +} + +`to_arrow.arrow::RecordBatch` <- function(x) x +`to_arrow.arrow::Table` <- function(x) x +`to_arrow.data.frame` <- function(x) table(x) + +#' serialize an [arrow::Table][arrow__Table], an [arrow::RecordBatch][arrow__RecordBatch], or a +#' data frame to either the streaming format or the binary file format +#' +#' @param x an [arrow::Table][arrow__Table], an [arrow::RecordBatch][arrow__RecordBatch] or a data.frame +#' +#' @param stream where to serialize to +#' +#' - A [arrow::ipc::RecordBatchWriter][arrow__ipc__RecordBatchWriter]: the `$write()` +#' of `x` is used. The stream is left open. This uses the streaming format +#' or the binary file format depending on the type of the writer. +#' +#' - A string or [file path][fs::path_abs()]: `x` is serialized with +#' a [arrow::ipc::RecordBatchFileWriter][arrow__ipc__RecordBatchFileWriter], i.e. +#' using the binary file format. +#' +#' - A raw vector: typically of length zero (its data is ignored, and only used for +#' dispatch). `x` is serialized using the streaming format, i.e. using the +#' [arrow::ipc::RecordBatchStreamWriter][arrow__ipc__RecordBatchStreamWriter] +#' +#' @param ... extra parameters, currently ignored +#' +#' `write_arrow` is a convenience function, the classes [arrow::ipc::RecordBatchFileWriter][arrow__ipc__RecordBatchFileWriter] +#' and [arrow::ipc::RecordBatchStreamWriter][arrow__ipc__RecordBatchStreamWriter] can be used for more flexibility. +#' +#' @export +write_arrow <- function(x, stream, ...) { + UseMethod("write_arrow", stream) +} + +#' @export +`write_arrow.arrow::ipc::RecordBatchWriter` <- function(x, stream, ...){ + stream$write(x) +} + +#' @export +`write_arrow.character` <- function(x, stream, ...) { + write_arrow(x, fs::path_abs(stream), ...) +} + +#' @export +`write_arrow.fs_path` <- function(x, stream, ...) { + assert_that(length(stream) == 1L) + x <- to_arrow(x) + file_stream <- close_on_exit(FileOutputStream(stream)) + file_writer <- close_on_exit(RecordBatchFileWriter(file_stream, x$schema)) + write_arrow(x, file_writer, ...) +} + +#' @export +`write_arrow.raw` <- function(x, stream, ...) { + x <- to_arrow(x) + schema <- x$schema + + # how many bytes do we need + mock_stream <- MockOutputStream() + writer <- RecordBatchStreamWriter(mock_stream, schema) + writer$write(x) + writer$close() + n <- mock_stream$GetExtentBytesWritten() + + # now that we know the size, stream in a buffer backed by an R raw vector + bytes <- raw(n) + buffer_writer <- FixedSizeBufferWriter(buffer(bytes)) + writer <- RecordBatchStreamWriter(buffer_writer, schema) + writer$write(x) + writer$close() + + bytes +} diff --git a/r/README.Rmd b/r/README.Rmd index 204a9f9d566ed..2c51d01c0f00f 100644 --- a/r/README.Rmd +++ b/r/README.Rmd @@ -46,9 +46,9 @@ tf <- tempfile() # write arrow::Table to file (tib <- tibble(x = 1:10, y = rnorm(10))) -arrow::write_arrow(tib, tf) +# arrow::write_arrow(tib, tf) -# read it back with pyarrow -pa <- import("pyarrow") -as_tibble(pa$open_file(tf)$read_pandas()) +# # read it back with pyarrow +# pa <- import("pyarrow") +# as_tibble(pa$open_file(tf)$read_pandas()) ``` diff --git a/r/configure b/r/configure index 69f04632a2f5b..28f6a73ac7ef5 100755 --- a/r/configure +++ b/r/configure @@ -91,7 +91,7 @@ if [ $? -ne 0 ]; then fi # Write to Makevars -sed -e "s|@cflags@|$PKG_CFLAGS|" -e "s|@libs@|$PKG_LIBS|" src/Makevars.in > src/Makevars +sed -e "s|@cflags@|$PKG_CFLAGS|" -e "s|@libs@|$PKG_LIBS|" -e "s|@visibility@|$C_VISIBILITY|" src/Makevars.in > src/Makevars # Success exit 0 diff --git a/r/data-raw/test.R b/r/data-raw/test.R deleted file mode 100644 index 516af58616ef9..0000000000000 --- a/r/data-raw/test.R +++ /dev/null @@ -1,85 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -library(tidyverse) -library(arrow) - -# meta data -(t1 <- int32()) -(t2 <- utf8()) -(t5 <- timestamp(unit = TimeUnit$MILLI)) - -# lists -list_of(t1) - -# shema -schema(x = int32(), y = float64()) - -# :scream_cat: -# -# pa.schema( -# [ -# pa.field('x', pa.int32()), -# pa.field('y', pa.float64()) -# ] -# ) -# - -schema(x = int32(), y = list_of(float64())) - -#------- arrays - -# arr = pa.array([1, 2, 3]) -arr <- array(1:3, 5:80) -arr -arr$as_vector() - -#------- read_arrow / stream -tbl <- tibble(x=1:10, y=rnorm(10)) -write_arrow(tbl, "/tmp/test.arrow") -readr::write_rds(tbl, "/tmp/test.rds") -fs::file_info(c("/tmp/test.arrow", "/tmp/test.rds")) - -(data <- read_arrow("/tmp/test.arrow")) - -# tibble <-> arrow::RecordBatch -(batch <- record_batch(tbl)) -batch$num_columns() -batch$num_rows() -write_arrow(batch, "/tmp/test") -readBin("/tmp/test", what = raw(), n = 1000) -batch$schema() -all.equal(tbl, data) - -batch <- read_record_batch("/tmp/test") -batch$schema() -batch$column(0) -batch$column(0)$as_vector() - -as_tibble(batch) - -# tibble <-> arrow::Table -tab <- arrow::table(tbl) -tab -tab$schema() -tab$num_columns() -tab$num_rows() - -# read_arrow, stream -tbl <- tibble(x = rnorm(20), y = seq_len(20)) -write_arrow(tbl, tf) - diff --git a/r/man/BufferOutputStream.Rd b/r/man/BufferOutputStream.Rd new file mode 100644 index 0000000000000..1776f995930fc --- /dev/null +++ b/r/man/BufferOutputStream.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/io.R +\name{BufferOutputStream} +\alias{BufferOutputStream} +\title{Open a \link[=arrow__io__BufferOutputStream]{arrow::io::BufferOutputStream}} +\usage{ +BufferOutputStream(initial_capacity = 0L) +} +\arguments{ +\item{initial_capacity}{initial capacity} +} +\value{ +a \link[=arrow__io__BufferOutputStream]{arrow::io::BufferOutputStream} +} +\description{ +Open a \link[=arrow__io__BufferOutputStream]{arrow::io::BufferOutputStream} +} diff --git a/r/man/buffer_reader.Rd b/r/man/BufferReader.Rd similarity index 52% rename from r/man/buffer_reader.Rd rename to r/man/BufferReader.Rd index 3b814fb00b19f..ea5dd790cddcb 100644 --- a/r/man/buffer_reader.Rd +++ b/r/man/BufferReader.Rd @@ -1,14 +1,14 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/io.R -\name{buffer_reader} -\alias{buffer_reader} -\title{Create a \code{arrow::BufferReader}} +\name{BufferReader} +\alias{BufferReader} +\title{Create a \link[=arrow__io__BufferReader]{arrow::io::BufferReader}} \usage{ -buffer_reader(x) +BufferReader(x) } \arguments{ \item{x}{R object to treat as a buffer or a buffer created by \code{\link[=buffer]{buffer()}}} } \description{ -Create a \code{arrow::BufferReader} +Create a \link[=arrow__io__BufferReader]{arrow::io::BufferReader} } diff --git a/r/man/FileOutputStream.Rd b/r/man/FileOutputStream.Rd new file mode 100644 index 0000000000000..4155d349d1a64 --- /dev/null +++ b/r/man/FileOutputStream.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/io.R +\name{FileOutputStream} +\alias{FileOutputStream} +\title{Open a \link[=arrow__io__FileOutputStream]{arrow::io::FileOutputStream}} +\usage{ +FileOutputStream(path) +} +\arguments{ +\item{path}{file path} +} +\value{ +a \link[=arrow__io__FileOutputStream]{arrow::io::FileOutputStream} +} +\description{ +Open a \link[=arrow__io__FileOutputStream]{arrow::io::FileOutputStream} +} diff --git a/r/man/FixedSizeBufferWriter.Rd b/r/man/FixedSizeBufferWriter.Rd new file mode 100644 index 0000000000000..553d61b76e1f4 --- /dev/null +++ b/r/man/FixedSizeBufferWriter.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/io.R +\name{FixedSizeBufferWriter} +\alias{FixedSizeBufferWriter} +\title{Open a \link[=arrow__io__FixedSizeBufferWriter]{arrow::io::FixedSizeBufferWriter}} +\usage{ +FixedSizeBufferWriter(buffer) +} +\arguments{ +\item{buffer}{\link[=arrow__Buffer]{arrow::Buffer} or something \code{\link[=buffer]{buffer()}} can handle} +} +\value{ +a \link[=arrow__io__BufferOutputStream]{arrow::io::BufferOutputStream} +} +\description{ +Open a \link[=arrow__io__FixedSizeBufferWriter]{arrow::io::FixedSizeBufferWriter} +} diff --git a/r/man/message_reader.Rd b/r/man/MessageReader.Rd similarity index 79% rename from r/man/message_reader.Rd rename to r/man/MessageReader.Rd index 0d8b1e7ff634e..01589f5d0780b 100644 --- a/r/man/message_reader.Rd +++ b/r/man/MessageReader.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/message.R -\name{message_reader} -\alias{message_reader} +\name{MessageReader} +\alias{MessageReader} \title{Open a MessageReader that reads from a stream} \usage{ -message_reader(stream) +MessageReader(stream) } \arguments{ \item{stream}{an InputStream} diff --git a/r/man/MockOutputStream.Rd b/r/man/MockOutputStream.Rd new file mode 100644 index 0000000000000..2e3c0b6d3e378 --- /dev/null +++ b/r/man/MockOutputStream.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/io.R +\name{MockOutputStream} +\alias{MockOutputStream} +\title{Open a \link[=arrow__io__MockOutputStream]{arrow::io::MockOutputStream}} +\usage{ +MockOutputStream() +} +\value{ +a \link[=arrow__io__MockOutputStream]{arrow::io::MockOutputStream} +} +\description{ +Open a \link[=arrow__io__MockOutputStream]{arrow::io::MockOutputStream} +} diff --git a/r/man/ReadableFile.Rd b/r/man/ReadableFile.Rd new file mode 100644 index 0000000000000..11535321bfb6a --- /dev/null +++ b/r/man/ReadableFile.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/io.R +\name{ReadableFile} +\alias{ReadableFile} +\title{open a \link[=arrow__io__ReadableFile]{arrow::io::ReadableFile}} +\usage{ +ReadableFile(path) +} +\arguments{ +\item{path}{file path} +} +\value{ +a \link[=arrow__io__ReadableFile]{arrow::io::ReadableFile} +} +\description{ +open a \link[=arrow__io__ReadableFile]{arrow::io::ReadableFile} +} diff --git a/r/man/RecordBatchFileReader.Rd b/r/man/RecordBatchFileReader.Rd new file mode 100644 index 0000000000000..3ea04817e0ee0 --- /dev/null +++ b/r/man/RecordBatchFileReader.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RecordBatchReader.R +\name{RecordBatchFileReader} +\alias{RecordBatchFileReader} +\title{Create an \link[=arrow__ipc__RecordBatchFileReader]{arrow::ipc::RecordBatchFileReader} from a file} +\usage{ +RecordBatchFileReader(file) +} +\arguments{ +\item{file}{The file to read from. A file path, or an \link[=arrow__ipc__RecordBatchFileReader]{arrow::io::RandomAccessFile}} +} +\description{ +Create an \link[=arrow__ipc__RecordBatchFileReader]{arrow::ipc::RecordBatchFileReader} from a file +} diff --git a/r/man/RecordBatchFileWriter.Rd b/r/man/RecordBatchFileWriter.Rd new file mode 100644 index 0000000000000..90858304b0ba3 --- /dev/null +++ b/r/man/RecordBatchFileWriter.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RecordBatchWriter.R +\name{RecordBatchFileWriter} +\alias{RecordBatchFileWriter} +\title{Create a record batch file writer from a stream} +\usage{ +RecordBatchFileWriter(sink, schema) +} +\arguments{ +\item{sink}{Where to write. Can either be: +\itemize{ +\item character vector of length one +\item a \link[fs:path_abs]{file path} +\item \link[=arrow__io__OutputStream]{arrow::io::OutputStream} +}} + +\item{schema}{The \link[=arrow__Schema]{arrow::Schema} for data to be written.} +} +\value{ +an \code{arrow::ipc::RecordBatchWriter} object +} +\description{ +Create a record batch file writer from a stream +} diff --git a/r/man/RecordBatchStreamReader.Rd b/r/man/RecordBatchStreamReader.Rd new file mode 100644 index 0000000000000..4bd0e8ccdc55d --- /dev/null +++ b/r/man/RecordBatchStreamReader.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RecordBatchReader.R +\name{RecordBatchStreamReader} +\alias{RecordBatchStreamReader} +\title{Create a \link[=arrow__ipc__RecordBatchStreamReader]{arrow::ipc::RecordBatchStreamReader} from an input stream} +\usage{ +RecordBatchStreamReader(stream) +} +\arguments{ +\item{stream}{input stream, an \link[=arrow__io__InputStream]{arrow::io::InputStream} or a raw vector} +} +\description{ +Create a \link[=arrow__ipc__RecordBatchStreamReader]{arrow::ipc::RecordBatchStreamReader} from an input stream +} diff --git a/r/man/RecordBatchStreamWriter.Rd b/r/man/RecordBatchStreamWriter.Rd new file mode 100644 index 0000000000000..b9183a80719cf --- /dev/null +++ b/r/man/RecordBatchStreamWriter.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RecordBatchWriter.R +\name{RecordBatchStreamWriter} +\alias{RecordBatchStreamWriter} +\title{Writer for the Arrow streaming binary format} +\usage{ +RecordBatchStreamWriter(sink, schema) +} +\arguments{ +\item{sink}{Where to write. Can either be: +\itemize{ +\item A string, meant as a file path, passed to \code{\link[fs:path_abs]{fs::path_abs()}} +\item a \link[fs:path_abs]{file path} +\item \link[=arrow__io__OutputStream]{arrow::io::OutputStream} +}} + +\item{schema}{The \link[=arrow__Schema]{arrow::Schema} for data to be written.} +} +\value{ +a \link[=arrow__ipc__RecordBatchStreamWriter]{arrow::ipc::RecordBatchStreamWriter} +} +\description{ +Writer for the Arrow streaming binary format +} diff --git a/r/man/array.Rd b/r/man/array.Rd index 38bd773be926d..ccdba181db823 100644 --- a/r/man/array.Rd +++ b/r/man/array.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/array.R \name{array} \alias{array} -\title{create an arrow::Array from an R vector} +\title{create an \link[=arrow__Array]{arrow::Array} from an R vector} \usage{ array(..., type) } @@ -12,5 +12,5 @@ array(..., type) \item{type}{currently ignored} } \description{ -create an arrow::Array from an R vector +create an \link[=arrow__Array]{arrow::Array} from an R vector } diff --git a/r/man/arrow__Array.Rd b/r/man/arrow__Array.Rd new file mode 100644 index 0000000000000..b11373d26b368 --- /dev/null +++ b/r/man/arrow__Array.Rd @@ -0,0 +1,57 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/array.R +\docType{class} +\name{arrow__Array} +\alias{arrow__Array} +\alias{arrow::Array} +\title{class arrow::Array + +Array base type. Immutable data array with some logical type and some length.} +\description{ +class arrow::Array + +Array base type. Immutable data array with some logical type and some length. +} +\section{Usage}{ +\preformatted{a <- array(...) + +a$IsNull(i) +a$IsValid(i) +a$length() or length(a) +a$offset() +a$null_count() +a$type() +a$type_id() +a$Equals(b) +a$ApproxEquals(b) +a$as_vector() +a$ToString() +a$Slice(offset, length = NULL) +a$RangeEquals(other, start_idx, end_idx, other_start_idx) + +print(a) +a == a +} +} + +\section{Methods}{ + +\itemize{ +\item \code{$IsNull(i)}: Return true if value at index is null. Does not boundscheck +\item \code{$IsValid(i)}: Return true if value at index is valid. Does not boundscheck +\item \code{$length()}: Size in the number of elements this array contains +\item \code{$offset()}: A relative position into another array's data, to enable zero-copy slicing +\item \code{$null_count()}: The number of null entries in the array +\item \code{$type()}: logical type of data +\item \code{$type_id()}: type id +\item \code{$Equals(other)} : is this array equal to \code{other} +\item \code{$ApproxEquals(other)} : +\item \code{$data()}: return the underlying \link[=arrow__ArrayData]{arrow::ArrayData} +\item \code{$as_vector()}: convert to an R vector +\item \code{$ToString()}: string representation of the array +\item \code{$Slice(offset, length = NULL)} : Construct a zero-copy slice of the array with the indicated offset and length. If length is \code{NULL}, the slice goes until the end of the array. +\item \code{$RangeEquals(other, start_idx, end_idx, other_start_idx)} : +} +} + +\keyword{datasets} diff --git a/r/man/arrow__ArrayData.Rd b/r/man/arrow__ArrayData.Rd new file mode 100644 index 0000000000000..bdf996605c532 --- /dev/null +++ b/r/man/arrow__ArrayData.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ArrayData.R +\docType{class} +\name{arrow__ArrayData} +\alias{arrow__ArrayData} +\alias{arrow::ArrayData} +\title{class arrow::ArrayData} +\description{ +class arrow::ArrayData +} +\section{Usage}{ +\preformatted{data <- array(...)$data() + +data$type() +data$length() +data$null_count() +data$offset() +data$buffers() +} +} + +\section{Methods}{ + + +... +} + +\keyword{datasets} diff --git a/r/man/arrow__Buffer.Rd b/r/man/arrow__Buffer.Rd new file mode 100644 index 0000000000000..135da7a20e788 --- /dev/null +++ b/r/man/arrow__Buffer.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/buffer.R +\docType{class} +\name{arrow__Buffer} +\alias{arrow__Buffer} +\alias{arrow::Buffer} +\title{class arrow::Buffer} +\description{ +class arrow::Buffer +} +\section{Methods}{ + +\itemize{ +\item \code{$is_mutable()} : +\item \code{$ZeroPadding()} : +\item \code{$size()} : +\item \code{$capacity()}: +} +} + +\keyword{datasets} diff --git a/r/man/arrow__ChunkedArray.Rd b/r/man/arrow__ChunkedArray.Rd new file mode 100644 index 0000000000000..a87bf1c0dcc1d --- /dev/null +++ b/r/man/arrow__ChunkedArray.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ChunkedArray.R +\docType{class} +\name{arrow__ChunkedArray} +\alias{arrow__ChunkedArray} +\alias{arrow::ChunkedArray} +\title{class arrow::ChunkedArray} +\description{ +class arrow::ChunkedArray +} +\section{Methods}{ + + +TODO +} + +\keyword{datasets} diff --git a/r/man/arrow__Column.Rd b/r/man/arrow__Column.Rd new file mode 100644 index 0000000000000..6a0ee6a40a5a9 --- /dev/null +++ b/r/man/arrow__Column.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Column.R +\docType{class} +\name{arrow__Column} +\alias{arrow__Column} +\alias{arrow::Column} +\title{class arrow::Column} +\description{ +class arrow::Column +} +\section{Methods}{ + + +TODO +} + +\keyword{datasets} diff --git a/r/man/arrow__DataType.Rd b/r/man/arrow__DataType.Rd new file mode 100644 index 0000000000000..53bd6327d9175 --- /dev/null +++ b/r/man/arrow__DataType.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/R6.R +\docType{class} +\name{arrow__DataType} +\alias{arrow__DataType} +\alias{arrow::DataType} +\title{class arrow::DataType} +\description{ +class arrow::DataType +} +\section{Methods}{ + + +TODO +} + +\keyword{datasets} diff --git a/r/man/arrow__DictionaryType.Rd b/r/man/arrow__DictionaryType.Rd new file mode 100644 index 0000000000000..ba462ee011497 --- /dev/null +++ b/r/man/arrow__DictionaryType.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/dictionary.R +\docType{class} +\name{arrow__DictionaryType} +\alias{arrow__DictionaryType} +\alias{arrow::DictionaryType} +\title{class arrow::DictionaryType} +\description{ +class arrow::DictionaryType +} +\section{Methods}{ + + +TODO +} + +\keyword{datasets} diff --git a/r/man/arrow__Field.Rd b/r/man/arrow__Field.Rd new file mode 100644 index 0000000000000..893a65aa08e43 --- /dev/null +++ b/r/man/arrow__Field.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Field.R +\docType{class} +\name{arrow__Field} +\alias{arrow__Field} +\alias{arrow::Field} +\title{class arrow::Field} +\description{ +class arrow::Field +} +\section{Methods}{ + + +TODO +} + +\keyword{datasets} diff --git a/r/man/arrow__FixedWidthType.Rd b/r/man/arrow__FixedWidthType.Rd new file mode 100644 index 0000000000000..610a40034290f --- /dev/null +++ b/r/man/arrow__FixedWidthType.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/R6.R +\docType{class} +\name{arrow__FixedWidthType} +\alias{arrow__FixedWidthType} +\alias{arrow::FixedWidthType} +\title{class arrow::FixedWidthType} +\description{ +class arrow::FixedWidthType +} +\section{Methods}{ + + +TODO +} + +\keyword{datasets} diff --git a/r/man/arrow__RecordBatch.Rd b/r/man/arrow__RecordBatch.Rd new file mode 100644 index 0000000000000..40ba6323ee0a9 --- /dev/null +++ b/r/man/arrow__RecordBatch.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RecordBatch.R +\docType{class} +\name{arrow__RecordBatch} +\alias{arrow__RecordBatch} +\alias{arrow::RecordBatch} +\title{class arrow::RecordBatch} +\description{ +class arrow::RecordBatch +} +\section{Methods}{ + + +TODO +} + +\keyword{datasets} diff --git a/r/man/arrow__RecordBatchReader.Rd b/r/man/arrow__RecordBatchReader.Rd new file mode 100644 index 0000000000000..b3ccd3f174944 --- /dev/null +++ b/r/man/arrow__RecordBatchReader.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RecordBatchReader.R +\docType{class} +\name{arrow__RecordBatchReader} +\alias{arrow__RecordBatchReader} +\alias{arrow::RecordBatchReader} +\title{class arrow::RecordBatchReader} +\description{ +class arrow::RecordBatchReader +} +\section{Methods}{ + + +TODO +} + +\keyword{datasets} diff --git a/r/man/arrow__Schema.Rd b/r/man/arrow__Schema.Rd new file mode 100644 index 0000000000000..b657ff2c4a8cf --- /dev/null +++ b/r/man/arrow__Schema.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Schema.R +\docType{class} +\name{arrow__Schema} +\alias{arrow__Schema} +\alias{arrow::Schema} +\title{class arrow::Schema} +\description{ +class arrow::Schema +} +\section{Usage}{ +\preformatted{s <- schema(...) + +s$ToString() +s$num_fields() +s$field(i) +} +} + +\section{Methods}{ + +\itemize{ +\item \code{$ToString()}: convert to a string +\item \code{$num_fields()}: returns the number of fields +\item \code{$field(i)}: returns the field at index \code{i} (0-based) +} +} + +\keyword{datasets} diff --git a/r/man/arrow__Table.Rd b/r/man/arrow__Table.Rd new file mode 100644 index 0000000000000..139db980acf4e --- /dev/null +++ b/r/man/arrow__Table.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Table.R +\docType{class} +\name{arrow__Table} +\alias{arrow__Table} +\alias{arrow::Table} +\title{class arrow::Table} +\description{ +class arrow::Table +} +\section{Methods}{ + + +TODO +} + +\keyword{datasets} diff --git a/r/man/arrow___MemoryPool.Rd b/r/man/arrow___MemoryPool.Rd new file mode 100644 index 0000000000000..9189e8be4a33c --- /dev/null +++ b/r/man/arrow___MemoryPool.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/memory_pool.R +\docType{class} +\name{arrow__MemoryPool} +\alias{arrow__MemoryPool} +\alias{arrow::MemoryPool} +\title{class arrow::MemoryPool} +\description{ +class arrow::MemoryPool +} +\section{Methods}{ + + +TODO +} + +\keyword{datasets} diff --git a/r/man/arrow__io__BufferOutputStream.Rd b/r/man/arrow__io__BufferOutputStream.Rd new file mode 100644 index 0000000000000..e90d1cc0ed87c --- /dev/null +++ b/r/man/arrow__io__BufferOutputStream.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/io.R +\docType{class} +\name{arrow__io__BufferOutputStream} +\alias{arrow__io__BufferOutputStream} +\alias{arrow::io::BufferOutputStream} +\title{class arrow::io::BufferOutputStream} +\format{An object of class \code{R6ClassGenerator} of length 24.} +\description{ +class arrow::io::BufferOutputStream +} +\section{Methods}{ + + +TODO +} + +\keyword{datasets} diff --git a/r/man/arrow__io__BufferReader.Rd b/r/man/arrow__io__BufferReader.Rd new file mode 100644 index 0000000000000..609fec5b6d4c8 --- /dev/null +++ b/r/man/arrow__io__BufferReader.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/io.R +\docType{class} +\name{arrow__io__BufferReader} +\alias{arrow__io__BufferReader} +\alias{arrow::io::BufferReader} +\title{class arrow::io::BufferReader} +\description{ +class arrow::io::BufferReader +} +\section{Methods}{ + + +TODO +} + +\keyword{datasets} diff --git a/r/man/arrow__io__FileOutputStream.Rd b/r/man/arrow__io__FileOutputStream.Rd new file mode 100644 index 0000000000000..92eaac13c9fd0 --- /dev/null +++ b/r/man/arrow__io__FileOutputStream.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/io.R +\docType{class} +\name{arrow__io__FileOutputStream} +\alias{arrow__io__FileOutputStream} +\alias{arrow::io::FileOutputStream} +\title{class arrow::io::FileOutputStream} +\description{ +class arrow::io::FileOutputStream +} +\section{Methods}{ + + +TODO +} + +\keyword{datasets} diff --git a/r/man/arrow__io__FixedSizeBufferWriter.Rd b/r/man/arrow__io__FixedSizeBufferWriter.Rd new file mode 100644 index 0000000000000..39d8bb69c25ff --- /dev/null +++ b/r/man/arrow__io__FixedSizeBufferWriter.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/io.R +\docType{class} +\name{arrow__io__FixedSizeBufferWriter} +\alias{arrow__io__FixedSizeBufferWriter} +\alias{arrow::io::FixedSizeBufferWriter} +\title{class arrow::io::FixedSizeBufferWriter} +\description{ +class arrow::io::FixedSizeBufferWriter +} +\section{Methods}{ + + +TODO +} + +\keyword{datasets} diff --git a/r/man/arrow__io__InputStream.Rd b/r/man/arrow__io__InputStream.Rd new file mode 100644 index 0000000000000..37f83308b6424 --- /dev/null +++ b/r/man/arrow__io__InputStream.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/io.R +\docType{class} +\name{arrow__io__InputStream} +\alias{arrow__io__InputStream} +\alias{arrow::io::InputStream} +\title{class arrow::io::InputStream} +\description{ +class arrow::io::InputStream +} +\section{Methods}{ + + +TODO +} + +\keyword{datasets} diff --git a/r/man/arrow__io__MemoryMappedFile.Rd b/r/man/arrow__io__MemoryMappedFile.Rd new file mode 100644 index 0000000000000..409bb17302abd --- /dev/null +++ b/r/man/arrow__io__MemoryMappedFile.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/io.R +\docType{class} +\name{arrow__io__MemoryMappedFile} +\alias{arrow__io__MemoryMappedFile} +\alias{arrow::io::MemoryMappedFile} +\title{class arrow::io::MemoryMappedFile} +\description{ +class arrow::io::MemoryMappedFile +} +\section{Methods}{ + + +TODO +} + +\seealso{ +\code{\link[=mmap_open]{mmap_open()}}, \code{\link[=mmap_create]{mmap_create()}} +} +\keyword{datasets} diff --git a/r/man/arrow__io__MockOutputStream.Rd b/r/man/arrow__io__MockOutputStream.Rd new file mode 100644 index 0000000000000..f0b2c06d7a55c --- /dev/null +++ b/r/man/arrow__io__MockOutputStream.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/io.R +\docType{class} +\name{arrow__io__MockOutputStream} +\alias{arrow__io__MockOutputStream} +\alias{arrow::io::MockOutputStream} +\title{class arrow::io::MockOutputStream} +\description{ +class arrow::io::MockOutputStream +} +\section{Methods}{ + + +TODO +} + +\keyword{datasets} diff --git a/r/man/arrow__io__OutputStream.Rd b/r/man/arrow__io__OutputStream.Rd new file mode 100644 index 0000000000000..c41b815c0217b --- /dev/null +++ b/r/man/arrow__io__OutputStream.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/io.R +\docType{class} +\name{arrow__io__OutputStream} +\alias{arrow__io__OutputStream} +\alias{arrow::io::OutputStream} +\title{OutputStream} +\description{ +OutputStream +} +\section{Methods}{ + +\itemize{ +\item \code{arrow::Buffer} \code{Read}(\code{int} nbytes): Read \code{nbytes} bytes +\item \code{void} \code{close}(): close the stream +} +} + +\keyword{datasets} diff --git a/r/man/arrow__io__RandomAccessFile.Rd b/r/man/arrow__io__RandomAccessFile.Rd new file mode 100644 index 0000000000000..f8cb86abda6d7 --- /dev/null +++ b/r/man/arrow__io__RandomAccessFile.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/io.R +\docType{class} +\name{arrow__io__RandomAccessFile} +\alias{arrow__io__RandomAccessFile} +\alias{arrow::io::RandomAccessFile} +\title{class arrow::io::RandomAccessFile} +\description{ +class arrow::io::RandomAccessFile +} +\section{Methods}{ + + +TODO +} + +\keyword{datasets} diff --git a/r/man/arrow__io__Readable.Rd b/r/man/arrow__io__Readable.Rd new file mode 100644 index 0000000000000..b0b30a42302bd --- /dev/null +++ b/r/man/arrow__io__Readable.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/io.R +\docType{class} +\name{arrow__io__Readable} +\alias{arrow__io__Readable} +\alias{arrow::io::Readable} +\title{class arrow::io::Readable} +\description{ +class arrow::io::Readable +} +\section{Methods}{ + + +TODO +} + +\keyword{datasets} diff --git a/r/man/arrow__io__ReadableFile.Rd b/r/man/arrow__io__ReadableFile.Rd new file mode 100644 index 0000000000000..440149fbbb4c4 --- /dev/null +++ b/r/man/arrow__io__ReadableFile.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/io.R +\docType{class} +\name{arrow__io__ReadableFile} +\alias{arrow__io__ReadableFile} +\alias{arrow::io::ReadableFile} +\title{class arrow::io::ReadableFile} +\description{ +class arrow::io::ReadableFile +} +\section{Methods}{ + + +TODO +} + +\keyword{datasets} diff --git a/r/man/arrow__ipc__Message.Rd b/r/man/arrow__ipc__Message.Rd new file mode 100644 index 0000000000000..d3811f8f4c10f --- /dev/null +++ b/r/man/arrow__ipc__Message.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/message.R +\docType{class} +\name{arrow__ipc__Message} +\alias{arrow__ipc__Message} +\alias{arrow::ipc::Message} +\title{class arrow::ipc::Message} +\description{ +class arrow::ipc::Message +} +\section{Methods}{ + + +TODO +} + +\keyword{datasets} diff --git a/r/man/arrow__ipc__MessageReader.Rd b/r/man/arrow__ipc__MessageReader.Rd new file mode 100644 index 0000000000000..883e9e0618b66 --- /dev/null +++ b/r/man/arrow__ipc__MessageReader.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/message.R +\docType{class} +\name{arrow__ipc__MessageReader} +\alias{arrow__ipc__MessageReader} +\alias{arrow::ipc::MessageReader} +\title{class arrow::ipc::MessageReader} +\description{ +class arrow::ipc::MessageReader +} +\section{Methods}{ + + +TODO +} + +\keyword{datasets} diff --git a/r/man/arrow__ipc__RecordBatchFileReader.Rd b/r/man/arrow__ipc__RecordBatchFileReader.Rd new file mode 100644 index 0000000000000..675f636b365bf --- /dev/null +++ b/r/man/arrow__ipc__RecordBatchFileReader.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RecordBatchReader.R +\docType{class} +\name{arrow__ipc__RecordBatchFileReader} +\alias{arrow__ipc__RecordBatchFileReader} +\alias{arrow::ipc::RecordBatchFileReader} +\title{class arrow::ipc::RecordBatchFileReader} +\description{ +class arrow::ipc::RecordBatchFileReader +} +\section{Methods}{ + + +TODO +} + +\keyword{datasets} diff --git a/r/man/arrow__ipc__RecordBatchFileWriter.Rd b/r/man/arrow__ipc__RecordBatchFileWriter.Rd new file mode 100644 index 0000000000000..a80b55941fb9e --- /dev/null +++ b/r/man/arrow__ipc__RecordBatchFileWriter.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RecordBatchWriter.R +\docType{class} +\name{arrow__ipc__RecordBatchFileWriter} +\alias{arrow__ipc__RecordBatchFileWriter} +\alias{arrow::ipc::RecordBatchFileWriter} +\title{class arrow::ipc::RecordBatchFileWriter + +Writer for the Arrow binary file format} +\description{ +class arrow::ipc::RecordBatchFileWriter + +Writer for the Arrow binary file format +} +\section{usage}{ +\preformatted{writer <- RecordBatchFileWriter(sink, schema) + +writer$write_batch(batch) +writer$write_table(table) +writer$close() +} +} + +\section{Factory}{ + + +The \code{\link[=RecordBatchFileWriter]{RecordBatchFileWriter()}} function creates a record batch stream writer. +} + +\section{Methods}{ + +inherited from \link[=arrow__ipc__RecordBatchWriter]{arrow::ipc::RecordBatchWriter} +\itemize{ +\item \code{$write_batch(batch)}: Write record batch to stream +\item \code{$write_table(table)}: write Table to stream +\item \code{$close()}: close stream +} +} + +\keyword{datasets} diff --git a/r/man/arrow__ipc__RecordBatchStreamReader.Rd b/r/man/arrow__ipc__RecordBatchStreamReader.Rd new file mode 100644 index 0000000000000..49f57cce057d9 --- /dev/null +++ b/r/man/arrow__ipc__RecordBatchStreamReader.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RecordBatchReader.R +\docType{class} +\name{arrow__ipc__RecordBatchStreamReader} +\alias{arrow__ipc__RecordBatchStreamReader} +\alias{arrow::ipc::RecordBatchStreamReader} +\title{class arrow::ipc::RecordBatchStreamReader} +\description{ +class arrow::ipc::RecordBatchStreamReader +} +\section{Methods}{ + + +TODO +} + +\keyword{datasets} diff --git a/r/man/arrow__ipc__RecordBatchStreamWriter.Rd b/r/man/arrow__ipc__RecordBatchStreamWriter.Rd new file mode 100644 index 0000000000000..3d2030287d1b5 --- /dev/null +++ b/r/man/arrow__ipc__RecordBatchStreamWriter.Rd @@ -0,0 +1,40 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RecordBatchWriter.R +\docType{class} +\name{arrow__ipc__RecordBatchStreamWriter} +\alias{arrow__ipc__RecordBatchStreamWriter} +\alias{arrow::ipc::RecordBatchStreamWriter} +\title{class arrow::ipc::RecordBatchStreamWriter + +Writer for the Arrow streaming binary format} +\description{ +class arrow::ipc::RecordBatchStreamWriter + +Writer for the Arrow streaming binary format +} +\section{usage}{ +\preformatted{writer <- RecordBatchStreamWriter(sink, schema) + +writer$write_batch(batch) +writer$write_table(table) +writer$close() +} +} + +\section{Factory}{ + + +The \code{\link[=RecordBatchStreamWriter]{RecordBatchStreamWriter()}} function creates a record batch stream writer. +} + +\section{Methods}{ + +inherited from \link[=arrow__ipc__RecordBatchWriter]{arrow::ipc::RecordBatchWriter} +\itemize{ +\item \code{$write_batch(batch)}: Write record batch to stream +\item \code{$write_table(table)}: write Table to stream +\item \code{$close()}: close stream +} +} + +\keyword{datasets} diff --git a/r/man/arrow__ipc__RecordBatchWriter.Rd b/r/man/arrow__ipc__RecordBatchWriter.Rd new file mode 100644 index 0000000000000..08593df852436 --- /dev/null +++ b/r/man/arrow__ipc__RecordBatchWriter.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RecordBatchWriter.R +\docType{class} +\name{arrow__ipc__RecordBatchWriter} +\alias{arrow__ipc__RecordBatchWriter} +\alias{arrow::ipc::RecordBatchWriter} +\title{class arrow::ipc::RecordBatchWriter} +\description{ +class arrow::ipc::RecordBatchWriter +} +\section{Methods}{ + +\itemize{ +\item \code{$write_batch(batch)}: Write record batch to stream +\item \code{$write_table(table)}: write Table to stream +\item \code{$close()}: close stream +} +} + +\section{Derived classes}{ + +\itemize{ +\item \link[=arrow__ipc__RecordBatchStreamWriter]{arrow::ipc::RecordBatchStreamWriter} implements the streaming binary format +\item \link[=arrow__ipc__RecordBatchFileWriter]{arrow::ipc::RecordBatchFileWriter} implements the binary file format +} +} + +\keyword{datasets} diff --git a/r/man/buffer.Rd b/r/man/buffer.Rd index 4d4e97e47d8c2..60fd25d4bf159 100644 --- a/r/man/buffer.Rd +++ b/r/man/buffer.Rd @@ -2,16 +2,16 @@ % Please edit documentation in R/buffer.R \name{buffer} \alias{buffer} -\title{Create a buffer from an R object} +\title{Create a \link[=arrow__Buffer]{arrow::Buffer} from an R object} \usage{ buffer(x) } \arguments{ -\item{x}{R object} +\item{x}{R object. Only raw, numeric and integer vectors are currently supported} } \value{ -an instance of \code{arrow::Buffer} that borrows memory from \code{x} +an instance of \link[=arrow__Buffer]{arrow::Buffer} that borrows memory from \code{x} } \description{ -Create a buffer from an R object +Create a \link[=arrow__Buffer]{arrow::Buffer} from an R object } diff --git a/r/man/chunked_array.Rd b/r/man/chunked_array.Rd index 1f4fb836143db..c6973be721014 100644 --- a/r/man/chunked_array.Rd +++ b/r/man/chunked_array.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/ChunkedArray.R \name{chunked_array} \alias{chunked_array} -\title{create an arrow::Array from an R vector} +\title{create an \link[=arrow__ChunkedArray]{arrow::ChunkedArray} from various R vectors} \usage{ chunked_array(..., type) } @@ -12,5 +12,5 @@ chunked_array(..., type) \item{type}{currently ignored} } \description{ -create an arrow::Array from an R vector +create an \link[=arrow__ChunkedArray]{arrow::ChunkedArray} from various R vectors } diff --git a/r/man/default_memory_pool.Rd b/r/man/default_memory_pool.Rd new file mode 100644 index 0000000000000..1725ff0e10a37 --- /dev/null +++ b/r/man/default_memory_pool.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/memory_pool.R +\name{default_memory_pool} +\alias{default_memory_pool} +\title{default \link[=arrow__MemoryPool]{arrow::MemoryPool}} +\usage{ +default_memory_pool() +} +\value{ +the default \link[=arrow__MemoryPool]{arrow::MemoryPool} +} +\description{ +default \link[=arrow__MemoryPool]{arrow::MemoryPool} +} diff --git a/r/man/dictionary.Rd b/r/man/dictionary.Rd index 2a7989648b01b..340283ec4dafc 100644 --- a/r/man/dictionary.Rd +++ b/r/man/dictionary.Rd @@ -13,6 +13,9 @@ dictionary(type, values, ordered = FALSE) \item{ordered}{Is this an ordered dictionary} } +\value{ +a \link[=arrow__DictionaryType]{arrow::DictionaryType} +} \description{ dictionary type factory } diff --git a/r/man/field.Rd b/r/man/field.Rd index e7af66db2905f..5cbd803387560 100644 --- a/r/man/field.Rd +++ b/r/man/field.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/Field.R \name{field} \alias{field} -\title{Factor for a \code{arrow::Field}} +\title{Factory for a \code{arrow::Field}} \usage{ field(name, type, metadata) } @@ -14,7 +14,7 @@ field(name, type, metadata) \item{metadata}{currently ignored} } \description{ -Factor for a \code{arrow::Field} +Factory for a \code{arrow::Field} } \examples{ field("x", int32()) diff --git a/r/man/io.Rd b/r/man/io.Rd deleted file mode 100644 index 74817bf88a394..0000000000000 --- a/r/man/io.Rd +++ /dev/null @@ -1,40 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/io.R -\name{mmap_create} -\alias{mmap_create} -\alias{mmap_open} -\alias{file_open} -\alias{file_output_stream} -\alias{mock_output_stream} -\alias{buffer_output_stream} -\alias{fixed_size_buffer_writer} -\title{Create a new read/write memory mapped file of a given size} -\usage{ -mmap_create(path, size) - -mmap_open(path, mode = c("read", "write", "readwrite")) - -file_open(path) - -file_output_stream(path) - -mock_output_stream() - -buffer_output_stream(initial_capacity = 0L) - -fixed_size_buffer_writer(buffer) -} -\arguments{ -\item{path}{file path} - -\item{size}{size in bytes} - -\item{mode}{file mode (read/write/readwrite)} - -\item{initial_capacity}{initial capacity for the buffer output stream} - -\item{buffer}{an \code{arrow::Buffer}, typically created by \code{\link[=buffer]{buffer()}}} -} -\description{ -Create a new read/write memory mapped file of a given size -} diff --git a/r/man/mmap_create.Rd b/r/man/mmap_create.Rd new file mode 100644 index 0000000000000..050ae18c76f3b --- /dev/null +++ b/r/man/mmap_create.Rd @@ -0,0 +1,19 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/io.R +\name{mmap_create} +\alias{mmap_create} +\title{Create a new read/write memory mapped file of a given size} +\usage{ +mmap_create(path, size) +} +\arguments{ +\item{path}{file path} + +\item{size}{size in bytes} +} +\value{ +a \link[=arrow__io__MemoryMappedFile]{arrow::io::MemoryMappedFile} +} +\description{ +Create a new read/write memory mapped file of a given size +} diff --git a/r/man/mmap_open.Rd b/r/man/mmap_open.Rd new file mode 100644 index 0000000000000..d0047a72c38ce --- /dev/null +++ b/r/man/mmap_open.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/io.R +\name{mmap_open} +\alias{mmap_open} +\title{Open a memory mapped file} +\usage{ +mmap_open(path, mode = c("read", "write", "readwrite")) +} +\arguments{ +\item{path}{file path} + +\item{mode}{file mode (read/write/readwrite)} +} +\description{ +Open a memory mapped file +} diff --git a/r/man/read_arrow.Rd b/r/man/read_arrow.Rd deleted file mode 100644 index 362ee7adc1a8e..0000000000000 --- a/r/man/read_arrow.Rd +++ /dev/null @@ -1,17 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/Table.R -\name{read_arrow} -\alias{read_arrow} -\title{Read an tibble from an arrow::Table on disk} -\usage{ -read_arrow(stream) -} -\arguments{ -\item{stream}{input stream} -} -\value{ -a \link[tibble:tibble]{tibble::tibble} -} -\description{ -Read an tibble from an arrow::Table on disk -} diff --git a/r/man/read_record_batch.Rd b/r/man/read_record_batch.Rd index 4ca048f28ec71..fef12cbac4a88 100644 --- a/r/man/read_record_batch.Rd +++ b/r/man/read_record_batch.Rd @@ -1,19 +1,19 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/RecordBatchReader.R +% Please edit documentation in R/read_record_batch.R \name{read_record_batch} \alias{read_record_batch} -\title{Read a single record batch from a stream} +\title{read \link[=arrow__RecordBatch]{arrow::RecordBatch} as encapsulated IPC message, given a known \link[=arrow__Schema]{arrow::Schema}} \usage{ -read_record_batch(stream, ...) +read_record_batch(obj, schema) } \arguments{ -\item{stream}{input stream} +\item{obj}{a \link[=arrow__ipc__Message]{arrow::ipc::Message}, a \link[=arrow__io__InputStream]{arrow::io::InputStream}, a \link[=arrow__Buffer]{arrow::Buffer}, or a raw vector} -\item{...}{additional parameters} +\item{schema}{a \link[=arrow__Schema]{arrow::Schema}} } -\description{ -Read a single record batch from a stream +\value{ +a \link[=arrow__RecordBatch]{arrow::RecordBatch} } -\details{ -\code{stream} can be a \code{arrow::io::RandomAccessFile} stream as created by \code{\link[=file_open]{file_open()}} or \code{\link[=mmap_open]{mmap_open()}} or a path. +\description{ +read \link[=arrow__RecordBatch]{arrow::RecordBatch} as encapsulated IPC message, given a known \link[=arrow__Schema]{arrow::Schema} } diff --git a/r/man/read_table.Rd b/r/man/read_table.Rd index f851057e8a7d0..3231b26da267b 100644 --- a/r/man/read_table.Rd +++ b/r/man/read_table.Rd @@ -1,14 +1,40 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/RecordBatchReader.R +% Please edit documentation in R/read_table.R \name{read_table} \alias{read_table} -\title{Read an arrow::Table from a stream} +\alias{read_arrow} +\title{Read an \link[=arrow__Table]{arrow::Table} from a stream} \usage{ read_table(stream) + +read_arrow(stream) } \arguments{ -\item{stream}{stream. Either a stream created by \code{\link[=file_open]{file_open()}} or \code{\link[=mmap_open]{mmap_open()}} or a file path.} +\item{stream}{stream. +\itemize{ +\item a \link[=arrow__ipc__RecordBatchFileReader]{arrow::ipc::RecordBatchFileReader}: +read an \link[=arrow__Table]{arrow::Table} +from all the record batches in the reader +\item a \link[=arrow__ipc__RecordBatchStreamReader]{arrow::ipc::RecordBatchStreamReader}: +read an \link[=arrow__Table]{arrow::Table} from the remaining record batches +in the reader +\item a string or \link[fs:path_abs]{file path}: interpret the file as an arrow +binary file format, and uses a \link[=arrow__ipc__RecordBatchFileReader]{arrow::ipc::RecordBatchFileReader} +to process it. +\item a raw vector: read using a \link[=arrow__ipc__RecordBatchStreamReader]{arrow::ipc::RecordBatchStreamReader} +}} +} +\value{ +\itemize{ +\item \code{read_table} returns an \link[=arrow__Table]{arrow::Table} +\item \code{read_arrow} returns a \code{\link[tibble:tibble]{tibble::tibble()}} +} } \description{ -Read an arrow::Table from a stream +Read an \link[=arrow__Table]{arrow::Table} from a stream +} +\details{ +The methods using \link[=arrow__ipc__RecordBatchFileReader]{arrow::ipc::RecordBatchFileReader} and +\link[=arrow__ipc__RecordBatchStreamReader]{arrow::ipc::RecordBatchStreamReader} offer the most +flexibility. The other methods are for convenience. } diff --git a/r/man/record_batch.Rd b/r/man/record_batch.Rd index e108d64b46a47..4567a9ab763f9 100644 --- a/r/man/record_batch.Rd +++ b/r/man/record_batch.Rd @@ -2,13 +2,16 @@ % Please edit documentation in R/RecordBatch.R \name{record_batch} \alias{record_batch} -\title{Create an arrow::RecordBatch from a data frame} +\title{Create an \link[=arrow__RecordBatch]{arrow::RecordBatch} from a data frame} \usage{ record_batch(.data) } \arguments{ \item{.data}{a data frame} } +\value{ +a \link[=arrow__RecordBatch]{arrow::RecordBatch} +} \description{ -Create an arrow::RecordBatch from a data frame +Create an \link[=arrow__RecordBatch]{arrow::RecordBatch} from a data frame } diff --git a/r/man/record_batch_file_reader.Rd b/r/man/record_batch_file_reader.Rd deleted file mode 100644 index b7e211dfbc23e..0000000000000 --- a/r/man/record_batch_file_reader.Rd +++ /dev/null @@ -1,14 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/RecordBatchReader.R -\name{record_batch_file_reader} -\alias{record_batch_file_reader} -\title{Create an \code{arrow::ipc::RecordBatchFileReader} from a file} -\usage{ -record_batch_file_reader(file) -} -\arguments{ -\item{file}{The file to read from} -} -\description{ -Create an \code{arrow::ipc::RecordBatchFileReader} from a file -} diff --git a/r/man/record_batch_file_writer.Rd b/r/man/record_batch_file_writer.Rd deleted file mode 100644 index b7dcb0c39e47b..0000000000000 --- a/r/man/record_batch_file_writer.Rd +++ /dev/null @@ -1,19 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/RecordBatchWriter.R -\name{record_batch_file_writer} -\alias{record_batch_file_writer} -\title{Create a record batch file writer from a stream} -\usage{ -record_batch_file_writer(stream, schema) -} -\arguments{ -\item{stream}{a stream} - -\item{schema}{the schema of the batches} -} -\value{ -an \code{arrow::ipc::RecordBatchWriter} object -} -\description{ -Create a record batch file writer from a stream -} diff --git a/r/man/record_batch_stream_reader.Rd b/r/man/record_batch_stream_reader.Rd deleted file mode 100644 index 018045f6a3272..0000000000000 --- a/r/man/record_batch_stream_reader.Rd +++ /dev/null @@ -1,14 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/RecordBatchReader.R -\name{record_batch_stream_reader} -\alias{record_batch_stream_reader} -\title{Create a \code{arrow::ipc::RecordBatchStreamReader} from an input stream} -\usage{ -record_batch_stream_reader(stream) -} -\arguments{ -\item{stream}{input stream} -} -\description{ -Create a \code{arrow::ipc::RecordBatchStreamReader} from an input stream -} diff --git a/r/man/record_batch_stream_writer.Rd b/r/man/record_batch_stream_writer.Rd deleted file mode 100644 index d720d50d3a749..0000000000000 --- a/r/man/record_batch_stream_writer.Rd +++ /dev/null @@ -1,16 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/RecordBatchWriter.R -\name{record_batch_stream_writer} -\alias{record_batch_stream_writer} -\title{Create a record batch stream writer} -\usage{ -record_batch_stream_writer(stream, schema) -} -\arguments{ -\item{stream}{a stream} - -\item{schema}{a schema} -} -\description{ -Create a record batch stream writer -} diff --git a/r/man/schema.Rd b/r/man/schema.Rd index 9b77d47b61352..ad3bcb1f4e0eb 100644 --- a/r/man/schema.Rd +++ b/r/man/schema.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/Schema.R \name{schema} \alias{schema} -\title{Schema functions} +\title{Schema factory} \usage{ schema(...) } @@ -10,8 +10,8 @@ schema(...) \item{...}{named list of data types} } \value{ -a Schema +a \link[=arrow__Schema]{schema} } \description{ -Schema functions +Schema factory } diff --git a/r/man/write_arrow.Rd b/r/man/write_arrow.Rd index 42b39f1d051fb..4296bcbd899da 100644 --- a/r/man/write_arrow.Rd +++ b/r/man/write_arrow.Rd @@ -1,18 +1,34 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/RecordBatchWriter.R +% Please edit documentation in R/write_arrow.R \name{write_arrow} \alias{write_arrow} -\title{Write an object to a stream} +\title{serialize an \link[=arrow__Table]{arrow::Table}, an \link[=arrow__RecordBatch]{arrow::RecordBatch}, or a +data frame to either the streaming format or the binary file format} \usage{ write_arrow(x, stream, ...) } \arguments{ -\item{x}{An object to stream} +\item{x}{an \link[=arrow__Table]{arrow::Table}, an \link[=arrow__RecordBatch]{arrow::RecordBatch} or a data.frame} -\item{stream}{A stream} +\item{stream}{where to serialize to +\itemize{ +\item A \link[=arrow__ipc__RecordBatchWriter]{arrow::ipc::RecordBatchWriter}: the \code{$write()} +of \code{x} is used. The stream is left open. This uses the streaming format +or the binary file format depending on the type of the writer. +\item A string or \link[fs:path_abs]{file path}: \code{x} is serialized with +a \link[=arrow__ipc__RecordBatchFileWriter]{arrow::ipc::RecordBatchFileWriter}, i.e. +using the binary file format. +\item A raw vector: typically of length zero (its data is ignored, and only used for +dispatch). \code{x} is serialized using the streaming format, i.e. using the +\link[=arrow__ipc__RecordBatchStreamWriter]{arrow::ipc::RecordBatchStreamWriter} +}} -\item{...}{additional parameters} +\item{...}{extra parameters, currently ignored + +\code{write_arrow} is a convenience function, the classes \link[=arrow__ipc__RecordBatchFileWriter]{arrow::ipc::RecordBatchFileWriter} +and \link[=arrow__ipc__RecordBatchStreamWriter]{arrow::ipc::RecordBatchStreamWriter} can be used for more flexibility.} } \description{ -Write an object to a stream +serialize an \link[=arrow__Table]{arrow::Table}, an \link[=arrow__RecordBatch]{arrow::RecordBatch}, or a +data frame to either the streaming format or the binary file format } diff --git a/r/man/write_record_batch.Rd b/r/man/write_record_batch.Rd deleted file mode 100644 index afc3363f0df14..0000000000000 --- a/r/man/write_record_batch.Rd +++ /dev/null @@ -1,18 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/RecordBatchWriter.R -\name{write_record_batch} -\alias{write_record_batch} -\title{write a record batch} -\usage{ -write_record_batch(x, stream, ...) -} -\arguments{ -\item{x}{a \code{arrow::RecordBatch}} - -\item{stream}{where to stream the record batch} - -\item{...}{extra parameters} -} -\description{ -write a record batch -} diff --git a/r/man/write_table.Rd b/r/man/write_table.Rd deleted file mode 100644 index a247870ec0190..0000000000000 --- a/r/man/write_table.Rd +++ /dev/null @@ -1,18 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/RecordBatchWriter.R -\name{write_table} -\alias{write_table} -\title{write an arrow::Table} -\usage{ -write_table(x, stream, ...) -} -\arguments{ -\item{x}{an \code{arrow::Table}} - -\item{stream}{where to stream the record batch} - -\item{...}{extra parameters} -} -\description{ -write an arrow::Table -} diff --git a/r/src/Makevars.in b/r/src/Makevars.in index 5e285518f24af..a0d5fed10bab8 100644 --- a/r/src/Makevars.in +++ b/r/src/Makevars.in @@ -16,7 +16,7 @@ # under the License. PKG_CPPFLAGS=@cflags@ -PKG_CXXFLAGS+=$(C_VISIBILITY) +PKG_CXXFLAGS=@visibility@ CXX_STD=CXX11 PKG_LIBS=@libs@ -Wl,-rpath,/usr/local/lib #CXXFLAGS="-D_GLIBCXX_USE_CXX11_ABI=0" diff --git a/r/src/RcppExports.cpp b/r/src/RcppExports.cpp index 2c549ad1b90ed..bca4eafdee4ce 100644 --- a/r/src/RcppExports.cpp +++ b/r/src/RcppExports.cpp @@ -1753,6 +1753,17 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// RecordBatch__columns +arrow::ArrayVector RecordBatch__columns(const std::shared_ptr& batch); +RcppExport SEXP _arrow_RecordBatch__columns(SEXP batchSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const std::shared_ptr& >::type batch(batchSEXP); + rcpp_result_gen = Rcpp::wrap(RecordBatch__columns(batch)); + return rcpp_result_gen; +END_RCPP +} // RecordBatch__column std::shared_ptr RecordBatch__column(const std::shared_ptr& batch, int i); RcppExport SEXP _arrow_RecordBatch__column(SEXP batchSEXP, SEXP iSEXP) { @@ -1859,6 +1870,29 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// ipc___SerializeRecordBatch__Raw +RawVector ipc___SerializeRecordBatch__Raw(const std::shared_ptr& batch); +RcppExport SEXP _arrow_ipc___SerializeRecordBatch__Raw(SEXP batchSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const std::shared_ptr& >::type batch(batchSEXP); + rcpp_result_gen = Rcpp::wrap(ipc___SerializeRecordBatch__Raw(batch)); + return rcpp_result_gen; +END_RCPP +} +// ipc___ReadRecordBatch__InputStream__Schema +std::shared_ptr ipc___ReadRecordBatch__InputStream__Schema(const std::shared_ptr& stream, const std::shared_ptr& schema); +RcppExport SEXP _arrow_ipc___ReadRecordBatch__InputStream__Schema(SEXP streamSEXP, SEXP schemaSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const std::shared_ptr& >::type stream(streamSEXP); + Rcpp::traits::input_parameter< const std::shared_ptr& >::type schema(schemaSEXP); + rcpp_result_gen = Rcpp::wrap(ipc___ReadRecordBatch__InputStream__Schema(stream, schema)); + return rcpp_result_gen; +END_RCPP +} // RecordBatchReader__schema std::shared_ptr RecordBatchReader__schema(const std::shared_ptr& reader); RcppExport SEXP _arrow_RecordBatchReader__schema(SEXP readerSEXP) { @@ -1892,6 +1926,17 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// ipc___RecordBatchStreamReader__batches +std::vector> ipc___RecordBatchStreamReader__batches(const std::shared_ptr& reader); +RcppExport SEXP _arrow_ipc___RecordBatchStreamReader__batches(SEXP readerSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const std::shared_ptr& >::type reader(readerSEXP); + rcpp_result_gen = Rcpp::wrap(ipc___RecordBatchStreamReader__batches(reader)); + return rcpp_result_gen; +END_RCPP +} // ipc___RecordBatchFileReader__schema std::shared_ptr ipc___RecordBatchFileReader__schema(const std::shared_ptr& reader); RcppExport SEXP _arrow_ipc___RecordBatchFileReader__schema(SEXP readerSEXP) { @@ -1959,39 +2004,25 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } -// ipc___RecordBatchFileWriter__Open -std::shared_ptr ipc___RecordBatchFileWriter__Open(const std::shared_ptr& stream, const std::shared_ptr& schema); -RcppExport SEXP _arrow_ipc___RecordBatchFileWriter__Open(SEXP streamSEXP, SEXP schemaSEXP) { +// ipc___RecordBatchFileReader__batches +std::vector> ipc___RecordBatchFileReader__batches(const std::shared_ptr& reader); +RcppExport SEXP _arrow_ipc___RecordBatchFileReader__batches(SEXP readerSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< const std::shared_ptr& >::type stream(streamSEXP); - Rcpp::traits::input_parameter< const std::shared_ptr& >::type schema(schemaSEXP); - rcpp_result_gen = Rcpp::wrap(ipc___RecordBatchFileWriter__Open(stream, schema)); - return rcpp_result_gen; -END_RCPP -} -// ipc___RecordBatchStreamWriter__Open -std::shared_ptr ipc___RecordBatchStreamWriter__Open(const std::shared_ptr& stream, const std::shared_ptr& schema); -RcppExport SEXP _arrow_ipc___RecordBatchStreamWriter__Open(SEXP streamSEXP, SEXP schemaSEXP) { -BEGIN_RCPP - Rcpp::RObject rcpp_result_gen; - Rcpp::RNGScope rcpp_rngScope_gen; - Rcpp::traits::input_parameter< const std::shared_ptr& >::type stream(streamSEXP); - Rcpp::traits::input_parameter< const std::shared_ptr& >::type schema(schemaSEXP); - rcpp_result_gen = Rcpp::wrap(ipc___RecordBatchStreamWriter__Open(stream, schema)); + Rcpp::traits::input_parameter< const std::shared_ptr& >::type reader(readerSEXP); + rcpp_result_gen = Rcpp::wrap(ipc___RecordBatchFileReader__batches(reader)); return rcpp_result_gen; END_RCPP } // ipc___RecordBatchWriter__WriteRecordBatch -void ipc___RecordBatchWriter__WriteRecordBatch(const std::shared_ptr& batch_writer, const std::shared_ptr& batch, bool allow_64bit); -RcppExport SEXP _arrow_ipc___RecordBatchWriter__WriteRecordBatch(SEXP batch_writerSEXP, SEXP batchSEXP, SEXP allow_64bitSEXP) { +void ipc___RecordBatchWriter__WriteRecordBatch(const std::shared_ptr& batch_writer, const std::shared_ptr& batch); +RcppExport SEXP _arrow_ipc___RecordBatchWriter__WriteRecordBatch(SEXP batch_writerSEXP, SEXP batchSEXP) { BEGIN_RCPP Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< const std::shared_ptr& >::type batch_writer(batch_writerSEXP); Rcpp::traits::input_parameter< const std::shared_ptr& >::type batch(batchSEXP); - Rcpp::traits::input_parameter< bool >::type allow_64bit(allow_64bitSEXP); - ipc___RecordBatchWriter__WriteRecordBatch(batch_writer, batch, allow_64bit); + ipc___RecordBatchWriter__WriteRecordBatch(batch_writer, batch); return R_NilValue; END_RCPP } @@ -2016,6 +2047,30 @@ BEGIN_RCPP return R_NilValue; END_RCPP } +// ipc___RecordBatchFileWriter__Open +std::shared_ptr ipc___RecordBatchFileWriter__Open(const std::shared_ptr& stream, const std::shared_ptr& schema); +RcppExport SEXP _arrow_ipc___RecordBatchFileWriter__Open(SEXP streamSEXP, SEXP schemaSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const std::shared_ptr& >::type stream(streamSEXP); + Rcpp::traits::input_parameter< const std::shared_ptr& >::type schema(schemaSEXP); + rcpp_result_gen = Rcpp::wrap(ipc___RecordBatchFileWriter__Open(stream, schema)); + return rcpp_result_gen; +END_RCPP +} +// ipc___RecordBatchStreamWriter__Open +std::shared_ptr ipc___RecordBatchStreamWriter__Open(const std::shared_ptr& stream, const std::shared_ptr& schema); +RcppExport SEXP _arrow_ipc___RecordBatchStreamWriter__Open(SEXP streamSEXP, SEXP schemaSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const std::shared_ptr& >::type stream(streamSEXP); + Rcpp::traits::input_parameter< const std::shared_ptr& >::type schema(schemaSEXP); + rcpp_result_gen = Rcpp::wrap(ipc___RecordBatchStreamWriter__Open(stream, schema)); + return rcpp_result_gen; +END_RCPP +} // Table__from_dataframe std::shared_ptr Table__from_dataframe(DataFrame tbl); RcppExport SEXP _arrow_Table__from_dataframe(SEXP tblSEXP) { @@ -2083,6 +2138,17 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// Table__columns +std::vector> Table__columns(const std::shared_ptr& table); +RcppExport SEXP _arrow_Table__columns(SEXP tableSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const std::shared_ptr& >::type table(tableSEXP); + rcpp_result_gen = Rcpp::wrap(Table__columns(table)); + return rcpp_result_gen; +END_RCPP +} static const R_CallMethodDef CallEntries[] = { {"_arrow_Array__from_vector", (DL_FUNC) &_arrow_Array__from_vector, 1}, @@ -2242,6 +2308,7 @@ static const R_CallMethodDef CallEntries[] = { {"_arrow_RecordBatch__num_columns", (DL_FUNC) &_arrow_RecordBatch__num_columns, 1}, {"_arrow_RecordBatch__num_rows", (DL_FUNC) &_arrow_RecordBatch__num_rows, 1}, {"_arrow_RecordBatch__schema", (DL_FUNC) &_arrow_RecordBatch__schema, 1}, + {"_arrow_RecordBatch__columns", (DL_FUNC) &_arrow_RecordBatch__columns, 1}, {"_arrow_RecordBatch__column", (DL_FUNC) &_arrow_RecordBatch__column, 2}, {"_arrow_RecordBatch__to_dataframe", (DL_FUNC) &_arrow_RecordBatch__to_dataframe, 1}, {"_arrow_RecordBatch__from_dataframe", (DL_FUNC) &_arrow_RecordBatch__from_dataframe, 1}, @@ -2251,26 +2318,31 @@ static const R_CallMethodDef CallEntries[] = { {"_arrow_RecordBatch__names", (DL_FUNC) &_arrow_RecordBatch__names, 1}, {"_arrow_RecordBatch__Slice1", (DL_FUNC) &_arrow_RecordBatch__Slice1, 2}, {"_arrow_RecordBatch__Slice2", (DL_FUNC) &_arrow_RecordBatch__Slice2, 3}, + {"_arrow_ipc___SerializeRecordBatch__Raw", (DL_FUNC) &_arrow_ipc___SerializeRecordBatch__Raw, 1}, + {"_arrow_ipc___ReadRecordBatch__InputStream__Schema", (DL_FUNC) &_arrow_ipc___ReadRecordBatch__InputStream__Schema, 2}, {"_arrow_RecordBatchReader__schema", (DL_FUNC) &_arrow_RecordBatchReader__schema, 1}, {"_arrow_RecordBatchReader__ReadNext", (DL_FUNC) &_arrow_RecordBatchReader__ReadNext, 1}, {"_arrow_ipc___RecordBatchStreamReader__Open", (DL_FUNC) &_arrow_ipc___RecordBatchStreamReader__Open, 1}, + {"_arrow_ipc___RecordBatchStreamReader__batches", (DL_FUNC) &_arrow_ipc___RecordBatchStreamReader__batches, 1}, {"_arrow_ipc___RecordBatchFileReader__schema", (DL_FUNC) &_arrow_ipc___RecordBatchFileReader__schema, 1}, {"_arrow_ipc___RecordBatchFileReader__num_record_batches", (DL_FUNC) &_arrow_ipc___RecordBatchFileReader__num_record_batches, 1}, {"_arrow_ipc___RecordBatchFileReader__ReadRecordBatch", (DL_FUNC) &_arrow_ipc___RecordBatchFileReader__ReadRecordBatch, 2}, {"_arrow_ipc___RecordBatchFileReader__Open", (DL_FUNC) &_arrow_ipc___RecordBatchFileReader__Open, 1}, {"_arrow_Table__from_RecordBatchFileReader", (DL_FUNC) &_arrow_Table__from_RecordBatchFileReader, 1}, {"_arrow_Table__from_RecordBatchStreamReader", (DL_FUNC) &_arrow_Table__from_RecordBatchStreamReader, 1}, - {"_arrow_ipc___RecordBatchFileWriter__Open", (DL_FUNC) &_arrow_ipc___RecordBatchFileWriter__Open, 2}, - {"_arrow_ipc___RecordBatchStreamWriter__Open", (DL_FUNC) &_arrow_ipc___RecordBatchStreamWriter__Open, 2}, - {"_arrow_ipc___RecordBatchWriter__WriteRecordBatch", (DL_FUNC) &_arrow_ipc___RecordBatchWriter__WriteRecordBatch, 3}, + {"_arrow_ipc___RecordBatchFileReader__batches", (DL_FUNC) &_arrow_ipc___RecordBatchFileReader__batches, 1}, + {"_arrow_ipc___RecordBatchWriter__WriteRecordBatch", (DL_FUNC) &_arrow_ipc___RecordBatchWriter__WriteRecordBatch, 2}, {"_arrow_ipc___RecordBatchWriter__WriteTable", (DL_FUNC) &_arrow_ipc___RecordBatchWriter__WriteTable, 2}, {"_arrow_ipc___RecordBatchWriter__Close", (DL_FUNC) &_arrow_ipc___RecordBatchWriter__Close, 1}, + {"_arrow_ipc___RecordBatchFileWriter__Open", (DL_FUNC) &_arrow_ipc___RecordBatchFileWriter__Open, 2}, + {"_arrow_ipc___RecordBatchStreamWriter__Open", (DL_FUNC) &_arrow_ipc___RecordBatchStreamWriter__Open, 2}, {"_arrow_Table__from_dataframe", (DL_FUNC) &_arrow_Table__from_dataframe, 1}, {"_arrow_Table__num_columns", (DL_FUNC) &_arrow_Table__num_columns, 1}, {"_arrow_Table__num_rows", (DL_FUNC) &_arrow_Table__num_rows, 1}, {"_arrow_Table__schema", (DL_FUNC) &_arrow_Table__schema, 1}, {"_arrow_Table__to_dataframe", (DL_FUNC) &_arrow_Table__to_dataframe, 1}, {"_arrow_Table__column", (DL_FUNC) &_arrow_Table__column, 2}, + {"_arrow_Table__columns", (DL_FUNC) &_arrow_Table__columns, 1}, {NULL, NULL, 0} }; diff --git a/r/src/array.cpp b/r/src/array.cpp index 038d7862f41f7..901f2b69bedb4 100644 --- a/r/src/array.cpp +++ b/r/src/array.cpp @@ -534,7 +534,7 @@ struct Converter_SimpleArray { if (n == null_count) { std::fill_n(data.begin() + start, n, default_value()); } else { - auto p_values = GetValuesSafely(array->data(), 1, array->offset()); + auto p_values = array->data()->GetValues(1); STOP_IF_NULL(p_values); // first copy all the data @@ -566,9 +566,9 @@ struct Converter_String { if (null_count == n) { std::fill_n(data.begin(), n, NA_STRING); } else { - auto p_offset = GetValuesSafely(array->data(), 1, array->offset()); + auto p_offset = array->data()->GetValues(1); STOP_IF_NULL(p_offset); - auto p_data = GetValuesSafely(array->data(), 2, *p_offset); + auto p_data = array->data()->GetValues(2, *p_offset); if (!p_data) { // There is an offset buffer, but the data buffer is null // There is at least one value in the array and not all the values are null @@ -615,7 +615,7 @@ struct Converter_Boolean { std::fill_n(data.begin() + start, n, NA_LOGICAL); } else { // process the data - auto p_data = GetValuesSafely(array->data(), 1, 0); + auto p_data = array->data()->GetValues(1, 0); STOP_IF_NULL(p_data); arrow::internal::BitmapReader data_reader(p_data, array->offset(), n); @@ -661,7 +661,7 @@ struct Converter_Dictionary_Int32Indices { std::fill_n(data.begin() + start, n, NA_INTEGER); } else { std::shared_ptr indices = dict_array->indices(); - auto p_array = GetValuesSafely(indices->data(), 1, indices->offset()); + auto p_array = indices->data()->GetValues(1); STOP_IF_NULL(p_array); if (array->null_count()) { @@ -692,7 +692,7 @@ struct Converter_Date64 { if (null_count == n) { std::fill_n(data.begin() + start, n, NA_REAL); } else { - auto p_values = GetValuesSafely(array->data(), 1, array->offset()); + auto p_values = array->data()->GetValues(1); STOP_IF_NULL(p_values); auto p_vec = data.begin() + start; @@ -726,7 +726,7 @@ struct Converter_Promotion { if (null_count == n) { std::fill_n(data.begin() + start, n, default_value()); } else { - auto p_values = GetValuesSafely(array->data(), 1, array->offset()); + auto p_values = array->data()->GetValues(1); STOP_IF_NULL(p_values); auto value_convert = [](value_type value) { @@ -766,7 +766,7 @@ struct Converter_Time { if (n == null_count) { std::fill_n(data.begin() + start, n, NA_REAL); } else { - auto p_values = GetValuesSafely(array->data(), 1, array->offset()); + auto p_values = array->data()->GetValues(1); STOP_IF_NULL(p_values); auto p_vec = data.begin() + start; auto convert = [this](value_type value) { @@ -803,7 +803,7 @@ struct Converter_Int64 { if (null_count == n) { std::fill_n(reinterpret_cast(data.begin()) + start, n, NA_INT64); } else { - auto p_values = GetValuesSafely(array->data(), 1, array->offset()); + auto p_values = array->data()->GetValues(1); STOP_IF_NULL(p_values); auto p_vec = reinterpret_cast(data.begin()) + start; diff --git a/r/src/arrow_types.h b/r/src/arrow_types.h index 419705f9fcde0..dba7a91c21e33 100644 --- a/r/src/arrow_types.h +++ b/r/src/arrow_types.h @@ -152,6 +152,7 @@ using LogicalVector_ = Rcpp::Vector; using StringVector_ = Rcpp::Vector; using CharacterVector_ = StringVector_; using RawVector_ = Rcpp::Vector; +using List_ = Rcpp::Vector; template inline typename Rcpp::Vector::stored_type default_value() { @@ -172,17 +173,6 @@ std::shared_ptr RecordBatch__from_dataframe(Rcpp::DataFrame namespace arrow { namespace r { -template -inline const T* GetValuesSafely(const std::shared_ptr& data, int i, - int64_t offset) { - auto buffer = data->buffers[i]; - if (!buffer) { - return nullptr; - } else { - return reinterpret_cast(buffer->data()) + offset; - } -} - template > class RBuffer : public MutableBuffer { public: diff --git a/r/src/recordbatch.cpp b/r/src/recordbatch.cpp index 829ad45eadbfc..b6bee7ae53927 100644 --- a/r/src/recordbatch.cpp +++ b/r/src/recordbatch.cpp @@ -40,6 +40,17 @@ std::shared_ptr RecordBatch__schema( return x->schema(); } +// [[Rcpp::export]] +arrow::ArrayVector RecordBatch__columns( + const std::shared_ptr& batch) { + auto nc = batch->num_columns(); + ArrayVector res(nc); + for (int i = 0; i < nc; i++) { + res[i] = batch->column(i); + } + return res; +} + // [[Rcpp::export]] std::shared_ptr RecordBatch__column( const std::shared_ptr& batch, int i) { @@ -120,3 +131,32 @@ std::shared_ptr RecordBatch__Slice2( const std::shared_ptr& self, int offset, int length) { return self->Slice(offset, length); } + +// [[Rcpp::export]] +RawVector ipc___SerializeRecordBatch__Raw( + const std::shared_ptr& batch) { + // how many bytes do we need ? + int64_t size; + STOP_IF_NOT_OK(arrow::ipc::GetRecordBatchSize(*batch, &size)); + + // allocate the result raw vector + RawVector out(no_init(size)); + + // serialize into the bytes of the raw vector + auto buffer = std::make_shared>(out); + arrow::io::FixedSizeBufferWriter stream(buffer); + STOP_IF_NOT_OK( + arrow::ipc::SerializeRecordBatch(*batch, arrow::default_memory_pool(), &stream)); + STOP_IF_NOT_OK(stream.Close()); + + return out; +} + +// [[Rcpp::export]] +std::shared_ptr ipc___ReadRecordBatch__InputStream__Schema( + const std::shared_ptr& stream, + const std::shared_ptr& schema) { + std::shared_ptr batch; + STOP_IF_NOT_OK(arrow::ipc::ReadRecordBatch(schema, stream.get(), &batch)); + return batch; +} diff --git a/r/src/recordbatchreader.cpp b/r/src/recordbatchreader.cpp index 65a1c9baf3b95..f3e90228d3ce0 100644 --- a/r/src/recordbatchreader.cpp +++ b/r/src/recordbatchreader.cpp @@ -41,6 +41,22 @@ std::shared_ptr ipc___RecordBatchStreamReader__Open( return reader; } +// [[Rcpp::export]] +std::vector> ipc___RecordBatchStreamReader__batches( + const std::shared_ptr& reader) { + std::vector> res; + + while (true) { + std::shared_ptr batch; + STOP_IF_NOT_OK(reader->ReadNext(&batch)); + if (!batch) break; + + res.push_back(batch); + } + + return res; +} + // -------- RecordBatchFileReader // [[Rcpp::export]] @@ -104,3 +120,16 @@ std::shared_ptr Table__from_RecordBatchStreamReader( return table; } + +// [[Rcpp::export]] +std::vector> ipc___RecordBatchFileReader__batches( + const std::shared_ptr& reader) { + auto n = reader->num_record_batches(); + std::vector> res(n); + + for (int i = 0; i < n; i++) { + STOP_IF_NOT_OK(reader->ReadRecordBatch(i, &res[i])); + } + + return res; +} diff --git a/r/src/recordbatchwriter.cpp b/r/src/recordbatchwriter.cpp index f86c474fec311..d4dd212a9bd11 100644 --- a/r/src/recordbatchwriter.cpp +++ b/r/src/recordbatchwriter.cpp @@ -17,6 +17,26 @@ #include "arrow_types.h" +// [[Rcpp::export]] +void ipc___RecordBatchWriter__WriteRecordBatch( + const std::shared_ptr& batch_writer, + const std::shared_ptr& batch) { + STOP_IF_NOT_OK(batch_writer->WriteRecordBatch(*batch, true)); +} + +// [[Rcpp::export]] +void ipc___RecordBatchWriter__WriteTable( + const std::shared_ptr& batch_writer, + const std::shared_ptr& table) { + STOP_IF_NOT_OK(batch_writer->WriteTable(*table)); +} + +// [[Rcpp::export]] +void ipc___RecordBatchWriter__Close( + const std::shared_ptr& batch_writer) { + STOP_IF_NOT_OK(batch_writer->Close()); +} + // [[Rcpp::export]] std::shared_ptr ipc___RecordBatchFileWriter__Open( const std::shared_ptr& stream, @@ -36,23 +56,3 @@ std::shared_ptr ipc___RecordBatchStreamWriter__Op arrow::ipc::RecordBatchStreamWriter::Open(stream.get(), schema, &stream_writer)); return stream_writer; } - -// [[Rcpp::export]] -void ipc___RecordBatchWriter__WriteRecordBatch( - const std::shared_ptr& batch_writer, - const std::shared_ptr& batch, bool allow_64bit) { - STOP_IF_NOT_OK(batch_writer->WriteRecordBatch(*batch, allow_64bit)); -} - -// [[Rcpp::export]] -void ipc___RecordBatchWriter__WriteTable( - const std::shared_ptr& batch_writer, - const std::shared_ptr& table) { - STOP_IF_NOT_OK(batch_writer->WriteTable(*table)); -} - -// [[Rcpp::export]] -void ipc___RecordBatchWriter__Close( - const std::shared_ptr& batch_writer) { - STOP_IF_NOT_OK(batch_writer->Close()); -} diff --git a/r/src/table.cpp b/r/src/table.cpp index 4bdff167db9c9..f4ebd0466b918 100644 --- a/r/src/table.cpp +++ b/r/src/table.cpp @@ -67,3 +67,14 @@ std::shared_ptr Table__column(const std::shared_ptr int i) { return table->column(i); } + +// [[Rcpp::export]] +std::vector> Table__columns( + const std::shared_ptr& table) { + auto nc = table->num_columns(); + std::vector> res(nc); + for (int i = 0; i < nc; i++) { + res[i] = table->column(i); + } + return res; +} diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R index cbf67e711d1b8..e456fe8865496 100644 --- a/r/tests/testthat/test-Array.R +++ b/r/tests/testthat/test-Array.R @@ -19,35 +19,35 @@ context("arrow::Array") test_that("Array", { x <- array(1:10, 1:10, 1:5) - expect_equal(x$type(), int32()) + expect_equal(x$type, int32()) expect_equal(x$length(), 25L) expect_equal(x$as_vector(), c(1:10, 1:10, 1:5)) y <- x$Slice(10) - expect_equal(y$type(), int32()) + expect_equal(y$type, int32()) expect_equal(y$length(), 15L) expect_equal(y$as_vector(), c(1:10, 1:5)) expect_true(x$RangeEquals(y, 10, 24, 0)) z <- x$Slice(10, 5) - expect_equal(z$type(), int32()) + expect_equal(z$type, int32()) expect_equal(z$length(), 5L) expect_equal(z$as_vector(), c(1:5)) expect_true(x$RangeEquals(z, 10, 15, 0)) x_dbl <- array(c(1,2,3), c(4,5,6)) - expect_equal(x_dbl$type(), float64()) + expect_equal(x_dbl$type, float64()) expect_equal(x_dbl$length(), 6L) expect_equal(x_dbl$as_vector(), as.numeric(1:6)) y_dbl <- x_dbl$Slice(3) - expect_equal(y_dbl$type(), float64()) + expect_equal(y_dbl$type, float64()) expect_equal(y_dbl$length(), 3L) - expect_equal(y_dbl$offset(), 3L) + expect_equal(y_dbl$offset, 3L) expect_equal(y_dbl$as_vector(), as.numeric(4:6)) z_dbl <- x_dbl$Slice(3, 2) - expect_equal(z_dbl$type(), float64()) + expect_equal(z_dbl$type, float64()) expect_equal(z_dbl$length(), 2L) expect_equal(z_dbl$as_vector(), as.numeric(4:5)) }) @@ -138,7 +138,7 @@ test_that("Array supports unordered factors (ARROW-3355)", { f <- factor(c("itsy", "bitsy", "spider", "spider")) arr_fac <- array(f) expect_equal(arr_fac$length(), 4L) - expect_equal(arr_fac$type()$index_type(), int8()) + expect_equal(arr_fac$type$index_type, int8()) expect_identical(arr_fac$as_vector(), f) expect_true(arr_fac$IsValid(0)) expect_true(arr_fac$IsValid(1)) @@ -147,7 +147,7 @@ test_that("Array supports unordered factors (ARROW-3355)", { sl <- arr_fac$Slice(1) expect_equal(sl$length(), 3L) - expect_equal(arr_fac$type()$index_type(), int8()) + expect_equal(arr_fac$type$index_type, int8()) expect_equal(sl$as_vector(), f[2:4]) # with NA @@ -155,7 +155,7 @@ test_that("Array supports unordered factors (ARROW-3355)", { # TODO: rm the suppressWarnings when https://github.com/r-lib/vctrs/issues/109 arr_fac <- suppressWarnings(array(f)) expect_equal(arr_fac$length(), 5L) - expect_equal(arr_fac$type()$index_type(), int8()) + expect_equal(arr_fac$type$index_type, int8()) expect_identical(arr_fac$as_vector(), f) expect_true(arr_fac$IsValid(0)) expect_true(arr_fac$IsValid(1)) @@ -165,7 +165,7 @@ test_that("Array supports unordered factors (ARROW-3355)", { sl <- arr_fac$Slice(1) expect_equal(sl$length(), 4L) - expect_equal(arr_fac$type()$index_type(), int8()) + expect_equal(arr_fac$type$index_type, int8()) expect_equal(sl$as_vector(), f[2:5]) }) @@ -174,7 +174,7 @@ test_that("Array supports ordered factors (ARROW-3355)", { f <- ordered(c("itsy", "bitsy", "spider", "spider")) arr_fac <- array(f) expect_equal(arr_fac$length(), 4L) - expect_equal(arr_fac$type()$index_type(), int8()) + expect_equal(arr_fac$type$index_type, int8()) expect_identical(arr_fac$as_vector(), f) expect_true(arr_fac$IsValid(0)) expect_true(arr_fac$IsValid(1)) @@ -183,7 +183,7 @@ test_that("Array supports ordered factors (ARROW-3355)", { sl <- arr_fac$Slice(1) expect_equal(sl$length(), 3L) - expect_equal(arr_fac$type()$index_type(), int8()) + expect_equal(arr_fac$type$index_type, int8()) expect_equal(sl$as_vector(), f[2:4]) # with NA @@ -191,7 +191,7 @@ test_that("Array supports ordered factors (ARROW-3355)", { # TODO: rm the suppressWarnings when https://github.com/r-lib/vctrs/issues/109 arr_fac <- suppressWarnings(array(f)) expect_equal(arr_fac$length(), 5L) - expect_equal(arr_fac$type()$index_type(), int8()) + expect_equal(arr_fac$type$index_type, int8()) expect_identical(arr_fac$as_vector(), f) expect_true(arr_fac$IsValid(0)) expect_true(arr_fac$IsValid(1)) @@ -201,27 +201,27 @@ test_that("Array supports ordered factors (ARROW-3355)", { sl <- arr_fac$Slice(1) expect_equal(sl$length(), 4L) - expect_equal(arr_fac$type()$index_type(), int8()) + expect_equal(arr_fac$type$index_type, int8()) expect_equal(sl$as_vector(), f[2:5]) }) test_that("array supports Date (ARROW-3340)", { d <- Sys.Date() + 1:10 a <- array(d) - expect_equal(a$type(), date32()) + expect_equal(a$type, date32()) expect_equal(a$length(), 10L) expect_equal(a$as_vector(), d) d[5] <- NA a <- array(d) - expect_equal(a$type(), date32()) + expect_equal(a$type, date32()) expect_equal(a$length(), 10L) expect_equal(a$as_vector(), d) expect_true(a$IsNull(4)) d2 <- d + .5 a <- array(d2) - expect_equal(a$type(), date32()) + expect_equal(a$type, date32()) expect_equal(a$length(), 10L) expect_equal(a$as_vector(), d) expect_true(a$IsNull(4)) @@ -230,15 +230,15 @@ test_that("array supports Date (ARROW-3340)", { test_that("array supports POSIXct (ARROW-3340)", { times <- lubridate::ymd_hms("2018-10-07 19:04:05") + 1:10 a <- array(times) - expect_equal(a$type()$name(), "timestamp") - expect_equal(a$type()$unit(), unclass(TimeUnit$MICRO)) + expect_equal(a$type$name, "timestamp") + expect_equal(a$type$unit(), unclass(TimeUnit$MICRO)) expect_equal(a$length(), 10L) expect_equal(as.numeric(a$as_vector()), as.numeric(times)) times[5] <- NA a <- array(times) - expect_equal(a$type()$name(), "timestamp") - expect_equal(a$type()$unit(), unclass(TimeUnit$MICRO)) + expect_equal(a$type$name, "timestamp") + expect_equal(a$type$unit(), unclass(TimeUnit$MICRO)) expect_equal(a$length(), 10L) expect_equal(as.numeric(a$as_vector()), as.numeric(times)) expect_true(a$IsNull(4)) @@ -247,13 +247,13 @@ test_that("array supports POSIXct (ARROW-3340)", { test_that("array supports integer64", { x <- bit64::as.integer64(1:10) a <- array(x) - expect_equal(a$type(), int64()) + expect_equal(a$type, int64()) expect_equal(a$length(), 10L) expect_equal(a$as_vector(), x) x[4] <- NA a <- array(x) - expect_equal(a$type(), int64()) + expect_equal(a$type, int64()) expect_equal(a$length(), 10L) expect_equal(a$as_vector(), x) expect_true(a$IsNull(3L)) @@ -268,12 +268,12 @@ test_that("array$as_vector() correctly handles all NA inte64 (ARROW-3795)", { test_that("array supports difftime", { time <- hms::hms(56, 34, 12) a <- array(time, time) - expect_equal(a$type(), time32(unit = TimeUnit$SECOND)) + expect_equal(a$type, time32(unit = TimeUnit$SECOND)) expect_equal(a$length(), 2L) expect_equal(a$as_vector(), c(time, time)) a <- array(time, NA) - expect_equal(a$type(), time32(unit = TimeUnit$SECOND)) + expect_equal(a$type, time32(unit = TimeUnit$SECOND)) expect_equal(a$length(), 2L) expect_true(a$IsNull(1)) expect_equal(a$as_vector()[1], time) @@ -284,7 +284,7 @@ test_that("support for NaN (ARROW-3615)", { x <- c(1, NA, NaN, -1) y <- array(x) expect_true(y$IsValid(2)) - expect_equal(y$null_count(), 1L) + expect_equal(y$null_count, 1L) }) test_that("array ignores the type argument (ARROW-3784)", { @@ -300,10 +300,10 @@ test_that("integer types casts (ARROW-3741)", { a_int32 <- a$cast(int32()) a_int64 <- a$cast(int64()) - expect_equal(a_int8$type(), int8()) - expect_equal(a_int16$type(), int16()) - expect_equal(a_int32$type(), int32()) - expect_equal(a_int64$type(), int64()) + expect_equal(a_int8$type, int8()) + expect_equal(a_int16$type, int16()) + expect_equal(a_int32$type, int32()) + expect_equal(a_int64$type, int64()) expect_true(a_int8$IsNull(10L)) expect_true(a_int16$IsNull(10L)) expect_true(a_int32$IsNull(10L)) @@ -314,10 +314,10 @@ test_that("integer types casts (ARROW-3741)", { a_uint32 <- a$cast(uint32()) a_uint64 <- a$cast(uint64()) - expect_equal(a_uint8$type(), uint8()) - expect_equal(a_uint16$type(), uint16()) - expect_equal(a_uint32$type(), uint32()) - expect_equal(a_uint64$type(), uint64()) + expect_equal(a_uint8$type, uint8()) + expect_equal(a_uint16$type, uint16()) + expect_equal(a_uint32$type, uint32()) + expect_equal(a_uint64$type, uint64()) expect_true(a_uint8$IsNull(10L)) expect_true(a_uint16$IsNull(10L)) expect_true(a_uint32$IsNull(10L)) @@ -345,8 +345,8 @@ test_that("float types casts (ARROW-3741)", { a_f32 <- a$cast(float32()) a_f64 <- a$cast(float64()) - expect_equal(a_f32$type(), float32()) - expect_equal(a_f64$type(), float64()) + expect_equal(a_f32$type, float32()) + expect_equal(a_f64$type, float64()) expect_true(a_f32$IsNull(3L)) expect_true(a_f64$IsNull(3L)) @@ -359,5 +359,5 @@ test_that("cast to half float works", { skip("until https://issues.apache.org/jira/browse/ARROW-3802") a <- array(1:4) a_f16 <- a$cast(float16()) - expect_equal(a_16$type(), float16()) + expect_equal(a_16$type, float16()) }) diff --git a/r/tests/testthat/test-DataType.R b/r/tests/testthat/test-DataType.R index b479e5a3f6798..fc9fc896eaee8 100644 --- a/r/tests/testthat/test-DataType.R +++ b/r/tests/testthat/test-DataType.R @@ -19,8 +19,8 @@ context("arrow::DataType") test_that("null type works as expected",{ x <- null() - expect_equal(x$id(), 0L) - expect_equal(x$name(), "null") + expect_equal(x$id, 0L) + expect_equal(x$name, "null") expect_equal(x$ToString(), "null") expect_true(x == x) expect_false(x == int8()) @@ -30,134 +30,134 @@ test_that("null type works as expected",{ test_that("boolean type work as expected",{ x <- boolean() - expect_equal(x$id(), 1L) - expect_equal(x$name(), "bool") + expect_equal(x$id, 1L) + expect_equal(x$name, "bool") expect_equal(x$ToString(), "bool") expect_true(x == x) expect_false(x == null()) expect_equal(x$num_children(), 0L) expect_equal(x$children(), list()) - expect_equal(x$bit_width(), 1L) + expect_equal(x$bit_width, 1L) }) test_that("int types works as expected",{ x <- uint8() - expect_equal(x$id(), 2L) - expect_equal(x$name(), "uint8") + expect_equal(x$id, 2L) + expect_equal(x$name, "uint8") expect_equal(x$ToString(), "uint8") expect_true(x == x) expect_false(x == null()) expect_equal(x$num_children(), 0L) expect_equal(x$children(), list()) - expect_equal(x$bit_width(), 8L) + expect_equal(x$bit_width, 8L) x <- int8() - expect_equal(x$id(), 3L) - expect_equal(x$name(), "int8") + expect_equal(x$id, 3L) + expect_equal(x$name, "int8") expect_equal(x$ToString(), "int8") expect_true(x == x) expect_false(x == null()) expect_equal(x$num_children(), 0L) expect_equal(x$children(), list()) - expect_equal(x$bit_width(), 8L) + expect_equal(x$bit_width, 8L) x <- uint16() - expect_equal(x$id(), 4L) - expect_equal(x$name(), "uint16") + expect_equal(x$id, 4L) + expect_equal(x$name, "uint16") expect_equal(x$ToString(), "uint16") expect_true(x == x) expect_false(x == null()) expect_equal(x$num_children(), 0L) expect_equal(x$children(), list()) - expect_equal(x$bit_width(), 16L) + expect_equal(x$bit_width, 16L) x <- int16() - expect_equal(x$id(), 5L) - expect_equal(x$name(), "int16") + expect_equal(x$id, 5L) + expect_equal(x$name, "int16") expect_equal(x$ToString(), "int16") expect_true(x == x) expect_false(x == null()) expect_equal(x$num_children(), 0L) expect_equal(x$children(), list()) - expect_equal(x$bit_width(), 16L) + expect_equal(x$bit_width, 16L) x <- uint32() - expect_equal(x$id(), 6L) - expect_equal(x$name(), "uint32") + expect_equal(x$id, 6L) + expect_equal(x$name, "uint32") expect_equal(x$ToString(), "uint32") expect_true(x == x) expect_false(x == null()) expect_equal(x$num_children(), 0L) expect_equal(x$children(), list()) - expect_equal(x$bit_width(), 32L) + expect_equal(x$bit_width, 32L) x <- int32() - expect_equal(x$id(), 7L) - expect_equal(x$name(), "int32") + expect_equal(x$id, 7L) + expect_equal(x$name, "int32") expect_equal(x$ToString(), "int32") expect_true(x == x) expect_false(x == null()) expect_equal(x$num_children(), 0L) expect_equal(x$children(), list()) - expect_equal(x$bit_width(), 32L) + expect_equal(x$bit_width, 32L) x <- uint64() - expect_equal(x$id(), 8L) - expect_equal(x$name(), "uint64") + expect_equal(x$id, 8L) + expect_equal(x$name, "uint64") expect_equal(x$ToString(), "uint64") expect_true(x == x) expect_false(x == null()) expect_equal(x$num_children(), 0L) expect_equal(x$children(), list()) - expect_equal(x$bit_width(), 64L) + expect_equal(x$bit_width, 64L) x <- int64() - expect_equal(x$id(), 9L) - expect_equal(x$name(), "int64") + expect_equal(x$id, 9L) + expect_equal(x$name, "int64") expect_equal(x$ToString(), "int64") expect_true(x == x) expect_false(x == null()) expect_equal(x$num_children(), 0L) expect_equal(x$children(), list()) - expect_equal(x$bit_width(), 64L) + expect_equal(x$bit_width, 64L) }) test_that("float types work as expected",{ x <- float16() - expect_equal(x$id(), 10L) - expect_equal(x$name(), "halffloat") + expect_equal(x$id, 10L) + expect_equal(x$name, "halffloat") expect_equal(x$ToString(), "halffloat") expect_true(x == x) expect_false(x == null()) expect_equal(x$num_children(), 0L) expect_equal(x$children(), list()) - expect_equal(x$bit_width(), 16L) + expect_equal(x$bit_width, 16L) x <- float32() - expect_equal(x$id(), 11L) - expect_equal(x$name(), "float") + expect_equal(x$id, 11L) + expect_equal(x$name, "float") expect_equal(x$ToString(), "float") expect_true(x == x) expect_false(x == null()) expect_equal(x$num_children(), 0L) expect_equal(x$children(), list()) - expect_equal(x$bit_width(), 32L) + expect_equal(x$bit_width, 32L) x <- float64() - expect_equal(x$id(), 12L) - expect_equal(x$name(), "double") + expect_equal(x$id, 12L) + expect_equal(x$name, "double") expect_equal(x$ToString(), "double") expect_true(x == x) expect_false(x == null()) expect_equal(x$num_children(), 0L) expect_equal(x$children(), list()) - expect_equal(x$bit_width(), 64L) + expect_equal(x$bit_width, 64L) }) test_that("utf8 type works as expected",{ x <- utf8() - expect_equal(x$id(), 13L) - expect_equal(x$name(), "utf8") + expect_equal(x$id, 13L) + expect_equal(x$name, "utf8") expect_equal(x$ToString(), "string") expect_true(x == x) expect_false(x == null()) @@ -167,8 +167,8 @@ test_that("utf8 type works as expected",{ test_that("date types work as expected", { x <- date32() - expect_equal(x$id(), 16L) - expect_equal(x$name(), "date32") + expect_equal(x$id, 16L) + expect_equal(x$name, "date32") expect_equal(x$ToString(), "date32[day]") expect_true(x == x) expect_false(x == null()) @@ -177,8 +177,8 @@ test_that("date types work as expected", { expect_equal(x$unit(), unclass(DateUnit$DAY)) x <- date64() - expect_equal(x$id(), 17L) - expect_equal(x$name(), "date64") + expect_equal(x$id, 17L) + expect_equal(x$name, "date64") expect_equal(x$ToString(), "date64[ms]") expect_true(x == x) expect_false(x == null()) @@ -189,106 +189,106 @@ test_that("date types work as expected", { test_that("timestamp type works as expected", { x <- timestamp(TimeUnit$SECOND) - expect_equal(x$id(), 18L) - expect_equal(x$name(), "timestamp") + expect_equal(x$id, 18L) + expect_equal(x$name, "timestamp") expect_equal(x$ToString(), "timestamp[s]") expect_true(x == x) expect_false(x == null()) expect_equal(x$num_children(), 0L) expect_equal(x$children(), list()) - expect_equal(x$bit_width(), 64L) + expect_equal(x$bit_width, 64L) expect_equal(x$timezone(), "") expect_equal(x$unit(), unclass(TimeUnit$SECOND)) x <- timestamp(TimeUnit$MILLI) - expect_equal(x$id(), 18L) - expect_equal(x$name(), "timestamp") + expect_equal(x$id, 18L) + expect_equal(x$name, "timestamp") expect_equal(x$ToString(), "timestamp[ms]") expect_true(x == x) expect_false(x == null()) expect_equal(x$num_children(), 0L) expect_equal(x$children(), list()) - expect_equal(x$bit_width(), 64L) + expect_equal(x$bit_width, 64L) expect_equal(x$timezone(), "") expect_equal(x$unit(), unclass(TimeUnit$MILLI)) x <- timestamp(TimeUnit$MICRO) - expect_equal(x$id(), 18L) - expect_equal(x$name(), "timestamp") + expect_equal(x$id, 18L) + expect_equal(x$name, "timestamp") expect_equal(x$ToString(), "timestamp[us]") expect_true(x == x) expect_false(x == null()) expect_equal(x$num_children(), 0L) expect_equal(x$children(), list()) - expect_equal(x$bit_width(), 64L) + expect_equal(x$bit_width, 64L) expect_equal(x$timezone(), "") expect_equal(x$unit(), unclass(TimeUnit$MICRO)) x <- timestamp(TimeUnit$NANO) - expect_equal(x$id(), 18L) - expect_equal(x$name(), "timestamp") + expect_equal(x$id, 18L) + expect_equal(x$name, "timestamp") expect_equal(x$ToString(), "timestamp[ns]") expect_true(x == x) expect_false(x == null()) expect_equal(x$num_children(), 0L) expect_equal(x$children(), list()) - expect_equal(x$bit_width(), 64L) + expect_equal(x$bit_width, 64L) expect_equal(x$timezone(), "") expect_equal(x$unit(), unclass(TimeUnit$NANO)) }) test_that("time32 types work as expected", { x <- time32(TimeUnit$SECOND) - expect_equal(x$id(), 19L) - expect_equal(x$name(), "time32") + expect_equal(x$id, 19L) + expect_equal(x$name, "time32") expect_equal(x$ToString(), "time32[s]") expect_true(x == x) expect_false(x == null()) expect_equal(x$num_children(), 0L) expect_equal(x$children(), list()) - expect_equal(x$bit_width(), 32L) + expect_equal(x$bit_width, 32L) expect_equal(x$unit(), unclass(TimeUnit$SECOND)) x <- time32(TimeUnit$MILLI) - expect_equal(x$id(), 19L) - expect_equal(x$name(), "time32") + expect_equal(x$id, 19L) + expect_equal(x$name, "time32") expect_equal(x$ToString(), "time32[ms]") expect_true(x == x) expect_false(x == null()) expect_equal(x$num_children(), 0L) expect_equal(x$children(), list()) - expect_equal(x$bit_width(), 32L) + expect_equal(x$bit_width, 32L) expect_equal(x$unit(), unclass(TimeUnit$MILLI)) }) test_that("time64 types work as expected", { x <- time64(TimeUnit$MICRO) - expect_equal(x$id(), 20L) - expect_equal(x$name(), "time64") + expect_equal(x$id, 20L) + expect_equal(x$name, "time64") expect_equal(x$ToString(), "time64[us]") expect_true(x == x) expect_false(x == null()) expect_equal(x$num_children(), 0L) expect_equal(x$children(), list()) - expect_equal(x$bit_width(), 64L) + expect_equal(x$bit_width, 64L) expect_equal(x$unit(), unclass(TimeUnit$MICRO)) x <- time64(TimeUnit$NANO) - expect_equal(x$id(), 20L) - expect_equal(x$name(), "time64") + expect_equal(x$id, 20L) + expect_equal(x$name, "time64") expect_equal(x$ToString(), "time64[ns]") expect_true(x == x) expect_false(x == null()) expect_equal(x$num_children(), 0L) expect_equal(x$children(), list()) - expect_equal(x$bit_width(), 64L) + expect_equal(x$bit_width, 64L) expect_equal(x$unit(), unclass(TimeUnit$NANO)) }) test_that("list type works as expected", { x <- list_of(int32()) - expect_equal(x$id(), 23L) - expect_equal(x$name(), "list") + expect_equal(x$id, 23L) + expect_equal(x$name, "list") expect_equal(x$ToString(), "list") expect_true(x == x) expect_false(x == null()) @@ -301,8 +301,8 @@ test_that("list type works as expected", { test_that("struct type works as expected", { x <- struct(x = int32(), y = boolean()) - expect_equal(x$id(), 24L) - expect_equal(x$name(), "struct") + expect_equal(x$id, 24L) + expect_equal(x$name, "struct") expect_equal(x$ToString(), "struct") expect_true(x == x) expect_false(x == null()) @@ -318,9 +318,9 @@ test_that("DictionaryType works as expected (ARROW-3355)", { expect_equal(d, d) expect_true(d == d) expect_false(d == int32()) - expect_equal(d$id(), Type$DICTIONARY) - expect_equal(d$bit_width(), 32L) + expect_equal(d$id, Type$DICTIONARY) + expect_equal(d$bit_width, 32L) expect_equal(d$ToString(), "dictionary") - expect_equal(d$index_type(), int32()) - expect_equal(d$dictionary(), array(c("foo", "bar", "baz"))) + expect_equal(d$index_type, int32()) + expect_equal(d$dictionary, array(c("foo", "bar", "baz"))) }) diff --git a/r/tests/testthat/test-RecordBatch.R b/r/tests/testthat/test-RecordBatch.R index 348327783fda8..f40bd8387ad74 100644 --- a/r/tests/testthat/test-RecordBatch.R +++ b/r/tests/testthat/test-RecordBatch.R @@ -28,15 +28,15 @@ test_that("RecordBatch", { expect_true(batch == batch) expect_equal( - batch$schema(), + batch$schema, schema( int = int32(), dbl = float64(), lgl = boolean(), chr = utf8(), fct = dictionary(int32(), array(letters[1:10])) ) ) - expect_equal(batch$num_columns(), 5L) - expect_equal(batch$num_rows(), 10L) + expect_equal(batch$num_columns, 5L) + expect_equal(batch$num_rows, 10L) expect_equal(batch$column_name(0), "int") expect_equal(batch$column_name(1), "dbl") expect_equal(batch$column_name(2), "lgl") @@ -47,32 +47,32 @@ test_that("RecordBatch", { col_int <- batch$column(0) expect_true(inherits(col_int, 'arrow::Array')) expect_equal(col_int$as_vector(), tbl$int) - expect_equal(col_int$type(), int32()) + expect_equal(col_int$type, int32()) col_dbl <- batch$column(1) expect_true(inherits(col_dbl, 'arrow::Array')) expect_equal(col_dbl$as_vector(), tbl$dbl) - expect_equal(col_dbl$type(), float64()) + expect_equal(col_dbl$type, float64()) col_lgl <- batch$column(2) expect_true(inherits(col_dbl, 'arrow::Array')) expect_equal(col_lgl$as_vector(), tbl$lgl) - expect_equal(col_lgl$type(), boolean()) + expect_equal(col_lgl$type, boolean()) col_chr <- batch$column(3) expect_true(inherits(col_chr, 'arrow::Array')) expect_equal(col_chr$as_vector(), tbl$chr) - expect_equal(col_chr$type(), utf8()) + expect_equal(col_chr$type, utf8()) col_fct <- batch$column(4) expect_true(inherits(col_fct, 'arrow::Array')) expect_equal(col_fct$as_vector(), tbl$fct) - expect_equal(col_fct$type(), dictionary(int32(), array(letters[1:10]))) + expect_equal(col_fct$type, dictionary(int32(), array(letters[1:10]))) batch2 <- batch$RemoveColumn(0) expect_equal( - batch2$schema(), + batch2$schema, schema(dbl = float64(), lgl = boolean(), chr = utf8(), fct = dictionary(int32(), array(letters[1:10]))) ) expect_equal(batch2$column(0), batch$column(1)) @@ -95,10 +95,10 @@ test_that("RecordBatch with 0 rows are supported", { ) batch <- record_batch(tbl) - expect_equal(batch$num_columns(), 5L) - expect_equal(batch$num_rows(), 0L) + expect_equal(batch$num_columns, 5L) + expect_equal(batch$num_rows, 0L) expect_equal( - batch$schema(), + batch$schema, schema( int = int32(), dbl = float64(), @@ -107,67 +107,6 @@ test_that("RecordBatch with 0 rows are supported", { fct = dictionary(int32(), array(c("a", "b"))) ) ) - - tf <- local_tempfile() - write_record_batch(batch, tf) - res <- read_record_batch(tf) - expect_equal(res, batch) -}) - -test_that("read_record_batch handles various streams (ARROW-3450, ARROW-3505)", { - tbl <- tibble::tibble( - int = 1:10, dbl = as.numeric(1:10), - lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE), - chr = letters[1:10] - ) - batch <- record_batch(tbl) - tf <- local_tempfile() - write_record_batch(batch, tf) - - bytes <- write_record_batch(batch, raw()) - buf_reader <- buffer_reader(bytes) - - batch1 <- read_record_batch(tf) - batch2 <- read_record_batch(fs::path_abs(tf)) - - readable_file <- close_on_exit(file_open(tf)) - batch3 <- read_record_batch(readable_file) - - mmap_file <- close_on_exit(mmap_open(tf)) - batch4 <- read_record_batch(mmap_file) - batch5 <- read_record_batch(bytes) - batch6 <- read_record_batch(buf_reader) - - stream_reader <- record_batch_stream_reader(bytes) - batch7 <- read_record_batch(stream_reader) - expect_null(read_record_batch(stream_reader)) - - file_reader <- record_batch_file_reader(tf) - batch8 <- read_record_batch(file_reader) - expect_null(read_record_batch(file_reader, i = 2)) - - expect_equal(batch, batch1) - expect_equal(batch, batch2) - expect_equal(batch, batch3) - expect_equal(batch, batch4) - expect_equal(batch, batch5) - expect_equal(batch, batch6) - expect_equal(batch, batch7) - expect_equal(batch, batch8) -}) - -test_that("read_record_batch can handle Message, Schema parameters (ARROW-3499)", { - batch <- record_batch(tibble::tibble(x = 1:10)) - stream <- buffer_reader(write_record_batch(batch, raw())) - - # schema - message <- read_message(stream) - - # batch - message <- read_message(stream) - schema <- batch$schema() - batch2 <- read_record_batch(message, schema) - expect_equal(batch, batch2) }) test_that("RecordBatch cast (ARROW-3741)", { @@ -178,7 +117,7 @@ test_that("RecordBatch cast (ARROW-3741)", { s2 <- schema(x = int16(), y = int64()) batch2 <- batch$cast(s2) - expect_equal(batch2$schema(), s2) - expect_equal(batch2$column(0L)$type(), int16()) - expect_equal(batch2$column(1L)$type(), int64()) + expect_equal(batch2$schema, s2) + expect_equal(batch2$column(0L)$type, int16()) + expect_equal(batch2$column(1L)$type, int64()) }) diff --git a/r/tests/testthat/test-Table.R b/r/tests/testthat/test-Table.R index d5db9de24069d..ec1be9b234886 100644 --- a/r/tests/testthat/test-Table.R +++ b/r/tests/testthat/test-Table.R @@ -24,29 +24,28 @@ test_that("read_table handles various input streams (ARROW-3450, ARROW-3505)", { chr = letters[1:10] ) tab <- arrow::table(tbl) + tf <- local_tempfile() - write_table(tab, tf) + write_arrow(tab, tf) - bytes <- write_table(tab, raw()) - buf_reader <- buffer_reader(bytes) + bytes <- write_arrow(tab, raw()) tab1 <- read_table(tf) tab2 <- read_table(fs::path_abs(tf)) - readable_file <- close_on_exit(file_open(tf)) - tab3 <- read_table(readable_file) + readable_file <- close_on_exit(ReadableFile(tf)) + tab3 <- read_table(close_on_exit(RecordBatchFileReader(readable_file))) mmap_file <- close_on_exit(mmap_open(tf)) - tab4 <- read_table(mmap_file) + tab4 <- read_table(close_on_exit(RecordBatchFileReader(mmap_file))) tab5 <- read_table(bytes) - tab6 <- read_table(buf_reader) - stream_reader <- record_batch_stream_reader(bytes) - tab7 <- read_table(stream_reader) + stream_reader <- RecordBatchStreamReader(bytes) + tab6 <- read_table(stream_reader) - file_reader <- record_batch_file_reader(tf) - tab8 <- read_table(file_reader) + file_reader <- RecordBatchFileReader(tf) + tab7 <- read_table(file_reader) expect_equal(tab, tab1) expect_equal(tab, tab2) @@ -55,7 +54,6 @@ test_that("read_table handles various input streams (ARROW-3450, ARROW-3505)", { expect_equal(tab, tab5) expect_equal(tab, tab6) expect_equal(tab, tab7) - expect_equal(tab, tab8) }) test_that("Table cast (ARROW-3741)", { @@ -66,7 +64,7 @@ test_that("Table cast (ARROW-3741)", { s2 <- schema(x = int16(), y = int64()) tab2 <- tab$cast(s2) - expect_equal(tab2$schema(), s2) - expect_equal(tab2$column(0L)$type(), int16()) - expect_equal(tab2$column(1L)$type(), int64()) + expect_equal(tab2$schema, s2) + expect_equal(tab2$column(0L)$type, int16()) + expect_equal(tab2$column(1L)$type, int64()) }) diff --git a/r/tests/testthat/test-arraydata.R b/r/tests/testthat/test-arraydata.R index 5d8f8f1dcaa3c..02ca9b8562595 100644 --- a/r/tests/testthat/test-arraydata.R +++ b/r/tests/testthat/test-arraydata.R @@ -24,5 +24,5 @@ test_that("string vectors with only empty strings and nulls don't allocate a dat buffers <- a$data()$buffers expect_null(buffers[[1]]) expect_null(buffers[[3]]) - expect_equal(buffers[[2]]$size(), 8L) + expect_equal(buffers[[2]]$size, 8L) }) diff --git a/r/tests/testthat/test-buffer.R b/r/tests/testthat/test-buffer.R index aa712b026803c..26ec8dfde0a9b 100644 --- a/r/tests/testthat/test-buffer.R +++ b/r/tests/testthat/test-buffer.R @@ -21,26 +21,26 @@ test_that("arrow::Buffer can be created from raw vector", { vec <- raw(123) buf <- buffer(vec) expect_is(buf, "arrow::Buffer") - expect_equal(buf$size(), 123) + expect_equal(buf$size, 123) }) test_that("arrow::Buffer can be created from integer vector", { vec <- integer(17) buf <- buffer(vec) expect_is(buf, "arrow::Buffer") - expect_equal(buf$size(), 17 * 4) + expect_equal(buf$size, 17 * 4) }) test_that("arrow::Buffer can be created from numeric vector", { vec <- numeric(17) buf <- buffer(vec) expect_is(buf, "arrow::Buffer") - expect_equal(buf$size(), 17 * 8) + expect_equal(buf$size, 17 * 8) }) test_that("arrow::Buffer can be created from complex vector", { vec <- complex(3) buf <- buffer(vec) expect_is(buf, "arrow::Buffer") - expect_equal(buf$size(), 3 * 16) + expect_equal(buf$size, 3 * 16) }) diff --git a/r/tests/testthat/test-bufferreader.R b/r/tests/testthat/test-bufferreader.R index e7680a493fc0f..72d257101fa56 100644 --- a/r/tests/testthat/test-bufferreader.R +++ b/r/tests/testthat/test-bufferreader.R @@ -18,9 +18,9 @@ context("arrow::BufferReader") test_that("BufferReader can be created from R objects", { - num <- buffer_reader(numeric(13)) - int <- buffer_reader(integer(13)) - raw <- buffer_reader(raw(16)) + num <- BufferReader(numeric(13)) + int <- BufferReader(integer(13)) + raw <- BufferReader(raw(16)) expect_is(num, "arrow::io::BufferReader") expect_is(int, "arrow::io::BufferReader") @@ -33,7 +33,7 @@ test_that("BufferReader can be created from R objects", { test_that("BufferReader can be created from Buffer", { buf <- buffer(raw(76)) - reader <- buffer_reader(buf) + reader <- BufferReader(buf) expect_is(reader, "arrow::io::BufferReader") expect_equal(reader$GetSize(), 76) diff --git a/r/tests/testthat/test-chunkedarray.R b/r/tests/testthat/test-chunkedarray.R index 8bca62014777a..11a196d039d5f 100644 --- a/r/tests/testthat/test-chunkedarray.R +++ b/r/tests/testthat/test-chunkedarray.R @@ -19,38 +19,38 @@ context("arrow::ChunkedArray") test_that("ChunkedArray", { x <- chunked_array(1:10, 1:10, 1:5) - expect_equal(x$type(), int32()) - expect_equal(x$num_chunks(), 3L) + expect_equal(x$type, int32()) + expect_equal(x$num_chunks, 3L) expect_equal(x$length(), 25L) expect_equal(x$as_vector(), c(1:10, 1:10, 1:5)) y <- x$Slice(8) - expect_equal(y$type(), int32()) - expect_equal(y$num_chunks(), 3L) + expect_equal(y$type, int32()) + expect_equal(y$num_chunks, 3L) expect_equal(y$length(), 17L) expect_equal(y$as_vector(), c(9:10, 1:10, 1:5)) z <- x$Slice(8, 5) - expect_equal(z$type(), int32()) - expect_equal(z$num_chunks(), 2L) + expect_equal(z$type, int32()) + expect_equal(z$num_chunks, 2L) expect_equal(z$length(), 5L) expect_equal(z$as_vector(), c(9:10, 1:3)) x_dbl <- chunked_array(c(1,2,3), c(4,5,6)) - expect_equal(x_dbl$type(), float64()) - expect_equal(x_dbl$num_chunks(), 2L) + expect_equal(x_dbl$type, float64()) + expect_equal(x_dbl$num_chunks, 2L) expect_equal(x_dbl$length(), 6L) expect_equal(x_dbl$as_vector(), as.numeric(1:6)) y_dbl <- x_dbl$Slice(2) - expect_equal(y_dbl$type(), float64()) - expect_equal(y_dbl$num_chunks(), 2L) + expect_equal(y_dbl$type, float64()) + expect_equal(y_dbl$num_chunks, 2L) expect_equal(y_dbl$length(), 4L) expect_equal(y_dbl$as_vector(), as.numeric(3:6)) z_dbl <- x_dbl$Slice(2, 2) - expect_equal(z_dbl$type(), float64()) - expect_equal(z_dbl$num_chunks(), 2L) + expect_equal(z_dbl$type, float64()) + expect_equal(z_dbl$num_chunks, 2L) expect_equal(z_dbl$length(), 2L) expect_equal(z_dbl$as_vector(), as.numeric(3:4)) }) @@ -58,19 +58,19 @@ test_that("ChunkedArray", { test_that("ChunkedArray handles !!! splicing", { data <- list(1, 2, 3) x <- chunked_array(!!!data) - expect_equal(x$type(), float64()) - expect_equal(x$num_chunks(), 3L) + expect_equal(x$type, float64()) + expect_equal(x$num_chunks, 3L) }) test_that("ChunkedArray handles NA", { data <- list(1:10, c(NA, 2:10), c(1:3, NA, 5L)) x <- chunked_array(!!!data) - expect_equal(x$type(), int32()) - expect_equal(x$num_chunks(), 3L) + expect_equal(x$type, int32()) + expect_equal(x$num_chunks, 3L) expect_equal(x$length(), 25L) expect_equal(x$as_vector(), c(1:10, c(NA, 2:10), c(1:3, NA, 5))) - chunks <- x$chunks() + chunks <- x$chunks expect_equal(Array__Mask(chunks[[1]]), !is.na(data[[1]])) expect_equal(Array__Mask(chunks[[2]]), !is.na(data[[2]])) expect_equal(Array__Mask(chunks[[3]]), !is.na(data[[3]])) @@ -81,10 +81,10 @@ test_that("ChunkedArray supports logical vectors (ARROW-3341)", { data <- purrr::rerun(3, sample(c(TRUE, FALSE, NA), 100, replace = TRUE)) arr_lgl <- chunked_array(!!!data) expect_equal(arr_lgl$length(), 300L) - expect_equal(arr_lgl$null_count(), sum(unlist(map(data, is.na)))) + expect_equal(arr_lgl$null_count, sum(unlist(map(data, is.na)))) expect_identical(arr_lgl$as_vector(), purrr::flatten_lgl(data)) - chunks <- arr_lgl$chunks() + chunks <- arr_lgl$chunks expect_identical(data[[1]], chunks[[1]]$as_vector()) expect_identical(data[[2]], chunks[[2]]$as_vector()) expect_identical(data[[3]], chunks[[3]]$as_vector()) @@ -94,10 +94,10 @@ test_that("ChunkedArray supports logical vectors (ARROW-3341)", { data <- purrr::rerun(3, sample(c(TRUE, FALSE), 100, replace = TRUE)) arr_lgl <- chunked_array(!!!data) expect_equal(arr_lgl$length(), 300L) - expect_equal(arr_lgl$null_count(), sum(unlist(map(data, is.na)))) + expect_equal(arr_lgl$null_count, sum(unlist(map(data, is.na)))) expect_identical(arr_lgl$as_vector(), purrr::flatten_lgl(data)) - chunks <- arr_lgl$chunks() + chunks <- arr_lgl$chunks expect_identical(data[[1]], chunks[[1]]$as_vector()) expect_identical(data[[2]], chunks[[2]]$as_vector()) expect_identical(data[[3]], chunks[[3]]$as_vector()) @@ -112,10 +112,10 @@ test_that("ChunkedArray supports character vectors (ARROW-3339)", { ) arr_chr <- chunked_array(!!!data) expect_equal(arr_chr$length(), length(unlist(data))) - expect_equal(arr_chr$null_count(), 1L) + expect_equal(arr_chr$null_count, 1L) expect_equal(arr_chr$as_vector(), purrr::flatten_chr(data)) - chunks <- arr_chr$chunks() + chunks <- arr_chr$chunks expect_equal(data, purrr::map(chunks, ~.$as_vector())) }) @@ -123,14 +123,14 @@ test_that("ChunkedArray supports factors (ARROW-3716)", { f <- factor(c("itsy", "bitsy", "spider", "spider")) arr_fac <- chunked_array(f, f, f) expect_equal(arr_fac$length(), 12L) - expect_equal(arr_fac$type()$index_type(), int8()) + expect_equal(arr_fac$type$index_type, int8()) expect_identical(arr_fac$as_vector(), vctrs::vec_c(f, f, f)) }) test_that("ChunkedArray supports dates (ARROW-3716)", { d <- Sys.Date() + 1:10 a <- chunked_array(d, d) - expect_equal(a$type(), date32()) + expect_equal(a$type, date32()) expect_equal(a$length(), 20L) expect_equal(a$as_vector(), c(d, d)) }) @@ -138,8 +138,8 @@ test_that("ChunkedArray supports dates (ARROW-3716)", { test_that("ChunkedArray supports POSIXct (ARROW-3716)", { times <- lubridate::ymd_hms("2018-10-07 19:04:05") + 1:10 a <- chunked_array(times, times) - expect_equal(a$type()$name(), "timestamp") - expect_equal(a$type()$unit(), unclass(TimeUnit$MICRO)) + expect_equal(a$type$name, "timestamp") + expect_equal(a$type$unit(), unclass(TimeUnit$MICRO)) expect_equal(a$length(), 20L) expect_equal(as.numeric(a$as_vector()), as.numeric(c(times, times))) }) @@ -147,7 +147,7 @@ test_that("ChunkedArray supports POSIXct (ARROW-3716)", { test_that("ChunkedArray supports integer64 (ARROW-3716)", { x <- bit64::as.integer64(1:10) a <- chunked_array(x, x) - expect_equal(a$type(), int64()) + expect_equal(a$type, int64()) expect_equal(a$length(), 20L) expect_equal(a$as_vector(), c(x,x)) }) @@ -155,7 +155,7 @@ test_that("ChunkedArray supports integer64 (ARROW-3716)", { test_that("ChunkedArray supports difftime", { time <- hms::hms(56, 34, 12) a <- chunked_array(time, time) - expect_equal(a$type(), time32(unit = TimeUnit$SECOND)) + expect_equal(a$type, time32(unit = TimeUnit$SECOND)) expect_equal(a$length(), 2L) expect_equal(a$as_vector(), c(time, time)) }) @@ -177,10 +177,10 @@ test_that("integer types casts for ChunkedArray (ARROW-3741)", { expect_is(a_int16, "arrow::ChunkedArray") expect_is(a_int32, "arrow::ChunkedArray") expect_is(a_int64, "arrow::ChunkedArray") - expect_equal(a_int8$type(), int8()) - expect_equal(a_int16$type(), int16()) - expect_equal(a_int32$type(), int32()) - expect_equal(a_int64$type(), int64()) + expect_equal(a_int8$type, int8()) + expect_equal(a_int16$type, int16()) + expect_equal(a_int32$type, int32()) + expect_equal(a_int64$type, int64()) a_uint8 <- a$cast(uint8()) a_uint16 <- a$cast(uint16()) @@ -192,8 +192,8 @@ test_that("integer types casts for ChunkedArray (ARROW-3741)", { expect_is(a_uint32, "arrow::ChunkedArray") expect_is(a_uint64, "arrow::ChunkedArray") - expect_equal(a_uint8$type(), uint8()) - expect_equal(a_uint16$type(), uint16()) - expect_equal(a_uint32$type(), uint32()) - expect_equal(a_uint64$type(), uint64()) + expect_equal(a_uint8$type, uint8()) + expect_equal(a_uint16$type, uint16()) + expect_equal(a_uint32$type, uint32()) + expect_equal(a_uint64$type, uint64()) }) diff --git a/r/tests/testthat/test-feather.R b/r/tests/testthat/test-feather.R index f6d9bee581d66..715017fb5865c 100644 --- a/r/tests/testthat/test-feather.R +++ b/r/tests/testthat/test-feather.R @@ -29,7 +29,7 @@ test_that("feather read/write round trip", { expect_true(fs::file_exists(tf2)) tf3 <- local_tempfile() - stream <- close_on_exit(file_output_stream(tf3)) + stream <- close_on_exit(FileOutputStream(tf3)) write_feather(tib, stream) expect_true(fs::file_exists(tf3)) @@ -47,7 +47,7 @@ test_that("feather read/write round trip", { expect_is(tab4, "arrow::Table") # reading directly from arrow::io::ReadableFile - tab5 <- read_feather(file_open(tf3)) + tab5 <- read_feather(ReadableFile(tf3)) expect_is(tab5, "arrow::Table") expect_equal(tib, as_tibble(tab1)) diff --git a/r/tests/testthat/test-field.R b/r/tests/testthat/test-field.R index 08bf4db36a51b..aaa2875510a15 100644 --- a/r/tests/testthat/test-field.R +++ b/r/tests/testthat/test-field.R @@ -19,8 +19,8 @@ context("arrow::Field") test_that("field() factory", { x <- field("x", int32()) - expect_equal(x$type(), int32()) - expect_equal(x$name(), "x") + expect_equal(x$type, int32()) + expect_equal(x$name, "x") expect_true(x == x) expect_false(x == field("x", int64())) }) diff --git a/r/tests/testthat/test-message.R b/r/tests/testthat/test-message.R index fd05b86056808..3fe5829f86919 100644 --- a/r/tests/testthat/test-message.R +++ b/r/tests/testthat/test-message.R @@ -19,16 +19,12 @@ context("arrow::ipc::Message") test_that("read_message can read from input stream", { batch <- record_batch(tibble::tibble(x = 1:10)) - bytes <- write_record_batch(batch, raw()) - stream <- buffer_reader(bytes) + bytes <- batch$serialize() + stream <- BufferReader(bytes) message <- read_message(stream) - expect_equal(message$type(), MessageType$SCHEMA) - expect_is(message$body, "arrow::Buffer") - expect_is(message$metadata, "arrow::Buffer") - - message <- read_message(stream) - expect_equal(message$type(), MessageType$RECORD_BATCH) + expect_is(message, "arrow::ipc::Message") + expect_equal(message$type, MessageType$RECORD_BATCH) expect_is(message$body, "arrow::Buffer") expect_is(message$metadata, "arrow::Buffer") diff --git a/r/tests/testthat/test-messagereader.R b/r/tests/testthat/test-messagereader.R index 4527a2882f022..5ff8277625ddb 100644 --- a/r/tests/testthat/test-messagereader.R +++ b/r/tests/testthat/test-messagereader.R @@ -19,16 +19,13 @@ context("arrow::ipc::MessageReader") test_that("MessageReader can be created from raw vectors", { batch <- record_batch(tibble::tibble(x = 1:10)) - bytes <- write_record_batch(batch, raw()) + bytes <- batch$serialize() - reader <- message_reader(bytes) - message <- reader$ReadNextMessage() - expect_equal(message$type(), MessageType$SCHEMA) - expect_is(message$body, "arrow::Buffer") - expect_is(message$metadata, "arrow::Buffer") + reader <- MessageReader(bytes) message <- reader$ReadNextMessage() - expect_equal(message$type(), MessageType$RECORD_BATCH) + expect_is(message, "arrow::ipc::Message") + expect_equal(message$type, MessageType$RECORD_BATCH) expect_is(message$body, "arrow::Buffer") expect_is(message$metadata, "arrow::Buffer") @@ -38,17 +35,17 @@ test_that("MessageReader can be created from raw vectors", { test_that("MessageReader can be created from input stream", { batch <- record_batch(tibble::tibble(x = 1:10)) - bytes <- write_record_batch(batch, raw()) - stream <- buffer_reader(bytes) + bytes <- batch$serialize() - reader <- message_reader(stream) - message <- reader$ReadNextMessage() - expect_equal(message$type(), MessageType$SCHEMA) - expect_is(message$body, "arrow::Buffer") - expect_is(message$metadata, "arrow::Buffer") + stream <- BufferReader(bytes) + expect_is(stream, "arrow::io::BufferReader") + + reader <- MessageReader(stream) + expect_is(reader, "arrow::ipc::MessageReader") message <- reader$ReadNextMessage() - expect_equal(message$type(), MessageType$RECORD_BATCH) + expect_is(message, "arrow::ipc::Message") + expect_equal(message$type, MessageType$RECORD_BATCH) expect_is(message$body, "arrow::Buffer") expect_is(message$metadata, "arrow::Buffer") diff --git a/r/tests/testthat/test-read-write.R b/r/tests/testthat/test-read-write.R index 2af718ebe565e..ffc14eba72bdb 100644 --- a/r/tests/testthat/test-read-write.R +++ b/r/tests/testthat/test-read-write.R @@ -25,24 +25,24 @@ test_that("arrow::table round trip", { ) tab <- arrow::table(tbl) - expect_equal(tab$num_columns(), 3L) - expect_equal(tab$num_rows(), 10L) + expect_equal(tab$num_columns, 3L) + expect_equal(tab$num_rows, 10L) # arrow::Column col_int <- tab$column(0) expect_equal(col_int$length(), 10L) - expect_equal(col_int$null_count(), 0L) - expect_equal(col_int$type(), int32()) + expect_equal(col_int$null_count, 0L) + expect_equal(col_int$type, int32()) # arrow::ChunkedArray chunked_array_int <- col_int$data() expect_equal(chunked_array_int$length(), 10L) - expect_equal(chunked_array_int$null_count(), 0L) + expect_equal(chunked_array_int$null_count, 0L) expect_equal(chunked_array_int$as_vector(), tbl$int) # arrow::Array - chunks_int <- chunked_array_int$chunks() - expect_equal(length(chunks_int), chunked_array_int$num_chunks()) + chunks_int <- chunked_array_int$chunks + expect_equal(length(chunks_int), chunked_array_int$num_chunks) for( i in seq_along(chunks_int)){ expect_equal(chunked_array_int$chunk(i-1L), chunks_int[[i]]) } @@ -50,18 +50,18 @@ test_that("arrow::table round trip", { # arrow::Column col_dbl <- tab$column(1) expect_equal(col_dbl$length(), 10L) - expect_equal(col_dbl$null_count(), 0L) - expect_equal(col_dbl$type(), float64()) + expect_equal(col_dbl$null_count, 0L) + expect_equal(col_dbl$type, float64()) # arrow::ChunkedArray chunked_array_dbl <- col_dbl$data() expect_equal(chunked_array_dbl$length(), 10L) - expect_equal(chunked_array_dbl$null_count(), 0L) + expect_equal(chunked_array_dbl$null_count, 0L) expect_equal(chunked_array_dbl$as_vector(), tbl$dbl) # arrow::Array - chunks_dbl <- chunked_array_dbl$chunks() - expect_equal(length(chunks_dbl), chunked_array_dbl$num_chunks()) + chunks_dbl <- chunked_array_dbl$chunks + expect_equal(length(chunks_dbl), chunked_array_dbl$num_chunks) for( i in seq_along(chunks_dbl)){ expect_equal(chunked_array_dbl$chunk(i-1L), chunks_dbl[[i]]) } @@ -69,18 +69,18 @@ test_that("arrow::table round trip", { # arrow::Colmumn col_raw <- tab$column(2) expect_equal(col_raw$length(), 10L) - expect_equal(col_raw$null_count(), 0L) - expect_equal(col_raw$type(), int8()) + expect_equal(col_raw$null_count, 0L) + expect_equal(col_raw$type, int8()) # arrow::ChunkedArray chunked_array_raw <- col_raw$data() expect_equal(chunked_array_raw$length(), 10L) - expect_equal(chunked_array_raw$null_count(), 0L) + expect_equal(chunked_array_raw$null_count, 0L) expect_equal(chunked_array_raw$as_vector(), tbl$raw) # arrow::Array - chunks_raw <- chunked_array_raw$chunks() - expect_equal(length(chunks_raw), chunked_array_raw$num_chunks()) + chunks_raw <- chunked_array_raw$chunks + expect_equal(length(chunks_raw), chunked_array_raw$num_chunks) for( i in seq_along(chunks_raw)){ expect_equal(chunked_array_raw$chunk(i-1L), chunks_raw[[i]]) } @@ -99,20 +99,20 @@ test_that("arrow::table round trip handles NA in integer and numeric", { ) tab <- arrow::table(tbl) - expect_equal(tab$num_columns(), 3L) - expect_equal(tab$num_rows(), 10L) + expect_equal(tab$num_columns, 3L) + expect_equal(tab$num_rows, 10L) expect_equal(tab$column(0)$length(), 10L) expect_equal(tab$column(1)$length(), 10L) expect_equal(tab$column(2)$length(), 10L) - expect_equal(tab$column(0)$null_count(), 1L) - expect_equal(tab$column(1)$null_count(), 2L) - expect_equal(tab$column(2)$null_count(), 0L) + expect_equal(tab$column(0)$null_count, 1L) + expect_equal(tab$column(1)$null_count, 2L) + expect_equal(tab$column(2)$null_count, 0L) - expect_equal(tab$column(0)$type(), int32()) - expect_equal(tab$column(1)$type(), float64()) - expect_equal(tab$column(2)$type(), int8()) + expect_equal(tab$column(0)$type, int32()) + expect_equal(tab$column(1)$type, float64()) + expect_equal(tab$column(2)$type, int8()) tf <- local_tempfile() write_arrow(tbl, tf) diff --git a/r/tests/testthat/test-read_record_batch.R b/r/tests/testthat/test-read_record_batch.R new file mode 100644 index 0000000000000..8477b7a4c3ddf --- /dev/null +++ b/r/tests/testthat/test-read_record_batch.R @@ -0,0 +1,73 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +context("read_record_batch()") + +test_that("RecordBatchFileWriter / RecordBatchFileReader roundtrips", { + tab <- table(tibble::tibble( + int = 1:10, dbl = as.numeric(1:10), + lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE), + chr = letters[1:10] + )) + tf <- local_tempfile() + + writer <- RecordBatchFileWriter(tf, tab$schema) + expect_is(writer, "arrow::ipc::RecordBatchFileWriter") + writer$write_table(tab) + writer$close() + tab2 <- read_table(tf) + expect_equal(tab, tab2) + + stream <- FileOutputStream(tf) + writer <- RecordBatchFileWriter(stream, tab$schema) + expect_is(writer, "arrow::ipc::RecordBatchFileWriter") + writer$write_table(tab) + writer$close() + tab3 <- read_table(tf) + expect_equal(tab, tab3) +}) + +test_that("read_record_batch() handles (raw|Buffer|InputStream, Schema) (ARROW-3450, ARROW-3505)", { + tbl <- tibble::tibble( + int = 1:10, dbl = as.numeric(1:10), + lgl = sample(c(TRUE, FALSE, NA), 10, replace = TRUE), + chr = letters[1:10] + ) + batch <- record_batch(tbl) + schema <- batch$schema + + raw <- batch$serialize() + batch2 <- read_record_batch(raw, schema) + batch3 <- read_record_batch(buffer(raw), schema) + batch4 <- read_record_batch(close_on_exit(BufferReader(raw)), schema) + + expect_equal(batch, batch2) + expect_equal(batch, batch3) + expect_equal(batch, batch4) +}) + +test_that("read_record_batch() can handle (Message, Schema) parameters (ARROW-3499)", { + batch <- record_batch(tibble::tibble(x = 1:10)) + schema <- batch$schema + + raw <- batch$serialize() + stream <- close_on_exit(BufferReader(raw)) + + message <- read_message(stream) + batch2 <- read_record_batch(message, schema) + expect_equal(batch, batch2) +}) diff --git a/r/tests/testthat/test-recordbatchreader.R b/r/tests/testthat/test-recordbatchreader.R new file mode 100644 index 0000000000000..d2b6a09c37b24 --- /dev/null +++ b/r/tests/testthat/test-recordbatchreader.R @@ -0,0 +1,68 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +context("arrow::RecordBatch.*(Reader|Writer)") + +test_that("RecordBatchStreamReader / Writer", { + batch <- record_batch(tibble::tibble( + x = 1:10, + y = letters[1:10] + )) + + sink <- BufferOutputStream() + writer <- RecordBatchStreamWriter(sink, batch$schema) + expect_is(writer, "arrow::ipc::RecordBatchStreamWriter") + writer$write_batch(batch) + writer$close() + + buf <- sink$getvalue() + expect_is(buf, "arrow::Buffer") + + reader <- RecordBatchStreamReader(buf) + expect_is(reader, "arrow::ipc::RecordBatchStreamReader") + + batch1 <- reader$read_next_batch() + expect_is(batch1, "arrow::RecordBatch") + expect_equal(batch, batch1) + + expect_null(reader$read_next_batch()) +}) + +test_that("RecordBatchFileReader / Writer", { + batch <- record_batch(tibble::tibble( + x = 1:10, + y = letters[1:10] + )) + + sink <- BufferOutputStream() + writer <- RecordBatchFileWriter(sink, batch$schema) + expect_is(writer, "arrow::ipc::RecordBatchFileWriter") + writer$write_batch(batch) + writer$close() + + buf <- sink$getvalue() + expect_is(buf, "arrow::Buffer") + + reader <- RecordBatchFileReader(buf) + expect_is(reader, "arrow::ipc::RecordBatchFileReader") + + batch1 <- reader$get_batch(0L) + expect_is(batch1, "arrow::RecordBatch") + expect_equal(batch, batch1) + + expect_equal(reader$num_record_batches, 1L) +}) diff --git a/r/tests/testthat/test-schema.R b/r/tests/testthat/test-schema.R index d40fbfa36bc18..2f2d3ee84e731 100644 --- a/r/tests/testthat/test-schema.R +++ b/r/tests/testthat/test-schema.R @@ -17,18 +17,30 @@ context("arrow::Schema") -test_that("reading schema from raw vector", { +test_that("reading schema from Buffer", { + # TODO: this uses the streaming format, i.e. from RecordBatchStreamWriter + # maybe there is an easier way to serialize a schema batch <- record_batch(tibble::tibble(x = 1:10)) - bytes <- write_record_batch(batch, raw()) - schema <- read_schema(bytes) - expect_equal(schema, batch$schema()) -}) + expect_is(batch, "arrow::RecordBatch") -test_that("reading schema from streams", { - batch <- record_batch(tibble::tibble(x = 1:10)) - bytes <- write_record_batch(batch, raw()) - stream <- buffer_reader(bytes) + stream <- BufferOutputStream() + writer <- RecordBatchStreamWriter(stream, batch$schema) + expect_is(writer, "arrow::ipc::RecordBatchStreamWriter") + writer$close() + + buffer <- stream$getvalue() + expect_is(buffer, "arrow::Buffer") + + reader <- MessageReader(buffer) + expect_is(reader, "arrow::ipc::MessageReader") + + message <- reader$ReadNextMessage() + expect_is(message, "arrow::ipc::Message") + expect_equal(message$type, MessageType$SCHEMA) - schema <- read_schema(stream) - expect_equal(schema, batch$schema()) + stream <- BufferReader(buffer) + expect_is(stream, "arrow::io::BufferReader") + message <- read_message(stream) + expect_is(message, "arrow::ipc::Message") + expect_equal(message$type, MessageType$SCHEMA) }) diff --git a/ruby/README.md b/ruby/README.md index aac714e537841..42486588cf9c6 100644 --- a/ruby/README.md +++ b/ruby/README.md @@ -23,4 +23,12 @@ There are the official Ruby bindings for Apache Arrow. [Red Arrow](https://github.com/apache/arrow/tree/master/ruby/red-arrow) is the base Apache Arrow bindings. -[Red Arrow GPU](https://github.com/apache/arrow/tree/master/ruby/red-arrow-gpu) is the Apache Arrow bindings of GPU part. +[Red Arrow CUDA](https://github.com/apache/arrow/tree/master/ruby/red-arrow-cuda) is the Apache Arrow bindings of CUDA part. + +[Red Gandiva](https://github.com/apache/arrow/tree/master/ruby/red-gandiva) is the Gandiva bindings. + +[Red Plasma](https://github.com/apache/arrow/tree/master/ruby/red-plasma) is the Plasma bindings. + +[Red Parquet](https://github.com/apache/arrow/tree/master/ruby/red-parquet) is the Parquet bindings. + + diff --git a/ruby/red-arrow-gpu/.gitignore b/ruby/red-arrow-cuda/.gitignore similarity index 96% rename from ruby/red-arrow-gpu/.gitignore rename to ruby/red-arrow-cuda/.gitignore index 161ac0553533c..3ec5511596306 100644 --- a/ruby/red-arrow-gpu/.gitignore +++ b/ruby/red-arrow-cuda/.gitignore @@ -15,6 +15,6 @@ # specific language governing permissions and limitations # under the License. -/lib/arrow-gpu/version.rb +/lib/arrow-cuda/version.rb /pkg/ diff --git a/ruby/red-arrow-gpu/Gemfile b/ruby/red-arrow-cuda/Gemfile similarity index 100% rename from ruby/red-arrow-gpu/Gemfile rename to ruby/red-arrow-cuda/Gemfile diff --git a/ruby/red-arrow-gpu/LICENSE.txt b/ruby/red-arrow-cuda/LICENSE.txt similarity index 100% rename from ruby/red-arrow-gpu/LICENSE.txt rename to ruby/red-arrow-cuda/LICENSE.txt diff --git a/ruby/red-arrow-gpu/NOTICE.txt b/ruby/red-arrow-cuda/NOTICE.txt similarity index 100% rename from ruby/red-arrow-gpu/NOTICE.txt rename to ruby/red-arrow-cuda/NOTICE.txt diff --git a/ruby/red-arrow-cuda/README.md b/ruby/red-arrow-cuda/README.md new file mode 100644 index 0000000000000..76fa51c9b136c --- /dev/null +++ b/ruby/red-arrow-cuda/README.md @@ -0,0 +1,62 @@ + + +# Red Arrow CUDA - Apache Arrow CUDA Ruby + +Red Arrow CUDA is the Ruby bindings of Apache Arrow CUDA. Red Arrow CUDA is based on GObject Introspection. + +[Apache Arrow CUDA](https://arrow.apache.org/) is an in-memory columnar data store on GPU. + +[GObject Introspection](https://wiki.gnome.org/action/show/Projects/GObjectIntrospection) is a middleware for language bindings of C library. GObject Introspection can generate language bindings automatically at runtime. + +Red Arrow CUDA uses [Apache Arrow CUDA GLib](https://github.com/apache/arrow/tree/master/c_glib) and [gobject-introspection gem](https://rubygems.org/gems/gobject-introspection) to generate Ruby bindings of Apache Arrow CUDA. + +Apache Arrow CUDA GLib is a C wrapper for [Apache Arrow CUDA C++](https://github.com/apache/arrow/tree/master/cpp). GObject Introspection can't use Apache Arrow CUDA C++ directly. Apache Arrow CUDA GLib is a bridge between Apache Arrow CUDA C++ and GObject Introspection. + +gobject-introspection gem is a Ruby bindings of GObject Introspection. Red Arrow CUDA uses GObject Introspection via gobject-introspection gem. + +## Install + +Install Apache Arrow CUDA GLib before install Red Arrow CUDA. Use [packages.red-data-tools.org](https://github.com/red-data-tools/packages.red-data-tools.org) for installing Apache Arrow CUDA GLib. + +Note that the Apache Arrow CUDA GLib packages are "unofficial". "Official" packages will be released in the future. + +Install Red Arrow CUDA after you install Apache Arrow CUDA GLib: + +```text +% gem install red-arrow-cuda +``` + +## Usage + +```ruby +require "arrow-cuda" + +manager = ArrowCUDA::DeviceManager.new +if manager.n_devices.zero? + raise "No GPU is found" +end + +context = manager[0] +buffer = ArrowCUDA::Buffer.new(context, 128) +ArrowCUDA::BufferOutputStream.open(buffer) do |stream| + stream.write("Hello World") +end +puts buffer.copy_to_host(0, 11) # => "Hello World" +``` diff --git a/ruby/red-arrow-gpu/Rakefile b/ruby/red-arrow-cuda/Rakefile similarity index 100% rename from ruby/red-arrow-gpu/Rakefile rename to ruby/red-arrow-cuda/Rakefile diff --git a/ruby/red-arrow-gpu/dependency-check/Rakefile b/ruby/red-arrow-cuda/dependency-check/Rakefile similarity index 88% rename from ruby/red-arrow-gpu/dependency-check/Rakefile rename to ruby/red-arrow-cuda/dependency-check/Rakefile index 0c2284811d95d..c057a1df2c1a3 100644 --- a/ruby/red-arrow-gpu/dependency-check/Rakefile +++ b/ruby/red-arrow-cuda/dependency-check/Rakefile @@ -33,9 +33,9 @@ end namespace :dependency do desc "Check dependency" task :check do - unless PKGConfig.check_version?("arrow-gpu-glib") - unless NativePackageInstaller.install(:debian => "libarrow-gpu-glib-dev", - :redhat => "arrow-gpu-glib-devel") + unless PKGConfig.check_version?("arrow-cuda-glib") + unless NativePackageInstaller.install(:debian => "libarrow-cuda-glib-dev", + :redhat => "arrow-cuda-glib-devel") exit(false) end end diff --git a/ruby/red-arrow-gpu/lib/arrow-gpu.rb b/ruby/red-arrow-cuda/lib/arrow-cuda.rb similarity index 92% rename from ruby/red-arrow-gpu/lib/arrow-gpu.rb rename to ruby/red-arrow-cuda/lib/arrow-cuda.rb index 10fdcc3c6cbb3..1fc13d0a053b7 100644 --- a/ruby/red-arrow-gpu/lib/arrow-gpu.rb +++ b/ruby/red-arrow-cuda/lib/arrow-cuda.rb @@ -17,11 +17,11 @@ require "arrow" -require "arrow-gpu/version" +require "arrow-cuda/version" -require "arrow-gpu/loader" +require "arrow-cuda/loader" -module ArrowGPU +module ArrowCUDA class Error < StandardError end diff --git a/ruby/red-arrow-gpu/lib/arrow-gpu/cuda-device-manager.rb b/ruby/red-arrow-cuda/lib/arrow-cuda/device-manager.rb similarity index 95% rename from ruby/red-arrow-gpu/lib/arrow-gpu/cuda-device-manager.rb rename to ruby/red-arrow-cuda/lib/arrow-cuda/device-manager.rb index 163128b208022..bbef749721e6c 100644 --- a/ruby/red-arrow-gpu/lib/arrow-gpu/cuda-device-manager.rb +++ b/ruby/red-arrow-cuda/lib/arrow-cuda/device-manager.rb @@ -15,8 +15,8 @@ # specific language governing permissions and limitations # under the License. -module ArrowGPU - class CUDADeviceManager +module ArrowCUDA + class DeviceManager # Experimental. # # Can we think device manager is a container of contexts? diff --git a/ruby/red-arrow-gpu/lib/arrow-gpu/loader.rb b/ruby/red-arrow-cuda/lib/arrow-cuda/loader.rb similarity index 91% rename from ruby/red-arrow-gpu/lib/arrow-gpu/loader.rb rename to ruby/red-arrow-cuda/lib/arrow-cuda/loader.rb index b9dc57cc81207..6b2afc4040e0e 100644 --- a/ruby/red-arrow-gpu/lib/arrow-gpu/loader.rb +++ b/ruby/red-arrow-cuda/lib/arrow-cuda/loader.rb @@ -15,11 +15,11 @@ # specific language governing permissions and limitations # under the License. -module ArrowGPU +module ArrowCUDA class Loader < GObjectIntrospection::Loader class << self def load - super("ArrowGPU", ArrowGPU) + super("ArrowCUDA", ArrowCUDA) end end @@ -29,7 +29,7 @@ def post_load(repository, namespace) end def require_libraries - require "arrow-gpu/cuda-device-manager" + require "arrow-cuda/device-manager" end end end diff --git a/ruby/red-arrow-gpu/red-arrow-gpu.gemspec b/ruby/red-arrow-cuda/red-arrow-cuda.gemspec similarity index 84% rename from ruby/red-arrow-gpu/red-arrow-gpu.gemspec rename to ruby/red-arrow-cuda/red-arrow-cuda.gemspec index 340d41e8f7680..b2ee982945605 100644 --- a/ruby/red-arrow-gpu/red-arrow-gpu.gemspec +++ b/ruby/red-arrow-cuda/red-arrow-cuda.gemspec @@ -20,11 +20,11 @@ require_relative "version" Gem::Specification.new do |spec| - spec.name = "red-arrow-gpu" + spec.name = "red-arrow-cuda" version_components = [ - ArrowGPU::Version::MAJOR.to_s, - ArrowGPU::Version::MINOR.to_s, - ArrowGPU::Version::MICRO.to_s, + ArrowCUDA::Version::MAJOR.to_s, + ArrowCUDA::Version::MINOR.to_s, + ArrowCUDA::Version::MICRO.to_s, # "beta1", ] spec.version = version_components.join(".") @@ -32,9 +32,9 @@ Gem::Specification.new do |spec| spec.authors = ["Apache Arrow Developers"] spec.email = ["dev@arrow.apache.org"] - spec.summary = "Red Arrow GPU is the Ruby bindings of Apache Arrow GPU" + spec.summary = "Red Arrow CUDA is the Ruby bindings of Apache Arrow CUDA" spec.description = - "Apache Arrow GPU is a common in-memory columnar data store on GPU. " + + "Apache Arrow CUDA is a common in-memory columnar data store on CUDA. " + "It's useful to share and process large data." spec.license = "Apache-2.0" spec.files = ["README.md", "Rakefile", "Gemfile", "#{spec.name}.gemspec"] diff --git a/ruby/red-arrow-gpu/test/helper.rb b/ruby/red-arrow-cuda/test/helper.rb similarity index 97% rename from ruby/red-arrow-gpu/test/helper.rb rename to ruby/red-arrow-cuda/test/helper.rb index 772636ab3cd75..4d018332677ec 100644 --- a/ruby/red-arrow-gpu/test/helper.rb +++ b/ruby/red-arrow-cuda/test/helper.rb @@ -18,6 +18,6 @@ require_relative "../../red-arrow/version" require_relative "../version" -require "arrow-gpu" +require "arrow-cuda" require "test-unit" diff --git a/ruby/red-arrow-gpu/test/run-test.rb b/ruby/red-arrow-cuda/test/run-test.rb similarity index 100% rename from ruby/red-arrow-gpu/test/run-test.rb rename to ruby/red-arrow-cuda/test/run-test.rb diff --git a/ruby/red-arrow-gpu/test/test-cuda.rb b/ruby/red-arrow-cuda/test/test-cuda.rb similarity index 87% rename from ruby/red-arrow-gpu/test/test-cuda.rb rename to ruby/red-arrow-cuda/test/test-cuda.rb index 05fd6cc155398..a48b687d36e0d 100644 --- a/ruby/red-arrow-gpu/test/test-cuda.rb +++ b/ruby/red-arrow-cuda/test/test-cuda.rb @@ -17,7 +17,7 @@ class TestCUDA < Test::Unit::TestCase def setup - @manager = ArrowGPU::CUDADeviceManager.new + @manager = ArrowCUDA::DeviceManager.new omit("At least one GPU is required") if @manager.n_devices.zero? @context = @manager[0] end @@ -25,11 +25,11 @@ def setup sub_test_case("BufferOutputStream") do def setup super - @buffer = ArrowGPU::CUDABuffer.new(@context, 128) + @buffer = ArrowCUDA::Buffer.new(@context, 128) end def test_new - ArrowGPU::CUDABufferOutputStream.open(@buffer) do |stream| + ArrowCUDA::BufferOutputStream.open(@buffer) do |stream| stream.write("Hello World") end assert_equal("Hello World", @buffer.copy_to_host(0, 11).to_s) diff --git a/ruby/red-arrow-gpu/version.rb b/ruby/red-arrow-cuda/version.rb similarity index 94% rename from ruby/red-arrow-gpu/version.rb rename to ruby/red-arrow-cuda/version.rb index fc0d37e6bae6b..c8bbbc7165f29 100644 --- a/ruby/red-arrow-gpu/version.rb +++ b/ruby/red-arrow-cuda/version.rb @@ -20,7 +20,7 @@ version_rb_path = Pathname.new(__FILE__) base_dir = version_rb_path.dirname pom_xml_path = base_dir.join("..", "..", "java", "pom.xml") -lib_version_rb_path = base_dir.join("lib", "arrow-gpu", "version.rb") +lib_version_rb_path = base_dir.join("lib", "arrow-cuda", "version.rb") need_update = false if not lib_version_rb_path.exist? @@ -53,7 +53,7 @@ # specific language governing permissions and limitations # under the License. -module ArrowGPU +module ArrowCUDA module Version MAJOR = #{major} MINOR = #{minor} @@ -68,4 +68,4 @@ module Version end end -require_relative "lib/arrow-gpu/version" +require_relative "lib/arrow-cuda/version" diff --git a/ruby/red-arrow-gpu/README.md b/ruby/red-arrow-gpu/README.md deleted file mode 100644 index ad76c13011f79..0000000000000 --- a/ruby/red-arrow-gpu/README.md +++ /dev/null @@ -1,62 +0,0 @@ - - -# Red Arrow GPU - Apache Arrow GPU Ruby - -Red Arrow GPU is the Ruby bindings of Apache Arrow GPU. Red Arrow GPU is based on GObject Introspection. - -[Apache Arrow GPU](https://arrow.apache.org/) is an in-memory columnar data store on GPU. - -[GObject Introspection](https://wiki.gnome.org/action/show/Projects/GObjectIntrospection) is a middleware for language bindings of C library. GObject Introspection can generate language bindings automatically at runtime. - -Red Arrow GPU uses [Apache Arrow GPU GLib](https://github.com/apache/arrow/tree/master/c_glib) and [gobject-introspection gem](https://rubygems.org/gems/gobject-introspection) to generate Ruby bindings of Apache Arrow GPU. - -Apache Arrow GPU GLib is a C wrapper for [Apache Arrow GPU C++](https://github.com/apache/arrow/tree/master/cpp). GObject Introspection can't use Apache Arrow GPU C++ directly. Apache Arrow GPU GLib is a bridge between Apache Arrow GPU C++ and GObject Introspection. - -gobject-introspection gem is a Ruby bindings of GObject Introspection. Red Arrow GPU uses GObject Introspection via gobject-introspection gem. - -## Install - -Install Apache Arrow GPU GLib before install Red Arrow GPU. Use [packages.red-data-tools.org](https://github.com/red-data-tools/packages.red-data-tools.org) for installing Apache Arrow GPU GLib. - -Note that the Apache Arrow GPU GLib packages are "unofficial". "Official" packages will be released in the future. - -Install Red Arrow GPU after you install Apache Arrow GPU GLib: - -```text -% gem install red-arrow-gpu -``` - -## Usage - -```ruby -require "arrow-gpu" - -manager = ArrowGPU::CUDADeviceManager.new -if manager.n_devices.zero? - raise "No GPU is found" -end - -context = manager[0] -buffer = ArrowGPU::CUDABuffer.new(context, 128) -ArrowGPU::CUDABufferOutputStream.open(buffer) do |stream| - stream.write("Hello World") -end -puts buffer.copy_to_host(0, 11) # => "Hello World" -``` diff --git a/ruby/red-arrow/lib/arrow/array.rb b/ruby/red-arrow/lib/arrow/array.rb index 7a0d053901d97..049224154dca3 100644 --- a/ruby/red-arrow/lib/arrow/array.rb +++ b/ruby/red-arrow/lib/arrow/array.rb @@ -20,11 +20,13 @@ class Array include Enumerable class << self - def new(values) + def new(*args) + return super if args.size != 1 + builder_class_name = "#{name}Builder" if const_defined?(builder_class_name) builder_class = const_get(builder_class_name) - builder_class.build(values) + builder_class.build(*args) else super end diff --git a/rust/Cargo.toml b/rust/Cargo.toml index b8750945fb162..b56cd6fb30091 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -28,6 +28,7 @@ include = [ "src/**/*.rs", "Cargo.toml", ] +edition = "2018" [lib] name = "arrow" diff --git a/rust/Dockerfile b/rust/Dockerfile index c63dcda79d55b..17661fcb5d8e5 100644 --- a/rust/Dockerfile +++ b/rust/Dockerfile @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -FROM rust +FROM rustlang/rust:nightly # sadly cargo doesn't have a command to fetch and build the # dependencies without building the library itself diff --git a/rust/README.md b/rust/README.md index 51beb46bab51f..f8908f8e6e64d 100644 --- a/rust/README.md +++ b/rust/README.md @@ -24,20 +24,19 @@ ## Status -This is a starting point for a native Rust implementation of Arrow. - -The current code demonstrates arrays of primitive types and structs. - -## Creating an Array from a Vec - -```rust -// create a memory-aligned Arrow array from an existing Vec -let array = PrimitiveArray::from(vec![1, 2, 3, 4, 5]); - -println!("array contents: {:?}", array.iter().collect::>()); -``` - -## Run Examples +This is a native Rust implementation of Apache Arrow. The current status is: + +- [x] Primitive Arrays +- [x] List Arrays +- [x] Struct Arrays +- [x] CSV Reader +- [ ] CSV Writer +- [ ] Parquet Reader +- [ ] Parquet Writer +- [ ] Arrow IPC +- [ ] Interop tests with other implementations + +## Examples The examples folder shows how to construct some different types of Arrow arrays, including dynamic arrays created at runtime. diff --git a/rust/src/array.rs b/rust/src/array.rs index 264aa50121f6c..ca1d2a5cdb1e7 100644 --- a/rust/src/array.rs +++ b/rust/src/array.rs @@ -22,12 +22,12 @@ use std::io::Write; use std::mem; use std::sync::Arc; -use array_data::{ArrayData, ArrayDataRef}; -use buffer::{Buffer, MutableBuffer}; -use builder::*; -use datatypes::*; -use memory; -use util::bit_util; +use crate::array_data::{ArrayData, ArrayDataRef}; +use crate::buffer::{Buffer, MutableBuffer}; +use crate::builder::*; +use crate::datatypes::*; +use crate::memory; +use crate::util::bit_util; /// Trait for dealing with different types of array at runtime when the type of the /// array is not known in advance @@ -692,14 +692,13 @@ impl From> for StructArray { #[cfg(test)] mod tests { - use std::thread; - use super::*; - use array_data::ArrayData; - use buffer::Buffer; - use datatypes::{DataType, Field, ToByteSlice}; - use memory; + use crate::array_data::ArrayData; + use crate::buffer::Buffer; + use crate::datatypes::{DataType, Field, ToByteSlice}; + use crate::memory; use std::sync::Arc; + use std::thread; #[test] fn test_primitive_array_from_vec() { diff --git a/rust/src/array_data.rs b/rust/src/array_data.rs index 055c8d91e7f09..b288d4a804535 100644 --- a/rust/src/array_data.rs +++ b/rust/src/array_data.rs @@ -17,10 +17,10 @@ use std::sync::Arc; -use bitmap::Bitmap; -use buffer::Buffer; -use datatypes::DataType; -use util::bit_util; +use crate::bitmap::Bitmap; +use crate::buffer::Buffer; +use crate::datatypes::DataType; +use crate::util::bit_util; /// An generic representation of Arrow array data which encapsulates common attributes and /// operations for Arrow array. Specific operations for different arrays types (e.g., @@ -225,8 +225,8 @@ mod tests { use std::sync::Arc; use super::{ArrayData, DataType}; - use buffer::Buffer; - use util::bit_util; + use crate::buffer::Buffer; + use crate::util::bit_util; #[test] fn test_new() { diff --git a/rust/src/bitmap.rs b/rust/src/bitmap.rs index 6cec4d51bb625..742fac5587b3e 100644 --- a/rust/src/bitmap.rs +++ b/rust/src/bitmap.rs @@ -16,7 +16,7 @@ // under the License. use super::buffer::Buffer; -use util::bit_util; +use crate::util::bit_util; #[derive(PartialEq, Debug)] pub struct Bitmap { diff --git a/rust/src/buffer.rs b/rust/src/buffer.rs index 67d2896b339f8..4b7d2a0d3c97e 100644 --- a/rust/src/buffer.rs +++ b/rust/src/buffer.rs @@ -20,9 +20,9 @@ use std::io::{Error as IoError, ErrorKind, Result as IoResult, Write}; use std::mem; use std::sync::Arc; -use error::Result; -use memory; -use util::bit_util; +use crate::error::Result; +use crate::memory; +use crate::util::bit_util; /// Buffer is a contiguous memory region of fixed size and is aligned at a 64-byte /// boundary. Buffer is immutable. @@ -314,9 +314,9 @@ unsafe impl Send for MutableBuffer {} #[cfg(test)] mod tests { + use crate::util::bit_util; use std::ptr::null_mut; use std::thread; - use util::bit_util; use super::*; diff --git a/rust/src/builder.rs b/rust/src/builder.rs index df6b645312e23..2cbdce0c8570b 100644 --- a/rust/src/builder.rs +++ b/rust/src/builder.rs @@ -23,12 +23,12 @@ use std::io::Write; use std::marker::PhantomData; use std::mem; -use array::*; -use array_data::ArrayData; -use buffer::{Buffer, MutableBuffer}; -use datatypes::*; -use error::{ArrowError, Result}; -use util::bit_util; +use crate::array::*; +use crate::array_data::ArrayData; +use crate::buffer::{Buffer, MutableBuffer}; +use crate::datatypes::*; +use crate::error::{ArrowError, Result}; +use crate::util::bit_util; /// Buffer builder with zero-copy build method pub struct BufferBuilder { @@ -209,7 +209,7 @@ impl BufferBuilderTrait for BufferBuilder { /// Trait for dealing with different array builders at runtime pub trait ArrayBuilder { /// The type of array that this builder creates - type ArrayType; + type ArrayType: Array; /// Returns the builder as an owned `Any` type so that it can be `downcast` to a specific /// implementation before calling it's `finish` method @@ -335,90 +335,65 @@ impl ListArrayBuilder { } } -macro_rules! impl_list_array_builder { - ($builder_ty:ty) => { - impl ArrayBuilder for ListArrayBuilder<$builder_ty> { - type ArrayType = ListArray; +impl ArrayBuilder for ListArrayBuilder +where + T: 'static, +{ + type ArrayType = ListArray; - /// Returns the builder as an owned `Any` type so that it can be `downcast` to a specific - /// implementation before calling it's `finish` method. - fn into_any(self) -> Box { - Box::new(self) - } + /// Returns the builder as an owned `Any` type so that it can be `downcast` to a specific + /// implementation before calling it's `finish` method. + fn into_any(self) -> Box { + Box::new(self) + } - /// Returns the number of array slots in the builder - fn len(&self) -> i64 { - self.len - } + /// Returns the number of array slots in the builder + fn len(&self) -> i64 { + self.len + } - /// Builds the `ListArray` - fn finish(self) -> ListArray { - let len = self.len(); - let values_arr = self - .values_builder - .into_any() - .downcast::<$builder_ty>() - .unwrap() - .finish(); - let values_data = values_arr.data(); - - let null_bit_buffer = self.bitmap_builder.finish(); - let data = - ArrayData::builder(DataType::List(Box::new(values_data.data_type().clone()))) - .len(len) - .null_count(len - bit_util::count_set_bits(null_bit_buffer.data())) - .add_buffer(self.offsets_builder.finish()) - .add_child_data(values_data) - .null_bit_buffer(null_bit_buffer) - .build(); - - ListArray::from(data) - } - } + /// Builds the `ListArray` + fn finish(self) -> ListArray { + let len = self.len(); + let values_arr = self + .values_builder + .into_any() + .downcast::() + .unwrap() + .finish(); + let values_data = values_arr.data(); - impl ListArrayBuilder<$builder_ty> { - /// Returns the child array builder as a mutable reference. - /// - /// This mutable reference can be used to push values into the child array builder, - /// but you must call `append` to delimit each distinct list value. - pub fn values(&mut self) -> &mut $builder_ty { - &mut self.values_builder - } + let null_bit_buffer = self.bitmap_builder.finish(); + let data = ArrayData::builder(DataType::List(Box::new(values_data.data_type().clone()))) + .len(len) + .null_count(len - bit_util::count_set_bits(null_bit_buffer.data())) + .add_buffer(self.offsets_builder.finish()) + .add_child_data(values_data) + .null_bit_buffer(null_bit_buffer) + .build(); - /// Finish the current variable-length list array slot - pub fn append(&mut self, is_valid: bool) -> Result<()> { - self.offsets_builder - .push(self.values_builder.len() as i32)?; - self.bitmap_builder.push(is_valid)?; - self.len += 1; - Ok(()) - } - } - }; + ListArray::from(data) + } } -impl_list_array_builder!(BooleanBuilder); -impl_list_array_builder!(UInt8Builder); -impl_list_array_builder!(UInt16Builder); -impl_list_array_builder!(UInt32Builder); -impl_list_array_builder!(UInt64Builder); -impl_list_array_builder!(Int8Builder); -impl_list_array_builder!(Int16Builder); -impl_list_array_builder!(Int32Builder); -impl_list_array_builder!(Int64Builder); -impl_list_array_builder!(Float32Builder); -impl_list_array_builder!(Float64Builder); -impl_list_array_builder!(ListArrayBuilder); -impl_list_array_builder!(ListArrayBuilder); -impl_list_array_builder!(ListArrayBuilder); -impl_list_array_builder!(ListArrayBuilder); -impl_list_array_builder!(ListArrayBuilder); -impl_list_array_builder!(ListArrayBuilder); -impl_list_array_builder!(ListArrayBuilder); -impl_list_array_builder!(ListArrayBuilder); -impl_list_array_builder!(ListArrayBuilder); -impl_list_array_builder!(ListArrayBuilder); -impl_list_array_builder!(ListArrayBuilder); +impl ListArrayBuilder { + /// Returns the child array builder as a mutable reference. + /// + /// This mutable reference can be used to push values into the child array builder, + /// but you must call `append` to delimit each distinct list value. + pub fn values(&mut self) -> &mut T { + &mut self.values_builder + } + + /// Finish the current variable-length list array slot + pub fn append(&mut self, is_valid: bool) -> Result<()> { + self.offsets_builder + .push(self.values_builder.len() as i32)?; + self.bitmap_builder.push(is_valid)?; + self.len += 1; + Ok(()) + } +} /// Array builder for `BinaryArray` pub struct BinaryArrayBuilder { @@ -481,7 +456,7 @@ impl BinaryArrayBuilder { #[cfg(test)] mod tests { - use array::Array; + use crate::array::Array; use super::*; @@ -850,7 +825,7 @@ mod tests { #[test] fn test_binary_array_builder() { - use array::BinaryArray; + use crate::array::BinaryArray; let mut builder = BinaryArrayBuilder::new(20); builder.push(b'h').unwrap(); @@ -885,7 +860,7 @@ mod tests { #[test] fn test_binary_array_builder_push_string() { - use array::BinaryArray; + use crate::array::BinaryArray; let mut builder = BinaryArrayBuilder::new(20); let var = "hello".to_owned(); diff --git a/rust/src/csv/reader.rs b/rust/src/csv/reader.rs index dcb35958c5d89..697ace653b691 100644 --- a/rust/src/csv/reader.rs +++ b/rust/src/csv/reader.rs @@ -44,11 +44,11 @@ use std::fs::File; use std::io::BufReader; use std::sync::Arc; -use array::{ArrayRef, BinaryArray}; -use builder::*; -use datatypes::*; -use error::{ArrowError, Result}; -use record_batch::RecordBatch; +use crate::array::{ArrayRef, BinaryArray}; +use crate::builder::*; +use crate::datatypes::*; +use crate::error::{ArrowError, Result}; +use crate::record_batch::RecordBatch; use csv_crate::{StringRecord, StringRecordsIntoIter}; @@ -161,7 +161,7 @@ impl Reader { &DataType::Float32 => build_primitive_array::(rows, i), &DataType::Float64 => build_primitive_array::(rows, i), &DataType::Utf8 => { - let mut values_builder: UInt8Builder = UInt8Builder::new(rows.len() as i64); + let values_builder: UInt8Builder = UInt8Builder::new(rows.len() as i64); let mut list_builder = ListArrayBuilder::new(values_builder); for row_index in 0..rows.len() { match rows[row_index].get(*i) { @@ -195,8 +195,8 @@ impl Reader { mod tests { use super::*; - use array::*; - use datatypes::Field; + use crate::array::*; + use crate::datatypes::Field; #[test] fn test_csv() { diff --git a/rust/src/datatypes.rs b/rust/src/datatypes.rs index fdb9351e61abc..f91c75d7bd0c3 100644 --- a/rust/src/datatypes.rs +++ b/rust/src/datatypes.rs @@ -26,7 +26,7 @@ use std::mem::size_of; use std::slice::from_raw_parts; use std::str::FromStr; -use error::{ArrowError, Result}; +use crate::error::{ArrowError, Result}; use serde_json::Value; /// The possible relative types that are supported. @@ -281,7 +281,7 @@ impl Field { _ => { return Err(ArrowError::ParseError( "Field missing 'name' attribute".to_string(), - )) + )); } }; let nullable = match map.get("nullable") { @@ -289,7 +289,7 @@ impl Field { _ => { return Err(ArrowError::ParseError( "Field missing 'nullable' attribute".to_string(), - )) + )); } }; let data_type = match map.get("type") { @@ -297,7 +297,7 @@ impl Field { _ => { return Err(ArrowError::ParseError( "Field missing 'type' attribute".to_string(), - )) + )); } }; Ok(Field { diff --git a/rust/src/lib.rs b/rust/src/lib.rs index b2db090cf7c87..e1670ff055971 100644 --- a/rust/src/lib.rs +++ b/rust/src/lib.rs @@ -17,9 +17,7 @@ #![feature(specialization)] -extern crate bytes; extern crate csv as csv_crate; -extern crate libc; #[macro_use] extern crate serde_derive; @@ -27,10 +25,6 @@ extern crate serde_derive; #[macro_use] extern crate serde_json; -extern crate serde; - -extern crate rand; - pub mod array; pub mod array_data; pub mod bitmap; diff --git a/rust/src/memory.rs b/rust/src/memory.rs index 376499e9c217a..193eff12d6f6f 100644 --- a/rust/src/memory.rs +++ b/rust/src/memory.rs @@ -19,7 +19,7 @@ use libc; use std::cmp; use std::mem; -use super::error::{ArrowError, Result}; +use crate::error::{ArrowError, Result}; const ALIGNMENT: usize = 64; diff --git a/rust/src/record_batch.rs b/rust/src/record_batch.rs index cde1122aadc0a..4cb5c8e7db4df 100644 --- a/rust/src/record_batch.rs +++ b/rust/src/record_batch.rs @@ -15,8 +15,8 @@ // specific language governing permissions and limitations // under the License. -use super::array::*; -use super::datatypes::*; +use crate::array::*; +use crate::datatypes::*; use std::sync::Arc; /// A batch of column-oriented data @@ -67,8 +67,8 @@ unsafe impl Sync for RecordBatch {} #[cfg(test)] mod tests { use super::*; - use array_data::*; - use buffer::*; + use crate::array_data::*; + use crate::buffer::*; #[test] fn create_record_batch() { diff --git a/rust/src/tensor.rs b/rust/src/tensor.rs index e50a3136d2ba1..ec56aeb4cccd5 100644 --- a/rust/src/tensor.rs +++ b/rust/src/tensor.rs @@ -19,8 +19,8 @@ use std::marker::PhantomData; use std::mem; -use buffer::Buffer; -use datatypes::*; +use crate::buffer::Buffer; +use crate::datatypes::*; /// Computes the strides required assuming a row major memory layout fn compute_row_major_strides(shape: &Vec) -> Vec { @@ -216,8 +216,8 @@ impl<'a, T: ArrowPrimitiveType> Tensor<'a, T> { #[cfg(test)] mod tests { use super::*; - use buffer::Buffer; - use builder::*; + use crate::buffer::Buffer; + use crate::builder::*; #[test] fn test_compute_row_major_strides() { diff --git a/site/_data/contributors.yml b/site/_data/contributors.yml index 9289542fd8d68..c49230738222a 100644 --- a/site/_data/contributors.yml +++ b/site/_data/contributors.yml @@ -25,6 +25,10 @@ apacheId: uwe githubId: xhochy role: PMC +- name: Jacques Nadeau + apacheId: jacques + githubId: jacques-n + role: PMC - name: Julien Le Dem apacheId: julienledem githubId: julienledem diff --git a/site/_posts/2018-12-05-gandiva-donation.md b/site/_posts/2018-12-05-gandiva-donation.md new file mode 100644 index 0000000000000..ab12e4d80801d --- /dev/null +++ b/site/_posts/2018-12-05-gandiva-donation.md @@ -0,0 +1,93 @@ +--- +layout: post +title: "Gandiva: A LLVM-based Analytical Expression Compiler for Apache Arrow" +date: "2018-12-05 00:00:00 -0500" +author: jacques +categories: [application] +--- + + +Today we're happy to announce that the Gandiva Initiative for Apache Arrow, an +LLVM-based execution kernel, is now part of the Apache Arrow project. Gandiva +was kindly donated by [Dremio](https://www.dremio.com/), where it was +originally developed and open-sourced. Gandiva extends Arrow's capabilities to +provide high performance analytical execution and is composed of two main +components: + +* A runtime expression compiler leveraging LLVM + +* A high performance execution environment + +Gandiva works as follows: applications submit an expression tree to the +compiler, built in a language agnostic protobuf-based expression +representation. From there, Gandiva then compiles the expression tree to native +code for the current runtime environment and hardware. Once compiled, the +Gandiva execution kernel then consumes and produces Arrow columnar batches. The +generated code is highly optimized for parallel processing on modern CPUs. For +example, on AVX-128 processors Gandiva can process 8 pairs of 2 byte values in +a single vectorized operation, and on AVX-512 processors Gandiva can process 4x +as many values in a single operation. Gandiva is built from the ground up to +understand Arrow's in-memory representation and optimize processing against it. + +While Gandiva is just starting within the Arrow community, it already supports +hundreds of [expressions][1], ranging from math functions to case +statements. Gandiva was built as a standalone C++ library built on top of the +core Apache Arrow codebase and was donated with C++ and Java APIs construction +and execution APIs for projection and filtering operations. The Arrow community +is already looking to expand Gandiva's capabilities. This will include +incorporating more operations and supporting many new language bindings. As an +example, multiple community members are already actively building new language +bindings that allow use of Gandiva within Python and Ruby. + +While young within the Arrow community, Gandiva is already shipped and used in +production by many Dremio customers as part of Dremio's execution +engine. Experiments have demonstrated [70x performance improvement][2] on many +SQL queries. We expect to see similar performance gains for many other projects +that leverage Arrow. + +The Arrow community is working to ship the first formal Apache Arrow release +that includes Gandiva, and we hope this will be available within the next +couple months. This should make it much easier for the broader analytics and +data science development communities to leverage runtime code generation for +high-performance data processing in a variety of contexts and projects. + +We started the Arrow project a couple of years ago with the objective of +creating an industry-standard columnar in-memory data representation for +analytics. Within this short period of time, Apache Arrow has been adopted by +dozens of both open source and commercial software products. Some key examples +include technologies such as Apache Spark, Pandas, Nvidia RAPIDS, Dremio, and +InfluxDB. This success has driven Arrow to now be downloaded more than 1 +million times per month. Over 200 developers have already contributed to Apache +Arrow. If you're interested in contributing to Gandiva or any other part of the +Apache Arrow project, feel free to reach out on the mailing list and join us! + +For additional technical details on Gandiva, you can check out some of the +following resources: + +* [https://www.dremio.com/announcing-gandiva-initiative-for-apache-arrow/](https://www.dremio.com/announcing-gandiva-initiative-for-apache-arrow/) + +* [https://www.dremio.com/gandiva-performance-improvements-production-query/](https://www.dremio.com/gandiva-performance-improvements-production-query/) + +* [https://www.dremio.com/webinars/vectorized-query-processing-apache-arrow/](https://www.dremio.com/webinars/vectorized-query-processing-apache-arrow/) + +* [https://www.dremio.com/adding-a-user-define-function-to-gandiva/](https://www.dremio.com/adding-a-user-define-function-to-gandiva/) + +[1]: https://github.com/apache/arrow/blob/master/cpp/src/gandiva/function_registry.cc +[2]: https://www.dremio.com/gandiva-performance-improvements-production-query/ \ No newline at end of file