From 194e4764a4662c6782d28fe5ac6eb45fccf45dde Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Sun, 2 Sep 2018 02:52:16 +0200 Subject: [PATCH 1/8] allow truncate float option and its implementation --- cpp/src/arrow/compute/kernels/cast.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/cast.h b/cpp/src/arrow/compute/kernels/cast.h index 8392c188dfd..49b12b9d561 100644 --- a/cpp/src/arrow/compute/kernels/cast.h +++ b/cpp/src/arrow/compute/kernels/cast.h @@ -38,7 +38,7 @@ struct ARROW_EXPORT CastOptions { CastOptions() : allow_int_overflow(false), allow_time_truncate(false), - allow_float_truncate(true) {} + allow_float_truncate(false) {} explicit CastOptions(bool safe) : allow_int_overflow(!safe), From 2961094251db91350570ff6a8ff03a54784724cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Sun, 2 Sep 2018 10:26:04 +0200 Subject: [PATCH 2/8] set allow_float_truncate true by default --- cpp/src/arrow/compute/kernels/cast.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/cast.h b/cpp/src/arrow/compute/kernels/cast.h index 49b12b9d561..8392c188dfd 100644 --- a/cpp/src/arrow/compute/kernels/cast.h +++ b/cpp/src/arrow/compute/kernels/cast.h @@ -38,7 +38,7 @@ struct ARROW_EXPORT CastOptions { CastOptions() : allow_int_overflow(false), allow_time_truncate(false), - allow_float_truncate(false) {} + allow_float_truncate(true) {} explicit CastOptions(bool safe) : allow_int_overflow(!safe), From e0838ceb242694b37775630d6fb9d47749206f89 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Wed, 29 Aug 2018 19:52:26 +0200 Subject: [PATCH 3/8] wire CastOptions through the API --- cpp/src/arrow/python/numpy_to_arrow.h | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/arrow/python/numpy_to_arrow.h b/cpp/src/arrow/python/numpy_to_arrow.h index 5e1c088264a..f50b626f7ad 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.h +++ b/cpp/src/arrow/python/numpy_to_arrow.h @@ -26,6 +26,7 @@ #include "arrow/compute/kernels/cast.h" #include "arrow/util/visibility.h" +#include "arrow/compute/kernels/cast.h" namespace arrow { From 80e14784abf607eb1ef712c26674a7d216856340 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Sat, 1 Sep 2018 16:11:00 +0200 Subject: [PATCH 4/8] lint --- cpp/src/arrow/python/numpy_to_arrow.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc index ece00c286ea..9d8718cc51a 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.cc +++ b/cpp/src/arrow/python/numpy_to_arrow.cc @@ -413,8 +413,8 @@ inline Status NumPyConverter::ConvertData(std::shared_ptr* data) { RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast(dtype_), &input_type)); if (!input_type->Equals(*type_)) { - RETURN_NOT_OK(CastBuffer(input_type, *data, length_, nullptr, 0, type_, cast_options_, - pool_, data)); + RETURN_NOT_OK(CastBuffer(input_type, *data, length_, nullptr, 0, type_, + cast_options_, pool_, data)); } return Status::OK(); From 2c8207400fcd6f4c800a0da87479acb34b0d4eee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Sat, 1 Sep 2018 16:47:46 +0200 Subject: [PATCH 5/8] check-format --- cpp/src/arrow/python/numpy_to_arrow.cc | 4 ++-- cpp/src/arrow/python/numpy_to_arrow.h | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc index 9d8718cc51a..ece00c286ea 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.cc +++ b/cpp/src/arrow/python/numpy_to_arrow.cc @@ -413,8 +413,8 @@ inline Status NumPyConverter::ConvertData(std::shared_ptr* data) { RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast(dtype_), &input_type)); if (!input_type->Equals(*type_)) { - RETURN_NOT_OK(CastBuffer(input_type, *data, length_, nullptr, 0, type_, - cast_options_, pool_, data)); + RETURN_NOT_OK(CastBuffer(input_type, *data, length_, nullptr, 0, type_, cast_options_, + pool_, data)); } return Status::OK(); diff --git a/cpp/src/arrow/python/numpy_to_arrow.h b/cpp/src/arrow/python/numpy_to_arrow.h index f50b626f7ad..5e1c088264a 100644 --- a/cpp/src/arrow/python/numpy_to_arrow.h +++ b/cpp/src/arrow/python/numpy_to_arrow.h @@ -26,7 +26,6 @@ #include "arrow/compute/kernels/cast.h" #include "arrow/util/visibility.h" -#include "arrow/compute/kernels/cast.h" namespace arrow { From 515a3935d2a71972f2f2b13cc52269e2798fa8ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Sun, 2 Sep 2018 03:01:23 +0200 Subject: [PATCH 6/8] Table.from_pandas safe option --- python/pyarrow/array.pxi | 14 ++------ python/pyarrow/includes/libarrow.pxd | 5 +++ python/pyarrow/pandas_compat.py | 5 +-- python/pyarrow/table.pxi | 39 +++++++++------------ python/pyarrow/tests/test_convert_pandas.py | 20 ++++++++++- 5 files changed, 46 insertions(+), 37 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index f9a16a334c5..362ebc6ff9d 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -53,10 +53,7 @@ cdef _ndarray_to_array(object values, object mask, DataType type, cdef: shared_ptr[CChunkedArray] chunked_out shared_ptr[CDataType] c_type - CCastOptions cast_options - - cast_options.allow_int_overflow = not safe - cast_options.allow_time_truncate = not safe + CCastOptions cast_options = CCastOptions(safe) dtype = values.dtype @@ -406,14 +403,9 @@ cdef class Array: casted : Array """ cdef: - CCastOptions options + CCastOptions options = CCastOptions(safe) + DataType type = _ensure_type(target_type) shared_ptr[CArray] result - DataType type - - type = _ensure_type(target_type) - - options.allow_int_overflow = not safe - options.allow_time_truncate = not safe with nogil: check_status(Cast(_context(), self.ap[0], type.sp_type, diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 8bbbfcfd661..8a91bf52c92 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -903,8 +903,13 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil: CFunctionContext(CMemoryPool* pool) cdef cppclass CCastOptions" arrow::compute::CastOptions": + CCastOptions() + CCastOptions(c_bool safe) + CCastOptions Safe() + CCastOptions Unsafe() c_bool allow_int_overflow c_bool allow_time_truncate + c_bool allow_float_truncate enum DatumType" arrow::compute::Datum::type": DatumType_NONE" arrow::compute::Datum::NONE" diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index 5ba17020cd9..6a43fe2fd56 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -316,7 +316,8 @@ def _index_level_name(index, i, column_names): return '__index_level_{:d}__'.format(i) -def dataframe_to_arrays(df, schema, preserve_index, nthreads=1, columns=None): +def dataframe_to_arrays(df, schema, preserve_index, nthreads=1, columns=None, + safe=True): if columns is None: columns = df.columns column_names = [] @@ -366,7 +367,7 @@ def dataframe_to_arrays(df, schema, preserve_index, nthreads=1, columns=None): def convert_column(col, ty): try: - return pa.array(col, from_pandas=True, type=ty) + return pa.array(col, type=ty, from_pandas=True, safe=safe) except (pa.ArrowInvalid, pa.ArrowNotImplementedError, pa.ArrowTypeError) as e: diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 513da28bb44..4780eff6c51 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -139,10 +139,8 @@ cdef class ChunkedArray: return result - def to_pandas(self, - c_bool strings_to_categorical=False, - c_bool zero_copy_only=False, - c_bool integer_object_nulls=False): + def to_pandas(self, bint strings_to_categorical=False, + bint zero_copy_only=False, bint integer_object_nulls=False): """ Convert the arrow::ChunkedArray to an array object suitable for use in pandas @@ -411,7 +409,7 @@ cdef class Column: def from_array(*args): return column(*args) - def cast(self, object target_type, safe=True): + def cast(self, object target_type, bint safe=True): """ Cast column values to another data type @@ -427,16 +425,11 @@ cdef class Column: casted : Column """ cdef: - CCastOptions options + CCastOptions options = CCastOptions(safe) + DataType type = _ensure_type(target_type) shared_ptr[CArray] result - DataType type CDatum out - type = _ensure_type(target_type) - - options.allow_int_overflow = not safe - options.allow_time_truncate = not safe - with nogil: check_status(Cast(_context(), CDatum(self.column.data()), type.sp_type, options, &out)) @@ -489,10 +482,8 @@ cdef class Column: return [pyarrow_wrap_column(col) for col in flattened] - def to_pandas(self, - c_bool strings_to_categorical=False, - c_bool zero_copy_only=False, - c_bool integer_object_nulls=False): + def to_pandas(self, bint strings_to_categorical=False, + bint zero_copy_only=False, bint integer_object_nulls=False): """ Convert the arrow::Column to a pandas.Series @@ -863,7 +854,7 @@ cdef class RecordBatch: entries.append((name, column)) return OrderedDict(entries) - def to_pandas(self, use_threads=True): + def to_pandas(self, bint use_threads=True): """ Convert the arrow::RecordBatch to a pandas DataFrame @@ -1089,7 +1080,7 @@ cdef class Table: @classmethod def from_pandas(cls, df, Schema schema=None, bint preserve_index=True, - nthreads=None, columns=None): + nthreads=None, columns=None, bint safe=True): """ Convert pandas.DataFrame to an Arrow Table. @@ -1120,7 +1111,8 @@ cdef class Table: indicated number of threads columns : list, optional List of column to be converted. If None, use all columns. - + safe : boolean, default True + Check for overflows or other unsafe conversions Returns ------- @@ -1143,7 +1135,8 @@ cdef class Table: schema=schema, preserve_index=preserve_index, nthreads=nthreads, - columns=columns + columns=columns, + safe=safe ) return cls.from_arrays(arrays, names=names, metadata=metadata) @@ -1291,9 +1284,9 @@ cdef class Table: return result - def to_pandas(self, strings_to_categorical=False, - memory_pool=None, zero_copy_only=False, categories=None, - integer_object_nulls=False, use_threads=True): + def to_pandas(self, bint strings_to_categorical=False, + memory_pool=None, bint zero_copy_only=False, categories=None, + bint integer_object_nulls=False, bint use_threads=True): """ Convert the arrow::Table to a pandas DataFrame diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index 4f65547757e..3fa7cf4c34e 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -530,7 +530,7 @@ def test_float_nulls_to_ints(self): # ARROW-2135 df = pd.DataFrame({"a": [1.0, 2.0, pd.np.NaN]}) schema = pa.schema([pa.field("a", pa.int16(), nullable=True)]) - table = pa.Table.from_pandas(df, schema=schema) + table = pa.Table.from_pandas(df, schema=schema, safe=False) assert table[0].to_pylist() == [1, 2, None] tm.assert_frame_equal(df, table.to_pandas()) @@ -2056,6 +2056,24 @@ def test_mixed_integer_columns(self): expected_df = pd.DataFrame(data=[row], columns=['foo', '123']) _check_pandas_roundtrip(df, expected=expected_df, preserve_index=True) + def test_safe_unsafe_casts(self): + # ARROW-2799 + df = pd.DataFrame({ + 'A': list('abc'), + 'B': np.linspace(0, 1, 3) + }) + + schema = pa.schema([ + pa.field('A', pa.string()), + pa.field('B', pa.int32()) + ]) + + with pytest.raises(ValueError): + pa.Table.from_pandas(df, schema=schema) + + table = pa.Table.from_pandas(df, schema=schema, safe=False) + assert table.column('B').type == pa.int32() + def _fully_loaded_dataframe_example(): from distutils.version import LooseVersion From 38dfea3e6c7855e87c03c3e1b76cc64ca3349250 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Sun, 2 Sep 2018 13:07:46 +0200 Subject: [PATCH 7/8] disallow float truncation by default --- cpp/src/arrow/compute/kernels/cast.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/compute/kernels/cast.h b/cpp/src/arrow/compute/kernels/cast.h index 8392c188dfd..49b12b9d561 100644 --- a/cpp/src/arrow/compute/kernels/cast.h +++ b/cpp/src/arrow/compute/kernels/cast.h @@ -38,7 +38,7 @@ struct ARROW_EXPORT CastOptions { CastOptions() : allow_int_overflow(false), allow_time_truncate(false), - allow_float_truncate(true) {} + allow_float_truncate(false) {} explicit CastOptions(bool safe) : allow_int_overflow(!safe), From 7bf9efd361ce6c9a6095860291bfc0423be85672 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= Date: Sun, 2 Sep 2018 19:17:34 +0200 Subject: [PATCH 8/8] unsafe table creation during parquet dataset partitioning --- python/pyarrow/parquet.py | 33 ++++++++++++++-------------- python/pyarrow/tests/test_parquet.py | 4 ++-- 2 files changed, 18 insertions(+), 19 deletions(-) diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py index 6c2539ccce6..9fa97b4e6e8 100644 --- a/python/pyarrow/parquet.py +++ b/python/pyarrow/parquet.py @@ -24,12 +24,14 @@ import numpy as np import pyarrow as pa -import pyarrow._parquet as _parquet import pyarrow.lib as lib +import pyarrow._parquet as _parquet + from pyarrow._parquet import (ParquetReader, RowGroupStatistics, # noqa FileMetaData, RowGroupMetaData, ColumnChunkMetaData, ParquetSchema, ColumnSchema) +from pyarrow.compat import guid from pyarrow.filesystem import (LocalFileSystem, _ensure_filesystem, _get_fs_from_path) from pyarrow.util import _is_path_like, _stringify_path, _deprecate_nthreads @@ -53,6 +55,7 @@ class ParquetFile(object): Will be used in reads for pandas schema metadata if not found in the main file's metadata, no other uses at the moment """ + def __init__(self, source, metadata=None, common_metadata=None): self.reader = ParquetReader() self.reader.open(source, metadata=metadata) @@ -1124,11 +1127,6 @@ def write_to_dataset(table, root_path, partition_cols=None, Parameter for instantiating Table; preserve pandas index or not. **kwargs : dict, kwargs for write_table function. """ - from pyarrow import ( - Table, - compat - ) - if filesystem is None: fs = _get_fs_from_path(root_path) else: @@ -1142,7 +1140,7 @@ def write_to_dataset(table, root_path, partition_cols=None, data_df = df.drop(partition_cols, axis='columns') data_cols = df.columns.drop(partition_cols) if len(data_cols) == 0: - raise ValueError("No data left to save outside partition columns") + raise ValueError('No data left to save outside partition columns') subschema = table.schema # ARROW-2891: Ensure the output_schema is preserved when writing a # partitioned dataset @@ -1152,21 +1150,22 @@ def write_to_dataset(table, root_path, partition_cols=None, for keys, subgroup in data_df.groupby(partition_keys): if not isinstance(keys, tuple): keys = (keys,) - subdir = "/".join( - ["{colname}={value}".format(colname=name, value=val) + subdir = '/'.join( + ['{colname}={value}'.format(colname=name, value=val) for name, val in zip(partition_cols, keys)]) - subtable = Table.from_pandas(subgroup, - preserve_index=preserve_index, - schema=subschema) - prefix = "/".join([root_path, subdir]) + subtable = pa.Table.from_pandas(subgroup, + preserve_index=preserve_index, + schema=subschema, + safe=False) + prefix = '/'.join([root_path, subdir]) _mkdir_if_not_exists(fs, prefix) - outfile = compat.guid() + ".parquet" - full_path = "/".join([prefix, outfile]) + outfile = guid() + '.parquet' + full_path = '/'.join([prefix, outfile]) with fs.open(full_path, 'wb') as f: write_table(subtable, f, **kwargs) else: - outfile = compat.guid() + ".parquet" - full_path = "/".join([root_path, outfile]) + outfile = guid() + '.parquet' + full_path = '/'.join([root_path, outfile]) with fs.open(full_path, 'wb') as f: write_table(table, f, **kwargs) diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 556b1558a51..64fd82d603d 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -643,7 +643,7 @@ def test_parquet_column_statistics_api(data, type, physical_type, min_value, distinct_count): df = pd.DataFrame({'data': data}) schema = pa.schema([pa.field('data', type)]) - table = pa.Table.from_pandas(df, schema=schema) + table = pa.Table.from_pandas(df, schema=schema, safe=False) fileh = make_sample_file(table) meta = fileh.metadata @@ -1812,7 +1812,7 @@ def _test_write_to_dataset_with_partitions(base_path, dtype='datetime64[D]')}) cols = output_df.columns.tolist() partition_by = ['group1', 'group2'] - output_table = pa.Table.from_pandas(output_df, schema=schema) + output_table = pa.Table.from_pandas(output_df, schema=schema, safe=False) pq.write_to_dataset(output_table, base_path, partition_by, filesystem=filesystem)