Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cpp/src/arrow/compute/kernels/cast.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ struct ARROW_EXPORT CastOptions {
CastOptions()
: allow_int_overflow(false),
allow_time_truncate(false),
allow_float_truncate(true) {}
allow_float_truncate(false) {}

explicit CastOptions(bool safe)
: allow_int_overflow(!safe),
Expand Down
14 changes: 3 additions & 11 deletions python/pyarrow/array.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -53,10 +53,7 @@ cdef _ndarray_to_array(object values, object mask, DataType type,
cdef:
shared_ptr[CChunkedArray] chunked_out
shared_ptr[CDataType] c_type
CCastOptions cast_options

cast_options.allow_int_overflow = not safe
cast_options.allow_time_truncate = not safe
CCastOptions cast_options = CCastOptions(safe)

dtype = values.dtype

Expand Down Expand Up @@ -406,14 +403,9 @@ cdef class Array:
casted : Array
"""
cdef:
CCastOptions options
CCastOptions options = CCastOptions(safe)
DataType type = _ensure_type(target_type)
shared_ptr[CArray] result
DataType type

type = _ensure_type(target_type)

options.allow_int_overflow = not safe
options.allow_time_truncate = not safe

with nogil:
check_status(Cast(_context(), self.ap[0], type.sp_type,
Expand Down
5 changes: 5 additions & 0 deletions python/pyarrow/includes/libarrow.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -903,8 +903,13 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil:
CFunctionContext(CMemoryPool* pool)

cdef cppclass CCastOptions" arrow::compute::CastOptions":
CCastOptions()
CCastOptions(c_bool safe)
CCastOptions Safe()
CCastOptions Unsafe()
c_bool allow_int_overflow
c_bool allow_time_truncate
c_bool allow_float_truncate

enum DatumType" arrow::compute::Datum::type":
DatumType_NONE" arrow::compute::Datum::NONE"
Expand Down
5 changes: 3 additions & 2 deletions python/pyarrow/pandas_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,8 @@ def _index_level_name(index, i, column_names):
return '__index_level_{:d}__'.format(i)


def dataframe_to_arrays(df, schema, preserve_index, nthreads=1, columns=None):
def dataframe_to_arrays(df, schema, preserve_index, nthreads=1, columns=None,
safe=True):
if columns is None:
columns = df.columns
column_names = []
Expand Down Expand Up @@ -366,7 +367,7 @@ def dataframe_to_arrays(df, schema, preserve_index, nthreads=1, columns=None):

def convert_column(col, ty):
try:
return pa.array(col, from_pandas=True, type=ty)
return pa.array(col, type=ty, from_pandas=True, safe=safe)
except (pa.ArrowInvalid,
pa.ArrowNotImplementedError,
pa.ArrowTypeError) as e:
Expand Down
33 changes: 16 additions & 17 deletions python/pyarrow/parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,14 @@
import numpy as np

import pyarrow as pa
import pyarrow._parquet as _parquet
import pyarrow.lib as lib
import pyarrow._parquet as _parquet

from pyarrow._parquet import (ParquetReader, RowGroupStatistics, # noqa
FileMetaData, RowGroupMetaData,
ColumnChunkMetaData,
ParquetSchema, ColumnSchema)
from pyarrow.compat import guid
from pyarrow.filesystem import (LocalFileSystem, _ensure_filesystem,
_get_fs_from_path)
from pyarrow.util import _is_path_like, _stringify_path, _deprecate_nthreads
Expand All @@ -53,6 +55,7 @@ class ParquetFile(object):
Will be used in reads for pandas schema metadata if not found in the
main file's metadata, no other uses at the moment
"""

def __init__(self, source, metadata=None, common_metadata=None):
self.reader = ParquetReader()
self.reader.open(source, metadata=metadata)
Expand Down Expand Up @@ -1124,11 +1127,6 @@ def write_to_dataset(table, root_path, partition_cols=None,
Parameter for instantiating Table; preserve pandas index or not.
**kwargs : dict, kwargs for write_table function.
"""
from pyarrow import (
Table,
compat
)

if filesystem is None:
fs = _get_fs_from_path(root_path)
else:
Expand All @@ -1142,7 +1140,7 @@ def write_to_dataset(table, root_path, partition_cols=None,
data_df = df.drop(partition_cols, axis='columns')
data_cols = df.columns.drop(partition_cols)
if len(data_cols) == 0:
raise ValueError("No data left to save outside partition columns")
raise ValueError('No data left to save outside partition columns')
subschema = table.schema
# ARROW-2891: Ensure the output_schema is preserved when writing a
# partitioned dataset
Expand All @@ -1152,21 +1150,22 @@ def write_to_dataset(table, root_path, partition_cols=None,
for keys, subgroup in data_df.groupby(partition_keys):
if not isinstance(keys, tuple):
keys = (keys,)
subdir = "/".join(
["{colname}={value}".format(colname=name, value=val)
subdir = '/'.join(
['{colname}={value}'.format(colname=name, value=val)
for name, val in zip(partition_cols, keys)])
subtable = Table.from_pandas(subgroup,
preserve_index=preserve_index,
schema=subschema)
prefix = "/".join([root_path, subdir])
subtable = pa.Table.from_pandas(subgroup,
preserve_index=preserve_index,
schema=subschema,
safe=False)
prefix = '/'.join([root_path, subdir])
_mkdir_if_not_exists(fs, prefix)
outfile = compat.guid() + ".parquet"
full_path = "/".join([prefix, outfile])
outfile = guid() + '.parquet'
full_path = '/'.join([prefix, outfile])
with fs.open(full_path, 'wb') as f:
write_table(subtable, f, **kwargs)
else:
outfile = compat.guid() + ".parquet"
full_path = "/".join([root_path, outfile])
outfile = guid() + '.parquet'
full_path = '/'.join([root_path, outfile])
with fs.open(full_path, 'wb') as f:
write_table(table, f, **kwargs)

Expand Down
39 changes: 16 additions & 23 deletions python/pyarrow/table.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -139,10 +139,8 @@ cdef class ChunkedArray:

return result

def to_pandas(self,
c_bool strings_to_categorical=False,
c_bool zero_copy_only=False,
c_bool integer_object_nulls=False):
def to_pandas(self, bint strings_to_categorical=False,
bint zero_copy_only=False, bint integer_object_nulls=False):
"""
Convert the arrow::ChunkedArray to an array object suitable for use
in pandas
Expand Down Expand Up @@ -411,7 +409,7 @@ cdef class Column:
def from_array(*args):
return column(*args)

def cast(self, object target_type, safe=True):
def cast(self, object target_type, bint safe=True):
"""
Cast column values to another data type

Expand All @@ -427,16 +425,11 @@ cdef class Column:
casted : Column
"""
cdef:
CCastOptions options
CCastOptions options = CCastOptions(safe)
DataType type = _ensure_type(target_type)
shared_ptr[CArray] result
DataType type
CDatum out

type = _ensure_type(target_type)

options.allow_int_overflow = not safe
options.allow_time_truncate = not safe

with nogil:
check_status(Cast(_context(), CDatum(self.column.data()),
type.sp_type, options, &out))
Expand Down Expand Up @@ -489,10 +482,8 @@ cdef class Column:

return [pyarrow_wrap_column(col) for col in flattened]

def to_pandas(self,
c_bool strings_to_categorical=False,
c_bool zero_copy_only=False,
c_bool integer_object_nulls=False):
def to_pandas(self, bint strings_to_categorical=False,
bint zero_copy_only=False, bint integer_object_nulls=False):
"""
Convert the arrow::Column to a pandas.Series

Expand Down Expand Up @@ -863,7 +854,7 @@ cdef class RecordBatch:
entries.append((name, column))
return OrderedDict(entries)

def to_pandas(self, use_threads=True):
def to_pandas(self, bint use_threads=True):
"""
Convert the arrow::RecordBatch to a pandas DataFrame

Expand Down Expand Up @@ -1089,7 +1080,7 @@ cdef class Table:

@classmethod
def from_pandas(cls, df, Schema schema=None, bint preserve_index=True,
nthreads=None, columns=None):
nthreads=None, columns=None, bint safe=True):
"""
Convert pandas.DataFrame to an Arrow Table.

Expand Down Expand Up @@ -1120,7 +1111,8 @@ cdef class Table:
indicated number of threads
columns : list, optional
List of column to be converted. If None, use all columns.

safe : boolean, default True
Check for overflows or other unsafe conversions

Returns
-------
Expand All @@ -1143,7 +1135,8 @@ cdef class Table:
schema=schema,
preserve_index=preserve_index,
nthreads=nthreads,
columns=columns
columns=columns,
safe=safe
)
return cls.from_arrays(arrays, names=names, metadata=metadata)

Expand Down Expand Up @@ -1291,9 +1284,9 @@ cdef class Table:

return result

def to_pandas(self, strings_to_categorical=False,
memory_pool=None, zero_copy_only=False, categories=None,
integer_object_nulls=False, use_threads=True):
def to_pandas(self, bint strings_to_categorical=False,
memory_pool=None, bint zero_copy_only=False, categories=None,
bint integer_object_nulls=False, bint use_threads=True):
"""
Convert the arrow::Table to a pandas DataFrame

Expand Down
20 changes: 19 additions & 1 deletion python/pyarrow/tests/test_convert_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -530,7 +530,7 @@ def test_float_nulls_to_ints(self):
# ARROW-2135
df = pd.DataFrame({"a": [1.0, 2.0, pd.np.NaN]})
schema = pa.schema([pa.field("a", pa.int16(), nullable=True)])
table = pa.Table.from_pandas(df, schema=schema)
table = pa.Table.from_pandas(df, schema=schema, safe=False)
assert table[0].to_pylist() == [1, 2, None]
tm.assert_frame_equal(df, table.to_pandas())

Expand Down Expand Up @@ -2056,6 +2056,24 @@ def test_mixed_integer_columns(self):
expected_df = pd.DataFrame(data=[row], columns=['foo', '123'])
_check_pandas_roundtrip(df, expected=expected_df, preserve_index=True)

def test_safe_unsafe_casts(self):
# ARROW-2799
df = pd.DataFrame({
'A': list('abc'),
'B': np.linspace(0, 1, 3)
})

schema = pa.schema([
pa.field('A', pa.string()),
pa.field('B', pa.int32())
])

with pytest.raises(ValueError):
pa.Table.from_pandas(df, schema=schema)

table = pa.Table.from_pandas(df, schema=schema, safe=False)
assert table.column('B').type == pa.int32()


def _fully_loaded_dataframe_example():
from distutils.version import LooseVersion
Expand Down
4 changes: 2 additions & 2 deletions python/pyarrow/tests/test_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -643,7 +643,7 @@ def test_parquet_column_statistics_api(data, type, physical_type, min_value,
distinct_count):
df = pd.DataFrame({'data': data})
schema = pa.schema([pa.field('data', type)])
table = pa.Table.from_pandas(df, schema=schema)
table = pa.Table.from_pandas(df, schema=schema, safe=False)
fileh = make_sample_file(table)

meta = fileh.metadata
Expand Down Expand Up @@ -1812,7 +1812,7 @@ def _test_write_to_dataset_with_partitions(base_path,
dtype='datetime64[D]')})
cols = output_df.columns.tolist()
partition_by = ['group1', 'group2']
output_table = pa.Table.from_pandas(output_df, schema=schema)
output_table = pa.Table.from_pandas(output_df, schema=schema, safe=False)
pq.write_to_dataset(output_table, base_path, partition_by,
filesystem=filesystem)

Expand Down