From 194e4764a4662c6782d28fe5ac6eb45fccf45dde Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= <szucs.krisztian@gmail.com>
Date: Sun, 2 Sep 2018 02:52:16 +0200
Subject: [PATCH 1/8] allow truncate float option and its implementation

---
 cpp/src/arrow/compute/kernels/cast.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/cpp/src/arrow/compute/kernels/cast.h b/cpp/src/arrow/compute/kernels/cast.h
index 8392c188dfd..49b12b9d561 100644
--- a/cpp/src/arrow/compute/kernels/cast.h
+++ b/cpp/src/arrow/compute/kernels/cast.h
@@ -38,7 +38,7 @@ struct ARROW_EXPORT CastOptions {
   CastOptions()
       : allow_int_overflow(false),
         allow_time_truncate(false),
-        allow_float_truncate(true) {}
+        allow_float_truncate(false) {}
 
   explicit CastOptions(bool safe)
       : allow_int_overflow(!safe),

From 2961094251db91350570ff6a8ff03a54784724cc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= <szucs.krisztian@gmail.com>
Date: Sun, 2 Sep 2018 10:26:04 +0200
Subject: [PATCH 2/8] set allow_float_truncate true by default

---
 cpp/src/arrow/compute/kernels/cast.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/arrow/compute/kernels/cast.h b/cpp/src/arrow/compute/kernels/cast.h
index 49b12b9d561..8392c188dfd 100644
--- a/cpp/src/arrow/compute/kernels/cast.h
+++ b/cpp/src/arrow/compute/kernels/cast.h
@@ -38,7 +38,7 @@ struct ARROW_EXPORT CastOptions {
   CastOptions()
       : allow_int_overflow(false),
         allow_time_truncate(false),
-        allow_float_truncate(false) {}
+        allow_float_truncate(true) {}
 
   explicit CastOptions(bool safe)
       : allow_int_overflow(!safe),

From e0838ceb242694b37775630d6fb9d47749206f89 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= <szucs.krisztian@gmail.com>
Date: Wed, 29 Aug 2018 19:52:26 +0200
Subject: [PATCH 3/8] wire CastOptions through the API

---
 cpp/src/arrow/python/numpy_to_arrow.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/src/arrow/python/numpy_to_arrow.h b/cpp/src/arrow/python/numpy_to_arrow.h
index 5e1c088264a..f50b626f7ad 100644
--- a/cpp/src/arrow/python/numpy_to_arrow.h
+++ b/cpp/src/arrow/python/numpy_to_arrow.h
@@ -26,6 +26,7 @@
 
 #include "arrow/compute/kernels/cast.h"
 #include "arrow/util/visibility.h"
+#include "arrow/compute/kernels/cast.h"
 
 namespace arrow {
 

From 80e14784abf607eb1ef712c26674a7d216856340 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= <szucs.krisztian@gmail.com>
Date: Sat, 1 Sep 2018 16:11:00 +0200
Subject: [PATCH 4/8] lint

---
 cpp/src/arrow/python/numpy_to_arrow.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc
index ece00c286ea..9d8718cc51a 100644
--- a/cpp/src/arrow/python/numpy_to_arrow.cc
+++ b/cpp/src/arrow/python/numpy_to_arrow.cc
@@ -413,8 +413,8 @@ inline Status NumPyConverter::ConvertData(std::shared_ptr<Buffer>* data) {
   RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast<PyObject*>(dtype_), &input_type));
 
   if (!input_type->Equals(*type_)) {
-    RETURN_NOT_OK(CastBuffer(input_type, *data, length_, nullptr, 0, type_, cast_options_,
-                             pool_, data));
+    RETURN_NOT_OK(CastBuffer(input_type, *data, length_, nullptr, 0, type_,
+                             cast_options_, pool_, data));
   }
 
   return Status::OK();

From 2c8207400fcd6f4c800a0da87479acb34b0d4eee Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= <szucs.krisztian@gmail.com>
Date: Sat, 1 Sep 2018 16:47:46 +0200
Subject: [PATCH 5/8] check-format

---
 cpp/src/arrow/python/numpy_to_arrow.cc | 4 ++--
 cpp/src/arrow/python/numpy_to_arrow.h  | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/cpp/src/arrow/python/numpy_to_arrow.cc b/cpp/src/arrow/python/numpy_to_arrow.cc
index 9d8718cc51a..ece00c286ea 100644
--- a/cpp/src/arrow/python/numpy_to_arrow.cc
+++ b/cpp/src/arrow/python/numpy_to_arrow.cc
@@ -413,8 +413,8 @@ inline Status NumPyConverter::ConvertData(std::shared_ptr<Buffer>* data) {
   RETURN_NOT_OK(NumPyDtypeToArrow(reinterpret_cast<PyObject*>(dtype_), &input_type));
 
   if (!input_type->Equals(*type_)) {
-    RETURN_NOT_OK(CastBuffer(input_type, *data, length_, nullptr, 0, type_,
-                             cast_options_, pool_, data));
+    RETURN_NOT_OK(CastBuffer(input_type, *data, length_, nullptr, 0, type_, cast_options_,
+                             pool_, data));
   }
 
   return Status::OK();
diff --git a/cpp/src/arrow/python/numpy_to_arrow.h b/cpp/src/arrow/python/numpy_to_arrow.h
index f50b626f7ad..5e1c088264a 100644
--- a/cpp/src/arrow/python/numpy_to_arrow.h
+++ b/cpp/src/arrow/python/numpy_to_arrow.h
@@ -26,7 +26,6 @@
 
 #include "arrow/compute/kernels/cast.h"
 #include "arrow/util/visibility.h"
-#include "arrow/compute/kernels/cast.h"
 
 namespace arrow {
 

From 515a3935d2a71972f2f2b13cc52269e2798fa8ab Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= <szucs.krisztian@gmail.com>
Date: Sun, 2 Sep 2018 03:01:23 +0200
Subject: [PATCH 6/8] Table.from_pandas safe option

---
 python/pyarrow/array.pxi                    | 14 ++------
 python/pyarrow/includes/libarrow.pxd        |  5 +++
 python/pyarrow/pandas_compat.py             |  5 +--
 python/pyarrow/table.pxi                    | 39 +++++++++------------
 python/pyarrow/tests/test_convert_pandas.py | 20 ++++++++++-
 5 files changed, 46 insertions(+), 37 deletions(-)

diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi
index f9a16a334c5..362ebc6ff9d 100644
--- a/python/pyarrow/array.pxi
+++ b/python/pyarrow/array.pxi
@@ -53,10 +53,7 @@ cdef _ndarray_to_array(object values, object mask, DataType type,
     cdef:
         shared_ptr[CChunkedArray] chunked_out
         shared_ptr[CDataType] c_type
-        CCastOptions cast_options
-
-    cast_options.allow_int_overflow = not safe
-    cast_options.allow_time_truncate = not safe
+        CCastOptions cast_options = CCastOptions(safe)
 
     dtype = values.dtype
 
@@ -406,14 +403,9 @@ cdef class Array:
         casted : Array
         """
         cdef:
-            CCastOptions options
+            CCastOptions options = CCastOptions(safe)
+            DataType type = _ensure_type(target_type)
             shared_ptr[CArray] result
-            DataType type
-
-        type = _ensure_type(target_type)
-
-        options.allow_int_overflow = not safe
-        options.allow_time_truncate = not safe
 
         with nogil:
             check_status(Cast(_context(), self.ap[0], type.sp_type,
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index 8bbbfcfd661..8a91bf52c92 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -903,8 +903,13 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil:
         CFunctionContext(CMemoryPool* pool)
 
     cdef cppclass CCastOptions" arrow::compute::CastOptions":
+        CCastOptions()
+        CCastOptions(c_bool safe)
+        CCastOptions Safe()
+        CCastOptions Unsafe()
         c_bool allow_int_overflow
         c_bool allow_time_truncate
+        c_bool allow_float_truncate
 
     enum DatumType" arrow::compute::Datum::type":
         DatumType_NONE" arrow::compute::Datum::NONE"
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index 5ba17020cd9..6a43fe2fd56 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -316,7 +316,8 @@ def _index_level_name(index, i, column_names):
         return '__index_level_{:d}__'.format(i)
 
 
-def dataframe_to_arrays(df, schema, preserve_index, nthreads=1, columns=None):
+def dataframe_to_arrays(df, schema, preserve_index, nthreads=1, columns=None,
+                        safe=True):
     if columns is None:
         columns = df.columns
     column_names = []
@@ -366,7 +367,7 @@ def dataframe_to_arrays(df, schema, preserve_index, nthreads=1, columns=None):
 
     def convert_column(col, ty):
         try:
-            return pa.array(col, from_pandas=True, type=ty)
+            return pa.array(col, type=ty, from_pandas=True, safe=safe)
         except (pa.ArrowInvalid,
                 pa.ArrowNotImplementedError,
                 pa.ArrowTypeError) as e:
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index 513da28bb44..4780eff6c51 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -139,10 +139,8 @@ cdef class ChunkedArray:
 
         return result
 
-    def to_pandas(self,
-                  c_bool strings_to_categorical=False,
-                  c_bool zero_copy_only=False,
-                  c_bool integer_object_nulls=False):
+    def to_pandas(self, bint strings_to_categorical=False,
+                  bint zero_copy_only=False, bint integer_object_nulls=False):
         """
         Convert the arrow::ChunkedArray to an array object suitable for use
         in pandas
@@ -411,7 +409,7 @@ cdef class Column:
     def from_array(*args):
         return column(*args)
 
-    def cast(self, object target_type, safe=True):
+    def cast(self, object target_type, bint safe=True):
         """
         Cast column values to another data type
 
@@ -427,16 +425,11 @@ cdef class Column:
         casted : Column
         """
         cdef:
-            CCastOptions options
+            CCastOptions options = CCastOptions(safe)
+            DataType type = _ensure_type(target_type)
             shared_ptr[CArray] result
-            DataType type
             CDatum out
 
-        type = _ensure_type(target_type)
-
-        options.allow_int_overflow = not safe
-        options.allow_time_truncate = not safe
-
         with nogil:
             check_status(Cast(_context(), CDatum(self.column.data()),
                               type.sp_type, options, &out))
@@ -489,10 +482,8 @@ cdef class Column:
 
         return [pyarrow_wrap_column(col) for col in flattened]
 
-    def to_pandas(self,
-                  c_bool strings_to_categorical=False,
-                  c_bool zero_copy_only=False,
-                  c_bool integer_object_nulls=False):
+    def to_pandas(self, bint strings_to_categorical=False,
+                  bint zero_copy_only=False, bint integer_object_nulls=False):
         """
         Convert the arrow::Column to a pandas.Series
 
@@ -863,7 +854,7 @@ cdef class RecordBatch:
             entries.append((name, column))
         return OrderedDict(entries)
 
-    def to_pandas(self, use_threads=True):
+    def to_pandas(self, bint use_threads=True):
         """
         Convert the arrow::RecordBatch to a pandas DataFrame
 
@@ -1089,7 +1080,7 @@ cdef class Table:
 
     @classmethod
     def from_pandas(cls, df, Schema schema=None, bint preserve_index=True,
-                    nthreads=None, columns=None):
+                    nthreads=None, columns=None, bint safe=True):
         """
         Convert pandas.DataFrame to an Arrow Table.
 
@@ -1120,7 +1111,8 @@ cdef class Table:
             indicated number of threads
         columns : list, optional
            List of column to be converted. If None, use all columns.
-
+        safe : boolean, default True
+           Check for overflows or other unsafe conversions
 
         Returns
         -------
@@ -1143,7 +1135,8 @@ cdef class Table:
             schema=schema,
             preserve_index=preserve_index,
             nthreads=nthreads,
-            columns=columns
+            columns=columns,
+            safe=safe
         )
         return cls.from_arrays(arrays, names=names, metadata=metadata)
 
@@ -1291,9 +1284,9 @@ cdef class Table:
 
         return result
 
-    def to_pandas(self, strings_to_categorical=False,
-                  memory_pool=None, zero_copy_only=False, categories=None,
-                  integer_object_nulls=False, use_threads=True):
+    def to_pandas(self, bint strings_to_categorical=False,
+                  memory_pool=None, bint zero_copy_only=False, categories=None,
+                  bint integer_object_nulls=False, bint use_threads=True):
         """
         Convert the arrow::Table to a pandas DataFrame
 
diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
index 4f65547757e..3fa7cf4c34e 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -530,7 +530,7 @@ def test_float_nulls_to_ints(self):
         # ARROW-2135
         df = pd.DataFrame({"a": [1.0, 2.0, pd.np.NaN]})
         schema = pa.schema([pa.field("a", pa.int16(), nullable=True)])
-        table = pa.Table.from_pandas(df, schema=schema)
+        table = pa.Table.from_pandas(df, schema=schema, safe=False)
         assert table[0].to_pylist() == [1, 2, None]
         tm.assert_frame_equal(df, table.to_pandas())
 
@@ -2056,6 +2056,24 @@ def test_mixed_integer_columns(self):
         expected_df = pd.DataFrame(data=[row], columns=['foo', '123'])
         _check_pandas_roundtrip(df, expected=expected_df, preserve_index=True)
 
+    def test_safe_unsafe_casts(self):
+        # ARROW-2799
+        df = pd.DataFrame({
+            'A': list('abc'),
+            'B': np.linspace(0, 1, 3)
+        })
+
+        schema = pa.schema([
+            pa.field('A', pa.string()),
+            pa.field('B', pa.int32())
+        ])
+
+        with pytest.raises(ValueError):
+            pa.Table.from_pandas(df, schema=schema)
+
+        table = pa.Table.from_pandas(df, schema=schema, safe=False)
+        assert table.column('B').type == pa.int32()
+
 
 def _fully_loaded_dataframe_example():
     from distutils.version import LooseVersion

From 38dfea3e6c7855e87c03c3e1b76cc64ca3349250 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= <szucs.krisztian@gmail.com>
Date: Sun, 2 Sep 2018 13:07:46 +0200
Subject: [PATCH 7/8] disallow float truncation by default

---
 cpp/src/arrow/compute/kernels/cast.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/arrow/compute/kernels/cast.h b/cpp/src/arrow/compute/kernels/cast.h
index 8392c188dfd..49b12b9d561 100644
--- a/cpp/src/arrow/compute/kernels/cast.h
+++ b/cpp/src/arrow/compute/kernels/cast.h
@@ -38,7 +38,7 @@ struct ARROW_EXPORT CastOptions {
   CastOptions()
       : allow_int_overflow(false),
         allow_time_truncate(false),
-        allow_float_truncate(true) {}
+        allow_float_truncate(false) {}
 
   explicit CastOptions(bool safe)
       : allow_int_overflow(!safe),

From 7bf9efd361ce6c9a6095860291bfc0423be85672 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Kriszti=C3=A1n=20Sz=C5=B1cs?= <szucs.krisztian@gmail.com>
Date: Sun, 2 Sep 2018 19:17:34 +0200
Subject: [PATCH 8/8] unsafe table creation during parquet dataset partitioning

---
 python/pyarrow/parquet.py            | 33 ++++++++++++++--------------
 python/pyarrow/tests/test_parquet.py |  4 ++--
 2 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py
index 6c2539ccce6..9fa97b4e6e8 100644
--- a/python/pyarrow/parquet.py
+++ b/python/pyarrow/parquet.py
@@ -24,12 +24,14 @@
 import numpy as np
 
 import pyarrow as pa
-import pyarrow._parquet as _parquet
 import pyarrow.lib as lib
+import pyarrow._parquet as _parquet
+
 from pyarrow._parquet import (ParquetReader, RowGroupStatistics,  # noqa
                               FileMetaData, RowGroupMetaData,
                               ColumnChunkMetaData,
                               ParquetSchema, ColumnSchema)
+from pyarrow.compat import guid
 from pyarrow.filesystem import (LocalFileSystem, _ensure_filesystem,
                                 _get_fs_from_path)
 from pyarrow.util import _is_path_like, _stringify_path, _deprecate_nthreads
@@ -53,6 +55,7 @@ class ParquetFile(object):
         Will be used in reads for pandas schema metadata if not found in the
         main file's metadata, no other uses at the moment
     """
+
     def __init__(self, source, metadata=None, common_metadata=None):
         self.reader = ParquetReader()
         self.reader.open(source, metadata=metadata)
@@ -1124,11 +1127,6 @@ def write_to_dataset(table, root_path, partition_cols=None,
         Parameter for instantiating Table; preserve pandas index or not.
     **kwargs : dict, kwargs for write_table function.
     """
-    from pyarrow import (
-        Table,
-        compat
-    )
-
     if filesystem is None:
         fs = _get_fs_from_path(root_path)
     else:
@@ -1142,7 +1140,7 @@ def write_to_dataset(table, root_path, partition_cols=None,
         data_df = df.drop(partition_cols, axis='columns')
         data_cols = df.columns.drop(partition_cols)
         if len(data_cols) == 0:
-            raise ValueError("No data left to save outside partition columns")
+            raise ValueError('No data left to save outside partition columns')
         subschema = table.schema
         # ARROW-2891: Ensure the output_schema is preserved when writing a
         # partitioned dataset
@@ -1152,21 +1150,22 @@ def write_to_dataset(table, root_path, partition_cols=None,
         for keys, subgroup in data_df.groupby(partition_keys):
             if not isinstance(keys, tuple):
                 keys = (keys,)
-            subdir = "/".join(
-                ["{colname}={value}".format(colname=name, value=val)
+            subdir = '/'.join(
+                ['{colname}={value}'.format(colname=name, value=val)
                  for name, val in zip(partition_cols, keys)])
-            subtable = Table.from_pandas(subgroup,
-                                         preserve_index=preserve_index,
-                                         schema=subschema)
-            prefix = "/".join([root_path, subdir])
+            subtable = pa.Table.from_pandas(subgroup,
+                                            preserve_index=preserve_index,
+                                            schema=subschema,
+                                            safe=False)
+            prefix = '/'.join([root_path, subdir])
             _mkdir_if_not_exists(fs, prefix)
-            outfile = compat.guid() + ".parquet"
-            full_path = "/".join([prefix, outfile])
+            outfile = guid() + '.parquet'
+            full_path = '/'.join([prefix, outfile])
             with fs.open(full_path, 'wb') as f:
                 write_table(subtable, f, **kwargs)
     else:
-        outfile = compat.guid() + ".parquet"
-        full_path = "/".join([root_path, outfile])
+        outfile = guid() + '.parquet'
+        full_path = '/'.join([root_path, outfile])
         with fs.open(full_path, 'wb') as f:
             write_table(table, f, **kwargs)
 
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index 556b1558a51..64fd82d603d 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -643,7 +643,7 @@ def test_parquet_column_statistics_api(data, type, physical_type, min_value,
                                        distinct_count):
     df = pd.DataFrame({'data': data})
     schema = pa.schema([pa.field('data', type)])
-    table = pa.Table.from_pandas(df, schema=schema)
+    table = pa.Table.from_pandas(df, schema=schema, safe=False)
     fileh = make_sample_file(table)
 
     meta = fileh.metadata
@@ -1812,7 +1812,7 @@ def _test_write_to_dataset_with_partitions(base_path,
                                                 dtype='datetime64[D]')})
     cols = output_df.columns.tolist()
     partition_by = ['group1', 'group2']
-    output_table = pa.Table.from_pandas(output_df, schema=schema)
+    output_table = pa.Table.from_pandas(output_df, schema=schema, safe=False)
     pq.write_to_dataset(output_table, base_path, partition_by,
                         filesystem=filesystem)