diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index ebe825f65c4..2173232d4ef 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -444,6 +444,7 @@ set(CYTHON_EXTENSIONS error scalar schema + table ) foreach(module ${CYTHON_EXTENSIONS}) diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index c343f5ba5f1..40a09c2feae 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -41,4 +41,6 @@ list_, struct, field, DataType, Field, Schema, schema) -from pyarrow.array import RowBatch, Table, from_pandas_dataframe +from pyarrow.array import RowBatch, from_pandas_dataframe + +from pyarrow.table import Column, Table diff --git a/python/pyarrow/array.pxd b/python/pyarrow/array.pxd index de3c7741962..8cd15cd4502 100644 --- a/python/pyarrow/array.pxd +++ b/python/pyarrow/array.pxd @@ -36,6 +36,8 @@ cdef class Array: cdef init(self, const shared_ptr[CArray]& sp_array) cdef getitem(self, int i) +cdef object box_arrow_array(const shared_ptr[CArray]& sp_array) + cdef class BooleanArray(Array): pass diff --git a/python/pyarrow/array.pyx b/python/pyarrow/array.pyx index 255efc268fe..456bf6d1da8 100644 --- a/python/pyarrow/array.pyx +++ b/python/pyarrow/array.pyx @@ -33,6 +33,8 @@ from pyarrow.scalar import NA from pyarrow.schema cimport Schema import pyarrow.schema as schema +from pyarrow.table cimport Table + def total_allocated_bytes(): cdef MemoryPool* pool = pyarrow.GetMemoryPool() return pool.bytes_allocated() @@ -287,76 +289,3 @@ cdef class RowBatch: return self.arrays[i] -cdef class Table: - ''' - Do not call this class's constructor directly. - ''' - cdef: - shared_ptr[CTable] sp_table - CTable* table - - def __cinit__(self): - pass - - cdef init(self, const shared_ptr[CTable]& table): - self.sp_table = table - self.table = table.get() - - @staticmethod - def from_pandas(df, name=None): - pass - - @staticmethod - def from_arrays(names, arrays, name=None): - cdef: - Array arr - Table result - c_string c_name - vector[shared_ptr[CField]] fields - vector[shared_ptr[CColumn]] columns - shared_ptr[CSchema] schema - shared_ptr[CTable] table - - cdef int K = len(arrays) - - fields.resize(K) - columns.resize(K) - for i in range(K): - arr = arrays[i] - c_name = tobytes(names[i]) - - fields[i].reset(new CField(c_name, arr.type.sp_type, True)) - columns[i].reset(new CColumn(fields[i], arr.sp_array)) - - if name is None: - c_name = '' - else: - c_name = tobytes(name) - - schema.reset(new CSchema(fields)) - table.reset(new CTable(c_name, schema, columns)) - - result = Table() - result.init(table) - - return result - - def to_pandas(self): - """ - Convert the arrow::Table to a pandas DataFrame - """ - cdef: - PyObject* arr - shared_ptr[CColumn] col - - import pandas as pd - - names = [] - data = [] - for i in range(self.table.num_columns()): - col = self.table.column(i) - check_status(pyarrow.ArrowToPandas(col, &arr)) - names.append(frombytes(col.get().name())) - data.append( arr) - - return pd.DataFrame(dict(zip(names, data)), columns=names) diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 42f1f25073d..b2ef45a347b 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -149,7 +149,10 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: c_string GetString(int i) cdef cppclass CChunkedArray" arrow::ChunkedArray": - pass + int64_t length() + int64_t null_count() + int num_chunks() + const shared_ptr[CArray]& chunk(int i) cdef cppclass CColumn" arrow::Column": CColumn(const shared_ptr[CField]& field, diff --git a/python/pyarrow/schema.pxd b/python/pyarrow/schema.pxd index 61458b765c7..f2cb776eb2e 100644 --- a/python/pyarrow/schema.pxd +++ b/python/pyarrow/schema.pxd @@ -41,5 +41,7 @@ cdef class Schema: CSchema* schema cdef init(self, const vector[shared_ptr[CField]]& fields) + cdef init_schema(self, const shared_ptr[CSchema]& schema) cdef DataType box_data_type(const shared_ptr[CDataType]& type) +cdef Schema box_schema(const shared_ptr[CSchema]& schema) diff --git a/python/pyarrow/schema.pyx b/python/pyarrow/schema.pyx index b3bf02aad76..22ddf0cf17e 100644 --- a/python/pyarrow/schema.pyx +++ b/python/pyarrow/schema.pyx @@ -106,6 +106,10 @@ cdef class Schema: self.schema = new CSchema(fields) self.sp_schema.reset(self.schema) + cdef init_schema(self, const shared_ptr[CSchema]& schema): + self.schema = schema.get() + self.sp_schema = schema + @classmethod def from_fields(cls, fields): cdef: @@ -223,3 +227,8 @@ cdef DataType box_data_type(const shared_ptr[CDataType]& type): cdef DataType out = DataType() out.init(type) return out + +cdef Schema box_schema(const shared_ptr[CSchema]& type): + cdef Schema out = Schema() + out.init_schema(type) + return out diff --git a/python/pyarrow/table.pxd b/python/pyarrow/table.pxd new file mode 100644 index 00000000000..0a5c122c95c --- /dev/null +++ b/python/pyarrow/table.pxd @@ -0,0 +1,46 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from pyarrow.includes.common cimport shared_ptr +from pyarrow.includes.libarrow cimport CChunkedArray, CColumn, CTable + + +cdef class ChunkedArray: + cdef: + shared_ptr[CChunkedArray] sp_chunked_array + CChunkedArray* chunked_array + + cdef init(self, const shared_ptr[CChunkedArray]& chunked_array) + cdef _check_nullptr(self) + + +cdef class Column: + cdef: + shared_ptr[CColumn] sp_column + CColumn* column + + cdef init(self, const shared_ptr[CColumn]& column) + cdef _check_nullptr(self) + + +cdef class Table: + cdef: + shared_ptr[CTable] sp_table + CTable* table + + cdef init(self, const shared_ptr[CTable]& table) + cdef _check_nullptr(self) diff --git a/python/pyarrow/table.pyx b/python/pyarrow/table.pyx new file mode 100644 index 00000000000..4c4816f0c7e --- /dev/null +++ b/python/pyarrow/table.pyx @@ -0,0 +1,264 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# cython: profile=False +# distutils: language = c++ +# cython: embedsignature = True + +from pyarrow.includes.libarrow cimport * +cimport pyarrow.includes.pyarrow as pyarrow + +import pyarrow.config + +from pyarrow.array cimport Array, box_arrow_array +from pyarrow.compat import frombytes, tobytes +from pyarrow.error cimport check_status +from pyarrow.schema cimport box_data_type, box_schema + +cdef class ChunkedArray: + ''' + Do not call this class's constructor directly. + ''' + + def __cinit__(self): + self.chunked_array = NULL + + cdef init(self, const shared_ptr[CChunkedArray]& chunked_array): + self.sp_chunked_array = chunked_array + self.chunked_array = chunked_array.get() + + cdef _check_nullptr(self): + if self.chunked_array == NULL: + raise ReferenceError("ChunkedArray object references a NULL pointer." + "Not initialized.") + + def length(self): + self._check_nullptr() + return self.chunked_array.length() + + def __len__(self): + return self.length() + + property null_count: + + def __get__(self): + self._check_nullptr() + return self.chunked_array.null_count() + + property num_chunks: + + def __get__(self): + self._check_nullptr() + return self.chunked_array.num_chunks() + + def chunk(self, i): + self._check_nullptr() + return box_arrow_array(self.chunked_array.chunk(i)) + + + def iterchunks(self): + for i in range(self.num_chunks): + yield self.chunk(i) + + +cdef class Column: + ''' + Do not call this class's constructor directly. + ''' + + def __cinit__(self): + self.column = NULL + + cdef init(self, const shared_ptr[CColumn]& column): + self.sp_column = column + self.column = column.get() + + def to_pandas(self): + """ + Convert the arrow::Column to a pandas Series + """ + cdef: + PyObject* arr + + import pandas as pd + + check_status(pyarrow.ArrowToPandas(self.sp_column, &arr)) + return pd.Series(arr, name=self.name) + + cdef _check_nullptr(self): + if self.column == NULL: + raise ReferenceError("Column object references a NULL pointer." + "Not initialized.") + + def __len__(self): + self._check_nullptr() + return self.column.length() + + def length(self): + self._check_nullptr() + return self.column.length() + + property shape: + + def __get__(self): + self._check_nullptr() + return (self.length(),) + + property null_count: + + def __get__(self): + self._check_nullptr() + return self.column.null_count() + + property name: + + def __get__(self): + return frombytes(self.column.name()) + + property type: + + def __get__(self): + return box_data_type(self.column.type()) + + property data: + + def __get__(self): + cdef ChunkedArray chunked_array = ChunkedArray() + chunked_array.init(self.column.data()) + return chunked_array + + +cdef class Table: + ''' + Do not call this class's constructor directly. + ''' + + def __cinit__(self): + self.table = NULL + + cdef init(self, const shared_ptr[CTable]& table): + self.sp_table = table + self.table = table.get() + + cdef _check_nullptr(self): + if self.table == NULL: + raise ReferenceError("Table object references a NULL pointer." + "Not initialized.") + + @staticmethod + def from_pandas(df, name=None): + pass + + @staticmethod + def from_arrays(names, arrays, name=None): + cdef: + Array arr + Table result + c_string c_name + vector[shared_ptr[CField]] fields + vector[shared_ptr[CColumn]] columns + shared_ptr[CSchema] schema + shared_ptr[CTable] table + + cdef int K = len(arrays) + + fields.resize(K) + columns.resize(K) + for i in range(K): + arr = arrays[i] + c_name = tobytes(names[i]) + + fields[i].reset(new CField(c_name, arr.type.sp_type, True)) + columns[i].reset(new CColumn(fields[i], arr.sp_array)) + + if name is None: + c_name = '' + else: + c_name = tobytes(name) + + schema.reset(new CSchema(fields)) + table.reset(new CTable(c_name, schema, columns)) + + result = Table() + result.init(table) + + return result + + def to_pandas(self): + """ + Convert the arrow::Table to a pandas DataFrame + """ + cdef: + PyObject* arr + shared_ptr[CColumn] col + + import pandas as pd + + names = [] + data = [] + for i in range(self.table.num_columns()): + col = self.table.column(i) + check_status(pyarrow.ArrowToPandas(col, &arr)) + names.append(frombytes(col.get().name())) + data.append( arr) + + return pd.DataFrame(dict(zip(names, data)), columns=names) + + property name: + + def __get__(self): + self._check_nullptr() + return frombytes(self.table.name()) + + property schema: + + def __get__(self): + raise box_schema(self.table.schema()) + + def column(self, index): + self._check_nullptr() + cdef Column column = Column() + column.init(self.table.column(index)) + return column + + def __getitem__(self, i): + return self.column(i) + + def itercolumns(self): + for i in range(self.num_columns): + yield self.column(i) + + property num_columns: + + def __get__(self): + self._check_nullptr() + return self.table.num_columns() + + property num_rows: + + def __get__(self): + self._check_nullptr() + return self.table.num_rows() + + def __len__(self): + return self.num_rows + + property shape: + + def __get__(self): + return (self.num_rows, self.num_columns) + diff --git a/python/pyarrow/tests/test_column.py b/python/pyarrow/tests/test_column.py new file mode 100644 index 00000000000..b62f58236e0 --- /dev/null +++ b/python/pyarrow/tests/test_column.py @@ -0,0 +1,49 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from pyarrow.compat import unittest +import pyarrow as arrow + +A = arrow + +import pandas as pd + + +class TestColumn(unittest.TestCase): + + def test_basics(self): + data = [ + A.from_pylist([-10, -5, 0, 5, 10]) + ] + table = A.Table.from_arrays(('a'), data, 'table_name') + column = table.column(0) + assert column.name == 'a' + assert column.length() == 5 + assert len(column) == 5 + assert column.shape == (5,) + + def test_pandas(self): + data = [ + A.from_pylist([-10, -5, 0, 5, 10]) + ] + table = A.Table.from_arrays(('a'), data, 'table_name') + column = table.column(0) + series = column.to_pandas() + assert series.name == 'a' + assert series.shape == (5,) + assert series.iloc[0] == -10 + diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index 2e24445bd0c..83fcbb8faff 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -20,6 +20,8 @@ A = arrow +import pandas as pd + class TestRowBatch(unittest.TestCase): @@ -38,3 +40,40 @@ def test_basics(self): assert len(batch) == num_rows assert batch.num_rows == num_rows assert batch.num_columns == len(data) + + +class TestTable(unittest.TestCase): + + def test_basics(self): + data = [ + A.from_pylist(range(5)), + A.from_pylist([-10, -5, 0, 5, 10]) + ] + table = A.Table.from_arrays(('a', 'b'), data, 'table_name') + assert table.name == 'table_name' + assert len(table) == 5 + assert table.num_rows == 5 + assert table.num_columns == 2 + assert table.shape == (5, 2) + + for col in table.itercolumns(): + for chunk in col.data.iterchunks(): + assert chunk is not None + + def test_pandas(self): + data = [ + A.from_pylist(range(5)), + A.from_pylist([-10, -5, 0, 5, 10]) + ] + table = A.Table.from_arrays(('a', 'b'), data, 'table_name') + + # TODO: Use this part once from_pandas is implemented + # data = {'a': range(5), 'b': [-10, -5, 0, 5, 10]} + # df = pd.DataFrame(data) + # A.Table.from_pandas(df) + + df = table.to_pandas() + assert set(df.columns) == set(('a', 'b')) + assert df.shape == (5, 2) + assert df.ix[0, 'b'] == -10 + diff --git a/python/setup.py b/python/setup.py index 5cc871aba9f..ebd80de46b4 100644 --- a/python/setup.py +++ b/python/setup.py @@ -214,7 +214,7 @@ def get_ext_built(self, name): return name + suffix def get_cmake_cython_names(self): - return ['array', 'config', 'error', 'scalar', 'schema'] + return ['array', 'config', 'error', 'scalar', 'schema', 'table'] def get_names(self): return self._found_names