Skip to content

Commit b3d7650

Browse files
authored
Merge branch 'master' into excel-number-format
2 parents 687e6f4 + 716efd3 commit b3d7650

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

60 files changed

+4922
-3132
lines changed

.coveragerc

+2
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
[run]
33
branch = False
44
omit = */tests/*
5+
plugins = Cython.Coverage
56

67
[report]
78
# Regexes for lines to exclude from consideration
@@ -22,6 +23,7 @@ exclude_lines =
2223
if __name__ == .__main__.:
2324

2425
ignore_errors = False
26+
show_missing = True
2527

2628
[html]
2729
directory = coverage_html_report

asv_bench/benchmarks/groupby.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,7 @@ def time_frame_nth(self, dtype):
142142
def time_series_nth_any(self, dtype):
143143
self.df['values'].groupby(self.df['key']).nth(0, dropna='any')
144144

145-
def time_groupby_nth_all(self, dtype):
145+
def time_series_nth_all(self, dtype):
146146
self.df['values'].groupby(self.df['key']).nth(0, dropna='all')
147147

148148
def time_series_nth(self, dtype):

asv_bench/benchmarks/reshape.py

+18
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
1+
import string
12
from itertools import product
23

34
import numpy as np
45
from pandas import DataFrame, MultiIndex, date_range, melt, wide_to_long
6+
import pandas as pd
57

68
from .pandas_vb_common import setup # noqa
79

@@ -132,3 +134,19 @@ def setup(self):
132134

133135
def time_pivot_table(self):
134136
self.df.pivot_table(index='key1', columns=['key2', 'key3'])
137+
138+
139+
class GetDummies(object):
140+
goal_time = 0.2
141+
142+
def setup(self):
143+
categories = list(string.ascii_letters[:12])
144+
s = pd.Series(np.random.choice(categories, size=1_000_000),
145+
dtype=pd.api.types.CategoricalDtype(categories))
146+
self.s = s
147+
148+
def time_get_dummies_1d(self):
149+
pd.get_dummies(self.s, sparse=False)
150+
151+
def time_get_dummies_1d_sparse(self):
152+
pd.get_dummies(self.s, sparse=True)

doc/source/whatsnew/v0.24.0.txt

+60-1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ v0.24.0 (Month XX, 2018)
1313
New features
1414
~~~~~~~~~~~~
1515

16+
1617
- ``ExcelWriter`` now accepts ``mode`` as a keyword argument, enabling append to existing workbooks when using the ``openpyxl`` engine (:issue:`3441`)
1718

1819
.. _whatsnew_0240.enhancements.extension_array_operators:
@@ -31,6 +32,62 @@ See the :ref:`ExtensionArray Operator Support
3132
<extending.extension.operator>` documentation section for details on both
3233
ways of adding operator support.
3334

35+
.. _whatsnew_0240.enhancements.intna:
36+
37+
Optional Integer NA Support
38+
^^^^^^^^^^^^^^^^^^^^^^^^^^^
39+
40+
Pandas has gained the ability to hold integer dtypes with missing values. This long requested feature is enabled through the use of :ref:`extension types <extending.extension-types>`.
41+
Here is an example of the usage.
42+
43+
We can construct a ``Series`` with the specified dtype. The dtype string ``Int64`` is a pandas ``ExtensionDtype``. Specifying a list or array using the traditional missing value
44+
marker of ``np.nan`` will infer to integer dtype. The display of the ``Series`` will also use the ``NaN`` to indicate missing values in string outputs. (:issue:`20700`, :issue:`20747`)
45+
46+
.. ipython:: python
47+
48+
s = pd.Series([1, 2, np.nan], dtype='Int64')
49+
s
50+
51+
52+
Operations on these dtypes will propagate ``NaN`` as other pandas operations.
53+
54+
.. ipython:: python
55+
56+
# arithmetic
57+
s + 1
58+
59+
# comparison
60+
s == 1
61+
62+
# indexing
63+
s.iloc[1:3]
64+
65+
# operate with other dtypes
66+
s + s.iloc[1:3].astype('Int8')
67+
68+
# coerce when needed
69+
s + 0.01
70+
71+
These dtypes can operate as part of of ``DataFrame``.
72+
73+
.. ipython:: python
74+
75+
df = pd.DataFrame({'A': s, 'B': [1, 1, 3], 'C': list('aab')})
76+
df
77+
df.dtypes
78+
79+
80+
These dtypes can be merged & reshaped & casted.
81+
82+
.. ipython:: python
83+
84+
pd.concat([df[['A']], df[['B', 'C']]], axis=1).dtypes
85+
df['A'].astype(float)
86+
87+
.. warning::
88+
89+
The Integer NA support currently uses the captilized dtype version, e.g. ``Int8`` as compared to the traditional ``int8``. This may be changed at a future date.
90+
3491
.. _whatsnew_0240.enhancements.read_html:
3592

3693
``read_html`` Enhancements
@@ -258,6 +315,7 @@ Previous Behavior:
258315
ExtensionType Changes
259316
^^^^^^^^^^^^^^^^^^^^^
260317

318+
- ``ExtensionArray`` has gained the abstract methods ``.dropna()`` (:issue:`21185`)
261319
- ``ExtensionDtype`` has gained the ability to instantiate from string dtypes, e.g. ``decimal`` would instantiate a registered ``DecimalDtype``; furthermore
262320
the ``ExtensionDtype`` has gained the method ``construct_array_type`` (:issue:`21185`)
263321
- The ``ExtensionArray`` constructor, ``_from_sequence`` now take the keyword arg ``copy=False`` (:issue:`21185`)
@@ -312,6 +370,7 @@ Other API Changes
312370
- Invalid construction of ``IntervalDtype`` will now always raise a ``TypeError`` rather than a ``ValueError`` if the subdtype is invalid (:issue:`21185`)
313371
- Trying to reindex a ``DataFrame`` with a non unique ``MultiIndex`` now raises a ``ValueError`` instead of an ``Exception`` (:issue:`21770`)
314372
- :meth:`PeriodIndex.tz_convert` and :meth:`PeriodIndex.tz_localize` have been removed (:issue:`21781`)
373+
- :class:`Index` subtraction will attempt to operate element-wise instead of raising ``TypeError`` (:issue:`19369`)
315374
- :class:`pandas.io.formats.style.Styler` supports a ``number-format`` property when using :meth:`~pandas.io.formats.style.Styler.to_excel`
316375

317376
.. _whatsnew_0240.deprecations:
@@ -351,7 +410,7 @@ Performance Improvements
351410
- Improved performance of :meth:`HDFStore.groups` (and dependent functions like
352411
:meth:`~HDFStore.keys`. (i.e. ``x in store`` checks are much faster)
353412
(:issue:`21372`)
354-
-
413+
- Improved the performance of :func:`pandas.get_dummies` with ``sparse=True`` (:issue:`21997`)
355414

356415
.. _whatsnew_0240.docs:
357416

pandas/_libs/internals.pyx

+1-1
Original file line numberDiff line numberDiff line change
@@ -390,7 +390,7 @@ def get_blkno_indexers(int64_t[:] blknos, bint group=True):
390390
start = 0
391391
cur_blkno = blknos[start]
392392

393-
if group == False:
393+
if group is False:
394394
for i in range(1, n):
395395
if blknos[i] != cur_blkno:
396396
yield cur_blkno, slice(start, i)

pandas/_libs/tslibs/period.pyx

+2-2
Original file line numberDiff line numberDiff line change
@@ -1655,8 +1655,8 @@ cdef class _Period(object):
16551655
return value
16561656

16571657
def __setstate__(self, state):
1658-
self.freq=state[1]
1659-
self.ordinal=state[2]
1658+
self.freq = state[1]
1659+
self.ordinal = state[2]
16601660

16611661
def __reduce__(self):
16621662
object_state = None, self.freq, self.ordinal

pandas/_libs/src/util.pxd renamed to pandas/_libs/tslibs/util.pxd

+3-3
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from numpy cimport ndarray, NPY_C_CONTIGUOUS, NPY_F_CONTIGUOUS
1+
from numpy cimport ndarray
22
cimport numpy as cnp
33
cnp.import_array()
44

@@ -64,7 +64,7 @@ cdef inline bint is_datetime64_object(object obj) nogil:
6464

6565
# --------------------------------------------------------------------
6666

67-
cdef extern from "numpy_helper.h":
67+
cdef extern from "../src/numpy_helper.h":
6868
void set_array_not_contiguous(ndarray ao)
6969

7070
int assign_value_1d(ndarray, Py_ssize_t, object) except -1
@@ -87,7 +87,7 @@ ctypedef fused numeric:
8787
cnp.float32_t
8888
cnp.float64_t
8989

90-
cdef extern from "headers/stdint.h":
90+
cdef extern from "../src/headers/stdint.h":
9191
enum: UINT8_MAX
9292
enum: UINT16_MAX
9393
enum: UINT32_MAX

pandas/_libs/util.pxd

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from tslibs.util cimport *

pandas/_libs/window.pyx

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ from numpy cimport ndarray, double_t, int64_t, float64_t
1313
cnp.import_array()
1414

1515

16-
cdef extern from "../src/headers/cmath" namespace "std":
16+
cdef extern from "src/headers/cmath" namespace "std":
1717
int signbit(double) nogil
1818
double sqrt(double x) nogil
1919

pandas/core/arrays/__init__.py

+3
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
11
from .base import (ExtensionArray, # noqa
2+
ExtensionOpsMixin,
23
ExtensionScalarOpsMixin)
34
from .categorical import Categorical # noqa
45
from .datetimes import DatetimeArrayMixin # noqa
56
from .interval import IntervalArray # noqa
67
from .period import PeriodArrayMixin # noqa
78
from .timedeltas import TimedeltaArrayMixin # noqa
9+
from .integer import ( # noqa
10+
IntegerArray, to_integer_array)

pandas/core/arrays/base.py

+10-6
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@
1212
from pandas.errors import AbstractMethodError
1313
from pandas.compat.numpy import function as nv
1414
from pandas.compat import set_function_name, PY3
15-
from pandas.core.dtypes.common import is_list_like
1615
from pandas.core import ops
16+
from pandas.core.dtypes.common import is_list_like
1717

1818
_not_implemented_message = "{} does not implement {}."
1919

@@ -88,16 +88,19 @@ class ExtensionArray(object):
8888
# Constructors
8989
# ------------------------------------------------------------------------
9090
@classmethod
91-
def _from_sequence(cls, scalars, copy=False):
91+
def _from_sequence(cls, scalars, dtype=None, copy=False):
9292
"""Construct a new ExtensionArray from a sequence of scalars.
9393
9494
Parameters
9595
----------
9696
scalars : Sequence
9797
Each element will be an instance of the scalar type for this
9898
array, ``cls.dtype.type``.
99+
dtype : dtype, optional
100+
Construct for this particular dtype. This should be a Dtype
101+
compatible with the ExtensionArray.
99102
copy : boolean, default False
100-
if True, copy the underlying data
103+
If True, copy the underlying data.
101104
Returns
102105
-------
103106
ExtensionArray
@@ -378,7 +381,7 @@ def fillna(self, value=None, method=None, limit=None):
378381
func = pad_1d if method == 'pad' else backfill_1d
379382
new_values = func(self.astype(object), limit=limit,
380383
mask=mask)
381-
new_values = self._from_sequence(new_values)
384+
new_values = self._from_sequence(new_values, dtype=self.dtype)
382385
else:
383386
# fill with value
384387
new_values = self.copy()
@@ -407,7 +410,7 @@ def unique(self):
407410
from pandas import unique
408411

409412
uniques = unique(self.astype(object))
410-
return self._from_sequence(uniques)
413+
return self._from_sequence(uniques, dtype=self.dtype)
411414

412415
def _values_for_factorize(self):
413416
# type: () -> Tuple[ndarray, Any]
@@ -559,7 +562,7 @@ def take(self, indices, allow_fill=False, fill_value=None):
559562
560563
result = take(data, indices, fill_value=fill_value,
561564
allow_fill=allow_fill)
562-
return self._from_sequence(result)
565+
return self._from_sequence(result, dtype=self.dtype)
563566
"""
564567
# Implementer note: The `fill_value` parameter should be a user-facing
565568
# value, an instance of self.dtype.type. When passed `fill_value=None`,
@@ -634,6 +637,7 @@ class ExtensionOpsMixin(object):
634637
"""
635638
A base class for linking the operators to their dunder names
636639
"""
640+
637641
@classmethod
638642
def _add_arithmetic_ops(cls):
639643
cls.__add__ = cls._create_arithmetic_method(operator.add)

pandas/core/arrays/categorical.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -488,8 +488,8 @@ def _constructor(self):
488488
return Categorical
489489

490490
@classmethod
491-
def _from_sequence(cls, scalars):
492-
return Categorical(scalars)
491+
def _from_sequence(cls, scalars, dtype=None, copy=False):
492+
return Categorical(scalars, dtype=dtype)
493493

494494
def copy(self):
495495
""" Copy constructor. """

0 commit comments

Comments
 (0)