Skip to content

Fixed factorize for MACArray #13

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Mar 27, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
203 changes: 203 additions & 0 deletions conda-recipes/pandas/0001-pandas.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
index 601acac20..7c89cab6b 100644
--- a/pandas/core/groupby.py
+++ b/pandas/core/groupby.py
@@ -44,7 +44,7 @@ from pandas.core.base import (PandasObject, SelectionMixin, GroupByError,
DataError, SpecificationError)
from pandas.core.index import (Index, MultiIndex,
CategoricalIndex, _ensure_index)
-from pandas.core.arrays import Categorical
+from pandas.core.arrays import ExtensionArray, Categorical
from pandas.core.frame import DataFrame
from pandas.core.generic import NDFrame, _shared_docs
from pandas.core.internals import BlockManager, make_block
@@ -2968,7 +2968,7 @@ class Grouping(object):

# no level passed
elif not isinstance(self.grouper,
- (Series, Index, Categorical, np.ndarray)):
+ (Series, Index, ExtensionArray, np.ndarray)):
if getattr(self.grouper, 'ndim', 1) != 1:
t = self.name or str(type(self.grouper))
raise ValueError("Grouper for '%s' not 1-dimensional" % t)
diff --git a/pandas/tests/extension/base/__init__.py b/pandas/tests/extension/base/__init__.py
index 27c106efd..f8078d279 100644
--- a/pandas/tests/extension/base/__init__.py
+++ b/pandas/tests/extension/base/__init__.py
@@ -44,6 +44,7 @@ from .casting import BaseCastingTests # noqa
from .constructors import BaseConstructorsTests # noqa
from .dtype import BaseDtypeTests # noqa
from .getitem import BaseGetitemTests # noqa
+from .groupby import BaseGroupbyTests # noqa
from .interface import BaseInterfaceTests # noqa
from .methods import BaseMethodsTests # noqa
from .missing import BaseMissingTests # noqa
diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py
new file mode 100644
index 000000000..a29ef2a50
--- /dev/null
+++ b/pandas/tests/extension/base/groupby.py
@@ -0,0 +1,69 @@
+import pytest
+
+import pandas.util.testing as tm
+import pandas as pd
+from .base import BaseExtensionTests
+
+
+class BaseGroupbyTests(BaseExtensionTests):
+ """Groupby-specific tests."""
+
+ def test_grouping_grouper(self, data_for_grouping):
+ df = pd.DataFrame({
+ "A": ["B", "B", None, None, "A", "A", "B", "C"],
+ "B": data_for_grouping
+ })
+ gr1 = df.groupby("A").grouper.groupings[0]
+ gr2 = df.groupby("B").grouper.groupings[0]
+
+ tm.assert_numpy_array_equal(gr1.grouper, df.A.values)
+ tm.assert_extension_array_equal(gr2.grouper, data_for_grouping)
+
+ @pytest.mark.parametrize('as_index', [True, False])
+ def test_groupby_extension_agg(self, as_index, data_for_grouping):
+ df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4],
+ "B": data_for_grouping})
+ result = df.groupby("B", as_index=as_index).A.mean()
+ _, index = pd.factorize(data_for_grouping, sort=True)
+ # TODO(ExtensionIndex): remove astype
+ index = pd.Index(index.astype(object), name="B")
+ expected = pd.Series([3, 1, 4], index=index, name="A")
+ if as_index:
+ self.assert_series_equal(result, expected)
+ else:
+ expected = expected.reset_index()
+ self.assert_frame_equal(result, expected)
+
+ def test_groupby_extension_no_sort(self, data_for_grouping):
+ df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4],
+ "B": data_for_grouping})
+ result = df.groupby("B", sort=False).A.mean()
+ _, index = pd.factorize(data_for_grouping, sort=False)
+ # TODO(ExtensionIndex): remove astype
+ index = pd.Index(index.astype(object), name="B")
+ expected = pd.Series([1, 3, 4], index=index, name="A")
+ self.assert_series_equal(result, expected)
+
+ def test_groupby_extension_transform(self, data_for_grouping):
+ valid = data_for_grouping[~data_for_grouping.isna()]
+ df = pd.DataFrame({"A": [1, 1, 3, 3, 1, 4],
+ "B": valid})
+
+ result = df.groupby("B").A.transform(len)
+ expected = pd.Series([3, 3, 2, 2, 3, 1], name="A")
+
+ self.assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize('op', [
+ lambda x: 1,
+ lambda x: [1] * len(x),
+ lambda x: pd.Series([1] * len(x)),
+ lambda x: x,
+ ], ids=['scalar', 'list', 'series', 'object'])
+ def test_groupby_extension_apply(self, data_for_grouping, op):
+ df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4],
+ "B": data_for_grouping})
+ df.groupby("B").apply(op)
+ df.groupby("B").A.apply(op)
+ df.groupby("A").apply(op)
+ df.groupby("A").B.apply(op)
diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py
index 22c1a67a0..d50917056 100644
--- a/pandas/tests/extension/decimal/test_decimal.py
+++ b/pandas/tests/extension/decimal/test_decimal.py
@@ -127,6 +127,10 @@ class TestCasting(BaseDecimal, base.BaseCastingTests):
pass


+class TestGroupby(BaseDecimal, base.BaseGroupbyTests):
+ pass
+
+
def test_series_constructor_coerce_data_to_extension_dtype_raises():
xpr = ("Cannot cast data to extension dtype 'decimal'. Pass the "
"extension array directly.")
diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py
index 51a68a370..d9ae49d87 100644
--- a/pandas/tests/extension/json/array.py
+++ b/pandas/tests/extension/json/array.py
@@ -113,8 +113,8 @@ class JSONArray(ExtensionArray):
return cls(data)

def _values_for_factorize(self):
- frozen = tuple(tuple(x.items()) for x in self)
- return np.array(frozen, dtype=object), ()
+ frozen = self._values_for_argsort()
+ return frozen, ()

def _values_for_argsort(self):
# Disable NumPy's shape inference by including an empty tuple...
diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py
index 63d97d5e7..5e9639c48 100644
--- a/pandas/tests/extension/json/test_json.py
+++ b/pandas/tests/extension/json/test_json.py
@@ -89,11 +89,12 @@ class TestMissing(base.BaseMissingTests):
"""We treat dictionaries as a mapping in fillna, not a scalar."""


-class TestMethods(base.BaseMethodsTests):
- unhashable = pytest.mark.skip(reason="Unhashable")
- unstable = pytest.mark.skipif(not PY36, # 3.6 or higher
- reason="Dictionary order unstable")
+unhashable = pytest.mark.skip(reason="Unhashable")
+unstable = pytest.mark.skipif(not PY36, # 3.6 or higher
+ reason="Dictionary order unstable")
+

+class TestMethods(base.BaseMethodsTests):
@unhashable
def test_value_counts(self, all_data, dropna):
pass
@@ -118,6 +119,7 @@ class TestMethods(base.BaseMethodsTests):
super(TestMethods, self).test_sort_values(
data_for_sorting, ascending)

+ @unstable
@pytest.mark.parametrize('ascending', [True, False])
def test_sort_values_missing(self, data_missing_for_sorting, ascending):
super(TestMethods, self).test_sort_values_missing(
@@ -126,3 +128,34 @@ class TestMethods(base.BaseMethodsTests):

class TestCasting(base.BaseCastingTests):
pass
+
+
+class TestGroupby(base.BaseGroupbyTests):
+
+ @unhashable
+ def test_groupby_extension_transform(self):
+ """
+ This currently fails in Series.name.setter, since the
+ name must be hashable, but the value is a dictionary.
+ I think this is what we want, i.e. `.name` should be the original
+ values, and not the values for factorization.
+ """
+
+ @unhashable
+ def test_groupby_extension_apply(self):
+ """
+ This fails in Index._do_unique_check with
+
+ > hash(val)
+ E TypeError: unhashable type: 'UserDict' with
+
+ I suspect that once we support Index[ExtensionArray],
+ we'll be able to dispatch unique.
+ """
+
+ @unstable
+ @pytest.mark.parametrize('as_index', [True, False])
+ def test_groupby_extension_agg(self, as_index, data_for_grouping):
+ super(TestGroupby, self).test_groupby_extension_agg(
+ as_index, data_for_grouping
+ )
3 changes: 3 additions & 0 deletions conda-recipes/pandas/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@ build:

source:
git_url: https://github.com/pandas-dev/pandas
git_rev: 766a480
patches:
- 0001-pandas.patch

requirements:
build:
Expand Down
69 changes: 4 additions & 65 deletions cyberpandas/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,8 @@

import numpy as np

import pandas as pd
from pandas.core.arrays import ExtensionArray

from ._utils import refactorize


class NumPyBackedExtensionArrayMixin(ExtensionArray):
@property
Expand All @@ -18,6 +15,10 @@ def dtype(self):
def _constructor_from_sequence(cls, scalars):
return cls(scalars)

@classmethod
def _from_factorized(cls, values, original):
return cls(values)

@property
def shape(self):
return (len(self.data),)
Expand Down Expand Up @@ -68,65 +69,3 @@ def unique(self):
_, indices = np.unique(self.data, return_index=True)
data = self.data.take(np.sort(indices))
return self._from_ndarray(data)

def factorize(self, na_sentinel=-1):
"""Factorize an IPArray into integer labels and unique values.

Calling :meth:`pandas.Series.factorize` or :meth:`pandas.factorize`
will dispatch to this method.

Parameters
----------
na_sentinel : int, default -1
The value in `labels` to use for indicating missing values in
`self`.

Returns
-------
labels : ndarray
An integer-type ndarray the same length as `self`. Each newly-
observed value in `self` will be assigned the next integer.
Missing values in self are assigned `na_sentinel`.
uniques : IPArray
The unique values in `self` in order of appereance, not including
the missing value ``IPv4Address('0.0.0.0')``.

See Also
--------
pandas.factorize, pandas.Series.factorize

Examples
--------
>>> arr = IPArray([2, 2, 0, 1, 2, 2**64 + 1])
>>> arr
IPArray(['0.0.0.2', '0.0.0.2', '0.0.0.0', '0.0.0.1',
'0.0.0.2', '::1:0:0:0:1'])

>>> labels, uniques = arr.factorize()
>>> labels
array([ 0, 0, -1, 1, 0, 2])

Notice that `uniques` does not include the missing value.
>>> uniques
IPArray(['0.0.0.2', '0.0.0.1', '::1:0:0:0:1'])
"""
# OK, so here's the plan.
# Start with factorizing `self.data`, which has two unfortunate issues
# 1. Requires casting to object.
# 2. Gets the NA logic wrong, since (0, 0) isn't NA to pandas.
# For now, we can't help with 1. Maybe someday.
# For 2, we can "fix" things with a little post-factorization cleanup.
l, u = pd.factorize(self.data)
mask = self.isna()
any_na = mask.any()

if any_na:
first_na = mask.argmax()
refactorize(l, first_na, na_sentinel=na_sentinel) # inplace op

# u is an ndarray of tuples. Go to our record type, then an IPArray
u2 = type(self)((u.astype(self.dtype._record_type)))
# May have a missing value.
if any_na:
u2 = u2[~u2.isna()]
return l, u2
3 changes: 3 additions & 0 deletions cyberpandas/ip_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,9 @@ def equals(self, other):
# TODO: missing
return (self.data == other.data).all()

def _values_for_factorize(self):
return self.astype(object), ipaddress.IPv4Address(0)

def isna(self):
ips = self.data
return (ips['lo'] == 0) & (ips['hi'] == 0)
Expand Down
20 changes: 17 additions & 3 deletions cyberpandas/mac_array.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from collections import Iterable

import numpy as np
import six

Expand Down Expand Up @@ -60,9 +62,7 @@ def _box_scalar(scalar):
return scalar

def __setitem__(self, key, value):
from .parser import to_ipaddress

value = to_ipaddress(value).data
value = to_macaddress(value)
self.data[key] = value

def __iter__(self):
Expand All @@ -88,6 +88,10 @@ def equals(self, other):
raise TypeError
return (self.data == other.data).all()

def _values_for_factorize(self):
# Should hit pandas' UInt64Hashtable
return self, 0

def isna(self):
return (self.data == 0)

Expand Down Expand Up @@ -126,3 +130,13 @@ def _parse(mac):
# https://stackoverflow.com/a/36883363/1889400
mac_int = int(mac.replace(":", "").replace("-", ""), 16)
return mac_int


def to_macaddress(addresses):
if (isinstance(addresses, six.string_types) or
not isinstance(addresses, Iterable)):
addresses = [addresses]

addresses = [_parse(mac) if isinstance(mac, six.string_types) else mac
for mac in addresses]
return np.array(addresses, dtype='u8')
2 changes: 1 addition & 1 deletion cyberpandas/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def _to_int_pairs(values):
if isinstance(values, (str, bytes, int)):
values = ipaddress.ip_address(values)._ip
return unpack(pack(values))
elif isinstance(values, np.ndarray):
elif isinstance(values, np.ndarray) and values.dtype != object:
if values.ndim != 2:
raise ValueError("'values' should be a 2-D when passing a "
"NumPy array.")
Expand Down
4 changes: 2 additions & 2 deletions tests/mac/test_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,12 @@ def all_data(request, data, data_missing):

@pytest.fixture
def data_for_sorting():
return MACArray([10, 2 ** 64 + 1, 1])
return MACArray([10, 2 ** 64 - 1, 1])


@pytest.fixture
def data_missing_for_sorting():
return MACArray([2 ** 64 + 1, 0, 1])
return MACArray([2 ** 64 - 1, 0, 1])


@pytest.fixture
Expand Down