diff --git a/conda-recipes/pandas/0001-pandas.patch b/conda-recipes/pandas/0001-pandas.patch new file mode 100644 index 0000000..fc2708d --- /dev/null +++ b/conda-recipes/pandas/0001-pandas.patch @@ -0,0 +1,203 @@ +diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py +index 601acac20..7c89cab6b 100644 +--- a/pandas/core/groupby.py ++++ b/pandas/core/groupby.py +@@ -44,7 +44,7 @@ from pandas.core.base import (PandasObject, SelectionMixin, GroupByError, + DataError, SpecificationError) + from pandas.core.index import (Index, MultiIndex, + CategoricalIndex, _ensure_index) +-from pandas.core.arrays import Categorical ++from pandas.core.arrays import ExtensionArray, Categorical + from pandas.core.frame import DataFrame + from pandas.core.generic import NDFrame, _shared_docs + from pandas.core.internals import BlockManager, make_block +@@ -2968,7 +2968,7 @@ class Grouping(object): + + # no level passed + elif not isinstance(self.grouper, +- (Series, Index, Categorical, np.ndarray)): ++ (Series, Index, ExtensionArray, np.ndarray)): + if getattr(self.grouper, 'ndim', 1) != 1: + t = self.name or str(type(self.grouper)) + raise ValueError("Grouper for '%s' not 1-dimensional" % t) +diff --git a/pandas/tests/extension/base/__init__.py b/pandas/tests/extension/base/__init__.py +index 27c106efd..f8078d279 100644 +--- a/pandas/tests/extension/base/__init__.py ++++ b/pandas/tests/extension/base/__init__.py +@@ -44,6 +44,7 @@ from .casting import BaseCastingTests # noqa + from .constructors import BaseConstructorsTests # noqa + from .dtype import BaseDtypeTests # noqa + from .getitem import BaseGetitemTests # noqa ++from .groupby import BaseGroupbyTests # noqa + from .interface import BaseInterfaceTests # noqa + from .methods import BaseMethodsTests # noqa + from .missing import BaseMissingTests # noqa +diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py +new file mode 100644 +index 000000000..a29ef2a50 +--- /dev/null ++++ b/pandas/tests/extension/base/groupby.py +@@ -0,0 +1,69 @@ ++import pytest ++ ++import pandas.util.testing as tm ++import pandas as pd ++from .base import BaseExtensionTests ++ ++ ++class BaseGroupbyTests(BaseExtensionTests): ++ """Groupby-specific tests.""" ++ ++ def test_grouping_grouper(self, data_for_grouping): ++ df = pd.DataFrame({ ++ "A": ["B", "B", None, None, "A", "A", "B", "C"], ++ "B": data_for_grouping ++ }) ++ gr1 = df.groupby("A").grouper.groupings[0] ++ gr2 = df.groupby("B").grouper.groupings[0] ++ ++ tm.assert_numpy_array_equal(gr1.grouper, df.A.values) ++ tm.assert_extension_array_equal(gr2.grouper, data_for_grouping) ++ ++ @pytest.mark.parametrize('as_index', [True, False]) ++ def test_groupby_extension_agg(self, as_index, data_for_grouping): ++ df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], ++ "B": data_for_grouping}) ++ result = df.groupby("B", as_index=as_index).A.mean() ++ _, index = pd.factorize(data_for_grouping, sort=True) ++ # TODO(ExtensionIndex): remove astype ++ index = pd.Index(index.astype(object), name="B") ++ expected = pd.Series([3, 1, 4], index=index, name="A") ++ if as_index: ++ self.assert_series_equal(result, expected) ++ else: ++ expected = expected.reset_index() ++ self.assert_frame_equal(result, expected) ++ ++ def test_groupby_extension_no_sort(self, data_for_grouping): ++ df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], ++ "B": data_for_grouping}) ++ result = df.groupby("B", sort=False).A.mean() ++ _, index = pd.factorize(data_for_grouping, sort=False) ++ # TODO(ExtensionIndex): remove astype ++ index = pd.Index(index.astype(object), name="B") ++ expected = pd.Series([1, 3, 4], index=index, name="A") ++ self.assert_series_equal(result, expected) ++ ++ def test_groupby_extension_transform(self, data_for_grouping): ++ valid = data_for_grouping[~data_for_grouping.isna()] ++ df = pd.DataFrame({"A": [1, 1, 3, 3, 1, 4], ++ "B": valid}) ++ ++ result = df.groupby("B").A.transform(len) ++ expected = pd.Series([3, 3, 2, 2, 3, 1], name="A") ++ ++ self.assert_series_equal(result, expected) ++ ++ @pytest.mark.parametrize('op', [ ++ lambda x: 1, ++ lambda x: [1] * len(x), ++ lambda x: pd.Series([1] * len(x)), ++ lambda x: x, ++ ], ids=['scalar', 'list', 'series', 'object']) ++ def test_groupby_extension_apply(self, data_for_grouping, op): ++ df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], ++ "B": data_for_grouping}) ++ df.groupby("B").apply(op) ++ df.groupby("B").A.apply(op) ++ df.groupby("A").apply(op) ++ df.groupby("A").B.apply(op) +diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py +index 22c1a67a0..d50917056 100644 +--- a/pandas/tests/extension/decimal/test_decimal.py ++++ b/pandas/tests/extension/decimal/test_decimal.py +@@ -127,6 +127,10 @@ class TestCasting(BaseDecimal, base.BaseCastingTests): + pass + + ++class TestGroupby(BaseDecimal, base.BaseGroupbyTests): ++ pass ++ ++ + def test_series_constructor_coerce_data_to_extension_dtype_raises(): + xpr = ("Cannot cast data to extension dtype 'decimal'. Pass the " + "extension array directly.") +diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py +index 51a68a370..d9ae49d87 100644 +--- a/pandas/tests/extension/json/array.py ++++ b/pandas/tests/extension/json/array.py +@@ -113,8 +113,8 @@ class JSONArray(ExtensionArray): + return cls(data) + + def _values_for_factorize(self): +- frozen = tuple(tuple(x.items()) for x in self) +- return np.array(frozen, dtype=object), () ++ frozen = self._values_for_argsort() ++ return frozen, () + + def _values_for_argsort(self): + # Disable NumPy's shape inference by including an empty tuple... +diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py +index 63d97d5e7..5e9639c48 100644 +--- a/pandas/tests/extension/json/test_json.py ++++ b/pandas/tests/extension/json/test_json.py +@@ -89,11 +89,12 @@ class TestMissing(base.BaseMissingTests): + """We treat dictionaries as a mapping in fillna, not a scalar.""" + + +-class TestMethods(base.BaseMethodsTests): +- unhashable = pytest.mark.skip(reason="Unhashable") +- unstable = pytest.mark.skipif(not PY36, # 3.6 or higher +- reason="Dictionary order unstable") ++unhashable = pytest.mark.skip(reason="Unhashable") ++unstable = pytest.mark.skipif(not PY36, # 3.6 or higher ++ reason="Dictionary order unstable") ++ + ++class TestMethods(base.BaseMethodsTests): + @unhashable + def test_value_counts(self, all_data, dropna): + pass +@@ -118,6 +119,7 @@ class TestMethods(base.BaseMethodsTests): + super(TestMethods, self).test_sort_values( + data_for_sorting, ascending) + ++ @unstable + @pytest.mark.parametrize('ascending', [True, False]) + def test_sort_values_missing(self, data_missing_for_sorting, ascending): + super(TestMethods, self).test_sort_values_missing( +@@ -126,3 +128,34 @@ class TestMethods(base.BaseMethodsTests): + + class TestCasting(base.BaseCastingTests): + pass ++ ++ ++class TestGroupby(base.BaseGroupbyTests): ++ ++ @unhashable ++ def test_groupby_extension_transform(self): ++ """ ++ This currently fails in Series.name.setter, since the ++ name must be hashable, but the value is a dictionary. ++ I think this is what we want, i.e. `.name` should be the original ++ values, and not the values for factorization. ++ """ ++ ++ @unhashable ++ def test_groupby_extension_apply(self): ++ """ ++ This fails in Index._do_unique_check with ++ ++ > hash(val) ++ E TypeError: unhashable type: 'UserDict' with ++ ++ I suspect that once we support Index[ExtensionArray], ++ we'll be able to dispatch unique. ++ """ ++ ++ @unstable ++ @pytest.mark.parametrize('as_index', [True, False]) ++ def test_groupby_extension_agg(self, as_index, data_for_grouping): ++ super(TestGroupby, self).test_groupby_extension_agg( ++ as_index, data_for_grouping ++ ) diff --git a/conda-recipes/pandas/meta.yaml b/conda-recipes/pandas/meta.yaml index 8f11866..91a4ef6 100644 --- a/conda-recipes/pandas/meta.yaml +++ b/conda-recipes/pandas/meta.yaml @@ -7,6 +7,9 @@ build: source: git_url: https://github.com/pandas-dev/pandas + git_rev: 766a480 + patches: + - 0001-pandas.patch requirements: build: diff --git a/cyberpandas/base.py b/cyberpandas/base.py index 429cb96..96ba09b 100644 --- a/cyberpandas/base.py +++ b/cyberpandas/base.py @@ -2,11 +2,8 @@ import numpy as np -import pandas as pd from pandas.core.arrays import ExtensionArray -from ._utils import refactorize - class NumPyBackedExtensionArrayMixin(ExtensionArray): @property @@ -18,6 +15,10 @@ def dtype(self): def _constructor_from_sequence(cls, scalars): return cls(scalars) + @classmethod + def _from_factorized(cls, values, original): + return cls(values) + @property def shape(self): return (len(self.data),) @@ -68,65 +69,3 @@ def unique(self): _, indices = np.unique(self.data, return_index=True) data = self.data.take(np.sort(indices)) return self._from_ndarray(data) - - def factorize(self, na_sentinel=-1): - """Factorize an IPArray into integer labels and unique values. - - Calling :meth:`pandas.Series.factorize` or :meth:`pandas.factorize` - will dispatch to this method. - - Parameters - ---------- - na_sentinel : int, default -1 - The value in `labels` to use for indicating missing values in - `self`. - - Returns - ------- - labels : ndarray - An integer-type ndarray the same length as `self`. Each newly- - observed value in `self` will be assigned the next integer. - Missing values in self are assigned `na_sentinel`. - uniques : IPArray - The unique values in `self` in order of appereance, not including - the missing value ``IPv4Address('0.0.0.0')``. - - See Also - -------- - pandas.factorize, pandas.Series.factorize - - Examples - -------- - >>> arr = IPArray([2, 2, 0, 1, 2, 2**64 + 1]) - >>> arr - IPArray(['0.0.0.2', '0.0.0.2', '0.0.0.0', '0.0.0.1', - '0.0.0.2', '::1:0:0:0:1']) - - >>> labels, uniques = arr.factorize() - >>> labels - array([ 0, 0, -1, 1, 0, 2]) - - Notice that `uniques` does not include the missing value. - >>> uniques - IPArray(['0.0.0.2', '0.0.0.1', '::1:0:0:0:1']) - """ - # OK, so here's the plan. - # Start with factorizing `self.data`, which has two unfortunate issues - # 1. Requires casting to object. - # 2. Gets the NA logic wrong, since (0, 0) isn't NA to pandas. - # For now, we can't help with 1. Maybe someday. - # For 2, we can "fix" things with a little post-factorization cleanup. - l, u = pd.factorize(self.data) - mask = self.isna() - any_na = mask.any() - - if any_na: - first_na = mask.argmax() - refactorize(l, first_na, na_sentinel=na_sentinel) # inplace op - - # u is an ndarray of tuples. Go to our record type, then an IPArray - u2 = type(self)((u.astype(self.dtype._record_type))) - # May have a missing value. - if any_na: - u2 = u2[~u2.isna()] - return l, u2 diff --git a/cyberpandas/ip_array.py b/cyberpandas/ip_array.py index 682c5dc..f78f2d9 100644 --- a/cyberpandas/ip_array.py +++ b/cyberpandas/ip_array.py @@ -256,6 +256,9 @@ def equals(self, other): # TODO: missing return (self.data == other.data).all() + def _values_for_factorize(self): + return self.astype(object), ipaddress.IPv4Address(0) + def isna(self): ips = self.data return (ips['lo'] == 0) & (ips['hi'] == 0) diff --git a/cyberpandas/mac_array.py b/cyberpandas/mac_array.py index d753839..e5d1acf 100644 --- a/cyberpandas/mac_array.py +++ b/cyberpandas/mac_array.py @@ -1,3 +1,5 @@ +from collections import Iterable + import numpy as np import six @@ -60,9 +62,7 @@ def _box_scalar(scalar): return scalar def __setitem__(self, key, value): - from .parser import to_ipaddress - - value = to_ipaddress(value).data + value = to_macaddress(value) self.data[key] = value def __iter__(self): @@ -88,6 +88,10 @@ def equals(self, other): raise TypeError return (self.data == other.data).all() + def _values_for_factorize(self): + # Should hit pandas' UInt64Hashtable + return self, 0 + def isna(self): return (self.data == 0) @@ -126,3 +130,13 @@ def _parse(mac): # https://stackoverflow.com/a/36883363/1889400 mac_int = int(mac.replace(":", "").replace("-", ""), 16) return mac_int + + +def to_macaddress(addresses): + if (isinstance(addresses, six.string_types) or + not isinstance(addresses, Iterable)): + addresses = [addresses] + + addresses = [_parse(mac) if isinstance(mac, six.string_types) else mac + for mac in addresses] + return np.array(addresses, dtype='u8') diff --git a/cyberpandas/parser.py b/cyberpandas/parser.py index f2d9cfe..9b0970b 100644 --- a/cyberpandas/parser.py +++ b/cyberpandas/parser.py @@ -57,7 +57,7 @@ def _to_int_pairs(values): if isinstance(values, (str, bytes, int)): values = ipaddress.ip_address(values)._ip return unpack(pack(values)) - elif isinstance(values, np.ndarray): + elif isinstance(values, np.ndarray) and values.dtype != object: if values.ndim != 2: raise ValueError("'values' should be a 2-D when passing a " "NumPy array.") diff --git a/tests/mac/test_interface.py b/tests/mac/test_interface.py index 8211a32..405da63 100644 --- a/tests/mac/test_interface.py +++ b/tests/mac/test_interface.py @@ -31,12 +31,12 @@ def all_data(request, data, data_missing): @pytest.fixture def data_for_sorting(): - return MACArray([10, 2 ** 64 + 1, 1]) + return MACArray([10, 2 ** 64 - 1, 1]) @pytest.fixture def data_missing_for_sorting(): - return MACArray([2 ** 64 + 1, 0, 1]) + return MACArray([2 ** 64 - 1, 0, 1]) @pytest.fixture