Merge #196

196: Pandas21 compat r=andrewgsavage a=MichaelTiemannOSC - [ ] Closes # (insert issue number) - [x] Executed `pre-commit run --all-files` with no errors - [x] The change is fully covered by automated unit tests - [ ] Documented in docs/ as appropriate - [x] Added an entry to the CHANGES file CI/CD doesn't quite work yet because we don't have a Pandas 2.1 rc to point to. But comments are welcome! Co-authored-by: Michael Tiemann <72577720+MichaelTiemannOSC@users.noreply.github.com>
hgrecco · Aug 14, 2023 · c43c18b · c43c18b
2 parents 31a3055 + f5947ea
commit c43c18b
Show file tree

Hide file tree

Showing 5 changed files with 287 additions and 60 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -8,7 +8,7 @@ jobs:
  matrix:
  python-version: [3.9, "3.10", "3.11"]
  numpy: ["numpy>=1.20.3,<2.0.0"]
- pandas: ["pandas==2.0.2", ]
+ pandas: ["pandas==2.0.2", "pandas==2.1.0rc0" ]
  pint: ["pint>=0.21.1", "pint==0.22"]
 
  runs-on: ubuntu-latest

diff --git a/CHANGES b/CHANGES
@@ -5,6 +5,8 @@ pint-pandas Changelog
 ----------------
 
 <<<<<<< HEAD
+- Support for Pandas version 2.1.0. #196
+- Support for dtype-preserving `PintArray.map` for both Pandas 2.0.2 and Pandas 2.1. #196
 - Support for <NA> values in columns with integer magnitudes
 - Support for magnitudes of any type, such as complex128 or tuples #146
 - Support for pandas 2.0, allowing `.cumsum, .cummax, .cummin` methods for `Series` and `DataFrame`. #186

diff --git a/pint_pandas/pint_array.py b/pint_pandas/pint_array.py
@@ -7,7 +7,7 @@
 import numpy as np
 import pandas as pd
 import pint
-from pandas import DataFrame, Series
+from pandas import DataFrame, Series, Index
 from pandas.api.extensions import (
  ExtensionArray,
  ExtensionDtype,
@@ -71,7 +71,7 @@ def __new__(cls, units=None):
  if not isinstance(units, _Unit):
  units = cls._parse_dtype_strict(units)
  # ureg.unit returns a quantity with a magnitude of 1
- # eg 1 mm. Initialising a quantity and taking it's unit
+ # eg 1 mm. Initialising a quantity and taking its unit
  # TODO: Seperate units from quantities in pint
  # to simplify this bit
  units = cls.ureg.Quantity(1, units).units
@@ -262,7 +262,6 @@ def __init__(self, values, dtype=None, copy=False):
  copy = False
  elif not isinstance(values, pd.core.arrays.numeric.NumericArray):
  values = pd.array(values, copy=copy)
- copy = False
  if copy:
  values = values.copy()
  self._data = values
@@ -323,10 +322,14 @@ def __setitem__(self, key, value):
 
  if isinstance(value, _Quantity):
  value = value.to(self.units).magnitude
- elif is_list_like(value) and len(value) > 0 and isinstance(value[0], _Quantity):
- value = [item.to(self.units).magnitude for item in value]
+ elif is_list_like(value) and len(value) > 0:
+ if isinstance(value[0], _Quantity):
+ value = [item.to(self.units).magnitude for item in value]
+ if len(value) == 1:
+ value = value[0]
 
  key = check_array_indexer(self, key)
+ # Filter out invalid values for our array type(s)
  try:
  self._data[key] = value
  except IndexError as e:
@@ -483,7 +486,10 @@ def take(self, indices, allow_fill=False, fill_value=None):
  # magnitude is in fact an array scalar, which will get rejected by pandas.
  fill_value = fill_value[()]
 
- result = take(data, indices, fill_value=fill_value, allow_fill=allow_fill)
+ with warnings.catch_warnings():
+ warnings.simplefilter("ignore")
+ # Turn off warning that PandasArray is deprecated for ``take``
+ result = take(data, indices, fill_value=fill_value, allow_fill=allow_fill)
 
  return PintArray(result, dtype=self.dtype)
 
@@ -525,18 +531,12 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
  raise ValueError(
  "Cannot infer dtype. No dtype specified and empty array"
  )
- if dtype is None and not isinstance(master_scalar, _Quantity):
- raise ValueError("No dtype specified and not a sequence of quantities")
- if dtype is None and isinstance(master_scalar, _Quantity):
+ if dtype is None:
+ if not isinstance(master_scalar, _Quantity):
+  raise ValueError("No dtype specified and not a sequence of quantities")
  dtype = PintType(master_scalar.units)
 
- def quantify_nan(item):
- if type(item) is float:
- return item * dtype.units
- return item
-
  if isinstance(master_scalar, _Quantity):
- scalars = [quantify_nan(item) for item in scalars]
  scalars = [
  (item.to(dtype.units).magnitude if hasattr(item, "to") else item)
  for item in scalars
@@ -551,10 +551,21 @@ def _from_sequence_of_strings(cls, scalars, dtype=None, copy=False):
 
  @classmethod
  def _from_factorized(cls, values, original):
+ from pandas.api.types import infer_dtype
+
+ if infer_dtype(values) != "object":
+ values = pd.array(values, copy=False)
  return cls(values, dtype=original.dtype)
 
  def _values_for_factorize(self):
- return self._data._values_for_factorize()
+ # factorize can now handle differentiating various types of null values.
+ # These can only occur when the array has object dtype.
+ # However, for backwards compatibility we only use the null for the
+ # provided dtype. This may be revisited in the future, see GH#48476.
+ arr = self._data
+ if arr.dtype.kind == "O":
+ return np.array(arr, copy=False), self.dtype.na_value
+ return arr._values_for_factorize()
 
  def value_counts(self, dropna=True):
  """
@@ -580,18 +591,19 @@ def value_counts(self, dropna=True):
 
  # compute counts on the data with no nans
  data = self._data
- nafilt = np.isnan(data)
+ nafilt = pd.isna(data)
+ na_value = pd.NA # NA value for index, not data, so not quantified
  data = data[~nafilt]
+ index = list(set(data))
 
  data_list = data.tolist()
- index = list(set(data))
  array = [data_list.count(item) for item in index]
 
  if not dropna:
- index.append(np.nan)
+ index.append(na_value)
  array.append(nafilt.sum())
 
- return Series(array, index=index)
+ return Series(np.asarray(array), index=index)
 
  def unique(self):
  """Compute the PintArray of unique values.
@@ -602,7 +614,8 @@ def unique(self):
  """
  from pandas import unique
 
- return self._from_sequence(unique(self._data), dtype=self.dtype)
+ data = self._data
+ return self._from_sequence(unique(data), dtype=self.dtype)
 
  def __contains__(self, item) -> bool:
  if not isinstance(item, _Quantity):
@@ -704,7 +717,7 @@ def convert_values(param):
  else:
  return param
 
- if isinstance(other, (Series, DataFrame)):
+ if isinstance(other, (Series, DataFrame, Index)):
  return NotImplemented
  lvalues = self.quantity
  validate_length(lvalues, other)
@@ -753,7 +766,9 @@ def __array__(self, dtype=None, copy=False):
 
  def _to_array_of_quantity(self, copy=False):
  qtys = [
- self._Q(item, self._dtype.units) if not pd.isna(item) else item
+ self._Q(item, self._dtype.units)
+ if not pd.isna(item)
+ else self.dtype.na_value
  for item in self._data
  ]
  with warnings.catch_warnings(record=True):
@@ -811,7 +826,42 @@ def searchsorted(self, value, side="left", sorter=None):
  value = [item.to(self.units).magnitude for item in value]
  return arr.searchsorted(value, side=side, sorter=sorter)
 
- def _reduce(self, name, **kwds):
+ def map(self, mapper, na_action=None):
+ """
+ Map values using an input mapping or function.
+
+ Parameters
+ ----------
+ mapper : function, dict, or Series
+ Mapping correspondence.
+ na_action : {None, 'ignore'}, default None
+ If 'ignore', propagate NA values, without passing them to the
+ mapping correspondence. If 'ignore' is not supported, a
+ ``NotImplementedError`` should be raised.
+
+ Returns
+ -------
+ If mapper is a function, operate on the magnitudes of the array and
+
+ """
+ if pandas_version_info < (2, 1):
+ ser = pd.Series(self._to_array_of_quantity())
+ arr = ser.map(mapper, na_action).values
+ else:
+ from pandas.core.algorithms import map_array
+
+ arr = map_array(self, mapper, na_action)
+
+ master_scalar = None
+ try:
+ master_scalar = next(i for i in arr if hasattr(i, "units"))
+ except StopIteration:
+ # JSON mapper formatting Qs as str don't create PintArrays
+ # ...and that's OK. Caller will get array of values
+ return arr
+ return PintArray._from_sequence(arr, PintType(master_scalar.units))
+
+ def _reduce(self, name, *, skipna: bool = True, keepdims: bool = False, **kwds):
  """
  Return a scalar result of performing the reduction operation.
 
@@ -855,14 +905,20 @@ def _reduce(self, name, **kwds):
 
  if isinstance(self._data, ExtensionArray):
  try:
- result = self._data._reduce(name, **kwds)
+ result = self._data._reduce(
+ name, skipna=skipna, keepdims=keepdims, **kwds
+ )
  except NotImplementedError:
  result = functions[name](self.numpy_data, **kwds)
 
  if name in {"all", "any", "kurt", "skew"}:
  return result
  if name == "var":
+ if keepdims:
+ return PintArray(result, f"pint[({self.units})**2]")
  return self._Q(result, self.units**2)
+ if keepdims:
+ return PintArray(result, self.dtype)
  return self._Q(result, self.units)
 
  def _accumulate(self, name: str, *, skipna: bool = True, **kwds):
@@ -879,7 +935,6 @@ def _accumulate(self, name: str, *, skipna: bool = True, **kwds):
  result = self._data._accumulate(name, **kwds)
  except NotImplementedError:
  result = functions[name](self.numpy_data, **kwds)
- print(result)
 
  return self._from_sequence(result, self.units)
 

diff --git a/pint_pandas/testsuite/test_issues.py b/pint_pandas/testsuite/test_issues.py
@@ -3,12 +3,14 @@
 
 import numpy as np
 import pandas as pd
+import pandas._testing as tm
 import pytest
 import pint
 from pandas.tests.extension.base.base import BaseExtensionTests
 from pint.testsuite import helpers
 
 from pint_pandas import PintArray, PintType
+from pint_pandas.pint_array import pandas_version_info
 
 ureg = PintType.ureg
 
@@ -41,7 +43,7 @@ def test_force_ndarray_like(self):
  expected = pd.DataFrame(
  {0: PintArray(q_a_), 1: PintArray(q_b)}, dtype="pint[degC]"
  )
- self.assert_equal(result, expected)
+ tm.assert_equal(result, expected)
 
  finally:
  # restore registry
@@ -64,7 +66,7 @@ def test_offset_concat(self):
  expected = pd.DataFrame(
  {0: PintArray(q_a_), 1: PintArray(q_b)}, dtype="pint[degC]"
  )
- self.assert_equal(result, expected)
+ tm.assert_equal(result, expected)
 
  # issue #141
  print(PintArray(q_a))
@@ -80,7 +82,7 @@ def test_assignment_add_empty(self):
  result = pd.Series(data)
  result[[]] += data[0]
  expected = pd.Series(data)
- self.assert_series_equal(result, expected)
+ tm.assert_series_equal(result, expected)
 
 
 class TestIssue80:
@@ -167,3 +169,19 @@ def test_issue_127():
  a = PintType.construct_from_string("pint[dimensionless]")
  b = PintType.construct_from_string("pint[]")
  assert a == b
+
+
+class TestIssue174(BaseExtensionTests):
+ def test_sum(self):
+ if pandas_version_info < (2, 1):
+ pytest.skip("Pandas reduce functions strip units prior to version 2.1.0")
+ a = pd.DataFrame([[0, 1, 2], [3, 4, 5]]).astype("pint[m]")
+ row_sum = a.sum(axis=0)
+ expected_1 = pd.Series([3, 5, 7], dtype="pint[m]")
+
+ tm.assert_series_equal(row_sum, expected_1)
+
+ col_sum = a.sum(axis=1)
+ expected_2 = pd.Series([3, 12], dtype="pint[m]")
+
+ tm.assert_series_equal(col_sum, expected_2)