Description
Pandas version checks
-
I have checked that this issue has not already been reported.
-
I have confirmed this bug exists on the latest version of pandas.
-
I have confirmed this bug exists on the main branch of pandas.
Reproducible Example
import pandas as pd
a = pd.DataFrame({'k': pd.Series([1,1,2,pd.NA]).astype(pd.Int64Dtype())})
a.groupby(by='k', dropna=False, group_keys=False).apply(lambda x: x)
Issue Description
After an update to pandas 1.5 a number of groupby operations I do on pandas Int
dtypes are broken. In the example above, I receive an IndexError
with the stack trace below. It works as expected if I roll back to pandas 1.4.4.
IndexError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_17940/849178950.py in <module>
1 import pandas as pd
2 a = pd.DataFrame({'k': pd.Series([1,1,2,pd.NA]).astype(pd.Int64Dtype())})
----> 3 a.groupby(by='k', dropna=False, group_keys=False).apply(lambda x: x)
~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\pandas\core\groupby\groupby.py in apply(self, func, *args, **kwargs)
1556 with option_context("mode.chained_assignment", None):
1557 try:
-> 1558 result = self._python_apply_general(f, self._selected_obj)
1559 except TypeError:
1560 # gh-20949
~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\pandas\core\groupby\groupby.py in _python_apply_general(self, f, data, not_indexed_same, is_transform, is_agg)
1608 data after applying f
1609 """
-> 1610 values, mutated = self.grouper.apply(f, data, self.axis)
1611 if not_indexed_same is None:
1612 not_indexed_same = mutated or self.mutated
~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\pandas\core\groupby\ops.py in apply(self, f, data, axis)
825 ) -> tuple[list, bool]:
826 mutated = self.mutated
--> 827 splitter = self._get_splitter(data, axis=axis)
828 group_keys = self.group_keys_seq
829 result_values = []
~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\pandas\core\groupby\ops.py in _get_splitter(self, data, axis)
797 Generator yielding subsetted objects
798 """
--> 799 ids, _, ngroups = self.group_info
800 return get_splitter(data, ids, ngroups, axis=axis)
801
~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\pandas\_libs\properties.pyx in pandas._libs.properties.CachedProperty.__get__()
~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\pandas\core\groupby\ops.py in group_info(self)
945 @cache_readonly
946 def group_info(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
--> 947 comp_ids, obs_group_ids = self._get_compressed_codes()
948
949 ngroups = len(obs_group_ids)
~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\pandas\core\groupby\ops.py in _get_compressed_codes(self)
976
977 ping = self.groupings[0]
--> 978 return ping.codes, np.arange(len(ping.group_index), dtype=np.intp)
979
980 @final
~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\pandas\core\groupby\grouper.py in codes(self)
620 return self._codes
621
--> 622 return self._codes_and_uniques[0]
623
624 @cache_readonly
~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\pandas\_libs\properties.pyx in pandas._libs.properties.CachedProperty.__get__()
~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\pandas\core\groupby\grouper.py in _codes_and_uniques(self)
691 # error: Incompatible types in assignment (expression has type "Union[
692 # ndarray[Any, Any], Index]", variable has type "Categorical")
--> 693 codes, uniques = algorithms.factorize( # type: ignore[assignment]
694 self.grouping_vector, sort=self._sort, use_na_sentinel=self._dropna
695 )
~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\pandas\core\algorithms.py in factorize(values, sort, na_sentinel, use_na_sentinel, size_hint)
790 # Avoid using catch_warnings when possible
791 # GH#46910 - TimelikeOps has deprecated signature
--> 792 codes, uniques = values.factorize( # type: ignore[call-arg]
793 use_na_sentinel=na_sentinel is not None
794 )
~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\pandas\core\arrays\masked.py in factorize(self, na_sentinel, use_na_sentinel)
916 codes[codes == -1] = na_code
917 # dummy value for uniques; not used since uniques_mask will be True
--> 918 uniques = np.insert(uniques, na_code, 0)
919 uniques_mask[na_code] = True
920 uniques_ea = type(self)(uniques, uniques_mask)
~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\numpy\core\overrides.py in insert(*args, **kwargs)
~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\numpy\lib\function_base.py in insert(arr, obj, values, axis)
5278 index = indices.item()
5279 if index < -N or index > N:
-> 5280 raise IndexError(f"index {obj} is out of bounds for axis {axis} "
5281 f"with size {N}")
5282 if (index < 0):
IndexError: index 3 is out of bounds for axis 0 with size 2
Expected Behavior
I expect the following output with dtype Int64
k
0 1
1 1
2 2
3 <NA>
The following code is not equivalent because the output column has dtype object
import pandas as pd
a = pd.DataFrame({'k': pd.Series([1,1,2,pd.NA])})
a.groupby(by='k', dropna=False, group_keys=False).apply(lambda x: x)
I also think it's odd that when printed the snippet above represents the NA
value as NaN
but if I check the type explicitly it's pandas._libs.missing.NAType
Installed Versions
pandas : 1.5.0
numpy : 1.22.3
pytz : 2021.3
dateutil : 2.8.2
setuptools : 58.1.0
pip : 22.2.2
Cython : None
pytest : 7.1.2
hypothesis : None
sphinx : None
blosc : None
feather : None
xlsxwriter : None
lxml.etree : None
html5lib : None
pymysql : None
psycopg2 : None
jinja2 : 3.0.3
IPython : 7.30.1
pandas_datareader: None
bs4 : 4.11.1
bottleneck : None
brotli : None
fastparquet : None
fsspec : None
gcsfs : None
matplotlib : None
numba : None
numexpr : None
odfpy : None
openpyxl : None
pandas_gbq : None
pyarrow : None
pyreadstat : None
pyxlsb : None
s3fs : None
scipy : None
snappy : None
sqlalchemy : 1.4.28
tables : None
tabulate : None
xarray : None
xlrd : None
xlwt : None
zstandard : None
tzdata : 2021.5