Skip to content

Commit cf56ff1

Browse files
committed
BUG: concat of series of dtype category converting to object dtype (GH8641)
1 parent 91e590f commit cf56ff1

File tree

12 files changed

+407
-134
lines changed

12 files changed

+407
-134
lines changed

doc/source/whatsnew/v0.15.2.txt

+1
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ users upgrade to this version.
2020
API changes
2121
~~~~~~~~~~~
2222

23+
- Bug in concat of Series with ``category`` dtype which were coercing to ``object``. (:issue:`8641`)
2324

2425
.. _whatsnew_0152.enhancements:
2526

pandas/core/categorical.py

+86-32
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,12 @@
1515
import pandas.core.common as com
1616
from pandas.util.decorators import cache_readonly
1717

18-
from pandas.core.common import isnull
18+
from pandas.core.common import (CategoricalDtype, ABCSeries, isnull, notnull,
19+
is_categorical_dtype, is_integer_dtype, is_object_dtype,
20+
_possibly_infer_to_datetimelike, get_dtype_kinds,
21+
is_list_like, _is_sequence,
22+
_ensure_platform_int, _ensure_object, _ensure_int64,
23+
_coerce_indexer_dtype, _values_from_object, take_1d)
1924
from pandas.util.terminal import get_terminal_size
2025
from pandas.core.config import get_option
2126
from pandas.core import format as fmt
@@ -69,11 +74,11 @@ def f(self, other):
6974

7075
def _is_categorical(array):
7176
""" return if we are a categorical possibility """
72-
return isinstance(array, Categorical) or isinstance(array.dtype, com.CategoricalDtype)
77+
return isinstance(array, Categorical) or isinstance(array.dtype, CategoricalDtype)
7378

7479
def _maybe_to_categorical(array):
7580
""" coerce to a categorical if a series is given """
76-
if isinstance(array, com.ABCSeries):
81+
if isinstance(array, ABCSeries):
7782
return array.values
7883
return array
7984

@@ -175,7 +180,7 @@ class Categorical(PandasObject):
175180
>>> a.min()
176181
'c'
177182
"""
178-
dtype = com.CategoricalDtype()
183+
dtype = CategoricalDtype()
179184
"""The dtype (always "category")"""
180185

181186
ordered = None
@@ -203,7 +208,7 @@ def __init__(self, values, categories=None, ordered=None, name=None, fastpath=Fa
203208

204209
if fastpath:
205210
# fast path
206-
self._codes = com._coerce_indexer_dtype(values, categories)
211+
self._codes = _coerce_indexer_dtype(values, categories)
207212
self.name = name
208213
self.categories = categories
209214
self.ordered = ordered
@@ -223,11 +228,11 @@ def __init__(self, values, categories=None, ordered=None, name=None, fastpath=Fa
223228
"use only 'categories'")
224229

225230
# sanitize input
226-
if com.is_categorical_dtype(values):
231+
if is_categorical_dtype(values):
227232

228233
# we are either a Series or a Categorical
229234
cat = values
230-
if isinstance(values, com.ABCSeries):
235+
if isinstance(values, ABCSeries):
231236
cat = values.values
232237
if categories is None:
233238
categories = cat.categories
@@ -244,7 +249,7 @@ def __init__(self, values, categories=None, ordered=None, name=None, fastpath=Fa
244249
# which is fine, but since factorize does this correctly no need here
245250
# this is an issue because _sanitize_array also coerces np.nan to a string
246251
# under certain versions of numpy as well
247-
values = com._possibly_infer_to_datetimelike(values, convert_dates=True)
252+
values = _possibly_infer_to_datetimelike(values, convert_dates=True)
248253
if not isinstance(values, np.ndarray):
249254
values = _convert_to_list_like(values)
250255
from pandas.core.series import _sanitize_array
@@ -286,11 +291,11 @@ def __init__(self, values, categories=None, ordered=None, name=None, fastpath=Fa
286291
codes = _get_codes_for_values(values, categories)
287292

288293
# TODO: check for old style usage. These warnings should be removes after 0.18/ in 2016
289-
if com.is_integer_dtype(values) and not com.is_integer_dtype(categories):
294+
if is_integer_dtype(values) and not is_integer_dtype(categories):
290295
warn("Values and categories have different dtypes. Did you mean to use\n"
291296
"'Categorical.from_codes(codes, categories)'?", RuntimeWarning)
292297

293-
if com.is_integer_dtype(values) and (codes == -1).all():
298+
if is_integer_dtype(values) and (codes == -1).all():
294299
warn("None of the categories were found in values. Did you mean to use\n"
295300
"'Categorical.from_codes(codes, categories)'?", RuntimeWarning)
296301

@@ -302,7 +307,7 @@ def __init__(self, values, categories=None, ordered=None, name=None, fastpath=Fa
302307
self.ordered = False if ordered is None else ordered
303308
self.categories = categories
304309
self.name = name
305-
self._codes = com._coerce_indexer_dtype(codes, categories)
310+
self._codes = _coerce_indexer_dtype(codes, categories)
306311

307312
def copy(self):
308313
""" Copy constructor. """
@@ -409,7 +414,7 @@ def _validate_categories(cls, categories):
409414
# on categories with NaNs, int values would be converted to float.
410415
# Use "object" dtype to prevent this.
411416
if isnull(categories).any():
412-
without_na = np.array([x for x in categories if com.notnull(x)])
417+
without_na = np.array([x for x in categories if notnull(x)])
413418
with_na = np.array(categories)
414419
if with_na.dtype != without_na.dtype:
415420
dtype = "object"
@@ -617,7 +622,7 @@ def add_categories(self, new_categories, inplace=False):
617622
remove_unused_categories
618623
set_categories
619624
"""
620-
if not com.is_list_like(new_categories):
625+
if not is_list_like(new_categories):
621626
new_categories = [new_categories]
622627
already_included = set(new_categories) & set(self._categories)
623628
if len(already_included) != 0:
@@ -627,7 +632,7 @@ def add_categories(self, new_categories, inplace=False):
627632
new_categories = self._validate_categories(new_categories)
628633
cat = self if inplace else self.copy()
629634
cat._categories = new_categories
630-
cat._codes = com._coerce_indexer_dtype(cat._codes, new_categories)
635+
cat._codes = _coerce_indexer_dtype(cat._codes, new_categories)
631636
if not inplace:
632637
return cat
633638

@@ -662,7 +667,7 @@ def remove_categories(self, removals, inplace=False):
662667
remove_unused_categories
663668
set_categories
664669
"""
665-
if not com.is_list_like(removals):
670+
if not is_list_like(removals):
666671
removals = [removals]
667672
removals = set(list(removals))
668673
not_included = removals - set(self._categories)
@@ -696,7 +701,7 @@ def remove_unused_categories(self, inplace=False):
696701
"""
697702
cat = self if inplace else self.copy()
698703
_used = sorted(np.unique(cat._codes))
699-
new_categories = cat.categories.take(com._ensure_platform_int(_used))
704+
new_categories = cat.categories.take(_ensure_platform_int(_used))
700705
new_categories = _ensure_index(new_categories)
701706
cat._codes = _get_codes_for_values(cat.__array__(), new_categories)
702707
cat._categories = new_categories
@@ -734,7 +739,7 @@ def __array__(self, dtype=None):
734739
A numpy array of either the specified dtype or, if dtype==None (default), the same
735740
dtype as categorical.categories.dtype
736741
"""
737-
ret = com.take_1d(self.categories.values, self._codes)
742+
ret = take_1d(self.categories.values, self._codes)
738743
if dtype and dtype != self.categories.dtype:
739744
return np.asarray(ret, dtype)
740745
return ret
@@ -822,8 +827,8 @@ def get_values(self):
822827

823828
# if we are a period index, return a string repr
824829
if isinstance(self.categories, PeriodIndex):
825-
return com.take_1d(np.array(self.categories.to_native_types(), dtype=object),
826-
self._codes)
830+
return take_1d(np.array(self.categories.to_native_types(), dtype=object),
831+
self._codes)
827832

828833
return np.array(self)
829834

@@ -1010,7 +1015,7 @@ def fillna(self, fill_value=None, method=None, limit=None, **kwargs):
10101015

10111016
else:
10121017

1013-
if not com.isnull(fill_value) and fill_value not in self.categories:
1018+
if not isnull(fill_value) and fill_value not in self.categories:
10141019
raise ValueError("fill value must be in categories")
10151020

10161021
mask = values==-1
@@ -1031,7 +1036,7 @@ def take_nd(self, indexer, allow_fill=True, fill_value=None):
10311036
# but is passed thru internally
10321037
assert isnull(fill_value)
10331038

1034-
codes = com.take_1d(self._codes, indexer, allow_fill=True, fill_value=-1)
1039+
codes = take_1d(self._codes, indexer, allow_fill=True, fill_value=-1)
10351040
result = Categorical(codes, categories=self.categories, ordered=self.ordered,
10361041
name=self.name, fastpath=True)
10371042
return result
@@ -1178,7 +1183,7 @@ def __setitem__(self, key, value):
11781183
raise ValueError("Cannot set a Categorical with another, without identical "
11791184
"categories")
11801185

1181-
rvalue = value if com.is_list_like(value) else [value]
1186+
rvalue = value if is_list_like(value) else [value]
11821187
to_add = Index(rvalue).difference(self.categories)
11831188
# no assignments of values not in categories, but it's always ok to set something to np.nan
11841189
if len(to_add) and not isnull(to_add).all():
@@ -1221,7 +1226,7 @@ def __setitem__(self, key, value):
12211226
# float categories do currently return -1 for np.nan, even if np.nan is included in the
12221227
# index -> "repair" this here
12231228
if isnull(rvalue).any() and isnull(self.categories).any():
1224-
nan_pos = np.where(com.isnull(self.categories))[0]
1229+
nan_pos = np.where(isnull(self.categories))[0]
12251230
lindexer[lindexer == -1] = nan_pos
12261231

12271232
key = self._maybe_coerce_indexer(key)
@@ -1304,7 +1309,7 @@ def mode(self):
13041309

13051310
import pandas.hashtable as htable
13061311
good = self._codes != -1
1307-
result = Categorical(sorted(htable.mode_int64(com._ensure_int64(self._codes[good]))),
1312+
result = Categorical(sorted(htable.mode_int64(_ensure_int64(self._codes[good]))),
13081313
categories=self.categories,ordered=self.ordered, name=self.name,
13091314
fastpath=True)
13101315
return result
@@ -1373,9 +1378,9 @@ def describe(self):
13731378
categories = np.arange(0,len(self.categories)+1 ,dtype=object)
13741379
categories[:-1] = self.categories
13751380
categories[-1] = np.nan
1376-
result.index = categories.take(com._ensure_platform_int(result.index))
1381+
result.index = categories.take(_ensure_platform_int(result.index))
13771382
else:
1378-
result.index = self.categories.take(com._ensure_platform_int(result.index))
1383+
result.index = self.categories.take(_ensure_platform_int(result.index))
13791384
result = result.reindex(self.categories)
13801385
result.index.name = 'categories'
13811386

@@ -1447,23 +1452,72 @@ def _get_codes_for_values(values, categories):
14471452

14481453
from pandas.core.algorithms import _get_data_algo, _hashtables
14491454
if values.dtype != categories.dtype:
1450-
values = com._ensure_object(values)
1451-
categories = com._ensure_object(categories)
1455+
values = _ensure_object(values)
1456+
categories = _ensure_object(categories)
14521457
(hash_klass, vec_klass), vals = _get_data_algo(values, _hashtables)
14531458
t = hash_klass(len(categories))
1454-
t.map_locations(com._values_from_object(categories))
1455-
return com._coerce_indexer_dtype(t.lookup(values), categories)
1459+
t.map_locations(_values_from_object(categories))
1460+
return _coerce_indexer_dtype(t.lookup(values), categories)
14561461

14571462
def _convert_to_list_like(list_like):
14581463
if hasattr(list_like, "dtype"):
14591464
return list_like
14601465
if isinstance(list_like, list):
14611466
return list_like
1462-
if (com._is_sequence(list_like) or isinstance(list_like, tuple)
1463-
or isinstance(list_like, types.GeneratorType)):
1467+
if (_is_sequence(list_like) or isinstance(list_like, tuple)
1468+
or isinstance(list_like, types.GeneratorType)):
14641469
return list(list_like)
14651470
elif np.isscalar(list_like):
14661471
return [list_like]
14671472
else:
14681473
# is this reached?
14691474
return [list_like]
1475+
1476+
def _concat_compat(to_concat, axis=0):
1477+
"""
1478+
provide concatenation of an object/categorical array of arrays each of which is a single dtype
1479+
1480+
Parameters
1481+
----------
1482+
to_concat : array of arrays
1483+
axis : axis to provide concatenation
1484+
1485+
Returns
1486+
-------
1487+
a single array, preserving the combined dtypes
1488+
"""
1489+
1490+
def convert_categorical(x):
1491+
# coerce to object dtype
1492+
if is_categorical_dtype(x.dtype):
1493+
return x.get_values()
1494+
return x.ravel()
1495+
1496+
typs = get_dtype_kinds(to_concat)
1497+
if not len(typs-set(['object','category'])):
1498+
1499+
# we only can deal with object & category types
1500+
pass
1501+
1502+
else:
1503+
1504+
# convert to object type and perform a regular concat
1505+
from pandas.core.common import _concat_compat
1506+
return _concat_compat([ np.array(x,copy=False).astype('object') for x in to_concat ],axis=axis)
1507+
1508+
# we could have object blocks and categorical's here
1509+
# if we only have a single cateogoricals then combine everything
1510+
# else its a non-compat categorical
1511+
categoricals = [ x for x in to_concat if is_categorical_dtype(x.dtype) ]
1512+
objects = [ x for x in to_concat if is_object_dtype(x.dtype) ]
1513+
1514+
# validate the categories
1515+
categories = None
1516+
for x in categoricals:
1517+
if categories is None:
1518+
categories = x.categories
1519+
if not categories.equals(x.categories):
1520+
raise ValueError("incompatible categories in categorical concat")
1521+
1522+
# concat them
1523+
return Categorical(np.concatenate([ convert_categorical(x) for x in to_concat ],axis=axis), categories=categories)

0 commit comments

Comments
 (0)