15
15
import pandas .core .common as com
16
16
from pandas .util .decorators import cache_readonly
17
17
18
- from pandas .core .common import isnull
18
+ from pandas .core .common import (CategoricalDtype , ABCSeries , isnull , notnull ,
19
+ is_categorical_dtype , is_integer_dtype , is_object_dtype ,
20
+ _possibly_infer_to_datetimelike , get_dtype_kinds ,
21
+ is_list_like , _is_sequence ,
22
+ _ensure_platform_int , _ensure_object , _ensure_int64 ,
23
+ _coerce_indexer_dtype , _values_from_object , take_1d )
19
24
from pandas .util .terminal import get_terminal_size
20
25
from pandas .core .config import get_option
21
26
from pandas .core import format as fmt
@@ -69,11 +74,11 @@ def f(self, other):
69
74
70
75
def _is_categorical (array ):
71
76
""" return if we are a categorical possibility """
72
- return isinstance (array , Categorical ) or isinstance (array .dtype , com . CategoricalDtype )
77
+ return isinstance (array , Categorical ) or isinstance (array .dtype , CategoricalDtype )
73
78
74
79
def _maybe_to_categorical (array ):
75
80
""" coerce to a categorical if a series is given """
76
- if isinstance (array , com . ABCSeries ):
81
+ if isinstance (array , ABCSeries ):
77
82
return array .values
78
83
return array
79
84
@@ -175,7 +180,7 @@ class Categorical(PandasObject):
175
180
>>> a.min()
176
181
'c'
177
182
"""
178
- dtype = com . CategoricalDtype ()
183
+ dtype = CategoricalDtype ()
179
184
"""The dtype (always "category")"""
180
185
181
186
ordered = None
@@ -203,7 +208,7 @@ def __init__(self, values, categories=None, ordered=None, name=None, fastpath=Fa
203
208
204
209
if fastpath :
205
210
# fast path
206
- self ._codes = com . _coerce_indexer_dtype (values , categories )
211
+ self ._codes = _coerce_indexer_dtype (values , categories )
207
212
self .name = name
208
213
self .categories = categories
209
214
self .ordered = ordered
@@ -223,11 +228,11 @@ def __init__(self, values, categories=None, ordered=None, name=None, fastpath=Fa
223
228
"use only 'categories'" )
224
229
225
230
# sanitize input
226
- if com . is_categorical_dtype (values ):
231
+ if is_categorical_dtype (values ):
227
232
228
233
# we are either a Series or a Categorical
229
234
cat = values
230
- if isinstance (values , com . ABCSeries ):
235
+ if isinstance (values , ABCSeries ):
231
236
cat = values .values
232
237
if categories is None :
233
238
categories = cat .categories
@@ -244,7 +249,7 @@ def __init__(self, values, categories=None, ordered=None, name=None, fastpath=Fa
244
249
# which is fine, but since factorize does this correctly no need here
245
250
# this is an issue because _sanitize_array also coerces np.nan to a string
246
251
# under certain versions of numpy as well
247
- values = com . _possibly_infer_to_datetimelike (values , convert_dates = True )
252
+ values = _possibly_infer_to_datetimelike (values , convert_dates = True )
248
253
if not isinstance (values , np .ndarray ):
249
254
values = _convert_to_list_like (values )
250
255
from pandas .core .series import _sanitize_array
@@ -286,11 +291,11 @@ def __init__(self, values, categories=None, ordered=None, name=None, fastpath=Fa
286
291
codes = _get_codes_for_values (values , categories )
287
292
288
293
# TODO: check for old style usage. These warnings should be removes after 0.18/ in 2016
289
- if com . is_integer_dtype (values ) and not com . is_integer_dtype (categories ):
294
+ if is_integer_dtype (values ) and not is_integer_dtype (categories ):
290
295
warn ("Values and categories have different dtypes. Did you mean to use\n "
291
296
"'Categorical.from_codes(codes, categories)'?" , RuntimeWarning )
292
297
293
- if com . is_integer_dtype (values ) and (codes == - 1 ).all ():
298
+ if is_integer_dtype (values ) and (codes == - 1 ).all ():
294
299
warn ("None of the categories were found in values. Did you mean to use\n "
295
300
"'Categorical.from_codes(codes, categories)'?" , RuntimeWarning )
296
301
@@ -302,7 +307,7 @@ def __init__(self, values, categories=None, ordered=None, name=None, fastpath=Fa
302
307
self .ordered = False if ordered is None else ordered
303
308
self .categories = categories
304
309
self .name = name
305
- self ._codes = com . _coerce_indexer_dtype (codes , categories )
310
+ self ._codes = _coerce_indexer_dtype (codes , categories )
306
311
307
312
def copy (self ):
308
313
""" Copy constructor. """
@@ -409,7 +414,7 @@ def _validate_categories(cls, categories):
409
414
# on categories with NaNs, int values would be converted to float.
410
415
# Use "object" dtype to prevent this.
411
416
if isnull (categories ).any ():
412
- without_na = np .array ([x for x in categories if com . notnull (x )])
417
+ without_na = np .array ([x for x in categories if notnull (x )])
413
418
with_na = np .array (categories )
414
419
if with_na .dtype != without_na .dtype :
415
420
dtype = "object"
@@ -617,7 +622,7 @@ def add_categories(self, new_categories, inplace=False):
617
622
remove_unused_categories
618
623
set_categories
619
624
"""
620
- if not com . is_list_like (new_categories ):
625
+ if not is_list_like (new_categories ):
621
626
new_categories = [new_categories ]
622
627
already_included = set (new_categories ) & set (self ._categories )
623
628
if len (already_included ) != 0 :
@@ -627,7 +632,7 @@ def add_categories(self, new_categories, inplace=False):
627
632
new_categories = self ._validate_categories (new_categories )
628
633
cat = self if inplace else self .copy ()
629
634
cat ._categories = new_categories
630
- cat ._codes = com . _coerce_indexer_dtype (cat ._codes , new_categories )
635
+ cat ._codes = _coerce_indexer_dtype (cat ._codes , new_categories )
631
636
if not inplace :
632
637
return cat
633
638
@@ -662,7 +667,7 @@ def remove_categories(self, removals, inplace=False):
662
667
remove_unused_categories
663
668
set_categories
664
669
"""
665
- if not com . is_list_like (removals ):
670
+ if not is_list_like (removals ):
666
671
removals = [removals ]
667
672
removals = set (list (removals ))
668
673
not_included = removals - set (self ._categories )
@@ -696,7 +701,7 @@ def remove_unused_categories(self, inplace=False):
696
701
"""
697
702
cat = self if inplace else self .copy ()
698
703
_used = sorted (np .unique (cat ._codes ))
699
- new_categories = cat .categories .take (com . _ensure_platform_int (_used ))
704
+ new_categories = cat .categories .take (_ensure_platform_int (_used ))
700
705
new_categories = _ensure_index (new_categories )
701
706
cat ._codes = _get_codes_for_values (cat .__array__ (), new_categories )
702
707
cat ._categories = new_categories
@@ -734,7 +739,7 @@ def __array__(self, dtype=None):
734
739
A numpy array of either the specified dtype or, if dtype==None (default), the same
735
740
dtype as categorical.categories.dtype
736
741
"""
737
- ret = com . take_1d (self .categories .values , self ._codes )
742
+ ret = take_1d (self .categories .values , self ._codes )
738
743
if dtype and dtype != self .categories .dtype :
739
744
return np .asarray (ret , dtype )
740
745
return ret
@@ -822,8 +827,8 @@ def get_values(self):
822
827
823
828
# if we are a period index, return a string repr
824
829
if isinstance (self .categories , PeriodIndex ):
825
- return com . take_1d (np .array (self .categories .to_native_types (), dtype = object ),
826
- self ._codes )
830
+ return take_1d (np .array (self .categories .to_native_types (), dtype = object ),
831
+ self ._codes )
827
832
828
833
return np .array (self )
829
834
@@ -1010,7 +1015,7 @@ def fillna(self, fill_value=None, method=None, limit=None, **kwargs):
1010
1015
1011
1016
else :
1012
1017
1013
- if not com . isnull (fill_value ) and fill_value not in self .categories :
1018
+ if not isnull (fill_value ) and fill_value not in self .categories :
1014
1019
raise ValueError ("fill value must be in categories" )
1015
1020
1016
1021
mask = values == - 1
@@ -1031,7 +1036,7 @@ def take_nd(self, indexer, allow_fill=True, fill_value=None):
1031
1036
# but is passed thru internally
1032
1037
assert isnull (fill_value )
1033
1038
1034
- codes = com . take_1d (self ._codes , indexer , allow_fill = True , fill_value = - 1 )
1039
+ codes = take_1d (self ._codes , indexer , allow_fill = True , fill_value = - 1 )
1035
1040
result = Categorical (codes , categories = self .categories , ordered = self .ordered ,
1036
1041
name = self .name , fastpath = True )
1037
1042
return result
@@ -1178,7 +1183,7 @@ def __setitem__(self, key, value):
1178
1183
raise ValueError ("Cannot set a Categorical with another, without identical "
1179
1184
"categories" )
1180
1185
1181
- rvalue = value if com . is_list_like (value ) else [value ]
1186
+ rvalue = value if is_list_like (value ) else [value ]
1182
1187
to_add = Index (rvalue ).difference (self .categories )
1183
1188
# no assignments of values not in categories, but it's always ok to set something to np.nan
1184
1189
if len (to_add ) and not isnull (to_add ).all ():
@@ -1221,7 +1226,7 @@ def __setitem__(self, key, value):
1221
1226
# float categories do currently return -1 for np.nan, even if np.nan is included in the
1222
1227
# index -> "repair" this here
1223
1228
if isnull (rvalue ).any () and isnull (self .categories ).any ():
1224
- nan_pos = np .where (com . isnull (self .categories ))[0 ]
1229
+ nan_pos = np .where (isnull (self .categories ))[0 ]
1225
1230
lindexer [lindexer == - 1 ] = nan_pos
1226
1231
1227
1232
key = self ._maybe_coerce_indexer (key )
@@ -1304,7 +1309,7 @@ def mode(self):
1304
1309
1305
1310
import pandas .hashtable as htable
1306
1311
good = self ._codes != - 1
1307
- result = Categorical (sorted (htable .mode_int64 (com . _ensure_int64 (self ._codes [good ]))),
1312
+ result = Categorical (sorted (htable .mode_int64 (_ensure_int64 (self ._codes [good ]))),
1308
1313
categories = self .categories ,ordered = self .ordered , name = self .name ,
1309
1314
fastpath = True )
1310
1315
return result
@@ -1373,9 +1378,9 @@ def describe(self):
1373
1378
categories = np .arange (0 ,len (self .categories )+ 1 ,dtype = object )
1374
1379
categories [:- 1 ] = self .categories
1375
1380
categories [- 1 ] = np .nan
1376
- result .index = categories .take (com . _ensure_platform_int (result .index ))
1381
+ result .index = categories .take (_ensure_platform_int (result .index ))
1377
1382
else :
1378
- result .index = self .categories .take (com . _ensure_platform_int (result .index ))
1383
+ result .index = self .categories .take (_ensure_platform_int (result .index ))
1379
1384
result = result .reindex (self .categories )
1380
1385
result .index .name = 'categories'
1381
1386
@@ -1447,23 +1452,72 @@ def _get_codes_for_values(values, categories):
1447
1452
1448
1453
from pandas .core .algorithms import _get_data_algo , _hashtables
1449
1454
if values .dtype != categories .dtype :
1450
- values = com . _ensure_object (values )
1451
- categories = com . _ensure_object (categories )
1455
+ values = _ensure_object (values )
1456
+ categories = _ensure_object (categories )
1452
1457
(hash_klass , vec_klass ), vals = _get_data_algo (values , _hashtables )
1453
1458
t = hash_klass (len (categories ))
1454
- t .map_locations (com . _values_from_object (categories ))
1455
- return com . _coerce_indexer_dtype (t .lookup (values ), categories )
1459
+ t .map_locations (_values_from_object (categories ))
1460
+ return _coerce_indexer_dtype (t .lookup (values ), categories )
1456
1461
1457
1462
def _convert_to_list_like (list_like ):
1458
1463
if hasattr (list_like , "dtype" ):
1459
1464
return list_like
1460
1465
if isinstance (list_like , list ):
1461
1466
return list_like
1462
- if (com . _is_sequence (list_like ) or isinstance (list_like , tuple )
1463
- or isinstance (list_like , types .GeneratorType )):
1467
+ if (_is_sequence (list_like ) or isinstance (list_like , tuple )
1468
+ or isinstance (list_like , types .GeneratorType )):
1464
1469
return list (list_like )
1465
1470
elif np .isscalar (list_like ):
1466
1471
return [list_like ]
1467
1472
else :
1468
1473
# is this reached?
1469
1474
return [list_like ]
1475
+
1476
+ def _concat_compat (to_concat , axis = 0 ):
1477
+ """
1478
+ provide concatenation of an object/categorical array of arrays each of which is a single dtype
1479
+
1480
+ Parameters
1481
+ ----------
1482
+ to_concat : array of arrays
1483
+ axis : axis to provide concatenation
1484
+
1485
+ Returns
1486
+ -------
1487
+ a single array, preserving the combined dtypes
1488
+ """
1489
+
1490
+ def convert_categorical (x ):
1491
+ # coerce to object dtype
1492
+ if is_categorical_dtype (x .dtype ):
1493
+ return x .get_values ()
1494
+ return x .ravel ()
1495
+
1496
+ typs = get_dtype_kinds (to_concat )
1497
+ if not len (typs - set (['object' ,'category' ])):
1498
+
1499
+ # we only can deal with object & category types
1500
+ pass
1501
+
1502
+ else :
1503
+
1504
+ # convert to object type and perform a regular concat
1505
+ from pandas .core .common import _concat_compat
1506
+ return _concat_compat ([ np .array (x ,copy = False ).astype ('object' ) for x in to_concat ],axis = axis )
1507
+
1508
+ # we could have object blocks and categorical's here
1509
+ # if we only have a single cateogoricals then combine everything
1510
+ # else its a non-compat categorical
1511
+ categoricals = [ x for x in to_concat if is_categorical_dtype (x .dtype ) ]
1512
+ objects = [ x for x in to_concat if is_object_dtype (x .dtype ) ]
1513
+
1514
+ # validate the categories
1515
+ categories = None
1516
+ for x in categoricals :
1517
+ if categories is None :
1518
+ categories = x .categories
1519
+ if not categories .equals (x .categories ):
1520
+ raise ValueError ("incompatible categories in categorical concat" )
1521
+
1522
+ # concat them
1523
+ return Categorical (np .concatenate ([ convert_categorical (x ) for x in to_concat ],axis = axis ), categories = categories )
0 commit comments