@@ -556,7 +556,8 @@ class _GroupBy(PandasObject, SelectionMixin):
556556
557557 def __init__ (self , obj , keys = None , axis = 0 , level = None ,
558558 grouper = None , exclusions = None , selection = None , as_index = True ,
559- sort = True , group_keys = True , squeeze = False , ** kwargs ):
559+ sort = True , group_keys = True , squeeze = False ,
560+ observed = None , ** kwargs ):
560561
561562 self ._selection = selection
562563
@@ -576,13 +577,15 @@ def __init__(self, obj, keys=None, axis=0, level=None,
576577 self .sort = sort
577578 self .group_keys = group_keys
578579 self .squeeze = squeeze
580+ self .observed = observed
579581 self .mutated = kwargs .pop ('mutated' , False )
580582
581583 if grouper is None :
582584 grouper , exclusions , obj = _get_grouper (obj , keys ,
583585 axis = axis ,
584586 level = level ,
585587 sort = sort ,
588+ observed = observed ,
586589 mutated = self .mutated )
587590
588591 self .obj = obj
@@ -2331,18 +2334,21 @@ def ngroups(self):
23312334 def recons_labels (self ):
23322335 comp_ids , obs_ids , _ = self .group_info
23332336 labels = (ping .labels for ping in self .groupings )
2334- return decons_obs_group_ids (comp_ids ,
2335- obs_ids , self .shape , labels , xnull = True )
2337+ return decons_obs_group_ids (
2338+ comp_ids , obs_ids , self .shape , labels , xnull = True )
23362339
23372340 @cache_readonly
23382341 def result_index (self ):
23392342 if not self .compressed and len (self .groupings ) == 1 :
23402343 return self .groupings [0 ].group_index .rename (self .names [0 ])
23412344
2342- return MultiIndex (levels = [ping .group_index for ping in self .groupings ],
2343- labels = self .recons_labels ,
2344- verify_integrity = False ,
2345- names = self .names )
2345+ labels = self .recons_labels
2346+ levels = [ping .group_index for ping in self .groupings ]
2347+ result = MultiIndex (levels = levels ,
2348+ labels = labels ,
2349+ verify_integrity = False ,
2350+ names = self .names )
2351+ return result
23462352
23472353 def get_group_levels (self ):
23482354 if not self .compressed and len (self .groupings ) == 1 :
@@ -2883,6 +2889,7 @@ class Grouping(object):
28832889 obj :
28842890 name :
28852891 level :
2892+ observed : If we are a Categorical, use the observed values
28862893 in_axis : if the Grouping is a column in self.obj and hence among
28872894 Groupby.exclusions list
28882895
@@ -2898,14 +2905,15 @@ class Grouping(object):
28982905 """
28992906
29002907 def __init__ (self , index , grouper = None , obj = None , name = None , level = None ,
2901- sort = True , in_axis = False ):
2908+ sort = True , observed = None , in_axis = False ):
29022909
29032910 self .name = name
29042911 self .level = level
29052912 self .grouper = _convert_grouper (index , grouper )
29062913 self .index = index
29072914 self .sort = sort
29082915 self .obj = obj
2916+ self .observed = observed
29092917 self .in_axis = in_axis
29102918
29112919 # right place for this?
@@ -2954,16 +2962,34 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None,
29542962 elif is_categorical_dtype (self .grouper ):
29552963
29562964 self .grouper = self .grouper ._codes_for_groupby (self .sort )
2965+ codes = self .grouper .codes
2966+ categories = self .grouper .categories
29572967
29582968 # we make a CategoricalIndex out of the cat grouper
29592969 # preserving the categories / ordered attributes
2960- self ._labels = self .grouper .codes
2970+ self ._labels = codes
2971+
2972+ # Use the observed values of the grouper if inidcated
2973+ observed = self .observed
2974+ if observed is None :
2975+ msg = ("pass observed=True to ensure that a "
2976+ "categorical grouper only returns the "
2977+ "observed groupers, or\n "
2978+ "observed=False to return NA for non-observed"
2979+ "values\n " )
2980+ warnings .warn (msg , FutureWarning , stacklevel = 5 )
2981+ observed = False
2982+
2983+ if observed :
2984+ codes = algorithms .unique1d (codes )
2985+ else :
2986+ codes = np .arange (len (categories ))
29612987
2962- c = self .grouper .categories
29632988 self ._group_index = CategoricalIndex (
2964- Categorical .from_codes (np .arange (len (c )),
2965- categories = c ,
2966- ordered = self .grouper .ordered ))
2989+ Categorical .from_codes (
2990+ codes = codes ,
2991+ categories = categories ,
2992+ ordered = self .grouper .ordered ))
29672993
29682994 # we are done
29692995 if isinstance (self .grouper , Grouping ):
@@ -3048,7 +3074,7 @@ def groups(self):
30483074
30493075
30503076def _get_grouper (obj , key = None , axis = 0 , level = None , sort = True ,
3051- mutated = False , validate = True ):
3077+ observed = None , mutated = False , validate = True ):
30523078 """
30533079 create and return a BaseGrouper, which is an internal
30543080 mapping of how to create the grouper indexers.
@@ -3065,6 +3091,9 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True,
30653091 are and then creates a Grouping for each one, combined into
30663092 a BaseGrouper.
30673093
3094+ If observed & we have a categorical grouper, only show the observed
3095+ values
3096+
30683097 If validate, then check for key/level overlaps
30693098
30703099 """
@@ -3243,6 +3272,7 @@ def is_in_obj(gpr):
32433272 name = name ,
32443273 level = level ,
32453274 sort = sort ,
3275+ observed = observed ,
32463276 in_axis = in_axis ) \
32473277 if not isinstance (gpr , Grouping ) else gpr
32483278
@@ -4154,7 +4184,7 @@ def first_not_none(values):
41544184 not_indexed_same = not_indexed_same )
41554185 elif self .grouper .groupings is not None :
41564186 if len (self .grouper .groupings ) > 1 :
4157- key_index = MultiIndex . from_tuples ( keys , names = key_names )
4187+ key_index = self . grouper . result_index
41584188
41594189 else :
41604190 ping = self .grouper .groupings [0 ]
@@ -4244,8 +4274,9 @@ def first_not_none(values):
42444274
42454275 # normally use vstack as its faster than concat
42464276 # and if we have mi-columns
4247- if isinstance (v .index ,
4248- MultiIndex ) or key_index is None :
4277+ if (isinstance (v .index , MultiIndex ) or
4278+ key_index is None or
4279+ isinstance (key_index , MultiIndex )):
42494280 stacked_values = np .vstack (map (np .asarray , values ))
42504281 result = DataFrame (stacked_values , index = key_index ,
42514282 columns = index )
@@ -4696,6 +4727,14 @@ def _reindex_output(self, result):
46964727
46974728 This can re-expand the output space
46984729 """
4730+
4731+ # TODO(jreback): remove completely
4732+ # when observed parameter is defaulted to True
4733+ # gh-20583
4734+
4735+ if self .observed :
4736+ return result
4737+
46994738 groupings = self .grouper .groupings
47004739 if groupings is None :
47014740 return result
0 commit comments