diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 462ead70c9f93..abc9e58d7c435 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -13,33 +13,38 @@ from pandas.core.missing import interpolate_2d from pandas.util.decorators import cache_readonly, deprecate_kwarg -from pandas.core.common import (ABCSeries, ABCIndexClass, ABCPeriodIndex, ABCCategoricalIndex, - isnull, notnull, is_dtype_equal, - is_categorical_dtype, is_integer_dtype, is_object_dtype, - _possibly_infer_to_datetimelike, get_dtype_kinds, - is_list_like, is_sequence, is_null_slice, is_bool, - _ensure_platform_int, _ensure_object, _ensure_int64, - _coerce_indexer_dtype, take_1d) +from pandas.core.common import ( + ABCSeries, ABCIndexClass, ABCPeriodIndex, ABCCategoricalIndex, isnull, + notnull, is_dtype_equal, is_categorical_dtype, is_integer_dtype, + is_object_dtype, _possibly_infer_to_datetimelike, get_dtype_kinds, + is_list_like, is_sequence, is_null_slice, is_bool, _ensure_platform_int, + _ensure_object, _ensure_int64, _coerce_indexer_dtype, take_1d) from pandas.core.dtypes import CategoricalDtype from pandas.util.terminal import get_terminal_size from pandas.core.config import get_option + def _cat_compare_op(op): def f(self, other): - # On python2, you can usually compare any type to any type, and Categoricals can be - # seen as a custom type, but having different results depending whether categories are - # the same or not is kind of insane, so be a bit stricter here and use the python3 idea - # of comparing only things of equal type. + # On python2, you can usually compare any type to any type, and + # Categoricals can be seen as a custom type, but having different + # results depending whether categories are the same or not is kind of + # insane, so be a bit stricter here and use the python3 idea of + # comparing only things of equal type. if not self.ordered: - if op in ['__lt__', '__gt__','__le__','__ge__']: - raise TypeError("Unordered Categoricals can only compare equality or not") + if op in ['__lt__', '__gt__', '__le__', '__ge__']: + raise TypeError("Unordered Categoricals can only compare " + "equality or not") if isinstance(other, Categorical): - # Two Categoricals can only be be compared if the categories are the same - if (len(self.categories) != len(other.categories)) or \ - not ((self.categories == other.categories).all()): - raise TypeError("Categoricals can only be compared if 'categories' are the same") + # Two Categoricals can only be be compared if the categories are + # the same + if ((len(self.categories) != len(other.categories)) or + not ((self.categories == other.categories).all())): + raise TypeError("Categoricals can only be compared if " + "'categories' are the same") if not (self.ordered == other.ordered): - raise TypeError("Categoricals can only be compared if 'ordered' is the same") + raise TypeError("Categoricals can only be compared if " + "'ordered' is the same") na_mask = (self._codes == -1) | (other._codes == -1) f = getattr(self._codes, op) ret = f(other._codes) @@ -66,37 +71,40 @@ def f(self, other): elif op == '__ne__': return np.repeat(True, len(self)) else: - msg = "Cannot compare a Categorical for op {op} with a scalar, " \ - "which is not a category." + msg = ("Cannot compare a Categorical for op {op} with a " + "scalar, which is not a category.") raise TypeError(msg.format(op=op)) else: # allow categorical vs object dtype array comparisons for equality # these are only positional comparisons - if op in ['__eq__','__ne__']: - return getattr(np.array(self),op)(np.array(other)) + if op in ['__eq__', '__ne__']: + return getattr(np.array(self), op)(np.array(other)) - msg = "Cannot compare a Categorical for op {op} with type {typ}. If you want to \n" \ - "compare values, use 'np.asarray(cat) other'." - raise TypeError(msg.format(op=op,typ=type(other))) + msg = ("Cannot compare a Categorical for op {op} with type {typ}." + "\nIf you want to compare values, use 'np.asarray(cat) " + " other'.") + raise TypeError(msg.format(op=op, typ=type(other))) f.__name__ = op return f + def maybe_to_categorical(array): """ coerce to a categorical if a series is given """ if isinstance(array, (ABCSeries, ABCCategoricalIndex)): return array._values return array + _codes_doc = """The category codes of this categorical. Level codes are an array if integer which are the positions of the real values in the categories array. -There is not setter, use the other categorical methods and the normal item setter to change -values in the categorical. +There is not setter, use the other categorical methods and the normal item +setter to change values in the categorical. """ _categories_doc = """The categories of this categorical. @@ -104,16 +112,17 @@ def maybe_to_categorical(array): Setting assigns new values to each category (effectively a rename of each individual category). -The assigned value has to be a list-like object. All items must be unique and the number of items -in the new categories must be the same as the number of items in the old categories. +The assigned value has to be a list-like object. All items must be unique and +the number of items in the new categories must be the same as the number of +items in the old categories. Assigning to `categories` is a inplace operation! Raises ------ ValueError - If the new categories do not validate as categories or if the number of new categories is - unequal the number of old categories + If the new categories do not validate as categories or if the number of new + categories is unequal the number of old categories See also -------- @@ -124,8 +133,9 @@ def maybe_to_categorical(array): remove_unused_categories set_categories """ -class Categorical(PandasObject): + +class Categorical(PandasObject): """ Represents a categorical variable in classic R / S-plus fashion @@ -135,27 +145,29 @@ class Categorical(PandasObject): (additions, divisions, ...) are not possible. All values of the `Categorical` are either in `categories` or `np.nan`. - Assigning values outside of `categories` will raise a `ValueError`. Order is - defined by the order of the `categories`, not lexical order of the values. + Assigning values outside of `categories` will raise a `ValueError`. Order + is defined by the order of the `categories`, not lexical order of the + values. Parameters ---------- values : list-like - The values of the categorical. If categories are given, values not in categories will - be replaced with NaN. + The values of the categorical. If categories are given, values not in + categories will be replaced with NaN. categories : Index-like (unique), optional - The unique categories for this categorical. If not given, the categories are assumed - to be the unique values of values. + The unique categories for this categorical. If not given, the + categories are assumed to be the unique values of values. ordered : boolean, (default False) - Whether or not this categorical is treated as a ordered categorical. If not given, - the resulting categorical will not be ordered. + Whether or not this categorical is treated as a ordered categorical. + If not given, the resulting categorical will not be ordered. Attributes ---------- categories : Index The categories of this categorical codes : ndarray - The codes (integer positions, which point to the categories) of this categorical, read only. + The codes (integer positions, which point to the categories) of this + categorical, read only. ordered : boolean Whether or not this Categorical is ordered. @@ -164,8 +176,8 @@ class Categorical(PandasObject): ValueError If the categories do not validate. TypeError - If an explicit ``ordered=True`` is given but no `categories` and the `values` are - not sortable. + If an explicit ``ordered=True`` is given but no `categories` and the + `values` are not sortable. Examples @@ -179,13 +191,13 @@ class Categorical(PandasObject): [a, b, c, a, b, c] Categories (3, object): [a < b < c] - >>> a = Categorical(['a','b','c','a','b','c'], ['c', 'b', 'a'], ordered=True) + >>> a = Categorical(['a','b','c','a','b','c'], ['c', 'b', 'a'], + ordered=True) >>> a.min() 'c' """ dtype = CategoricalDtype() """The dtype (always "category")""" - """Whether or not this Categorical is ordered. Only ordered `Categoricals` can be sorted (according to the order @@ -199,34 +211,38 @@ class Categorical(PandasObject): Categorical.max """ - # For comparisons, so that numpy uses our implementation if the compare ops, which raise + # For comparisons, so that numpy uses our implementation if the compare + # ops, which raise __array_priority__ = 1000 _typ = 'categorical' - def __init__(self, values, categories=None, ordered=False, name=None, fastpath=False, - levels=None): + def __init__(self, values, categories=None, ordered=False, name=None, + fastpath=False, levels=None): if fastpath: # fast path self._codes = _coerce_indexer_dtype(values, categories) - self._categories = self._validate_categories(categories, fastpath=isinstance(categories, ABCIndexClass)) + self._categories = self._validate_categories( + categories, fastpath=isinstance(categories, ABCIndexClass)) self._ordered = ordered return - if not name is None: - msg = "the 'name' keyword is removed, use 'name' with consumers of the " \ - "categorical instead (e.g. 'Series(cat, name=\"something\")'" + if name is not None: + msg = ("the 'name' keyword is removed, use 'name' with consumers " + "of the categorical instead (e.g. 'Series(cat, " + "name=\"something\")'") warn(msg, UserWarning, stacklevel=2) # TODO: Remove after deprecation period in 2017/ after 0.18 - if not levels is None: - warn("Creating a 'Categorical' with 'levels' is deprecated, use 'categories' instead", - FutureWarning, stacklevel=2) + if levels is not None: + warn("Creating a 'Categorical' with 'levels' is deprecated, use " + "'categories' instead", FutureWarning, stacklevel=2) if categories is None: categories = levels else: - raise ValueError("Cannot pass in both 'categories' and (deprecated) 'levels', " - "use only 'categories'", stacklevel=2) + raise ValueError("Cannot pass in both 'categories' and " + "(deprecated) 'levels', use only " + "'categories'", stacklevel=2) # sanitize input if is_categorical_dtype(values): @@ -246,56 +262,66 @@ def __init__(self, values, categories=None, ordered=False, name=None, fastpath=F else: - # on numpy < 1.6 datetimelike get inferred to all i8 by _sanitize_array - # which is fine, but since factorize does this correctly no need here - # this is an issue because _sanitize_array also coerces np.nan to a string - # under certain versions of numpy as well - values = _possibly_infer_to_datetimelike(values, convert_dates=True) + # on numpy < 1.6 datetimelike get inferred to all i8 by + # _sanitize_array which is fine, but since factorize does this + # correctly no need here this is an issue because _sanitize_array + # also coerces np.nan to a string under certain versions of numpy + # as well + values = _possibly_infer_to_datetimelike(values, + convert_dates=True) if not isinstance(values, np.ndarray): values = _convert_to_list_like(values) from pandas.core.series import _sanitize_array - # On list with NaNs, int values will be converted to float. Use "object" dtype - # to prevent this. In the end objects will be casted to int/... in the category - # assignment step. + # On list with NaNs, int values will be converted to float. Use + # "object" dtype to prevent this. In the end objects will be + # casted to int/... in the category assignment step. dtype = 'object' if isnull(values).any() else None values = _sanitize_array(values, None, dtype=dtype) - if categories is None: try: codes, categories = factorize(values, sort=True) except TypeError: codes, categories = factorize(values, sort=False) if ordered: - # raise, as we don't have a sortable data structure and so the user should - # give us one by specifying categories - raise TypeError("'values' is not ordered, please explicitly specify the " - "categories order by passing in a categories argument.") + # raise, as we don't have a sortable data structure and so + # the user should give us one by specifying categories + raise TypeError("'values' is not ordered, please " + "explicitly specify the categories order " + "by passing in a categories argument.") except ValueError: - ### FIXME #### - raise NotImplementedError("> 1 ndim Categorical are not supported at this time") + # FIXME + raise NotImplementedError("> 1 ndim Categorical are not " + "supported at this time") categories = self._validate_categories(categories) else: # there were two ways if categories are present - # - the old one, where each value is a int pointer to the levels array -> not anymore - # possible, but code outside of pandas could call us like that, so make some checks - # - the new one, where each value is also in the categories array (or np.nan) - - # make sure that we always have the same type here, no matter what we get passed in + # - the old one, where each value is a int pointer to the levels + # array -> not anymore possible, but code outside of pandas could + # call us like that, so make some checks + # - the new one, where each value is also in the categories array + # (or np.nan) + + # make sure that we always have the same type here, no matter what + # we get passed in categories = self._validate_categories(categories) codes = _get_codes_for_values(values, categories) - # TODO: check for old style usage. These warnings should be removes after 0.18/ in 2016 + # TODO: check for old style usage. These warnings should be removes + # after 0.18/ in 2016 if is_integer_dtype(values) and not is_integer_dtype(categories): - warn("Values and categories have different dtypes. Did you mean to use\n" - "'Categorical.from_codes(codes, categories)'?", RuntimeWarning, stacklevel=2) + warn("Values and categories have different dtypes. Did you " + "mean to use\n'Categorical.from_codes(codes, " + "categories)'?", RuntimeWarning, stacklevel=2) - if len(values) and is_integer_dtype(values) and (codes == -1).all(): - warn("None of the categories were found in values. Did you mean to use\n" - "'Categorical.from_codes(codes, categories)'?", RuntimeWarning, stacklevel=2) + if (len(values) and is_integer_dtype(values) and + (codes == -1).all()): + warn("None of the categories were found in values. Did you " + "mean to use\n'Categorical.from_codes(codes, " + "categories)'?", RuntimeWarning, stacklevel=2) self.set_ordered(ordered or False, inplace=True) self._categories = categories @@ -303,8 +329,9 @@ def __init__(self, values, categories=None, ordered=False, name=None, fastpath=F def copy(self): """ Copy constructor. """ - return Categorical(values=self._codes.copy(),categories=self.categories, - ordered=self.ordered, fastpath=True) + return Categorical(values=self._codes.copy(), + categories=self.categories, ordered=self.ordered, + fastpath=True) def astype(self, dtype): """ coerce this type to another dtype """ @@ -356,37 +383,45 @@ def from_codes(cls, codes, categories, ordered=False, name=None): """ Make a Categorical type from codes and categories arrays. - This constructor is useful if you already have codes and categories and so do not need the - (computation intensive) factorization step, which is usually done on the constructor. + This constructor is useful if you already have codes and categories and + so do not need the (computation intensive) factorization step, which is + usually done on the constructor. - If your data does not follow this convention, please use the normal constructor. + If your data does not follow this convention, please use the normal + constructor. Parameters ---------- codes : array-like, integers - An integer array, where each integer points to a category in categories or -1 for NaN + An integer array, where each integer points to a category in + categories or -1 for NaN categories : index-like The categories for the categorical. Items need to be unique. ordered : boolean, (default False) - Whether or not this categorical is treated as a ordered categorical. If not given, - the resulting categorical will be unordered. - """ - if not name is None: - msg = "the 'name' keyword is removed, use 'name' with consumers of the " \ - "categorical instead (e.g. 'Series(cat, name=\"something\")'" + Whether or not this categorical is treated as a ordered + categorical. If not given, the resulting categorical will be + unordered. + """ + if name is not None: + msg = ("the 'name' keyword is removed, use 'name' with consumers " + "of the categorical instead (e.g. 'Series(cat, " + "name=\"something\")'") warn(msg, UserWarning, stacklevel=2) try: codes = np.asarray(codes, np.int64) except: - raise ValueError("codes need to be convertible to an arrays of integers") + raise ValueError( + "codes need to be convertible to an arrays of integers") categories = cls._validate_categories(categories) if len(codes) and (codes.max() >= len(categories) or codes.min() < -1): - raise ValueError("codes need to be between -1 and len(categories)-1") + raise ValueError("codes need to be between -1 and " + "len(categories)-1") - return Categorical(codes, categories=categories, ordered=ordered, fastpath=True) + return Categorical(codes, categories=categories, ordered=ordered, + fastpath=True) _codes = None @@ -416,7 +451,8 @@ def _get_labels(self): Deprecated, use .codes! """ - warn("'labels' is deprecated. Use 'codes' instead", FutureWarning, stacklevel=2) + warn("'labels' is deprecated. Use 'codes' instead", FutureWarning, + stacklevel=2) return self.codes labels = property(fget=_get_labels, fset=_set_codes) @@ -438,10 +474,11 @@ def _validate_categories(cls, categories, fastpath=False): dtype = None if not hasattr(categories, "dtype"): categories = _convert_to_list_like(categories) - # on categories with NaNs, int values would be converted to float. - # Use "object" dtype to prevent this. + # On categories with NaNs, int values would be converted to + # float. Use "object" dtype to prevent this. if isnull(categories).any(): - without_na = np.array([x for x in categories if notnull(x)]) + without_na = np.array([x for x in categories + if notnull(x)]) with_na = np.array(categories) if with_na.dtype != without_na.dtype: dtype = "object" @@ -455,7 +492,8 @@ def _validate_categories(cls, categories, fastpath=False): # we don't allow NaNs in the categories themselves if categories.hasnans: - # NaNs in cats deprecated in 0.17, remove in 0.18 or 0.19 GH 10748 + # NaNs in cats deprecated in 0.17, + # remove in 0.18 or 0.19 GH 10748 msg = ('\nSetting NaNs in `categories` is deprecated and ' 'will be removed in a future version of pandas.') warn(msg, FutureWarning, stacklevel=3) @@ -478,9 +516,10 @@ def _set_categories(self, categories, fastpath=False): """ categories = self._validate_categories(categories, fastpath=fastpath) - if not fastpath and not self._categories is None and len(categories) != len(self._categories): - raise ValueError("new categories need to have the same number of items than the old " - "categories!") + if (not fastpath and self._categories is not None and + len(categories) != len(self._categories)): + raise ValueError("new categories need to have the same number of " + "items than the old categories!") self._categories = categories @@ -489,16 +528,19 @@ def _get_categories(self): # categories is an Index, which is immutable -> no need to copy return self._categories - categories = property(fget=_get_categories, fset=_set_categories, doc=_categories_doc) + categories = property(fget=_get_categories, fset=_set_categories, + doc=_categories_doc) def _set_levels(self, levels): """ set new levels (deprecated, use "categories") """ - warn("Assigning to 'levels' is deprecated, use 'categories'", FutureWarning, stacklevel=2) + warn("Assigning to 'levels' is deprecated, use 'categories'", + FutureWarning, stacklevel=2) self.categories = levels def _get_levels(self): """ Gets the levels (deprecated, use "categories") """ - warn("Accessing 'levels' is deprecated, use 'categories'", FutureWarning, stacklevel=2) + warn("Accessing 'levels' is deprecated, use 'categories'", + FutureWarning, stacklevel=2) return self.categories # TODO: Remove after deprecation period in 2017/ after 0.18 @@ -508,8 +550,8 @@ def _get_levels(self): def _set_ordered(self, value): """ Sets the ordered attribute to the boolean value """ - warn("Setting 'ordered' directly is deprecated, use 'set_ordered'", FutureWarning, - stacklevel=2) + warn("Setting 'ordered' directly is deprecated, use 'set_ordered'", + FutureWarning, stacklevel=2) self.set_ordered(value, inplace=True) def set_ordered(self, value, inplace=False): @@ -518,10 +560,11 @@ def set_ordered(self, value, inplace=False): Parameters ---------- - value : boolean to set whether this categorical is ordered (True) or not (False) + value : boolean to set whether this categorical is ordered (True) or + not (False) inplace : boolean (default: False) - Whether or not to set the ordered attribute inplace or return a copy of this categorical - with ordered set to the value + Whether or not to set the ordered attribute inplace or return a copy + of this categorical with ordered set to the value """ if not is_bool(value): raise TypeError("ordered must be a boolean value") @@ -537,8 +580,8 @@ def as_ordered(self, inplace=False): Parameters ---------- inplace : boolean (default: False) - Whether or not to set the ordered attribute inplace or return a copy of this categorical - with ordered set to True + Whether or not to set the ordered attribute inplace or return a copy + of this categorical with ordered set to True """ return self.set_ordered(True, inplace=inplace) @@ -549,8 +592,8 @@ def as_unordered(self, inplace=False): Parameters ---------- inplace : boolean (default: False) - Whether or not to set the ordered attribute inplace or return a copy of this categorical - with ordered set to False + Whether or not to set the ordered attribute inplace or return a copy + of this categorical with ordered set to False """ return self.set_ordered(False, inplace=inplace) @@ -560,22 +603,25 @@ def _get_ordered(self): ordered = property(fget=_get_ordered, fset=_set_ordered) - def set_categories(self, new_categories, ordered=None, rename=False, inplace=False): + def set_categories(self, new_categories, ordered=None, rename=False, + inplace=False): """ Sets the categories to the specified new_categories. - `new_categories` can include new categories (which will result in unused categories) or - or remove old categories (which results in values set to NaN). If `rename==True`, - the categories will simple be renamed (less or more items than in old categories will - result in values set to NaN or in unused categories respectively). + `new_categories` can include new categories (which will result in + unused categories) or remove old categories (which results in values + set to NaN). If `rename==True`, the categories will simple be renamed + (less or more items than in old categories will result in values set to + NaN or in unused categories respectively). - This method can be used to perform more than one action of adding, removing, - and reordering simultaneously and is therefore faster than performing the individual steps - via the more specialised methods. + This method can be used to perform more than one action of adding, + removing, and reordering simultaneously and is therefore faster than + performing the individual steps via the more specialised methods. - On the other hand this methods does not do checks (e.g., whether the old categories are - included in the new categories on a reorder), which can result in surprising changes, for - example when using special string dtypes on python3, which does not considers a S1 string - equal to a single char python string. + On the other hand this methods does not do checks (e.g., whether the + old categories are included in the new categories on a reorder), which + can result in surprising changes, for example when using special string + dtypes on python3, which does not considers a S1 string equal to a + single char python string. Raises ------ @@ -587,14 +633,14 @@ def set_categories(self, new_categories, ordered=None, rename=False, inplace=Fal new_categories : Index-like The categories in new order. ordered : boolean, (default: False) - Whether or not the categorical is treated as a ordered categorical. If not given, - do not change the ordered information. + Whether or not the categorical is treated as a ordered categorical. + If not given, do not change the ordered information. rename : boolean (default: False) - Whether or not the new_categories should be considered as a rename of the old - categories or as reordered categories. + Whether or not the new_categories should be considered as a rename + of the old categories or as reordered categories. inplace : boolean (default: False) - Whether or not to reorder the categories inplace or return a copy of this categorical - with reordered categories. + Whether or not to reorder the categories inplace or return a copy of + this categorical with reordered categories. Returns ------- @@ -611,7 +657,8 @@ def set_categories(self, new_categories, ordered=None, rename=False, inplace=Fal new_categories = self._validate_categories(new_categories) cat = self if inplace else self.copy() if rename: - if not cat._categories is None and len(new_categories) < len(cat._categories): + if (cat._categories is not None and + len(new_categories) < len(cat._categories)): # remove all _codes which are larger and set to -1/NaN self._codes[self._codes >= len(new_categories)] = -1 else: @@ -629,22 +676,23 @@ def set_categories(self, new_categories, ordered=None, rename=False, inplace=Fal def rename_categories(self, new_categories, inplace=False): """ Renames categories. - The new categories has to be a list-like object. All items must be unique and the number of - items in the new categories must be the same as the number of items in the old categories. + The new categories has to be a list-like object. All items must be + unique and the number of items in the new categories must be the same + as the number of items in the old categories. Raises ------ ValueError - If the new categories do not have the same number of items than the current categories - or do not validate as categories + If the new categories do not have the same number of items than the + current categories or do not validate as categories Parameters ---------- new_categories : Index-like The renamed categories. inplace : boolean (default: False) - Whether or not to rename the categories inplace or return a copy of this categorical - with renamed categories. + Whether or not to rename the categories inplace or return a copy of + this categorical with renamed categories. Returns ------- @@ -666,23 +714,25 @@ def rename_categories(self, new_categories, inplace=False): def reorder_categories(self, new_categories, ordered=None, inplace=False): """ Reorders categories as specified in new_categories. - `new_categories` need to include all old categories and no new category items. + `new_categories` need to include all old categories and no new category + items. Raises ------ ValueError - If the new categories do not contain all old category items or any new ones + If the new categories do not contain all old category items or any + new ones Parameters ---------- new_categories : Index-like The categories in new order. ordered : boolean, optional - Whether or not the categorical is treated as a ordered categorical. If not given, - do not change the ordered information. + Whether or not the categorical is treated as a ordered categorical. + If not given, do not change the ordered information. inplace : boolean (default: False) - Whether or not to reorder the categories inplace or return a copy of this categorical - with reordered categories. + Whether or not to reorder the categories inplace or return a copy of + this categorical with reordered categories. Returns ------- @@ -697,27 +747,30 @@ def reorder_categories(self, new_categories, ordered=None, inplace=False): set_categories """ if set(self._categories) != set(new_categories): - raise ValueError("items in new_categories are not the same as in old categories") - return self.set_categories(new_categories, ordered=ordered, inplace=inplace) + raise ValueError("items in new_categories are not the same as in " + "old categories") + return self.set_categories(new_categories, ordered=ordered, + inplace=inplace) def add_categories(self, new_categories, inplace=False): """ Add new categories. - `new_categories` will be included at the last/highest place in the categories and will be - unused directly after this call. + `new_categories` will be included at the last/highest place in the + categories and will be unused directly after this call. Raises ------ ValueError - If the new categories include old categories or do not validate as categories + If the new categories include old categories or do not validate as + categories Parameters ---------- new_categories : category or list-like of category The new categories to be included. inplace : boolean (default: False) - Whether or not to add the categories inplace or return a copy of this categorical - with added categories. + Whether or not to add the categories inplace or return a copy of + this categorical with added categories. Returns ------- @@ -735,7 +788,8 @@ def add_categories(self, new_categories, inplace=False): new_categories = [new_categories] already_included = set(new_categories) & set(self._categories) if len(already_included) != 0: - msg = "new categories must not include old categories: %s" % str(already_included) + msg = ("new categories must not include old categories: %s" % + str(already_included)) raise ValueError(msg) new_categories = list(self._categories) + list(new_categories) cat = self if inplace else self.copy() @@ -747,8 +801,8 @@ def add_categories(self, new_categories, inplace=False): def remove_categories(self, removals, inplace=False): """ Removes the specified categories. - `removals` must be included in the old categories. Values which were in the removed - categories will be set to NaN + `removals` must be included in the old categories. Values which were in + the removed categories will be set to NaN Raises ------ @@ -760,8 +814,8 @@ def remove_categories(self, removals, inplace=False): removals : category or list of categories The categories which should be removed. inplace : boolean (default: False) - Whether or not to remove the categories inplace or return a copy of this categorical - with removed categories. + Whether or not to remove the categories inplace or return a copy of + this categorical with removed categories. Returns ------- @@ -780,7 +834,7 @@ def remove_categories(self, removals, inplace=False): removal_set = set(list(removals)) not_included = removal_set - set(self._categories) - new_categories = [ c for c in self._categories if c not in removal_set ] + new_categories = [c for c in self._categories if c not in removal_set] # GH 10156 if any(isnull(removals)): @@ -788,11 +842,11 @@ def remove_categories(self, removals, inplace=False): new_categories = [x for x in new_categories if notnull(x)] if len(not_included) != 0: - raise ValueError("removals must all be in old categories: %s" % str(not_included)) - - return self.set_categories(new_categories, ordered=self.ordered, rename=False, - inplace=inplace) + raise ValueError("removals must all be in old categories: %s" % + str(not_included)) + return self.set_categories(new_categories, ordered=self.ordered, + rename=False, inplace=inplace) def remove_unused_categories(self, inplace=False): """ Removes categories which are not used. @@ -800,8 +854,8 @@ def remove_unused_categories(self, inplace=False): Parameters ---------- inplace : boolean (default: False) - Whether or not to drop unused categories inplace or return a copy of this categorical - with unused categories dropped. + Whether or not to drop unused categories inplace or return a copy of + this categorical with unused categories dropped. Returns ------- @@ -827,7 +881,6 @@ def remove_unused_categories(self, inplace=False): if not inplace: return cat - __eq__ = _cat_compare_op('__eq__') __ne__ = _cat_compare_op('__ne__') __lt__ = _cat_compare_op('__lt__') @@ -874,8 +927,7 @@ def shift(self, periods): else: codes[periods:] = -1 - return Categorical.from_codes(codes, - categories=self.categories, + return Categorical.from_codes(codes, categories=self.categories, ordered=self.ordered) def __array__(self, dtype=None): @@ -885,11 +937,12 @@ def __array__(self, dtype=None): Returns ------- values : numpy array - A numpy array of either the specified dtype or, if dtype==None (default), the same - dtype as categorical.categories.dtype + A numpy array of either the specified dtype or, + if dtype==None (default), the same dtype as + categorical.categories.dtype """ ret = take_1d(self.categories.values, self._codes) - if dtype and not is_dtype_equal(dtype,self.categories.dtype): + if dtype and not is_dtype_equal(dtype, self.categories.dtype): return np.asarray(ret, dtype) return ret @@ -902,8 +955,8 @@ def __setstate__(self, state): if '_codes' not in state and 'labels' in state: state['_codes'] = state.pop('labels') if '_categories' not in state and '_levels' in state: - state['_categories'] = \ - self._validate_categories(state.pop('_levels')) + state['_categories'] = self._validate_categories(state.pop( + '_levels')) # 0.16.0 ordered change if '_ordered' not in state: @@ -960,7 +1013,8 @@ def searchsorted(self, v, side='left', sorter=None): Parameters ---------- v : array_like - Array-like values or a scalar value, to insert/search for in `self`. + Array-like values or a scalar value, to insert/search for in + `self`. side : {'left', 'right'}, optional If 'left', the index of the first suitable location found is given. If 'right', return the last such index. If there is no suitable @@ -996,16 +1050,20 @@ def searchsorted(self, v, side='left', sorter=None): array([1, 4]) >>> x.searchsorted(['bread', 'eggs'], side='right') array([3, 4]) # eggs before milk - >>> x = pd.Categorical(['apple', 'bread', 'bread', 'cheese', 'milk', 'donuts' ]) - >>> x.searchsorted(['bread', 'eggs'], side='right', sorter=[0, 1, 2, 3, 5, 4]) - array([3, 5]) # eggs after donuts, after switching milk and donuts + >>> x = pd.Categorical(['apple', 'bread', 'bread', 'cheese', 'milk', + 'donuts' ]) + >>> x.searchsorted(['bread', 'eggs'], side='right', + sorter=[0, 1, 2, 3, 5, 4]) + array([3, 5]) # eggs after donuts, after switching milk and donuts """ if not self.ordered: - raise ValueError("Categorical not ordered\n" - "you can use .as_ordered() to change the Categorical to an ordered one\n") + raise ValueError("Categorical not ordered\nyou can use " + ".as_ordered() to change the Categorical to an " + "ordered one") from pandas.core.series import Series - values_as_codes = self.categories.values.searchsorted(Series(v).values, side) + values_as_codes = self.categories.values.searchsorted( + Series(v).values, side) return self.codes.searchsorted(values_as_codes, sorter=sorter) def isnull(self): @@ -1031,14 +1089,15 @@ def isnull(self): if np.nan in self.categories: nan_pos = np.where(isnull(self.categories))[0] # we only have one NA in categories - ret = np.logical_or(ret , self._codes == nan_pos) + ret = np.logical_or(ret, self._codes == nan_pos) return ret def notnull(self): """ Reverse of isnull - Both missing values (-1 in .codes) and NA as a category are detected as null. + Both missing values (-1 in .codes) and NA as a category are detected as + null. Returns ------- @@ -1087,9 +1146,8 @@ def value_counts(self, dropna=True): from pandas.core.series import Series from pandas.core.index import CategoricalIndex - obj = self.remove_categories([np.nan]) \ - if dropna and isnull(self.categories).any() else self - + obj = (self.remove_categories([np.nan]) if dropna and + isnull(self.categories).any() else self) code, cat = obj._codes, obj.categories ncat, mask = len(cat), 0 <= code ix, clean = np.arange(ncat), mask.all() @@ -1101,8 +1159,8 @@ def value_counts(self, dropna=True): count = bincount(np.where(mask, code, ncat)) ix = np.append(ix, -1) - ix = Categorical(ix, categories=cat, - ordered=obj.ordered, fastpath=True) + ix = Categorical(ix, categories=cat, ordered=obj.ordered, + fastpath=True) return Series(count, index=CategoricalIndex(ix), dtype='int64') @@ -1126,7 +1184,8 @@ def check_for_ordered(self, op): """ assert that we are ordered """ if not self.ordered: raise TypeError("Categorical is not ordered for operation {op}\n" - "you can use .as_ordered() to change the Categorical to an ordered one\n".format(op=op)) + "you can use .as_ordered() to change the " + "Categorical to an ordered one\n".format(op=op)) def argsort(self, ascending=True, **kwargs): """ Implements ndarray.argsort. @@ -1145,7 +1204,8 @@ def argsort(self, ascending=True, **kwargs): return result def sort_values(self, inplace=False, ascending=True, na_position='last'): - """ Sorts the Category by category value returning a new Categorical by default. + """ Sorts the Category by category value returning a new Categorical by + default. Only ordered Categoricals can be sorted! @@ -1169,7 +1229,7 @@ def sort_values(self, inplace=False, ascending=True, na_position='last'): -------- Category.sort """ - if na_position not in ['last','first']: + if na_position not in ['last', 'first']: raise ValueError('invalid na_position: {!r}'.format(na_position)) codes = np.sort(self._codes) @@ -1177,19 +1237,19 @@ def sort_values(self, inplace=False, ascending=True, na_position='last'): codes = codes[::-1] # NaN handling - na_mask = (codes==-1) + na_mask = (codes == -1) if na_mask.any(): n_nans = len(codes[na_mask]) - if na_position=="first" and not ascending: + if na_position == "first" and not ascending: # in this case sort to the front new_codes = codes.copy() new_codes[0:n_nans] = -1 new_codes[n_nans:] = codes[~na_mask] codes = new_codes - elif na_position=="last" and not ascending: + elif na_position == "last" and not ascending: # ... and to the end new_codes = codes.copy() - pos = len(codes)-n_nans + pos = len(codes) - n_nans new_codes[0:pos] = codes[~na_mask] new_codes[pos:] = -1 codes = new_codes @@ -1197,14 +1257,15 @@ def sort_values(self, inplace=False, ascending=True, na_position='last'): self._codes = codes return else: - return Categorical(values=codes,categories=self.categories, ordered=self.ordered, - fastpath=True) + return Categorical(values=codes, categories=self.categories, + ordered=self.ordered, fastpath=True) def order(self, inplace=False, ascending=True, na_position='last'): """ DEPRECATED: use :meth:`Categorical.sort_values` - Sorts the Category by category value returning a new Categorical by default. + Sorts the Category by category value returning a new Categorical by + default. Only ordered Categoricals can be sorted! @@ -1228,9 +1289,10 @@ def order(self, inplace=False, ascending=True, na_position='last'): -------- Category.sort """ - warn("order is deprecated, use sort_values(...)", - FutureWarning, stacklevel=2) - return self.sort_values(inplace=inplace, ascending=ascending, na_position=na_position) + warn("order is deprecated, use sort_values(...)", FutureWarning, + stacklevel=2) + return self.sort_values(inplace=inplace, ascending=ascending, + na_position=na_position) def sort(self, inplace=True, ascending=True, na_position='last'): """ Sorts the Category inplace by category value. @@ -1340,8 +1402,8 @@ def fillna(self, value=None, method=None, limit=None): if method is not None: values = self.to_dense().reshape(-1, len(self)) - values = interpolate_2d( - values, method, 0, None, value).astype(self.categories.dtype)[0] + values = interpolate_2d(values, method, 0, None, + value).astype(self.categories.dtype)[0] values = _get_codes_for_values(values, self.categories) else: @@ -1349,13 +1411,13 @@ def fillna(self, value=None, method=None, limit=None): if not isnull(value) and value not in self.categories: raise ValueError("fill value must be in categories") - mask = values==-1 + mask = values == -1 if mask.any(): values = values.copy() values[mask] = self.categories.get_loc(value) - return Categorical(values, categories=self.categories, ordered=self.ordered, - fastpath=True) + return Categorical(values, categories=self.categories, + ordered=self.ordered, fastpath=True) def take_nd(self, indexer, allow_fill=True, fill_value=None): """ Take the codes by the indexer, fill with the fill_value. @@ -1368,8 +1430,8 @@ def take_nd(self, indexer, allow_fill=True, fill_value=None): assert isnull(fill_value) codes = take_1d(self._codes, indexer, allow_fill=True, fill_value=-1) - result = Categorical(codes, categories=self.categories, ordered=self.ordered, - fastpath=True) + result = Categorical(codes, categories=self.categories, + ordered=self.ordered, fastpath=True) return result take = take_nd @@ -1384,12 +1446,13 @@ def _slice(self, slicer): # in a 2-d case be passd (slice(None),....) if isinstance(slicer, tuple) and len(slicer) == 2: if not is_null_slice(slicer[0]): - raise AssertionError("invalid slicing for a 1-ndim categorical") + raise AssertionError("invalid slicing for a 1-ndim " + "categorical") slicer = slicer[1] _codes = self._codes[slicer] - return Categorical(values=_codes,categories=self.categories, ordered=self.ordered, - fastpath=True) + return Categorical(values=_codes, categories=self.categories, + ordered=self.ordered, fastpath=True) def __len__(self): """The length of this Categorical.""" @@ -1400,11 +1463,12 @@ def __iter__(self): return iter(self.get_values()) def _tidy_repr(self, max_vals=10, footer=True): - """ a short repr displaying only max_vals and an optional (but default footer) """ + """ a short repr displaying only max_vals and an optional (but default + footer) + """ num = max_vals // 2 head = self[:num]._get_repr(length=False, footer=False) - tail = self[-(max_vals - num):]._get_repr(length=False, - footer=False) + tail = self[-(max_vals - num):]._get_repr(length=False, footer=False) result = '%s, ..., %s' % (head[:-1], tail[1:]) if footer: @@ -1414,8 +1478,8 @@ def _tidy_repr(self, max_vals=10, footer=True): def _repr_categories(self): """ return the base repr for the categories """ - max_categories = (10 if get_option("display.max_categories") == 0 - else get_option("display.max_categories")) + max_categories = (10 if get_option("display.max_categories") == 0 else + get_option("display.max_categories")) from pandas.core import format as fmt if len(self.categories) > max_categories: num = max_categories // 2 @@ -1433,7 +1497,8 @@ def _repr_categories_info(self): """ Returns a string representation of the footer.""" category_strs = self._repr_categories() - dtype = getattr(self.categories, 'dtype_str', str(self.categories.dtype)) + dtype = getattr(self.categories, 'dtype_str', + str(self.categories.dtype)) levheader = "Categories (%d, %s): " % (len(self.categories), dtype) width, height = get_terminal_size() @@ -1443,20 +1508,20 @@ def _repr_categories_info(self): max_width = 0 levstring = "" start = True - cur_col_len = len(levheader) # header + cur_col_len = len(levheader) # header sep_len, sep = (3, " < ") if self.ordered else (2, ", ") - linesep = sep.rstrip() + "\n" # remove whitespace + linesep = sep.rstrip() + "\n" # remove whitespace for val in category_strs: if max_width != 0 and cur_col_len + sep_len + len(val) > max_width: levstring += linesep + (" " * (len(levheader) + 1)) - cur_col_len = len(levheader) + 1 # header + a whitespace + cur_col_len = len(levheader) + 1 # header + a whitespace elif not start: levstring += sep cur_col_len += len(val) levstring += val start = False # replace to simple save space by - return levheader + "["+levstring.replace(" < ... < ", " ... ")+"]" + return levheader + "[" + levstring.replace(" < ... < ", " ... ") + "]" def _repr_footer(self): @@ -1464,10 +1529,8 @@ def _repr_footer(self): def _get_repr(self, length=True, na_rep='NaN', footer=True): from pandas.core import format as fmt - formatter = fmt.CategoricalFormatter(self, - length=length, - na_rep=na_rep, - footer=footer) + formatter = fmt.CategoricalFormatter(self, length=length, + na_rep=na_rep, footer=footer) result = formatter.to_string() return compat.text_type(result) @@ -1479,9 +1542,9 @@ def __unicode__(self): elif len(self._codes) > 0: result = self._get_repr(length=len(self) > _maxlen) else: - result = '[], %s' % self._get_repr(length=False, - footer=True, - ).replace("\n",", ") + result = ('[], %s' % + self._get_repr(length=False, + footer=True, ).replace("\n", ", ")) return result @@ -1500,8 +1563,10 @@ def __getitem__(self, key): else: return self.categories[i] else: - return Categorical(values=self._codes[key], categories=self.categories, - ordered=self.ordered, fastpath=True) + return Categorical(values=self._codes[key], + categories=self.categories, + ordered=self.ordered, + fastpath=True) def __setitem__(self, key, value): """ Item assignment. @@ -1510,26 +1575,26 @@ def __setitem__(self, key, value): Raises ------ ValueError - If (one or more) Value is not in categories or if a assigned `Categorical` has not the - same categories - + If (one or more) Value is not in categories or if a assigned + `Categorical` does not have the same categories """ # require identical categories set if isinstance(value, Categorical): if not value.categories.equals(self.categories): - raise ValueError("Cannot set a Categorical with another, without identical " - "categories") + raise ValueError("Cannot set a Categorical with another, " + "without identical categories") rvalue = value if is_list_like(value) else [value] from pandas import Index to_add = Index(rvalue).difference(self.categories) - # no assignments of values not in categories, but it's always ok to set something to np.nan + # no assignments of values not in categories, but it's always ok to set + # something to np.nan if len(to_add) and not isnull(to_add).all(): - raise ValueError("cannot setitem on a Categorical with a new category," - " set the categories first") + raise ValueError("Cannot setitem on a Categorical with a new " + "category, set the categories first") # set by position if isinstance(key, (int, np.integer)): @@ -1541,12 +1606,14 @@ def __setitem__(self, key, value): # in a 2-d case be passd (slice(None),....) if len(key) == 2: if not is_null_slice(key[0]): - raise AssertionError("invalid slicing for a 1-ndim categorical") + raise AssertionError("invalid slicing for a 1-ndim " + "categorical") key = key[1] elif len(key) == 1: key = key[0] else: - raise AssertionError("invalid slicing for a 1-ndim categorical") + raise AssertionError("invalid slicing for a 1-ndim " + "categorical") # slicing in Series or Categorical elif isinstance(key, slice): @@ -1554,18 +1621,20 @@ def __setitem__(self, key, value): # Array of True/False in Series or Categorical else: - # There is a bug in numpy, which does not accept a Series as a indexer + # There is a bug in numpy, which does not accept a Series as a + # indexer # https://github.com/pydata/pandas/issues/6168 # https://github.com/numpy/numpy/issues/4240 -> fixed in numpy 1.9 - # FIXME: remove when numpy 1.9 is the lowest numpy version pandas accepts... + # FIXME: remove when numpy 1.9 is the lowest numpy version pandas + # accepts... key = np.asarray(key) lindexer = self.categories.get_indexer(rvalue) - # FIXME: the following can be removed after https://github.com/pydata/pandas/issues/7820 - # is fixed. - # float categories do currently return -1 for np.nan, even if np.nan is included in the - # index -> "repair" this here + # FIXME: the following can be removed after GH7820 is fixed: + # https://github.com/pydata/pandas/issues/7820 + # float categories do currently return -1 for np.nan, even if np.nan is + # included in the index -> "repair" this here if isnull(rvalue).any() and isnull(self.categories).any(): nan_pos = np.where(isnull(self.categories))[0] lindexer[lindexer == -1] = nan_pos @@ -1573,13 +1642,14 @@ def __setitem__(self, key, value): lindexer = self._maybe_coerce_indexer(lindexer) self._codes[key] = lindexer - #### reduction ops #### + # reduction ops # def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds): """ perform the reduction type operation """ - func = getattr(self,name,None) + func = getattr(self, name, None) if func is None: - raise TypeError("Categorical cannot perform the operation {op}".format(op=name)) + raise TypeError("Categorical cannot perform the operation " + "{op}".format(op=name)) return func(numeric_only=numeric_only, **kwds) def min(self, numeric_only=None, **kwargs): @@ -1607,7 +1677,6 @@ def min(self, numeric_only=None, **kwargs): else: return self.categories[pointer] - def max(self, numeric_only=None, **kwargs): """ The maximum value of the object. @@ -1637,8 +1706,8 @@ def mode(self): """ Returns the mode(s) of the Categorical. - Empty if nothing occurs at least 2 times. Always returns `Categorical` even - if only one value. + Empty if nothing occurs at least 2 times. Always returns `Categorical` + even if only one value. Returns ------- @@ -1647,14 +1716,15 @@ def mode(self): import pandas.hashtable as htable good = self._codes != -1 - result = Categorical(sorted(htable.mode_int64(_ensure_int64(self._codes[good]))), - categories=self.categories,ordered=self.ordered, fastpath=True) + values = sorted(htable.mode_int64(_ensure_int64(self._codes[good]))) + result = Categorical(values=values, categories=self.categories, + ordered=self.ordered, fastpath=True) return result def unique(self): """ - Return the ``Categorical`` which ``categories`` and ``codes`` are unique. - Unused categories are NOT returned. + Return the ``Categorical`` which ``categories`` and ``codes`` are + unique. Unused categories are NOT returned. - unordered category: values and categories are sorted by appearance order. @@ -1690,7 +1760,8 @@ def equals(self, other): ------- are_equal : boolean """ - return self.is_dtype_equal(other) and np.array_equal(self._codes, other._codes) + return (self.is_dtype_equal(other) and + np.array_equal(self._codes, other._codes)) def is_dtype_equal(self, other): """ @@ -1707,7 +1778,8 @@ def is_dtype_equal(self, other): """ try: - return self.categories.equals(other.categories) and self.ordered == other.ordered + return (self.categories.equals(other.categories) and + self.ordered == other.ordered) except (AttributeError, TypeError): return False @@ -1723,8 +1795,8 @@ def describe(self): freqs = counts / float(counts.sum()) from pandas.tools.merge import concat - result = concat([counts,freqs],axis=1) - result.columns = ['counts','freqs'] + result = concat([counts, freqs], axis=1) + result.columns = ['counts', 'freqs'] result.index.name = 'categories' return result @@ -1742,15 +1814,16 @@ def repeat(self, repeats): return Categorical(values=codes, categories=self.categories, ordered=self.ordered, fastpath=True) +# The Series.cat accessor -##### The Series.cat accessor ##### class CategoricalAccessor(PandasDelegate, NoNewAttributesMixin): """ Accessor object for categorical properties of the Series values. - Be aware that assigning to `categories` is a inplace operation, while all methods return - new categorical data per default (but can be called with `inplace=True`). + Be aware that assigning to `categories` is a inplace operation, while all + methods return new categorical data per default (but can be called with + `inplace=True`). Examples -------- @@ -1787,24 +1860,21 @@ def _delegate_method(self, name, *args, **kwargs): from pandas import Series method = getattr(self.categorical, name) res = method(*args, **kwargs) - if not res is None: + if res is not None: return Series(res, index=self.index) + CategoricalAccessor._add_delegate_accessors(delegate=Categorical, - accessors=["categories", "ordered"], + accessors=["categories", + "ordered"], typ='property') -CategoricalAccessor._add_delegate_accessors(delegate=Categorical, - accessors=["rename_categories", - "reorder_categories", - "add_categories", - "remove_categories", - "remove_unused_categories", - "set_categories", - "as_ordered", - "as_unordered"], - typ='method') - -##### utility routines ##### +CategoricalAccessor._add_delegate_accessors(delegate=Categorical, accessors=[ + "rename_categories", "reorder_categories", "add_categories", + "remove_categories", "remove_unused_categories", "set_categories", + "as_ordered", "as_unordered"], typ='method') + +# utility routines + def _get_codes_for_values(values, categories): """ @@ -1812,7 +1882,7 @@ def _get_codes_for_values(values, categories): """ from pandas.core.algorithms import _get_data_algo, _hashtables - if not is_dtype_equal(values.dtype,categories.dtype): + if not is_dtype_equal(values.dtype, categories.dtype): values = _ensure_object(values) categories = _ensure_object(categories) @@ -1822,13 +1892,14 @@ def _get_codes_for_values(values, categories): t.map_locations(cats) return _coerce_indexer_dtype(t.lookup(vals), cats) + def _convert_to_list_like(list_like): if hasattr(list_like, "dtype"): return list_like if isinstance(list_like, list): return list_like - if (is_sequence(list_like) or isinstance(list_like, tuple) - or isinstance(list_like, types.GeneratorType)): + if (is_sequence(list_like) or isinstance(list_like, tuple) or + isinstance(list_like, types.GeneratorType)): return list(list_like) elif np.isscalar(list_like): return [list_like] @@ -1836,6 +1907,7 @@ def _convert_to_list_like(list_like): # is this reached? return [list_like] + def _concat_compat(to_concat, axis=0): """Concatenate an object/categorical array of arrays, each of which is a single dtype @@ -1882,9 +1954,9 @@ def convert_categorical(x): if len(categoricals) == len(to_concat): # concating numeric types is much faster than concating object types # and fastpath takes a shorter path through the constructor - return Categorical(np.concatenate([x.codes for x in to_concat], axis=0), - rawcats, - ordered=categoricals[0].ordered, + return Categorical(np.concatenate([x.codes for x in to_concat], + axis=0), + rawcats, ordered=categoricals[0].ordered, fastpath=True) else: concatted = np.concatenate(list(map(convert_categorical, to_concat)), diff --git a/pandas/core/common.py b/pandas/core/common.py index 3d320199e04d9..b80b7eecaeb11 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -17,10 +17,13 @@ import pandas.lib as lib import pandas.tslib as tslib from pandas import compat -from pandas.compat import BytesIO, range, long, u, zip, map, string_types, iteritems -from pandas.core.dtypes import CategoricalDtype, CategoricalDtypeType, DatetimeTZDtype, DatetimeTZDtypeType +from pandas.compat import (BytesIO, range, long, u, zip, map, string_types, + iteritems) +from pandas.core.dtypes import (CategoricalDtype, CategoricalDtypeType, + DatetimeTZDtype, DatetimeTZDtypeType) from pandas.core.config import get_option + class PandasError(Exception): pass @@ -41,70 +44,78 @@ class AbstractMethodError(NotImplementedError): """Raise this error instead of NotImplementedError for abstract methods while keeping compatibility with Python 2 and Python 3. """ + def __init__(self, class_instance): self.class_instance = class_instance def __str__(self): - return "This method must be defined in the concrete class of " \ - + self.class_instance.__class__.__name__ + return ("This method must be defined in the concrete class of %s" % + self.class_instance.__class__.__name__) + _POSSIBLY_CAST_DTYPES = set([np.dtype(t).name - for t in ['O', 'int8', - 'uint8', 'int16', 'uint16', 'int32', - 'uint32', 'int64', 'uint64']]) + for t in ['O', 'int8', 'uint8', 'int16', 'uint16', + 'int32', 'uint32', 'int64', 'uint64']]) _NS_DTYPE = np.dtype('M8[ns]') _TD_DTYPE = np.dtype('m8[ns]') _INT64_DTYPE = np.dtype(np.int64) -_DATELIKE_DTYPES = set([np.dtype(t) for t in ['M8[ns]', 'M8[ns]', - 'm8[ns]', 'm8[ns]']]) +_DATELIKE_DTYPES = set([np.dtype(t) + for t in ['M8[ns]', 'M8[ns]', + 'm8[ns]', 'm8[ns]']]) _int8_max = np.iinfo(np.int8).max _int16_max = np.iinfo(np.int16).max _int32_max = np.iinfo(np.int32).max _int64_max = np.iinfo(np.int64).max + # define abstract base classes to enable isinstance type checking on our # objects def create_pandas_abc_type(name, attr, comp): @classmethod def _check(cls, inst): return getattr(inst, attr, '_typ') in comp - dct = dict(__instancecheck__=_check, - __subclasscheck__=_check) - meta = type("ABCBase", (type,), dct) + + dct = dict(__instancecheck__=_check, __subclasscheck__=_check) + meta = type("ABCBase", (type, ), dct) return meta(name, tuple(), dct) -ABCIndex = create_pandas_abc_type("ABCIndex", "_typ", ("index",)) -ABCInt64Index = create_pandas_abc_type("ABCInt64Index", "_typ", ("int64index",)) -ABCFloat64Index = create_pandas_abc_type("ABCFloat64Index", "_typ", ("float64index",)) -ABCMultiIndex = create_pandas_abc_type("ABCMultiIndex", "_typ", ("multiindex",)) -ABCDatetimeIndex = create_pandas_abc_type("ABCDatetimeIndex", "_typ", ("datetimeindex",)) -ABCTimedeltaIndex = create_pandas_abc_type("ABCTimedeltaIndex", "_typ", ("timedeltaindex",)) -ABCPeriodIndex = create_pandas_abc_type("ABCPeriodIndex", "_typ", ("periodindex",)) -ABCCategoricalIndex = create_pandas_abc_type("ABCCategoricalIndex", "_typ", ("categoricalindex",)) -ABCIndexClass = create_pandas_abc_type("ABCIndexClass", "_typ", ("index", - "int64index", - "float64index", - "multiindex", - "datetimeindex", - "timedeltaindex", - "periodindex", - "categoricalindex")) - -ABCSeries = create_pandas_abc_type("ABCSeries", "_typ", ("series",)) -ABCDataFrame = create_pandas_abc_type("ABCDataFrame", "_typ", ("dataframe",)) -ABCPanel = create_pandas_abc_type("ABCPanel", "_typ", ("panel",)) +ABCIndex = create_pandas_abc_type("ABCIndex", "_typ", ("index", )) +ABCInt64Index = create_pandas_abc_type("ABCInt64Index", "_typ", + ("int64index", )) +ABCFloat64Index = create_pandas_abc_type("ABCFloat64Index", "_typ", + ("float64index", )) +ABCMultiIndex = create_pandas_abc_type("ABCMultiIndex", "_typ", + ("multiindex", )) +ABCDatetimeIndex = create_pandas_abc_type("ABCDatetimeIndex", "_typ", + ("datetimeindex", )) +ABCTimedeltaIndex = create_pandas_abc_type("ABCTimedeltaIndex", "_typ", + ("timedeltaindex", )) +ABCPeriodIndex = create_pandas_abc_type("ABCPeriodIndex", "_typ", + ("periodindex", )) +ABCCategoricalIndex = create_pandas_abc_type("ABCCategoricalIndex", "_typ", + ("categoricalindex", )) +ABCIndexClass = create_pandas_abc_type("ABCIndexClass", "_typ", + ("index", "int64index", "float64index", + "multiindex", "datetimeindex", + "timedeltaindex", "periodindex", + "categoricalindex")) + +ABCSeries = create_pandas_abc_type("ABCSeries", "_typ", ("series", )) +ABCDataFrame = create_pandas_abc_type("ABCDataFrame", "_typ", ("dataframe", )) +ABCPanel = create_pandas_abc_type("ABCPanel", "_typ", ("panel", )) ABCSparseSeries = create_pandas_abc_type("ABCSparseSeries", "_subtyp", ('sparse_series', 'sparse_time_series')) ABCSparseArray = create_pandas_abc_type("ABCSparseArray", "_subtyp", ('sparse_array', 'sparse_series')) -ABCCategorical = create_pandas_abc_type("ABCCategorical","_typ",("categorical")) -ABCPeriod = create_pandas_abc_type("ABCPeriod", "_typ", ("period",)) +ABCCategorical = create_pandas_abc_type("ABCCategorical", "_typ", + ("categorical")) +ABCPeriod = create_pandas_abc_type("ABCPeriod", "_typ", ("period", )) -class _ABCGeneric(type): +class _ABCGeneric(type): def __instancecheck__(cls, inst): return hasattr(inst, "_data") @@ -136,6 +147,7 @@ class to receive bound method else: setattr(cls, name, func) + def isnull(obj): """Detect missing values (NaN in numeric arrays, None/NaN in object arrays) @@ -198,6 +210,7 @@ def _isnull_old(obj): else: return obj is None + _isnull = _isnull_new @@ -263,6 +276,7 @@ def _isnull_ndarraylike(obj): return result + def _isnull_ndarraylike_old(obj): values = getattr(obj, 'values', obj) dtype = values.dtype @@ -316,6 +330,7 @@ def notnull(obj): return not res return ~res + def is_null_datelike_scalar(other): """ test whether the object is a null datelike, e.g. Nat but guard against passing a non-scalar """ @@ -324,18 +339,19 @@ def is_null_datelike_scalar(other): elif np.isscalar(other): # a timedelta - if hasattr(other,'dtype'): + if hasattr(other, 'dtype'): return other.view('i8') == tslib.iNaT elif is_integer(other) and other == tslib.iNaT: return True return isnull(other) return False + def array_equivalent(left, right, strict_nan=False): """ - True if two arrays, left and right, have equal non-NaN elements, and NaNs in - corresponding locations. False otherwise. It is assumed that left and right - are NumPy arrays of the same dtype. The behavior of this function + True if two arrays, left and right, have equal non-NaN elements, and NaNs + in corresponding locations. False otherwise. It is assumed that left and + right are NumPy arrays of the same dtype. The behavior of this function (particularly with respect to NaNs) is not defined if the dtypes are different. @@ -363,22 +379,25 @@ def array_equivalent(left, right, strict_nan=False): """ left, right = np.asarray(left), np.asarray(right) - if left.shape != right.shape: return False + if left.shape != right.shape: + return False # Object arrays can contain None, NaN and NaT. - if issubclass(left.dtype.type, np.object_) or issubclass(right.dtype.type, np.object_): + if (issubclass(left.dtype.type, np.object_) or + issubclass(right.dtype.type, np.object_)): if not strict_nan: # pd.isnull considers NaN and None to be equivalent. - return lib.array_equivalent_object(_ensure_object(left.ravel()), - _ensure_object(right.ravel())) + return lib.array_equivalent_object( + _ensure_object(left.ravel()), _ensure_object(right.ravel())) for left_value, right_value in zip(left, right): if left_value is tslib.NaT and right_value is not tslib.NaT: return False elif isinstance(left_value, float) and np.isnan(left_value): - if not isinstance(right_value, float) or not np.isnan(right_value): + if (not isinstance(right_value, float) or + not np.isnan(right_value)): return False else: if left_value != right_value: @@ -396,6 +415,7 @@ def array_equivalent(left, right, strict_nan=False): # NaNs cannot occur otherwise. return np.array_equal(left, right) + def _iterable_not_string(x): return (isinstance(x, collections.Iterable) and not isinstance(x, compat.string_types)) @@ -502,6 +522,7 @@ def wrapper(arr, indexer, out, fill_value=np.nan): if fill_wrap is not None: fill_value = fill_wrap(fill_value) f(arr, indexer, out, fill_value=fill_value) + return wrapper @@ -509,6 +530,7 @@ def _convert_wrapper(f, conv_dtype): def wrapper(arr, indexer, out, fill_value=np.nan): arr = arr.astype(conv_dtype) f(arr, indexer, out, fill_value=fill_value) + return wrapper @@ -569,15 +591,14 @@ def _take_nd_generic(arr, indexer, out, axis, fill_value, mask_info): ('float32', 'float64'): algos.take_1d_float32_float64, ('float64', 'float64'): algos.take_1d_float64_float64, ('object', 'object'): algos.take_1d_object_object, - ('bool', 'bool'): - _view_wrapper(algos.take_1d_bool_bool, np.uint8, np.uint8), - ('bool', 'object'): - _view_wrapper(algos.take_1d_bool_object, np.uint8, None), - ('datetime64[ns]', 'datetime64[ns]'): - _view_wrapper(algos.take_1d_int64_int64, np.int64, np.int64, np.int64) + ('bool', 'bool'): _view_wrapper(algos.take_1d_bool_bool, np.uint8, + np.uint8), + ('bool', 'object'): _view_wrapper(algos.take_1d_bool_object, np.uint8, + None), + ('datetime64[ns]', 'datetime64[ns]'): _view_wrapper( + algos.take_1d_int64_int64, np.int64, np.int64, np.int64) } - _take_2d_axis0_dict = { ('int8', 'int8'): algos.take_2d_axis0_int8_int8, ('int8', 'int32'): algos.take_2d_axis0_int8_int32, @@ -596,16 +617,15 @@ def _take_nd_generic(arr, indexer, out, axis, fill_value, mask_info): ('float32', 'float64'): algos.take_2d_axis0_float32_float64, ('float64', 'float64'): algos.take_2d_axis0_float64_float64, ('object', 'object'): algos.take_2d_axis0_object_object, - ('bool', 'bool'): - _view_wrapper(algos.take_2d_axis0_bool_bool, np.uint8, np.uint8), - ('bool', 'object'): - _view_wrapper(algos.take_2d_axis0_bool_object, np.uint8, None), + ('bool', 'bool'): _view_wrapper(algos.take_2d_axis0_bool_bool, np.uint8, + np.uint8), + ('bool', 'object'): _view_wrapper(algos.take_2d_axis0_bool_object, + np.uint8, None), ('datetime64[ns]', 'datetime64[ns]'): _view_wrapper(algos.take_2d_axis0_int64_int64, np.int64, np.int64, fill_wrap=np.int64) } - _take_2d_axis1_dict = { ('int8', 'int8'): algos.take_2d_axis1_int8_int8, ('int8', 'int32'): algos.take_2d_axis1_int8_int32, @@ -624,16 +644,15 @@ def _take_nd_generic(arr, indexer, out, axis, fill_value, mask_info): ('float32', 'float64'): algos.take_2d_axis1_float32_float64, ('float64', 'float64'): algos.take_2d_axis1_float64_float64, ('object', 'object'): algos.take_2d_axis1_object_object, - ('bool', 'bool'): - _view_wrapper(algos.take_2d_axis1_bool_bool, np.uint8, np.uint8), - ('bool', 'object'): - _view_wrapper(algos.take_2d_axis1_bool_object, np.uint8, None), + ('bool', 'bool'): _view_wrapper(algos.take_2d_axis1_bool_bool, np.uint8, + np.uint8), + ('bool', 'object'): _view_wrapper(algos.take_2d_axis1_bool_object, + np.uint8, None), ('datetime64[ns]', 'datetime64[ns]'): _view_wrapper(algos.take_2d_axis1_int64_int64, np.int64, np.int64, fill_wrap=np.int64) } - _take_2d_multi_dict = { ('int8', 'int8'): algos.take_2d_multi_int8_int8, ('int8', 'int32'): algos.take_2d_multi_int8_int32, @@ -652,10 +671,10 @@ def _take_nd_generic(arr, indexer, out, axis, fill_value, mask_info): ('float32', 'float64'): algos.take_2d_multi_float32_float64, ('float64', 'float64'): algos.take_2d_multi_float64_float64, ('object', 'object'): algos.take_2d_multi_object_object, - ('bool', 'bool'): - _view_wrapper(algos.take_2d_multi_bool_bool, np.uint8, np.uint8), - ('bool', 'object'): - _view_wrapper(algos.take_2d_multi_bool_object, np.uint8, None), + ('bool', 'bool'): _view_wrapper(algos.take_2d_multi_bool_bool, np.uint8, + np.uint8), + ('bool', 'object'): _view_wrapper(algos.take_2d_multi_bool_object, + np.uint8, None), ('datetime64[ns]', 'datetime64[ns]'): _view_wrapper(algos.take_2d_multi_int64_int64, np.int64, np.int64, fill_wrap=np.int64) @@ -689,13 +708,14 @@ def _get_take_nd_function(ndim, arr_dtype, out_dtype, axis=0, mask_info=None): def func(arr, indexer, out, fill_value=np.nan): indexer = _ensure_int64(indexer) - _take_nd_generic(arr, indexer, out, axis=axis, - fill_value=fill_value, mask_info=mask_info) + _take_nd_generic(arr, indexer, out, axis=axis, fill_value=fill_value, + mask_info=mask_info) + return func -def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, - mask_info=None, allow_fill=True): +def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, + allow_fill=True): """ Specialized Cython take which sets NaN values in one pass @@ -786,8 +806,8 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, else: out = np.empty(out_shape, dtype=dtype) - func = _get_take_nd_function(arr.ndim, arr.dtype, out.dtype, - axis=axis, mask_info=mask_info) + func = _get_take_nd_function(arr.ndim, arr.dtype, out.dtype, axis=axis, + mask_info=mask_info) indexer = _ensure_int64(indexer) func(arr, indexer, out, fill_value) @@ -799,8 +819,8 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, take_1d = take_nd -def take_2d_multi(arr, indexer, out=None, fill_value=np.nan, - mask_info=None, allow_fill=True): +def take_2d_multi(arr, indexer, out=None, fill_value=np.nan, mask_info=None, + allow_fill=True): """ Specialized Cython take which sets NaN values in one pass """ @@ -858,12 +878,15 @@ def take_2d_multi(arr, indexer, out=None, fill_value=np.nan, if func is not None: func = _convert_wrapper(func, out.dtype) if func is None: + def func(arr, indexer, out, fill_value=np.nan): - _take_2d_multi_generic(arr, indexer, out, - fill_value=fill_value, mask_info=mask_info) + _take_2d_multi_generic(arr, indexer, out, fill_value=fill_value, + mask_info=mask_info) + func(arr, indexer, out=out, fill_value=fill_value) return out + _diff_special = { 'float64': algos.diff_2d_float64, 'float32': algos.diff_2d_float32, @@ -873,6 +896,7 @@ def func(arr, indexer, out, fill_value=np.nan): 'int8': algos.diff_2d_int8, } + def diff(arr, n, axis=0): """ difference of n between self, analagoust to s-s.shift(n) """ @@ -931,10 +955,12 @@ def diff(arr, n, axis=0): if is_timedelta: from pandas import TimedeltaIndex - out_arr = TimedeltaIndex(out_arr.ravel().astype('int64')).asi8.reshape(out_arr.shape).astype('timedelta64[ns]') + out_arr = TimedeltaIndex(out_arr.ravel().astype('int64')).asi8.reshape( + out_arr.shape).astype('timedelta64[ns]') return out_arr + def _coerce_indexer_dtype(indexer, categories): """ coerce the indexer input array to the smallest dtype possible """ l = len(categories) @@ -946,6 +972,7 @@ def _coerce_indexer_dtype(indexer, categories): return _ensure_int32(indexer) return _ensure_int64(indexer) + def _coerce_to_dtypes(result, dtypes): """ given a dtypes and a result set, coerce the result elements to the dtypes @@ -965,7 +992,7 @@ def conv(r, dtype): r = _coerce_scalar_to_timedelta_type(r) elif dtype == np.bool_: # messy. non 0/1 integers do not get converted. - if is_integer(r) and r not in [0,1]: + if is_integer(r) and r not in [0, 1]: return int(r) r = bool(r) elif dtype.kind == 'f': @@ -982,22 +1009,22 @@ def conv(r, dtype): def _infer_fill_value(val): """ - infer the fill value for the nan/NaT from the provided scalar/ndarray/list-like - if we are a NaT, return the correct dtyped element to provide proper block construction - + infer the fill value for the nan/NaT from the provided + scalar/ndarray/list-like if we are a NaT, return the correct dtyped + element to provide proper block construction """ if not is_list_like(val): val = [val] - val = np.array(val,copy=False) + val = np.array(val, copy=False) if is_datetimelike(val): - return np.array('NaT',dtype=val.dtype) + return np.array('NaT', dtype=val.dtype) elif is_object_dtype(val.dtype): dtype = lib.infer_dtype(_ensure_object(val)) - if dtype in ['datetime','datetime64']: - return np.array('NaT',dtype=_NS_DTYPE) - elif dtype in ['timedelta','timedelta64']: - return np.array('NaT',dtype=_TD_DTYPE) + if dtype in ['datetime', 'datetime64']: + return np.array('NaT', dtype=_NS_DTYPE) + elif dtype in ['timedelta', 'timedelta64']: + return np.array('NaT', dtype=_TD_DTYPE) return np.nan @@ -1025,12 +1052,13 @@ def _infer_dtype_from_scalar(val): dtype = np.object_ - elif isinstance(val, (np.datetime64, datetime)) and getattr(val,'tzinfo',None) is None: + elif isinstance(val, (np.datetime64, + datetime)) and getattr(val, 'tzinfo', None) is None: val = lib.Timestamp(val).value dtype = np.dtype('M8[ns]') elif isinstance(val, (np.timedelta64, timedelta)): - val = tslib.convert_to_timedelta(val,'ns') + val = tslib.convert_to_timedelta(val, 'ns') dtype = np.dtype('m8[ns]') elif is_bool(val): @@ -1252,7 +1280,9 @@ def _possibly_downcast_to_dtype(result, dtype): if np.isscalar(result): return result - trans = lambda x: x + def trans(x): + return x + if isinstance(dtype, compat.string_types): if dtype == 'infer': inferred_type = lib.infer_dtype(_ensure_object(result.ravel())) @@ -1269,8 +1299,9 @@ def _possibly_downcast_to_dtype(result, dtype): elif inferred_type == 'floating': dtype = 'int64' if issubclass(result.dtype.type, np.number): - trans = lambda x: x.round() + def trans(x): + return x.round() else: dtype = 'object' @@ -1281,7 +1312,8 @@ def _possibly_downcast_to_dtype(result, dtype): # don't allow upcasts here (except if empty) if dtype.kind == result.dtype.kind: - if result.dtype.itemsize <= dtype.itemsize and np.prod(result.shape): + if (result.dtype.itemsize <= dtype.itemsize and + np.prod(result.shape)): return result if issubclass(dtype.type, np.floating): @@ -1317,7 +1349,7 @@ def _possibly_downcast_to_dtype(result, dtype): return new_result # a datetimelike - elif dtype.kind in ['M','m'] and result.dtype.kind in ['i']: + elif dtype.kind in ['M', 'm'] and result.dtype.kind in ['i']: try: result = result.astype(dtype) except: @@ -1339,7 +1371,7 @@ def _maybe_convert_string_to_object(values): if isinstance(values, string_types): values = np.array([values], dtype=object) elif (isinstance(values, np.ndarray) and - issubclass(values.dtype.type, (np.string_, np.unicode_))): + issubclass(values.dtype.type, (np.string_, np.unicode_))): values = values.astype(object) return values @@ -1386,9 +1418,9 @@ def _fill_zeros(result, x, y, name, fill): return result if name.startswith(('r', '__r')): - x,y = y,x + x, y = y, x - is_typed_variable = (hasattr(y, 'dtype') or hasattr(y,'type')) + is_typed_variable = (hasattr(y, 'dtype') or hasattr(y, 'type')) is_scalar = lib.isscalar(y) if not is_typed_variable and not is_scalar: @@ -1433,30 +1465,32 @@ def _consensus_name_attr(objs): return None return name - -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- # Lots of little utilities + def _validate_date_like_dtype(dtype): try: typ = np.datetime_data(dtype)[0] except ValueError as e: raise TypeError('%s' % e) if typ != 'generic' and typ != 'ns': - raise ValueError('%r is too specific of a frequency, try passing %r' - % (dtype.name, dtype.type.__name__)) + raise ValueError('%r is too specific of a frequency, try passing %r' % + (dtype.name, dtype.type.__name__)) def _invalidate_string_dtypes(dtype_set): - """Change string like dtypes to object for ``DataFrame.select_dtypes()``.""" + """Change string like dtypes to object for + ``DataFrame.select_dtypes()``. + """ non_string_dtypes = dtype_set - _string_dtypes if non_string_dtypes != dtype_set: raise TypeError("string dtypes are not allowed, use 'object' instead") def _get_dtype_from_object(dtype): - """Get a numpy dtype.type-style object. This handles the - datetime64[ns] and datetime64[ns, TZ] compat + """Get a numpy dtype.type-style object. This handles the datetime64[ns] + and datetime64[ns, TZ] compat Notes ----- @@ -1523,6 +1557,7 @@ def _maybe_box_datetimelike(value): return value + _values_from_object = lib.values_from_object @@ -1569,39 +1604,43 @@ def _possibly_cast_to_datetime(value, dtype, errors='raise'): if is_datetime64 or is_datetime64tz or is_timedelta64: # force the dtype if needed - if is_datetime64 and not is_dtype_equal(dtype,_NS_DTYPE): + if is_datetime64 and not is_dtype_equal(dtype, _NS_DTYPE): if dtype.name == 'datetime64[ns]': dtype = _NS_DTYPE else: - raise TypeError( - "cannot convert datetimelike to dtype [%s]" % dtype) + raise TypeError("cannot convert datetimelike to " + "dtype [%s]" % dtype) elif is_datetime64tz: pass - elif is_timedelta64 and not is_dtype_equal(dtype,_TD_DTYPE): + elif is_timedelta64 and not is_dtype_equal(dtype, _TD_DTYPE): if dtype.name == 'timedelta64[ns]': dtype = _TD_DTYPE else: - raise TypeError( - "cannot convert timedeltalike to dtype [%s]" % dtype) + raise TypeError("cannot convert timedeltalike to " + "dtype [%s]" % dtype) if np.isscalar(value): if value == tslib.iNaT or isnull(value): value = tslib.iNaT else: - value = np.array(value,copy=False) + value = np.array(value, copy=False) # have a scalar array-like (e.g. NaT) if value.ndim == 0: value = tslib.iNaT # we have an array of datetime or timedeltas & nulls - elif np.prod(value.shape) or not is_dtype_equal(value.dtype, dtype): + elif np.prod(value.shape) or not is_dtype_equal(value.dtype, + dtype): try: if is_datetime64: value = to_datetime(value, errors=errors)._values elif is_datetime64tz: - # input has to be UTC at this point, so just localize - value = to_datetime(value, errors=errors).tz_localize(dtype.tz) + # input has to be UTC at this point, so just + # localize + value = to_datetime( + value, + errors=errors).tz_localize(dtype.tz) elif is_timedelta64: value = to_timedelta(value, errors=errors)._values except (AttributeError, ValueError): @@ -1670,7 +1709,7 @@ def _possibly_infer_to_datetimelike(value, convert_dates=False): v = value if not is_list_like(v): v = [v] - v = np.array(v,copy=False) + v = np.array(v, copy=False) shape = v.shape if not v.ndim == 1: v = v.ravel() @@ -1684,8 +1723,8 @@ def _try_datetime(v): except ValueError: # we might have a sequence of the same-datetimes with tz's - # if so coerce to a DatetimeIndex; if they are not the same, then - # these stay as object dtype + # if so coerce to a DatetimeIndex; if they are not the same, + # then these stay as object dtype try: from pandas import to_datetime return to_datetime(v) @@ -1708,17 +1747,18 @@ def _try_timedelta(v): return v # do a quick inference for perf - sample = v[:min(3,len(v))] + sample = v[:min(3, len(v))] inferred_type = lib.infer_dtype(sample) - if inferred_type in ['datetime', 'datetime64'] or (convert_dates and inferred_type in ['date']): + if (inferred_type in ['datetime', 'datetime64'] or + (convert_dates and inferred_type in ['date'])): value = _try_datetime(v) elif inferred_type in ['timedelta', 'timedelta64']: value = _try_timedelta(v) - # its possible to have nulls intermixed within the datetime or timedelta - # these will in general have an inferred_type of 'mixed', so have to try - # both datetime and timedelta + # It's possible to have nulls intermixed within the datetime or + # timedelta. These will in general have an inferred_type of 'mixed', + # so have to try both datetime and timedelta. # try timedelta first to avoid spurious datetime conversions # e.g. '00:00:01' is a timedelta but technically is also a datetime @@ -1758,7 +1798,7 @@ def is_bool_indexer(key): def _default_index(n): from pandas.core.index import Int64Index values = np.arange(n, dtype=np.int64) - result = Int64Index(values,name=None) + result = Int64Index(values, name=None) result.is_unique = True return result @@ -1785,6 +1825,7 @@ def _mut_exclusive(**kwargs): def _not_none(*args): return (arg for arg in args if arg is not None) + def _any_none(*args): for arg in args: if arg is None: @@ -1810,12 +1851,10 @@ def _try_sort(iterable): def _count_not_none(*args): return sum(x is not None for x in args) -#------------------------------------------------------------------------------ +# ----------------------------------------------------------------------------- # miscellaneous python tools - - def adjoin(space, *lists, **kwargs): """ Glues together two sets of strings using the amount of space requested. @@ -1850,6 +1889,7 @@ def adjoin(space, *lists, **kwargs): out_lines.append(_join_unicode(lines)) return _join_unicode(out_lines, sep='\n') + def _justify(texts, max_len, mode='right'): """ Perform ljust, center, rjust against string or list-like @@ -1861,6 +1901,7 @@ def _justify(texts, max_len, mode='right'): else: return [x.rjust(max_len) for x in texts] + def _join_unicode(lines, sep=''): try: return sep.join(lines) @@ -1874,7 +1915,7 @@ def iterpairs(seq): """ Parameters ---------- - seq: sequence + seq : sequence Returns ------- @@ -1933,7 +1974,6 @@ def _long_prod(vals): class groupby(dict): - """ A simple groupby different from the one in itertools. @@ -1945,6 +1985,7 @@ def __init__(self, seq, key=lambda x: x): for value in seq: k = key(value) self.setdefault(k, []).append(value) + try: __iter__ = dict.iteritems except AttributeError: # pragma: no cover @@ -1986,8 +2027,7 @@ def intersection(*seqs): def _asarray_tuplesafe(values, dtype=None): from pandas.core.index import Index - if not (isinstance(values, (list, tuple)) - or hasattr(values, '__array__')): + if not (isinstance(values, (list, tuple)) or hasattr(values, '__array__')): values = list(values) elif isinstance(values, Index): return values.values @@ -2036,25 +2076,21 @@ def _maybe_make_list(obj): return [obj] return obj -######################## -##### TYPE TESTING ##### -######################## +# TYPE TESTING is_bool = lib.is_bool - is_integer = lib.is_integer - is_float = lib.is_float - is_complex = lib.is_complex def is_string_like(obj): return isinstance(obj, (compat.text_type, compat.string_types)) + def is_iterator(obj): # python 3 generators have __next__ instead of next return hasattr(obj, 'next') or hasattr(obj, '__next__') @@ -2063,6 +2099,7 @@ def is_iterator(obj): def is_number(obj): return isinstance(obj, (numbers.Number, np.number)) + def is_period_arraylike(arr): """ return if we are period arraylike / PeriodIndex """ if isinstance(arr, pd.PeriodIndex): @@ -2071,6 +2108,7 @@ def is_period_arraylike(arr): return arr.dtype == object and lib.infer_dtype(arr) == 'period' return getattr(arr, 'inferred_type', None) == 'period' + def is_datetime_arraylike(arr): """ return if we are datetime arraylike / DatetimeIndex """ if isinstance(arr, ABCDatetimeIndex): @@ -2079,8 +2117,11 @@ def is_datetime_arraylike(arr): return arr.dtype == object and lib.infer_dtype(arr) == 'datetime' return getattr(arr, 'inferred_type', None) == 'datetime' + def is_datetimelike(arr): - return arr.dtype in _DATELIKE_DTYPES or isinstance(arr, ABCPeriodIndex) or is_datetimetz(arr) + return (arr.dtype in _DATELIKE_DTYPES or isinstance(arr, ABCPeriodIndex) or + is_datetimetz(arr)) + def _coerce_to_dtype(dtype): """ coerce a string / np.dtype to a dtype """ @@ -2092,6 +2133,7 @@ def _coerce_to_dtype(dtype): dtype = np.dtype(dtype) return dtype + def _get_dtype(arr_or_dtype): if isinstance(arr_or_dtype, np.dtype): return arr_or_dtype @@ -2111,6 +2153,7 @@ def _get_dtype(arr_or_dtype): arr_or_dtype = arr_or_dtype.dtype return np.dtype(arr_or_dtype) + def _get_dtype_type(arr_or_dtype): if isinstance(arr_or_dtype, np.dtype): return arr_or_dtype.type @@ -2131,6 +2174,7 @@ def _get_dtype_type(arr_or_dtype): except AttributeError: return type(None) + def is_dtype_equal(source, target): """ return a boolean if the dtypes are equal """ try: @@ -2143,6 +2187,7 @@ def is_dtype_equal(source, target): # object == category will hit this return False + def is_any_int_dtype(arr_or_dtype): tipo = _get_dtype_type(arr_or_dtype) return issubclass(tipo, np.integer) @@ -2153,15 +2198,18 @@ def is_integer_dtype(arr_or_dtype): return (issubclass(tipo, np.integer) and not issubclass(tipo, (np.datetime64, np.timedelta64))) + def is_int64_dtype(arr_or_dtype): tipo = _get_dtype_type(arr_or_dtype) return issubclass(tipo, np.int64) + def is_int_or_datetime_dtype(arr_or_dtype): tipo = _get_dtype_type(arr_or_dtype) return (issubclass(tipo, np.integer) or issubclass(tipo, (np.datetime64, np.timedelta64))) + def is_datetime64_dtype(arr_or_dtype): try: tipo = _get_dtype_type(arr_or_dtype) @@ -2169,11 +2217,15 @@ def is_datetime64_dtype(arr_or_dtype): return False return issubclass(tipo, np.datetime64) + def is_datetime64tz_dtype(arr_or_dtype): return DatetimeTZDtype.is_dtype(arr_or_dtype) + def is_datetime64_any_dtype(arr_or_dtype): - return is_datetime64_dtype(arr_or_dtype) or is_datetime64tz_dtype(arr_or_dtype) + return (is_datetime64_dtype(arr_or_dtype) or + is_datetime64tz_dtype(arr_or_dtype)) + def is_datetime64_ns_dtype(arr_or_dtype): try: @@ -2182,6 +2234,7 @@ def is_datetime64_ns_dtype(arr_or_dtype): return False return tipo == _NS_DTYPE + def is_timedelta64_dtype(arr_or_dtype): tipo = _get_dtype_type(arr_or_dtype) return issubclass(tipo, np.timedelta64) @@ -2215,62 +2268,77 @@ def is_numeric_v_string_like(a, b): is_a_scalar_string_like = not is_a_array and is_string_like(a) is_b_scalar_string_like = not is_b_array and is_string_like(b) - return ( - is_a_numeric_array and is_b_scalar_string_like) or ( - is_b_numeric_array and is_a_scalar_string_like) or ( - is_a_numeric_array and is_b_string_array) or ( - is_b_numeric_array and is_a_string_array - ) + return ((is_a_numeric_array and is_b_scalar_string_like) or + (is_b_numeric_array and is_a_scalar_string_like) or + (is_a_numeric_array and is_b_string_array) or + (is_b_numeric_array and is_a_string_array)) + def is_datetimelike_v_numeric(a, b): - # return if we have an i8 convertible and numeric comparision - if not hasattr(a,'dtype'): + # return if we have an i8 convertible and numeric comparison + if not hasattr(a, 'dtype'): a = np.asarray(a) if not hasattr(b, 'dtype'): b = np.asarray(b) - is_numeric = lambda x: is_integer_dtype(x) or is_float_dtype(x) + + def is_numeric(x): + return is_integer_dtype(x) or is_float_dtype(x) + is_datetimelike = needs_i8_conversion - return (is_datetimelike(a) and is_numeric(b)) or ( - is_datetimelike(b) and is_numeric(a)) + return ((is_datetimelike(a) and is_numeric(b)) or + (is_datetimelike(b) and is_numeric(a))) + def is_datetimelike_v_object(a, b): - # return if we have an i8 convertible and object comparision - if not hasattr(a,'dtype'): + # return if we have an i8 convertible and object comparsion + if not hasattr(a, 'dtype'): a = np.asarray(a) if not hasattr(b, 'dtype'): b = np.asarray(b) - f = lambda x: is_object_dtype(x) - is_object = lambda x: is_integer_dtype(x) or is_float_dtype(x) + + def f(x): + return is_object_dtype(x) + + def is_object(x): + return is_integer_dtype(x) or is_float_dtype(x) + is_datetimelike = needs_i8_conversion - return (is_datetimelike(a) and is_object(b)) or ( - is_datetimelike(b) and is_object(a)) + return ((is_datetimelike(a) and is_object(b)) or + (is_datetimelike(b) and is_object(a))) + + +def needs_i8_conversion(arr_or_dtype): + return (is_datetime_or_timedelta_dtype(arr_or_dtype) or + is_datetime64tz_dtype(arr_or_dtype)) -needs_i8_conversion = lambda arr_or_dtype: is_datetime_or_timedelta_dtype(arr_or_dtype) or \ - is_datetime64tz_dtype(arr_or_dtype) def i8_boxer(arr_or_dtype): """ return the scalar boxer for the dtype """ - if is_datetime64_dtype(arr_or_dtype) or is_datetime64tz_dtype(arr_or_dtype): + if (is_datetime64_dtype(arr_or_dtype) or + is_datetime64tz_dtype(arr_or_dtype)): return lib.Timestamp elif is_timedelta64_dtype(arr_or_dtype): - return lambda x: lib.Timedelta(x,unit='ns') + return lambda x: lib.Timedelta(x, unit='ns') raise ValueError("cannot find a scalar boxer for {0}".format(arr_or_dtype)) + def is_numeric_dtype(arr_or_dtype): tipo = _get_dtype_type(arr_or_dtype) - return (issubclass(tipo, (np.number, np.bool_)) - and not issubclass(tipo, (np.datetime64, np.timedelta64))) + return (issubclass(tipo, (np.number, np.bool_)) and + not issubclass(tipo, (np.datetime64, np.timedelta64))) def is_string_dtype(arr_or_dtype): dtype = _get_dtype(arr_or_dtype) return dtype.kind in ('O', 'S', 'U') + def is_string_like_dtype(arr_or_dtype): # exclude object as its a mixed dtype dtype = _get_dtype(arr_or_dtype) return dtype.kind in ('S', 'U') + def is_float_dtype(arr_or_dtype): tipo = _get_dtype_type(arr_or_dtype) return issubclass(tipo, np.floating) @@ -2289,13 +2357,18 @@ def is_bool_dtype(arr_or_dtype): return False return issubclass(tipo, np.bool_) + def is_sparse(array): """ return if we are a sparse array """ return isinstance(array, (ABCSparseArray, ABCSparseSeries)) + def is_datetimetz(array): """ return if we are a datetime with tz array """ - return (isinstance(array, ABCDatetimeIndex) and getattr(array,'tz',None) is not None) or is_datetime64tz_dtype(array) + return ((isinstance(array, ABCDatetimeIndex) and + getattr(array, 'tz', None) is not None) or + is_datetime64tz_dtype(array)) + def is_internal_type(value): """ @@ -2310,13 +2383,16 @@ def is_internal_type(value): return True return False + def is_categorical(array): """ return if we are a categorical possibility """ return isinstance(array, ABCCategorical) or is_categorical_dtype(array) + def is_categorical_dtype(arr_or_dtype): return CategoricalDtype.is_dtype(arr_or_dtype) + def is_complex_dtype(arr_or_dtype): tipo = _get_dtype_type(arr_or_dtype) return issubclass(tipo, np.complexfloating) @@ -2341,21 +2417,25 @@ def is_re_compilable(obj): def is_list_like(arg): - return (hasattr(arg, '__iter__') and + return (hasattr(arg, '__iter__') and not isinstance(arg, compat.string_and_binary_types)) + def is_named_tuple(arg): return isinstance(arg, tuple) and hasattr(arg, '_fields') + def is_null_slice(obj): """ we have a null slice """ return (isinstance(obj, slice) and obj.start is None and obj.stop is None and obj.step is None) + def is_full_slice(obj, l): """ we have a full length slice """ - return (isinstance(obj, slice) and obj.start == 0 and - obj.stop == l and obj.step is None) + return (isinstance(obj, slice) and obj.start == 0 and obj.stop == l and + obj.step is None) + def is_hashable(arg): """Return True if hash(arg) will succeed, False otherwise. @@ -2414,10 +2494,10 @@ def _get_callable_name(obj): # distinguishing between no name and a name of '' return None + _string_dtypes = frozenset(map(_get_dtype_from_object, (compat.binary_type, compat.text_type))) - _ensure_float64 = algos.ensure_float64 _ensure_float32 = algos.ensure_float32 _ensure_int64 = algos.ensure_int64 @@ -2456,7 +2536,7 @@ def _astype_nansafe(arr, dtype, copy=True): # in py3, timedelta64[ns] are int64 elif ((compat.PY3 and dtype not in [_INT64_DTYPE, _TD_DTYPE]) or - (not compat.PY3 and dtype != _TD_DTYPE)): + (not compat.PY3 and dtype != _TD_DTYPE)): # allow frequency conversions if dtype.kind == 'm': @@ -2524,11 +2604,13 @@ def get_dtype_kinds(l): typs.add(typ) return typs + def _concat_compat(to_concat, axis=0): """ provide concatenation of an array of arrays each of which is a single - 'normalized' dtypes (in that for example, if its object, then it is a non-datetimelike - provde a combined dtype for the resulting array the preserves the overall dtype if possible) + 'normalized' dtypes (in that for example, if it's object, then it is a + non-datetimelike and provide a combined dtype for the resulting array that + preserves the overall dtype if possible) Parameters ---------- @@ -2547,6 +2629,7 @@ def is_nonempty(x): return x.shape[axis] > 0 except Exception: return True + nonempty = [x for x in to_concat if is_nonempty(x)] # If all arrays are empty, there's nothing to convert, just short-cut to @@ -2572,20 +2655,22 @@ def is_nonempty(x): return _concat_compat(to_concat, axis=axis) if not nonempty: - - # we have all empties, but may need to coerce the result dtype to object if we - # have non-numeric type operands (numpy would otherwise cast this to float) + # we have all empties, but may need to coerce the result dtype to + # object if we have non-numeric type operands (numpy would otherwise + # cast this to float) typs = get_dtype_kinds(to_concat) if len(typs) != 1: - if not len(typs-set(['i','u','f'])) or not len(typs-set(['bool','i','u'])): + if (not len(typs - set(['i', 'u', 'f'])) or + not len(typs - set(['bool', 'i', 'u']))): # let numpy coerce pass else: # coerce to object - to_concat = [ x.astype('object') for x in to_concat ] + to_concat = [x.astype('object') for x in to_concat] + + return np.concatenate(to_concat, axis=axis) - return np.concatenate(to_concat,axis=axis) def _where_compat(mask, arr1, arr2): if arr1.dtype == _NS_DTYPE and arr2.dtype == _NS_DTYPE: @@ -2600,6 +2685,7 @@ def _where_compat(mask, arr1, arr2): return np.where(mask, arr1, arr2) + def _dict_compat(d): """ Helper function to convert datetimelike-keyed dicts to Timestamp-keyed dict @@ -2613,20 +2699,23 @@ def _dict_compat(d): dict """ - return dict((_maybe_box_datetimelike(key), value) for key, value in iteritems(d)) + return dict((_maybe_box_datetimelike(key), value) + for key, value in iteritems(d)) -def sentinel_factory(): +def sentinel_factory(): class Sentinel(object): pass return Sentinel() + def in_interactive_session(): """ check if we're running in an interactive shell returns True if running under python/ipython interactive shell """ + def check_main(): import __main__ as main return (not hasattr(main, '__file__') or @@ -2648,8 +2737,7 @@ def in_qtconsole(): ip = get_ipython() front_end = ( ip.config.get('KernelApp', {}).get('parent_appname', "") or - ip.config.get('IPKernelApp', {}).get('parent_appname', "") - ) + ip.config.get('IPKernelApp', {}).get('parent_appname', "")) if 'qtconsole' in front_end.lower(): return True except: @@ -2668,8 +2756,7 @@ def in_ipnb(): ip = get_ipython() front_end = ( ip.config.get('KernelApp', {}).get('parent_appname', "") or - ip.config.get('IPKernelApp', {}).get('parent_appname', "") - ) + ip.config.get('IPKernelApp', {}).get('parent_appname', "")) if 'notebook' in front_end.lower(): return True except: @@ -2738,7 +2825,8 @@ def _pprint_seq(seq, _nest_lvl=0, max_seq_items=None, **kwds): s = iter(seq) r = [] for i in range(min(nitems, len(seq))): # handle sets, no slicing - r.append(pprint_thing(next(s), _nest_lvl + 1, max_seq_items=max_seq_items, **kwds)) + r.append(pprint_thing( + next(s), _nest_lvl + 1, max_seq_items=max_seq_items, **kwds)) body = ", ".join(r) if nitems < len(seq): @@ -2765,8 +2853,11 @@ def _pprint_dict(seq, _nest_lvl=0, max_seq_items=None, **kwds): nitems = max_seq_items or get_option("max_seq_items") or len(seq) for k, v in list(seq.items())[:nitems]: - pairs.append(pfmt % (pprint_thing(k, _nest_lvl + 1, max_seq_items=max_seq_items, **kwds), - pprint_thing(v, _nest_lvl + 1, max_seq_items=max_seq_items, **kwds))) + pairs.append(pfmt % + (pprint_thing(k, _nest_lvl + 1, + max_seq_items=max_seq_items, **kwds), + pprint_thing(v, _nest_lvl + 1, + max_seq_items=max_seq_items, **kwds))) if nitems < len(seq): return fmt % (", ".join(pairs) + ", ...") @@ -2802,6 +2893,7 @@ def pprint_thing(thing, _nest_lvl=0, escape_chars=None, default_escapes=False, result - unicode object on py2, str on py3. Always Unicode. """ + def as_escaped_unicode(thing, escape_chars=escape_chars): # Unicode is fine, else we try to decode using utf-8 and 'replace' # if that's not it either, we have no way of knowing and the user @@ -2813,10 +2905,7 @@ def as_escaped_unicode(thing, escape_chars=escape_chars): # either utf-8 or we replace errors result = str(thing).decode('utf-8', "replace") - translate = {'\t': r'\t', - '\n': r'\n', - '\r': r'\r', - } + translate = {'\t': r'\t', '\n': r'\n', '\r': r'\r', } if isinstance(escape_chars, dict): if default_escapes: translate.update(escape_chars) @@ -2834,11 +2923,13 @@ def as_escaped_unicode(thing, escape_chars=escape_chars): return compat.text_type(thing) elif (isinstance(thing, dict) and _nest_lvl < get_option("display.pprint_nest_depth")): - result = _pprint_dict(thing, _nest_lvl, quote_strings=True, max_seq_items=max_seq_items) - elif is_sequence(thing) and _nest_lvl < \ - get_option("display.pprint_nest_depth"): + result = _pprint_dict(thing, _nest_lvl, quote_strings=True, + max_seq_items=max_seq_items) + elif (is_sequence(thing) and + _nest_lvl < get_option("display.pprint_nest_depth")): result = _pprint_seq(thing, _nest_lvl, escape_chars=escape_chars, - quote_strings=quote_strings, max_seq_items=max_seq_items) + quote_strings=quote_strings, + max_seq_items=max_seq_items) elif isinstance(thing, compat.string_types) and quote_strings: if compat.PY3: fmt = "'%s'" @@ -2864,8 +2955,8 @@ def console_encode(object, **kwds): set in display.encoding. Use this everywhere where you output to the console. """ - return pprint_thing_encoded(object, - get_option("display.encoding")) + return pprint_thing_encoded(object, get_option("display.encoding")) + def _maybe_match_name(a, b): a_has = hasattr(a, 'name') @@ -2881,6 +2972,7 @@ def _maybe_match_name(a, b): return b.name return None + def _random_state(state=None): """ Helper function for processing random_state arguments. @@ -2906,4 +2998,5 @@ def _random_state(state=None): elif state is None: return np.random.RandomState() else: - raise ValueError("random_state must be an integer, a numpy RandomState, or None") + raise ValueError("random_state must be an integer, a numpy " + "RandomState, or None") diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b27c4268796dd..273166db12142 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -23,17 +23,15 @@ import numpy as np import numpy.ma as ma -from pandas.core.common import (isnull, notnull, PandasError, _try_sort, - _default_index, _maybe_upcast, is_sequence, - _infer_dtype_from_scalar, _values_from_object, - is_list_like, _maybe_box_datetimelike, - is_categorical_dtype, is_object_dtype, - is_internal_type, is_datetimetz, - _possibly_infer_to_datetimelike, _dict_compat) +from pandas.core.common import ( + isnull, notnull, PandasError, _try_sort, _default_index, _maybe_upcast, + is_sequence, _infer_dtype_from_scalar, _values_from_object, is_list_like, + _maybe_box_datetimelike, is_categorical_dtype, is_object_dtype, + is_internal_type, is_datetimetz, _possibly_infer_to_datetimelike, + _dict_compat) from pandas.core.generic import NDFrame, _shared_docs from pandas.core.index import Index, MultiIndex, _ensure_index -from pandas.core.indexing import (maybe_droplevels, - convert_to_index_sliceable, +from pandas.core.indexing import (maybe_droplevels, convert_to_index_sliceable, check_bool_indexer) from pandas.core.internals import (BlockManager, create_block_manager_from_arrays, @@ -43,11 +41,11 @@ import pandas.computation.expressions as expressions from pandas.computation.eval import eval as _eval from numpy import percentile as _quantile -from pandas.compat import(range, map, zip, lrange, lmap, lzip, StringIO, u, - OrderedDict, raise_with_traceback) +from pandas.compat import (range, map, zip, lrange, lmap, lzip, StringIO, u, + OrderedDict, raise_with_traceback) from pandas import compat -from pandas.util.decorators import (deprecate, Appender, - Substitution, deprecate_kwarg) +from pandas.util.decorators import (deprecate, Appender, Substitution, + deprecate_kwarg) from pandas.tseries.period import PeriodIndex from pandas.tseries.index import DatetimeIndex @@ -59,6 +57,7 @@ import pandas.core.format as fmt import pandas.core.nanops as nanops import pandas.core.ops as ops +import pandas.tools.plotting as gfx import pandas.lib as lib import pandas.algos as _algos @@ -66,7 +65,7 @@ from pandas.core.config import get_option from pandas import _np_version_under1p9 -# ---------------------------------------------------------------------- +# --------------------------------------------------------------------- # Docstring templates _shared_doc_kwargs = dict(axes='index, columns', klass='DataFrame', @@ -156,12 +155,11 @@ of DataFrame. """ -#---------------------------------------------------------------------- +# ----------------------------------------------------------------------- # DataFrame class class DataFrame(NDFrame): - """ Two-dimensional size-mutable, potentially heterogeneous tabular data structure with labeled axes (rows and columns). Arithmetic operations align on both row and column labels. Can be thought of as a dict-like @@ -359,11 +357,9 @@ def _init_dict(self, data, index, columns, dtype=None): columns = data_names = Index(keys) arrays = [data[k] for k in keys] - return _arrays_to_mgr(arrays, data_names, index, columns, - dtype=dtype) + return _arrays_to_mgr(arrays, data_names, index, columns, dtype=dtype) - def _init_ndarray(self, values, index, columns, dtype=None, - copy=False): + def _init_ndarray(self, values, index, columns, dtype=None, copy=False): # input must be a ndarray, list, Series, index if isinstance(values, Series): @@ -396,20 +392,20 @@ def _get_axes(N, K, index=index, columns=columns): # we could have a categorical type passed or coerced to 'category' # recast this to an _arrays_to_mgr - if is_categorical_dtype(getattr(values,'dtype',None)) or is_categorical_dtype(dtype): + if (is_categorical_dtype(getattr(values, 'dtype', None)) or + is_categorical_dtype(dtype)): - if not hasattr(values,'dtype'): + if not hasattr(values, 'dtype'): values = _prep_ndarray(values, copy=copy) values = values.ravel() elif copy: values = values.copy() - index, columns = _get_axes(len(values),1) - return _arrays_to_mgr([ values ], columns, index, columns, + index, columns = _get_axes(len(values), 1) + return _arrays_to_mgr([values], columns, index, columns, dtype=dtype) elif is_datetimetz(values): - return self._init_dict({ 0 : values }, index, columns, - dtype=dtype) + return self._init_dict({0: values}, index, columns, dtype=dtype) # by definition an array here # the dtypes will be coerced to a single dtype @@ -449,7 +445,7 @@ def shape(self): """ Return a tuple representing the dimensionality of the DataFrame. """ - return (len(self.index), len(self.columns)) + return len(self.index), len(self.columns) def _repr_fits_vertical_(self): """ @@ -478,9 +474,9 @@ def _repr_fits_horizontal_(self, ignore_width=False): ((not ignore_width) and width and nb_columns > (width // 2))): return False - if (ignore_width # used by repr_html under IPython notebook - # scripts ignore terminal dims - or not com.in_interactive_session()): + # used by repr_html under IPython notebook or scripts ignore terminal + # dims + if ignore_width or not com.in_interactive_session(): return True if (get_option('display.width') is not None or @@ -514,9 +510,8 @@ def _repr_fits_horizontal_(self, ignore_width=False): def _info_repr(self): """True if the repr should show the info view.""" info_repr_option = (get_option("display.large_repr") == "info") - return info_repr_option and not ( - self._repr_fits_horizontal_() and self._repr_fits_vertical_() - ) + return info_repr_option and not (self._repr_fits_horizontal_() and + self._repr_fits_vertical_()) def __unicode__(self): """ @@ -551,8 +546,8 @@ def _repr_html_(self): # behaves badly when outputting an HTML table # that doesn't fit the window, so disable it. # XXX: In IPython 3.x and above, the Qt console will not attempt to - # display HTML, so this check can be removed when support for IPython 2.x - # is no longer needed. + # display HTML, so this check can be removed when support for + # IPython 2.x is no longer needed. if com.in_qtconsole(): # 'HTML output is disabled in QtConsole' return None @@ -561,8 +556,8 @@ def _repr_html_(self): buf = StringIO(u("")) self.info(buf=buf) # need to escape the , should be the first line. - val = buf.getvalue().replace('<', r'<', 1).replace('>', - r'>', 1) + val = buf.getvalue().replace('<', r'<', 1) + val = val.replace('>', r'>', 1) return '
' + val + '
' if get_option("display.notebook_repr_html"): @@ -571,8 +566,7 @@ def _repr_html_(self): show_dimensions = get_option("display.show_dimensions") return self.to_html(max_rows=max_rows, max_cols=max_cols, - show_dimensions=show_dimensions, - notebook=True) + show_dimensions=show_dimensions, notebook=True) else: return None @@ -602,8 +596,8 @@ def iteritems(self): See also -------- - iterrows : Iterate over the rows of a DataFrame as (index, Series) pairs. - itertuples : Iterate over the rows of a DataFrame as namedtuples of the values. + iterrows : Iterate over DataFrame rows as (index, Series) pairs. + itertuples : Iterate over DataFrame rows as namedtuples of the values. """ if self.columns.is_unique and hasattr(self, '_item_cache'): @@ -611,11 +605,11 @@ def iteritems(self): yield k, self._get_item_cache(k) else: for i, k in enumerate(self.columns): - yield k, self._ixs(i,axis=1) + yield k, self._ixs(i, axis=1) def iterrows(self): """ - Iterate over the rows of a DataFrame as (index, Series) pairs. + Iterate over DataFrame rows as (index, Series) pairs. Notes ----- @@ -651,7 +645,7 @@ def iterrows(self): See also -------- - itertuples : Iterate over the rows of a DataFrame as namedtuples of the values. + itertuples : Iterate over DataFrame rows as namedtuples of the values. iteritems : Iterate over (column name, Series) pairs. """ @@ -662,15 +656,16 @@ def iterrows(self): def itertuples(self, index=True, name="Pandas"): """ - Iterate over the rows of DataFrame as namedtuples, with index value - as first element of the tuple. + Iterate over DataFrame rows as namedtuples, with index value as first + element of the tuple. Parameters ---------- index : boolean, default True If True, return the index as the first element of the tuple. name : string, default "Pandas" - The name of the returned namedtuples or None to return regular tuples. + The name of the returned namedtuples or None to return regular + tuples. Notes ----- @@ -680,13 +675,14 @@ def itertuples(self, index=True, name="Pandas"): See also -------- - iterrows : Iterate over the rows of a DataFrame as (index, Series) pairs. + iterrows : Iterate over DataFrame rows as (index, Series) pairs. iteritems : Iterate over (column name, Series) pairs. Examples -------- - >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [0.1, 0.2]}, index=['a', 'b']) + >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [0.1, 0.2]}, + index=['a', 'b']) >>> df col1 col2 a 1 0.1 @@ -712,8 +708,9 @@ def itertuples(self, index=True, name="Pandas"): if name is not None and len(self.columns) + index < 256: # `rename` is unsupported in Python 2.6 try: - itertuple = collections.namedtuple( - name, fields+list(self.columns), rename=True) + itertuple = collections.namedtuple(name, + fields + list(self.columns), + rename=True) return map(itertuple._make, zip(*arrays)) except Exception: pass @@ -759,8 +756,7 @@ def dot(self, other): (lvals.shape, rvals.shape)) if isinstance(other, DataFrame): - return self._constructor(np.dot(lvals, rvals), - index=left.index, + return self._constructor(np.dot(lvals, rvals), index=left.index, columns=other.columns) elif isinstance(other, Series): return Series(np.dot(lvals, rvals), index=left.index) @@ -773,7 +769,7 @@ def dot(self, other): else: # pragma: no cover raise TypeError('unsupported type: %s' % type(other)) - #---------------------------------------------------------------------- + # ---------------------------------------------------------------------- # IO methods (to / from other formats) @classmethod @@ -847,12 +843,15 @@ def to_dict(self, orient='dict'): elif orient.lower().startswith('sp'): return {'index': self.index.tolist(), 'columns': self.columns.tolist(), - 'data': lib.map_infer(self.values.ravel(), _maybe_box_datetimelike) + 'data': lib.map_infer(self.values.ravel(), + _maybe_box_datetimelike) .reshape(self.values.shape).tolist()} elif orient.lower().startswith('s'): - return dict((k, _maybe_box_datetimelike(v)) for k, v in compat.iteritems(self)) + return dict((k, _maybe_box_datetimelike(v)) + for k, v in compat.iteritems(self)) elif orient.lower().startswith('r'): - return [dict((k, _maybe_box_datetimelike(v)) for k, v in zip(self.columns, row)) + return [dict((k, _maybe_box_datetimelike(v)) + for k, v in zip(self.columns, row)) for row in self.values] elif orient.lower().startswith('i'): return dict((k, v.to_dict()) for k, v in self.iterrows()) @@ -890,8 +889,8 @@ def to_gbq(self, destination_table, project_id, chunksize=10000, from pandas.io import gbq return gbq.to_gbq(self, destination_table, project_id=project_id, - chunksize=chunksize, verbose=verbose, - reauth=reauth, if_exists=if_exists) + chunksize=chunksize, verbose=verbose, reauth=reauth, + if_exists=if_exists) @classmethod def from_records(cls, data, index=None, exclude=None, columns=None, @@ -1015,8 +1014,7 @@ def from_records(cls, data, index=None, exclude=None, columns=None, arr_columns = arr_columns.drop(arr_exclude) columns = columns.drop(exclude) - mgr = _arrays_to_mgr(arrays, arr_columns, result_index, - columns) + mgr = _arrays_to_mgr(arrays, arr_columns, result_index, columns) return cls(mgr) @@ -1126,11 +1124,12 @@ def _from_arrays(cls, arrays, columns, index, dtype=None): return cls(mgr) @classmethod - def from_csv(cls, path, header=0, sep=',', index_col=0, - parse_dates=True, encoding=None, tupleize_cols=False, + def from_csv(cls, path, header=0, sep=',', index_col=0, parse_dates=True, + encoding=None, tupleize_cols=False, infer_datetime_format=False): """ - Read CSV file (DISCOURAGED, please use :func:`pandas.read_csv` instead). + Read CSV file (DISCOURAGED, please use :func:`pandas.read_csv` + instead). It is preferable to use the more powerful :func:`pandas.read_csv` for most general purposes, but ``from_csv`` makes for an easy @@ -1197,8 +1196,8 @@ def to_sparse(self, fill_value=None, kind='block'): y : SparseDataFrame """ from pandas.core.sparse import SparseDataFrame - return SparseDataFrame(self._series, index=self.index, columns=self.columns, - default_kind=kind, + return SparseDataFrame(self._series, index=self.index, + columns=self.columns, default_kind=kind, default_fill_value=fill_value) def to_panel(self): @@ -1246,7 +1245,8 @@ def to_panel(self): # create new manager new_mgr = selfsorted._data.reshape_nd(axes=new_axes, - labels=[major_labels, minor_labels], + labels=[major_labels, + minor_labels], shape=shape, ref_items=selfsorted.columns) @@ -1316,26 +1316,25 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, date_format : string, default None Format string for datetime objects decimal: string, default '.' - Character recognized as decimal separator. E.g. use ',' for European data + Character recognized as decimal separator. E.g. use ',' for + European data .. versionadded:: 0.16.0 """ formatter = fmt.CSVFormatter(self, path_or_buf, - line_terminator=line_terminator, - sep=sep, encoding=encoding, - compression=compression, - quoting=quoting, na_rep=na_rep, - float_format=float_format, cols=columns, - header=header, index=index, + line_terminator=line_terminator, sep=sep, + encoding=encoding, + compression=compression, quoting=quoting, + na_rep=na_rep, float_format=float_format, + cols=columns, header=header, index=index, index_label=index_label, mode=mode, chunksize=chunksize, quotechar=quotechar, engine=kwds.get("engine"), tupleize_cols=tupleize_cols, date_format=date_format, doublequote=doublequote, - escapechar=escapechar, - decimal=decimal) + escapechar=escapechar, decimal=decimal) formatter.save() if path_or_buf is None: @@ -1344,8 +1343,7 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='', float_format=None, columns=None, header=True, index=True, index_label=None, startrow=0, startcol=0, engine=None, - merge_cells=True, encoding=None, inf_rep='inf', - verbose=True): + merge_cells=True, encoding=None, inf_rep='inf', verbose=True): """ Write DataFrame to a excel sheet @@ -1410,12 +1408,9 @@ def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='', excel_writer = ExcelWriter(excel_writer, engine=engine) need_save = True - formatter = fmt.ExcelFormatter(self, - na_rep=na_rep, - cols=columns, + formatter = fmt.ExcelFormatter(self, na_rep=na_rep, cols=columns, header=header, - float_format=float_format, - index=index, + float_format=float_format, index=index, index_label=index_label, merge_cells=merge_cells, inf_rep=inf_rep) @@ -1425,9 +1420,9 @@ def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='', if need_save: excel_writer.save() - def to_stata( - self, fname, convert_dates=None, write_index=True, encoding="latin-1", - byteorder=None, time_stamp=None, data_label=None): + def to_stata(self, fname, convert_dates=None, write_index=True, + encoding="latin-1", byteorder=None, time_stamp=None, + data_label=None): """ A class for writing Stata binary dta files from array-like objects @@ -1464,10 +1459,10 @@ def to_stata( writer.write_file() @Appender(fmt.docstring_to_string, indents=1) - def to_string(self, buf=None, columns=None, col_space=None, - header=True, index=True, na_rep='NaN', formatters=None, - float_format=None, sparsify=None, index_names=True, - justify=None, line_width=None, max_rows=None, max_cols=None, + def to_string(self, buf=None, columns=None, col_space=None, header=True, + index=True, na_rep='NaN', formatters=None, float_format=None, + sparsify=None, index_names=True, justify=None, + line_width=None, max_rows=None, max_cols=None, show_dimensions=False): """ Render a DataFrame to a console-friendly tabular output. @@ -1477,8 +1472,7 @@ def to_string(self, buf=None, columns=None, col_space=None, col_space=col_space, na_rep=na_rep, formatters=formatters, float_format=float_format, - sparsify=sparsify, - justify=justify, + sparsify=sparsify, justify=justify, index_names=index_names, header=header, index=index, line_width=line_width, @@ -1527,12 +1521,10 @@ def to_html(self, buf=None, columns=None, col_space=None, colSpace=None, col_space=col_space, na_rep=na_rep, formatters=formatters, float_format=float_format, - sparsify=sparsify, - justify=justify, + sparsify=sparsify, justify=justify, index_names=index_names, header=header, index=index, - bold_rows=bold_rows, - escape=escape, + bold_rows=bold_rows, escape=escape, max_rows=max_rows, max_cols=max_cols, show_dimensions=show_dimensions) @@ -1546,8 +1538,8 @@ def to_html(self, buf=None, columns=None, col_space=None, colSpace=None, def to_latex(self, buf=None, columns=None, col_space=None, colSpace=None, header=True, index=True, na_rep='NaN', formatters=None, float_format=None, sparsify=None, index_names=True, - bold_rows=True, column_format=None, - longtable=None, escape=None, encoding=None): + bold_rows=True, column_format=None, longtable=None, + escape=None, encoding=None): """ Render a DataFrame to a tabular environment table. You can splice this into a LaTeX document. Requires \\usepackage{booktabs}. @@ -1558,7 +1550,8 @@ def to_latex(self, buf=None, columns=None, col_space=None, colSpace=None, Make the row labels bold in the output column_format : str, default None The columns format as specified in `LaTeX table format - `__ e.g 'rcl' for 3 columns + `__ e.g 'rcl' for 3 + columns longtable : boolean, default will be read from the pandas config module default: False Use a longtable environment instead of tabular. Requires adding @@ -1596,7 +1589,8 @@ def to_latex(self, buf=None, columns=None, col_space=None, colSpace=None, if buf is None: return formatter.buf.getvalue() - def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, null_counts=None): + def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, + null_counts=None): """ Concise summary of a DataFrame. @@ -1619,9 +1613,10 @@ def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, null_co human-readable units (base-2 representation). null_counts : boolean, default None Whether to show the non-null counts - If None, then only show if the frame is smaller than max_info_rows and max_info_columns. - If True, always show counts. - If False, never show counts. + - If None, then only show if the frame is smaller than + max_info_rows and max_info_columns. + - If True, always show counts. + - If False, never show counts. """ from pandas.core.format import _put_lines @@ -1643,8 +1638,8 @@ def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, null_co # hack if max_cols is None: - max_cols = get_option( - 'display.max_info_columns', len(self.columns) + 1) + max_cols = get_option('display.max_info_columns', + len(self.columns) + 1) max_rows = get_option('display.max_info_rows', len(self) + 1) @@ -1665,8 +1660,8 @@ def _verbose_repr(): if show_counts: counts = self.count() if len(cols) != len(counts): # pragma: no cover - raise AssertionError('Columns must equal counts (%d != %d)' % - (len(cols), len(counts))) + raise AssertionError('Columns must equal counts (%d != %d)' + % (len(cols), len(counts))) tmpl = "%s non-null %s" dtypes = self.dtypes @@ -1678,8 +1673,7 @@ def _verbose_repr(): if show_counts: count = counts.iloc[i] - lines.append(_put_str(col, space) + - tmpl % (count, dtype)) + lines.append(_put_str(col, space) + tmpl % (count, dtype)) def _non_verbose_repr(): lines.append(self.columns.summary(name='Columns')) @@ -1712,18 +1706,17 @@ def _sizeof_fmt(num, size_qualifier): # append memory usage of df to display size_qualifier = '' if memory_usage == 'deep': - deep=True + deep = True else: - # size_qualifier is just a best effort; not guaranteed to catch all - # cases (e.g., it misses categorical data even with object + # size_qualifier is just a best effort; not guaranteed to catch + # all cases (e.g., it misses categorical data even with object # categories) - deep=False + deep = False if 'object' in counts or is_object_dtype(self.index): size_qualifier = '+' mem_usage = self.memory_usage(index=True, deep=deep).sum() lines.append("memory usage: %s\n" % - _sizeof_fmt(mem_usage, size_qualifier) - ) + _sizeof_fmt(mem_usage, size_qualifier)) _put_lines(buf, lines) def memory_usage(self, index=True, deep=False): @@ -1754,11 +1747,11 @@ def memory_usage(self, index=True, deep=False): -------- numpy.ndarray.nbytes """ - result = Series([ c.memory_usage(index=False, deep=deep) for col, c in self.iteritems() ], - index=self.columns) + result = Series([c.memory_usage(index=False, deep=deep) + for col, c in self.iteritems()], index=self.columns) if index: - result = Series(self.index.memory_usage(deep=deep), - index=['Index']).append(result) + result = Series(self.index.memory_usage(deep=deep), + index=['Index']).append(result) return result def transpose(self): @@ -1767,7 +1760,7 @@ def transpose(self): T = property(transpose) - #---------------------------------------------------------------------- + # ---------------------------------------------------------------------- # Picklability # legacy pickle formats @@ -1795,15 +1788,13 @@ def _unpickle_matrix_compat(self, state): # pragma: no cover if object_state is not None: ovals, _, ocols = object_state objects = DataFrame(ovals, index=index, - columns=_unpickle_array(ocols), - copy=False) + columns=_unpickle_array(ocols), copy=False) dm = dm.join(objects) self._data = dm._data - #---------------------------------------------------------------------- - #---------------------------------------------------------------------- + # ---------------------------------------------------------------------- # Getting and setting elements def get_value(self, index, col, takeable=False): @@ -1888,7 +1879,6 @@ def _ixs(self, i, axis=0): # irow if axis == 0: - """ Notes ----- @@ -1902,14 +1892,15 @@ def _ixs(self, i, axis=0): if isinstance(label, Index): # a location index by definition result = self.take(i, axis=axis) - copy=True + copy = True else: new_values = self._data.fast_xs(i) if lib.isscalar(new_values): return new_values # if we are a copy, mark as such - copy = isinstance(new_values,np.ndarray) and new_values.base is None + copy = (isinstance(new_values, np.ndarray) and + new_values.base is None) result = Series(new_values, index=self.columns, name=self.index[i], dtype=new_values.dtype) result._set_is_copy(self, copy=copy) @@ -1917,7 +1908,6 @@ def _ixs(self, i, axis=0): # icol else: - """ Notes ----- @@ -1943,9 +1933,10 @@ def _ixs(self, i, axis=0): if index_len and not len(values): values = np.array([np.nan] * index_len, dtype=object) - result = self._constructor_sliced.from_array( - values, index=self.index, - name=label, fastpath=True) + result = self._constructor_sliced.from_array(values, + index=self.index, + name=label, + fastpath=True) # this is a cached value, mark it so result._set_as_cached(label, self) @@ -2035,14 +2026,17 @@ def _getitem_multilevel(self, key): else: new_values = self.values[:, loc] result = self._constructor(new_values, index=self.index, - columns=result_columns).__finalize__(self) + columns=result_columns) + result = result.__finalize__(self) if len(result.columns) == 1: top = result.columns[0] if ((type(top) == str and top == '') or (type(top) == tuple and top[0] == '')): result = result[''] if isinstance(result, Series): - result = self._constructor_sliced(result, index=self.index, name=key) + result = self._constructor_sliced(result, + index=self.index, + name=key) result._set_is_copy(self) return result @@ -2274,16 +2268,15 @@ def select_dtypes(self, include=None, exclude=None): 'nonempty') # convert the myriad valid dtypes object to a single representation - include, exclude = map(lambda x: - frozenset(map(com._get_dtype_from_object, x)), - selection) + include, exclude = map( + lambda x: frozenset(map(com._get_dtype_from_object, x)), selection) for dtypes in (include, exclude): com._invalidate_string_dtypes(dtypes) # can't both include AND exclude! if not include.isdisjoint(exclude): - raise ValueError('include and exclude overlap on %s' - % (include & exclude)) + raise ValueError('include and exclude overlap on %s' % + (include & exclude)) # empty include/exclude -> defaults to True # three cases (we've already raised if both are empty) @@ -2381,16 +2374,15 @@ def _ensure_valid_index(self, value): """ # GH5632, make sure that we are a Series convertible if not len(self.index) and is_list_like(value): - try: - value = Series(value) - except: - raise ValueError('Cannot set a frame with no defined index ' - 'and a value that cannot be converted to a ' - 'Series') - - self._data = self._data.reindex_axis(value.index.copy(), axis=1, - fill_value=np.nan) + try: + value = Series(value) + except: + raise ValueError('Cannot set a frame with no defined index ' + 'and a value that cannot be converted to a ' + 'Series') + self._data = self._data.reindex_axis(value.index.copy(), axis=1, + fill_value=np.nan) def _set_item(self, key, value): """ @@ -2429,8 +2421,8 @@ def insert(self, loc, column, value, allow_duplicates=False): """ self._ensure_valid_index(value) value = self._sanitize_column(column, value) - self._data.insert( - loc, column, value, allow_duplicates=allow_duplicates) + self._data.insert(loc, column, value, + allow_duplicates=allow_duplicates) def assign(self, **kwargs): """ @@ -2558,7 +2550,7 @@ def reindexer(value): elif isinstance(value, Categorical): value = value.copy() - elif (isinstance(value, Index) or is_sequence(value)): + elif isinstance(value, Index) or is_sequence(value): from pandas.core.series import _sanitize_index # turn me into an ndarray @@ -2589,8 +2581,8 @@ def reindexer(value): # broadcast across multiple columns if necessary if key in self.columns and value.ndim == 1: - if not self.columns.is_unique or isinstance(self.columns, - MultiIndex): + if (not self.columns.is_unique or + isinstance(self.columns, MultiIndex)): existing_piece = self[key] if isinstance(existing_piece, DataFrame): value = np.tile(value, (len(existing_piece.columns), 1)) @@ -2656,11 +2648,11 @@ def lookup(self, row_labels, col_labels): return result - #---------------------------------------------------------------------- + # ---------------------------------------------------------------------- # Reindexing and alignment - def _reindex_axes(self, axes, level, limit, tolerance, method, - fill_value, copy): + def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, + copy): frame = self columns = axes['columns'] @@ -2706,7 +2698,7 @@ def _reindex_multi(self, axes, copy, fill_value): return self._constructor(new_values, index=new_index, columns=new_columns) else: - return self._reindex_with_indexers({0: [new_index, row_indexer], + return self._reindex_with_indexers({0: [new_index, row_indexer], 1: [new_columns, col_indexer]}, copy=copy, fill_value=fill_value) @@ -2715,9 +2707,12 @@ def _reindex_multi(self, axes, copy, fill_value): def align(self, other, join='outer', axis=None, level=None, copy=True, fill_value=None, method=None, limit=None, fill_axis=0, broadcast_axis=None): - return super(DataFrame, self).align(other, join=join, axis=axis, level=level, copy=copy, - fill_value=fill_value, method=method, limit=limit, - fill_axis=fill_axis, broadcast_axis=broadcast_axis) + return super(DataFrame, self).align(other, join=join, axis=axis, + level=level, copy=copy, + fill_value=fill_value, + method=method, limit=limit, + fill_axis=fill_axis, + broadcast_axis=broadcast_axis) @Appender(_shared_docs['reindex'] % _shared_doc_kwargs) def reindex(self, index=None, columns=None, **kwargs): @@ -2727,10 +2722,10 @@ def reindex(self, index=None, columns=None, **kwargs): @Appender(_shared_docs['reindex_axis'] % _shared_doc_kwargs) def reindex_axis(self, labels, axis=0, method=None, level=None, copy=True, limit=None, fill_value=np.nan): - return super(DataFrame, self).reindex_axis(labels=labels, axis=axis, - method=method, level=level, - copy=copy, limit=limit, - fill_value=fill_value) + return super(DataFrame, + self).reindex_axis(labels=labels, axis=axis, + method=method, level=level, copy=copy, + limit=limit, fill_value=fill_value) @Appender(_shared_docs['rename'] % _shared_doc_kwargs) def rename(self, index=None, columns=None, **kwargs): @@ -2740,10 +2735,10 @@ def rename(self, index=None, columns=None, **kwargs): @Appender(_shared_docs['fillna'] % _shared_doc_kwargs) def fillna(self, value=None, method=None, axis=None, inplace=False, limit=None, downcast=None, **kwargs): - return super(DataFrame, self).fillna(value=value, method=method, - axis=axis, inplace=inplace, - limit=limit, downcast=downcast, - **kwargs) + return super(DataFrame, + self).fillna(value=value, method=method, axis=axis, + inplace=inplace, limit=limit, + downcast=downcast, **kwargs) @Appender(_shared_docs['shift'] % _shared_doc_kwargs) def shift(self, periods=1, freq=None, axis=0): @@ -2880,8 +2875,7 @@ def reset_index(self, level=None, drop=False, inplace=False, col_level=0, def _maybe_casted_values(index, labels=None): if isinstance(index, PeriodIndex): values = index.asobject.values - elif (isinstance(index, DatetimeIndex) and - index.tz is not None): + elif isinstance(index, DatetimeIndex) and index.tz is not None: values = index else: values = index.values @@ -2893,11 +2887,11 @@ def _maybe_casted_values(index, labels=None): mask = labels == -1 values = values.take(labels) if mask.any(): - values, changed = com._maybe_upcast_putmask(values, - mask, np.nan) + values, changed = com._maybe_upcast_putmask(values, mask, + np.nan) return values - new_index = np.arange(len(new_obj),dtype='int64') + new_index = np.arange(len(new_obj), dtype='int64') if isinstance(self.index, MultiIndex): if level is not None: if not isinstance(level, (tuple, list)): @@ -2918,8 +2912,7 @@ def _maybe_casted_values(index, labels=None): if multi_col: if col_fill is None: - col_name = tuple([col_name] * - self.columns.nlevels) + col_name = tuple([col_name] * self.columns.nlevels) else: name_lst = [col_fill] * self.columns.nlevels lev_num = self.columns._get_level_number(col_level) @@ -2950,8 +2943,7 @@ def _maybe_casted_values(index, labels=None): if not inplace: return new_obj - - #---------------------------------------------------------------------- + # ---------------------------------------------------------------------- # Reindex-based selection methods def dropna(self, axis=0, how='any', thresh=None, subset=None, @@ -2982,8 +2974,8 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None, if isinstance(axis, (tuple, list)): result = self for ax in axis: - result = result.dropna(how=how, thresh=thresh, - subset=subset, axis=ax) + result = result.dropna(how=how, thresh=thresh, subset=subset, + axis=ax) else: axis = self._get_axis_number(axis) agg_axis = 1 - axis @@ -2994,8 +2986,8 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None, indices = ax.get_indexer_for(subset) check = indices == -1 if check.any(): - raise KeyError(list(np.compress(check,subset))) - agg_obj = self.take(indices,axis=agg_axis) + raise KeyError(list(np.compress(check, subset))) + agg_obj = self.take(indices, axis=agg_axis) count = agg_obj.count(axis=agg_axis) @@ -3018,7 +3010,8 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None, else: return result - @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) + @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', + False: 'first'}) @deprecate_kwarg(old_arg_name='cols', new_arg_name='subset', stacklevel=3) def drop_duplicates(self, subset=None, keep='first', inplace=False): """ @@ -3052,7 +3045,8 @@ def drop_duplicates(self, subset=None, keep='first', inplace=False): else: return self[-duplicated] - @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', False: 'first'}) + @deprecate_kwarg('take_last', 'keep', mapping={True: 'last', + False: 'first'}) @deprecate_kwarg(old_arg_name='cols', new_arg_name='subset', stacklevel=3) def duplicated(self, subset=None, keep='first'): """ @@ -3082,23 +3076,24 @@ def duplicated(self, subset=None, keep='first'): from pandas.hashtable import duplicated_int64, _SIZE_HINT_LIMIT def f(vals): - labels, shape = factorize(vals, size_hint=min(len(self), _SIZE_HINT_LIMIT)) - return labels.astype('i8',copy=False), len(shape) + labels, shape = factorize(vals, size_hint=min(len(self), + _SIZE_HINT_LIMIT)) + return labels.astype('i8', copy=False), len(shape) if subset is None: subset = self.columns - elif not np.iterable(subset) or \ - isinstance(subset, compat.string_types) or \ - isinstance(subset, tuple) and subset in self.columns: + elif (not np.iterable(subset) or + isinstance(subset, compat.string_types) or + isinstance(subset, tuple) and subset in self.columns): subset = subset, vals = (self[col].values for col in subset) - labels, shape = map(list, zip( * map(f, vals))) + labels, shape = map(list, zip(*map(f, vals))) ids = get_group_index(labels, shape, sort=False, xnull=False) return Series(duplicated_int64(ids, keep), index=self.index) - #---------------------------------------------------------------------- + # ---------------------------------------------------------------------- # Sorting @Appender(_shared_docs['sort_values'] % _shared_doc_kwargs) @@ -3106,16 +3101,14 @@ def sort_values(self, by, axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last'): axis = self._get_axis_number(axis) - labels = self._get_axis(axis) if axis != 0: - raise ValueError('When sorting by column, axis must be 0 ' - '(rows)') + raise ValueError('When sorting by column, axis must be 0 (rows)') if not isinstance(by, list): by = [by] if com.is_sequence(ascending) and len(by) != len(ascending): - raise ValueError('Length of ascending (%d) != length of by' - ' (%d)' % (len(ascending), len(by))) + raise ValueError('Length of ascending (%d) != length of by (%d)' % + (len(ascending), len(by))) if len(by) > 1: from pandas.core.groupby import _lexsort_indexer @@ -3123,11 +3116,13 @@ def trans(v): if com.needs_i8_conversion(v): return v.view('i8') return v + keys = [] for x in by: k = self[x].values if k.ndim == 2: - raise ValueError('Cannot sort by duplicate column %s' % str(x)) + raise ValueError('Cannot sort by duplicate column %s' % + str(x)) keys.append(trans(k)) indexer = _lexsort_indexer(keys, orders=ascending, na_position=na_position) @@ -3141,19 +3136,20 @@ def trans(v): # try to be helpful if isinstance(self.columns, MultiIndex): - raise ValueError('Cannot sort by column %s in a multi-index' - ' you need to explicity provide all the levels' - % str(by)) + raise ValueError('Cannot sort by column %s in a ' + 'multi-index you need to explicity ' + 'provide all the levels' % str(by)) - raise ValueError('Cannot sort by duplicate column %s' - % str(by)) + raise ValueError('Cannot sort by duplicate column %s' % + str(by)) if isinstance(ascending, (tuple, list)): ascending = ascending[0] indexer = _nargsort(k, kind=kind, ascending=ascending, na_position=na_position) - new_data = self._data.take(indexer, axis=self._get_block_manager_axis(axis), + new_data = self._data.take(indexer, + axis=self._get_block_manager_axis(axis), convert=False, verify=False) if inplace: @@ -3161,8 +3157,8 @@ def trans(v): else: return self._constructor(new_data).__finalize__(self) - def sort(self, columns=None, axis=0, ascending=True, - inplace=False, kind='quicksort', na_position='last'): + def sort(self, columns=None, axis=0, ascending=True, inplace=False, + kind='quicksort', na_position='last'): """ DEPRECATED: use :meth:`DataFrame.sort_values` @@ -3183,7 +3179,8 @@ def sort(self, columns=None, axis=0, ascending=True, inplace : boolean, default False Sort the DataFrame without creating a new instance kind : {'quicksort', 'mergesort', 'heapsort'}, optional - This option is only applied when sorting on a single column or label. + This option is only applied when sorting on a single column or + label. na_position : {'first', 'last'} (optional, default='last') 'first' puts NaNs at the beginning 'last' puts NaNs at the end @@ -3200,25 +3197,28 @@ def sort(self, columns=None, axis=0, ascending=True, if columns is None: warnings.warn("sort(....) is deprecated, use sort_index(.....)", FutureWarning, stacklevel=2) - return self.sort_index(axis=axis, ascending=ascending, inplace=inplace) + return self.sort_index(axis=axis, ascending=ascending, + inplace=inplace) - warnings.warn("sort(columns=....) is deprecated, use sort_values(by=.....)", - FutureWarning, stacklevel=2) + warnings.warn("sort(columns=....) is deprecated, use " + "sort_values(by=.....)", FutureWarning, stacklevel=2) return self.sort_values(by=columns, axis=axis, ascending=ascending, - inplace=inplace, kind=kind, na_position=na_position) + inplace=inplace, kind=kind, + na_position=na_position) @Appender(_shared_docs['sort_index'] % _shared_doc_kwargs) def sort_index(self, axis=0, level=None, ascending=True, inplace=False, - kind='quicksort', na_position='last', sort_remaining=True, by=None): + kind='quicksort', na_position='last', sort_remaining=True, + by=None): # 10726 if by is not None: - warnings.warn("by argument to sort_index is deprecated, pls use .sort_values(by=...)", - FutureWarning, stacklevel=2) + warnings.warn("by argument to sort_index is deprecated, pls use " + ".sort_values(by=...)", FutureWarning, stacklevel=2) if level is not None: raise ValueError("unable to simultaneously sort by and level") - return self.sort_values(by, axis=axis, ascending=ascending, inplace=inplace) - + return self.sort_values(by, axis=axis, ascending=ascending, + inplace=inplace) axis = self._get_axis_number(axis) labels = self._get_axis(axis) @@ -3243,9 +3243,10 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, from pandas.core.groupby import _nargsort # GH11080 - Check monotonic-ness before sort an index - # if monotonic (already sorted), return None or copy() according to 'inplace' - if (ascending and labels.is_monotonic_increasing) or \ - (not ascending and labels.is_monotonic_decreasing): + # if monotonic (already sorted), return None or copy() according + # to 'inplace' + if ((ascending and labels.is_monotonic_increasing) or + (not ascending and labels.is_monotonic_decreasing)): if inplace: return else: @@ -3254,7 +3255,8 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, indexer = _nargsort(labels, kind=kind, ascending=ascending, na_position=na_position) - new_data = self._data.take(indexer, axis=self._get_block_manager_axis(axis), + new_data = self._data.take(indexer, + axis=self._get_block_manager_axis(axis), convert=False, verify=False) if inplace: @@ -3262,8 +3264,8 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, else: return self._constructor(new_data).__finalize__(self) - def sortlevel(self, level=0, axis=0, ascending=True, - inplace=False, sort_remaining=True): + def sortlevel(self, level=0, axis=0, ascending=True, inplace=False, + sort_remaining=True): """ Sort multilevel index by chosen axis and primary level. Data will be lexicographically sorted by the chosen level followed by the other @@ -3291,7 +3293,6 @@ def sortlevel(self, level=0, axis=0, ascending=True, return self.sort_index(level=level, axis=axis, ascending=ascending, inplace=inplace, sort_remaining=sort_remaining) - def _nsorted(self, columns, n, method, keep): if not com.is_list_like(columns): columns = [columns] @@ -3421,7 +3422,7 @@ def reorder_levels(self, order, axis=0): result.columns = result.columns.reorder_levels(order) return result - #---------------------------------------------------------------------- + # ---------------------------------------------------------------------- # Arithmetic / combination related def _combine_frame(self, other, func, fill_value=None, level=None): @@ -3473,18 +3474,21 @@ def f(i): else: result = _arith_op(this.values, other.values) - return self._constructor(result, index=new_index, - columns=new_columns, copy=False) + return self._constructor(result, index=new_index, columns=new_columns, + copy=False) def _combine_series(self, other, func, fill_value=None, axis=None, level=None): if axis is not None: axis = self._get_axis_name(axis) if axis == 'index': - return self._combine_match_index(other, func, level=level, fill_value=fill_value) + return self._combine_match_index(other, func, level=level, + fill_value=fill_value) else: - return self._combine_match_columns(other, func, level=level, fill_value=fill_value) - return self._combine_series_infer(other, func, level=level, fill_value=fill_value) + return self._combine_match_columns(other, func, level=level, + fill_value=fill_value) + return self._combine_series_infer(other, func, level=level, + fill_value=fill_value) def _combine_series_infer(self, other, func, level=None, fill_value=None): if len(other) == 0: @@ -3495,48 +3499,56 @@ def _combine_series_infer(self, other, func, level=None, fill_value=None): return self._constructor(data=self._series, index=self.index, columns=self.columns) - return self._combine_match_columns(other, func, level=level, fill_value=fill_value) + return self._combine_match_columns(other, func, level=level, + fill_value=fill_value) def _combine_match_index(self, other, func, level=None, fill_value=None): - left, right = self.align(other, join='outer', axis=0, level=level, copy=False) + left, right = self.align(other, join='outer', axis=0, level=level, + copy=False) if fill_value is not None: raise NotImplementedError("fill_value %r not supported." % fill_value) return self._constructor(func(left.values.T, right.values).T, - index=left.index, - columns=self.columns, copy=False) + index=left.index, columns=self.columns, + copy=False) def _combine_match_columns(self, other, func, level=None, fill_value=None): - left, right = self.align(other, join='outer', axis=1, level=level, copy=False) + left, right = self.align(other, join='outer', axis=1, level=level, + copy=False) if fill_value is not None: raise NotImplementedError("fill_value %r not supported" % fill_value) - new_data = left._data.eval( - func=func, other=right, axes=[left.columns, self.index]) + new_data = left._data.eval(func=func, other=right, + axes=[left.columns, self.index]) return self._constructor(new_data) def _combine_const(self, other, func, raise_on_error=True): if self.empty: return self - new_data = self._data.eval(func=func, other=other, raise_on_error=raise_on_error) + new_data = self._data.eval(func=func, other=other, + raise_on_error=raise_on_error) return self._constructor(new_data) def _compare_frame_evaluate(self, other, func, str_rep): # unique if self.columns.is_unique: + def _compare(a, b): return dict([(col, func(a[col], b[col])) for col in a.columns]) + new_data = expressions.evaluate(_compare, str_rep, self, other) return self._constructor(data=new_data, index=self.index, columns=self.columns, copy=False) # non-unique else: + def _compare(a, b): return dict([(i, func(a.iloc[:, i], b.iloc[:, i])) for i, col in enumerate(a.columns)]) + new_data = expressions.evaluate(_compare, str_rep, self, other) result = self._constructor(data=new_data, index=self.index, copy=False) @@ -3640,8 +3652,7 @@ def combine(self, other, func, fill_value=None, overwrite=True): result[col] = arr # convert_objects just in case - return self._constructor(result, - index=new_index, + return self._constructor(result, index=new_index, columns=new_columns)._convert(datetime=True, copy=False) @@ -3666,6 +3677,7 @@ def combine_first(self, other): ------- combined : DataFrame """ + def combiner(x, y, needs_i8_conversion=False): x_values = x.values if hasattr(x, 'values') else x y_values = y.values if hasattr(y, 'values') else y @@ -3730,10 +3742,10 @@ def update(self, other, join='left', overwrite=True, filter_func=None, else: mask = notnull(this) - self[col] = expressions.where( - mask, this, that, raise_on_error=True) + self[col] = expressions.where(mask, this, that, + raise_on_error=True) - #---------------------------------------------------------------------- + # ---------------------------------------------------------------------- # Misc methods def first_valid_index(self): @@ -3748,7 +3760,7 @@ def last_valid_index(self): """ return self.index[self.count(1) > 0][-1] - #---------------------------------------------------------------------- + # ---------------------------------------------------------------------- # Data reshaping def pivot(self, index=None, columns=None, values=None): @@ -3899,7 +3911,7 @@ def unstack(self, level=-1): from pandas.core.reshape import unstack return unstack(self, level) - #---------------------------------------------------------------------- + # ---------------------------------------------------------------------- # Time series-related def diff(self, periods=1, axis=0): @@ -3923,7 +3935,7 @@ def diff(self, periods=1, axis=0): new_data = self._data.diff(n=periods, axis=bm_axis) return self._constructor(new_data) - #---------------------------------------------------------------------- + # ---------------------------------------------------------------------- # Function application def apply(self, func, axis=0, broadcast=False, raw=False, reduce=None, @@ -3988,7 +4000,9 @@ def apply(self, func, axis=0, broadcast=False, raw=False, reduce=None, """ axis = self._get_axis_number(axis) if kwds or args and not isinstance(func, np.ufunc): - f = lambda x: func(x, *args, **kwds) + + def f(x): + return func(x, *args, **kwds) else: f = func @@ -4036,8 +4050,7 @@ def _apply_raw(self, func, axis): # TODO: mixed type case if result.ndim == 2: - return DataFrame(result, index=self.index, - columns=self.columns) + return DataFrame(result, index=self.index, columns=self.columns) else: return Series(result, index=self._get_agg_axis(axis)) @@ -4045,8 +4058,9 @@ def _apply_standard(self, func, axis, ignore_failures=False, reduce=True): # skip if we are mixed datelike and trying reduce across axes # GH6125 - if reduce and axis==1 and self._is_mixed_type and self._is_datelike_mixed_type: - reduce=False + if (reduce and axis == 1 and self._is_mixed_type and + self._is_datelike_mixed_type): + reduce = False # try to reduce first (by default) # this only matters if the reduction in values is of different dtype @@ -4071,16 +4085,18 @@ def _apply_standard(self, func, axis, ignore_failures=False, reduce=True): dtype = object if self._is_mixed_type else None if axis == 0: - series_gen = (self._ixs(i,axis=1) for i in range(len(self.columns))) + series_gen = (self._ixs(i, axis=1) + for i in range(len(self.columns))) res_index = self.columns res_columns = self.index elif axis == 1: res_index = self.index res_columns = self.columns values = self.values - series_gen = (Series.from_array(arr, index=res_columns, name=name, dtype=dtype) - for i, (arr, name) in - enumerate(zip(values, res_index))) + series_gen = (Series.from_array(arr, index=res_columns, name=name, + dtype=dtype) + for i, (arr, name) in enumerate(zip(values, + res_index))) else: # pragma : no cover raise AssertionError('Axis must be 0 or 1, got %s' % str(axis)) @@ -4110,7 +4126,7 @@ def _apply_standard(self, func, axis, ignore_failures=False, reduce=True): if i is not None: k = res_index[i] e.args = e.args + ('occurred at index %s' % - com.pprint_thing(k),) + com.pprint_thing(k), ) raise if len(results) > 0 and is_sequence(results[0]): @@ -4197,9 +4213,10 @@ def infer(x): f = com.i8_boxer(x) x = lib.map_infer(_values_from_object(x), f) return lib.map_infer(_values_from_object(x), func) + return self.apply(infer) - #---------------------------------------------------------------------- + # ---------------------------------------------------------------------- # Merging / joining methods def append(self, other, ignore_index=False, verify_integrity=False): @@ -4222,9 +4239,9 @@ def append(self, other, ignore_index=False, verify_integrity=False): Notes ----- - If a list of dict/series is passed and the keys are all contained in the - DataFrame's index, the order of the columns in the resulting DataFrame - will be unchanged. + If a list of dict/series is passed and the keys are all contained in + the DataFrame's index, the order of the columns in the resulting + DataFrame will be unchanged. See also -------- @@ -4265,11 +4282,11 @@ def append(self, other, ignore_index=False, verify_integrity=False): ' or if the Series has a name') index = None if other.name is None else [other.name] - combined_columns = self.columns.tolist() + self.columns.union(other.index).difference(self.columns).tolist() + combined_columns = self.columns.tolist() + self.columns.union( + other.index).difference(self.columns).tolist() other = other.reindex(combined_columns, copy=False) other = DataFrame(other.values.reshape((1, len(other))), - index=index, - columns=combined_columns) + index=index, columns=combined_columns) other = other._convert(datetime=True, timedelta=True) if not self.columns.equals(combined_columns): @@ -4370,8 +4387,8 @@ def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='', joined = frames[0] for frame in frames[1:]: - joined = merge(joined, frame, how=how, - left_index=True, right_index=True) + joined = merge(joined, frame, how=how, left_index=True, + right_index=True) return joined @@ -4381,10 +4398,10 @@ def merge(self, right, how='inner', on=None, left_on=None, right_on=None, left_index=False, right_index=False, sort=False, suffixes=('_x', '_y'), copy=True, indicator=False): from pandas.tools.merge import merge - return merge(self, right, how=how, on=on, - left_on=left_on, right_on=right_on, - left_index=left_index, right_index=right_index, sort=sort, - suffixes=suffixes, copy=copy, indicator=indicator) + return merge(self, right, how=how, on=on, left_on=left_on, + right_on=right_on, left_index=left_index, + right_index=right_index, sort=sort, suffixes=suffixes, + copy=copy, indicator=indicator) def round(self, decimals=0, out=None): """ @@ -4462,8 +4479,8 @@ def _series_round(s, decimals): new_cols = [_series_round(v, decimals) for _, v in self.iteritems()] else: - raise TypeError("decimals must be an integer, " - "a dict-like, or a Series") + raise TypeError("decimals must be an integer, a dict-like or a " + "Series") if len(new_cols) > 0: return self._constructor(concat(new_cols, axis=1), @@ -4472,7 +4489,7 @@ def _series_round(s, decimals): else: return self - #---------------------------------------------------------------------- + # ---------------------------------------------------------------------- # Statistical methods, etc. def corr(self, method='pearson', min_periods=1): @@ -4499,8 +4516,7 @@ def corr(self, method='pearson', min_periods=1): mat = numeric_df.values if method == 'pearson': - correl = _algos.nancorr(com._ensure_float64(mat), - minp=min_periods) + correl = _algos.nancorr(com._ensure_float64(mat), minp=min_periods) elif method == 'spearman': correl = _algos.nancorr_spearman(com._ensure_float64(mat), minp=min_periods) @@ -4617,7 +4633,7 @@ def corrwith(self, other, axis=0, drop=False): return correl - #---------------------------------------------------------------------- + # ---------------------------------------------------------------------- # ndarray-like stats methods def count(self, axis=0, level=None, numeric_only=False): @@ -4694,8 +4710,7 @@ def _count_level(self, level, axis=0, numeric_only=False): labels = com._ensure_int64(count_axis.labels[level]) counts = lib.count_level_2d(mask, labels, len(level_index), axis=0) - result = DataFrame(counts, index=level_index, - columns=agg_axis) + result = DataFrame(counts, index=level_index, columns=agg_axis) if axis == 1: # Undo our earlier transpose @@ -4706,7 +4721,10 @@ def _count_level(self, level, axis=0, numeric_only=False): def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds): axis = self._get_axis_number(axis) - f = lambda x: op(x, axis=axis, skipna=skipna, **kwds) + + def f(x): + return op(x, axis=axis, skipna=skipna, **kwds) + labels = self._get_agg_axis(axis) # exclude timedelta/datetime unless we are uniform types @@ -4726,7 +4744,7 @@ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, # this can end up with a non-reduction # but not always. if the types are mixed # with datelike then need to make sure a series - result = self.apply(f,reduce=False) + result = self.apply(f, reduce=False) if result.ndim == self.ndim: result = result.iloc[0] return result @@ -4739,8 +4757,8 @@ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, data = self._get_bool_data() else: # pragma: no cover e = NotImplementedError("Handling exception with filter_" - "type %s not implemented." - % filter_type) + "type %s not implemented." % + filter_type) raise_with_traceback(e) result = f(data.values) labels = data._get_agg_axis(axis) @@ -4847,14 +4865,14 @@ def _get_agg_axis(self, axis_num): def mode(self, axis=0, numeric_only=False): """ - Gets the mode(s) of each element along the axis selected. Empty if nothing - has 2+ occurrences. Adds a row for each mode per label, fills in gaps - with nan. + Gets the mode(s) of each element along the axis selected. Empty if + nothing has 2+ occurrences. Adds a row for each mode per label, fills + in gaps with nan. Note that there could be multiple values returned for the selected - axis (when more than one item share the maximum frequency), which is the - reason why a dataframe is returned. If you want to impute missing values - with the mode in a dataframe ``df``, you can just do this: + axis (when more than one item share the maximum frequency), which is + the reason why a dataframe is returned. If you want to impute missing + values with the mode in a dataframe ``df``, you can just do this: ``df.fillna(df.mode().iloc[0])`` Parameters @@ -4878,7 +4896,10 @@ def mode(self, axis=0, numeric_only=False): 1 2 """ data = self if not numeric_only else self._get_numeric_data() - f = lambda s: s.mode() + + def f(s): + return s.mode() + return data.apply(f, axis=axis) def quantile(self, q=0.5, axis=0, numeric_only=True, @@ -5154,26 +5175,25 @@ def isin(self, values): for i, col in enumerate(self.columns)), axis=1) elif isinstance(values, Series): if not values.index.is_unique: - raise ValueError("ValueError: cannot compute isin with" - " a duplicate axis.") + raise ValueError("cannot compute isin with " + "a duplicate axis.") return self.eq(values.reindex_like(self), axis='index') elif isinstance(values, DataFrame): if not (values.columns.is_unique and values.index.is_unique): - raise ValueError("ValueError: cannot compute isin with" - " a duplicate axis.") + raise ValueError("cannot compute isin with " + "a duplicate axis.") return self.eq(values.reindex_like(self)) else: if not is_list_like(values): - raise TypeError("only list-like or dict-like objects are" - " allowed to be passed to DataFrame.isin(), " + raise TypeError("only list-like or dict-like objects are " + "allowed to be passed to DataFrame.isin(), " "you passed a " "{0!r}".format(type(values).__name__)) return DataFrame(lib.ismember(self.values.ravel(), - set(values)).reshape(self.shape), - self.index, + set(values)).reshape(self.shape), self.index, self.columns) - #---------------------------------------------------------------------- + # ---------------------------------------------------------------------- # Deprecated stuff def combineAdd(self, other): @@ -5237,6 +5257,7 @@ def combineMult(self, other): _EMPTY_SERIES = Series([]) + def _arrays_to_mgr(arrays, arr_names, index, columns, dtype=None): """ Segregate Series based on type and coerce into matrices. @@ -5300,8 +5321,8 @@ def extract_index(data): if have_series: if lengths[0] != len(index): - msg = ('array length %d does not match index length %d' - % (lengths[0], len(index))) + msg = ('array length %d does not match index length %d' % + (lengths[0], len(index))) raise ValueError(msg) else: index = Index(np.arange(lengths[0])) @@ -5349,11 +5370,11 @@ def _to_arrays(data, columns, coerce_float=False, dtype=None): """ if isinstance(data, DataFrame): if columns is not None: - arrays = [data._ixs(i,axis=1).values for i, col in enumerate(data.columns) - if col in columns] + arrays = [data._ixs(i, axis=1).values + for i, col in enumerate(data.columns) if col in columns] else: columns = data.columns - arrays = [data._ixs(i,axis=1).values for i in range(len(columns))] + arrays = [data._ixs(i, axis=1).values for i in range(len(columns))] return arrays, columns @@ -5368,8 +5389,7 @@ def _to_arrays(data, columns, coerce_float=False, dtype=None): dtype=dtype) elif isinstance(data[0], collections.Mapping): return _list_of_dict_to_arrays(data, columns, - coerce_float=coerce_float, - dtype=dtype) + coerce_float=coerce_float, dtype=dtype) elif isinstance(data[0], Series): return _list_of_series_to_arrays(data, columns, coerce_float=coerce_float, @@ -5378,8 +5398,8 @@ def _to_arrays(data, columns, coerce_float=False, dtype=None): if columns is None: columns = _default_index(len(data)) return data, columns - elif (isinstance(data, (np.ndarray, Series, Index)) - and data.dtype.names is not None): + elif (isinstance(data, (np.ndarray, Series, Index)) and + data.dtype.names is not None): columns = list(data.dtype.names) arrays = [data[k] for k in columns] @@ -5387,8 +5407,7 @@ def _to_arrays(data, columns, coerce_float=False, dtype=None): else: # last ditch effort data = lmap(tuple, data) - return _list_to_arrays(data, columns, - coerce_float=coerce_float, + return _list_to_arrays(data, columns, coerce_float=coerce_float, dtype=dtype) @@ -5433,10 +5452,8 @@ def _reorder_arrays(arrays, arr_columns, columns): # reorder according to the columns if (columns is not None and len(columns) and arr_columns is not None and len(arr_columns)): - indexer = _ensure_index( - arr_columns).get_indexer(columns) - arr_columns = _ensure_index( - [arr_columns[i] for i in indexer]) + indexer = _ensure_index(arr_columns).get_indexer(columns) + arr_columns = _ensure_index([arr_columns[i] for i in indexer]) arrays = [arrays[i] for i in indexer] return arrays, arr_columns @@ -5515,7 +5532,7 @@ def convert(arr): arr = com._possibly_cast_to_datetime(arr, dtype) return arr - arrays = [ convert(arr) for arr in content ] + arrays = [convert(arr) for arr in content] return arrays, columns @@ -5583,29 +5600,26 @@ def _from_nested_dict(data): def _put_str(s, space): return ('%s' % s)[:space].ljust(space) - -#---------------------------------------------------------------------- +# ---------------------------------------------------------------------- # Add plotting methods to DataFrame -import pandas.tools.plotting as gfx - -DataFrame.plot = base.AccessorProperty(gfx.FramePlotMethods, gfx.FramePlotMethods) +DataFrame.plot = base.AccessorProperty(gfx.FramePlotMethods, + gfx.FramePlotMethods) DataFrame.hist = gfx.hist_frame @Appender(_shared_docs['boxplot'] % _shared_doc_kwargs) -def boxplot(self, column=None, by=None, ax=None, fontsize=None, - rot=0, grid=True, figsize=None, layout=None, return_type=None, - **kwds): +def boxplot(self, column=None, by=None, ax=None, fontsize=None, rot=0, + grid=True, figsize=None, layout=None, return_type=None, **kwds): import pandas.tools.plotting as plots import matplotlib.pyplot as plt - ax = plots.boxplot(self, column=column, by=by, ax=ax, - fontsize=fontsize, grid=grid, rot=rot, - figsize=figsize, layout=layout, return_type=return_type, - **kwds) + ax = plots.boxplot(self, column=column, by=by, ax=ax, fontsize=fontsize, + grid=grid, rot=rot, figsize=figsize, layout=layout, + return_type=return_type, **kwds) plt.draw_if_interactive() return ax + DataFrame.boxplot = boxplot ops.add_flex_arithmetic_methods(DataFrame, **ops.frame_flex_funcs) diff --git a/setup.cfg b/setup.cfg index 5c07a44ff4f7f..f69e256b80869 100644 --- a/setup.cfg +++ b/setup.cfg @@ -13,3 +13,9 @@ parentdir_prefix = pandas- [flake8] ignore = E731 + +[yapf] +based_on_style = pep8 +split_before_named_assigns = false +split_penalty_after_opening_bracket = 1000000 +split_penalty_logical_operator = 30