`__ e.g 'rcl'
- for 3 columns
+ Right alignment for numbers and left - for strings.
"""
- buf.write(f"\\begin{{longtable}}{{{column_format}}}\n")
- if self.caption is not None or self.label is not None:
- if self.caption is None:
- pass
- else:
- buf.write(f"\\caption{{{self.caption}}}")
+ def get_col_type(dtype):
+ if issubclass(dtype.type, np.number):
+ return "r"
+ return "l"
- if self.label is None:
- pass
- else:
- buf.write(f"\\label{{{self.label}}}")
+ dtypes = self.frame.dtypes._values
+ return "".join(map(get_col_type, dtypes))
- # a double-backslash is required at the end of the line
- # as discussed here:
- # https://tex.stackexchange.com/questions/219138
- buf.write("\\\\\n")
- else:
- pass
+ def _get_index_format(self) -> str:
+ """Get index column format."""
+ return "l" * self.frame.index.nlevels if self.fmt.index else ""
- @staticmethod
- def _write_longtable_end(buf):
- """
- Write the end of a longtable environment.
- Parameters
- ----------
- buf : string or file handle
- File path or object. If not specified, the result is returned as
- a string.
+def _escape_symbols(row: List[str]) -> List[str]:
+ """Carry out string replacements for special symbols.
- """
- buf.write("\\end{longtable}\n")
+ Parameters
+ ----------
+ row : list
+ List of string, that may contain special symbols.
+
+ Returns
+ -------
+ list
+ list of strings with the special symbols replaced.
+ """
+ return [
+ (
+ x.replace("\\", "\\textbackslash ")
+ .replace("_", "\\_")
+ .replace("%", "\\%")
+ .replace("$", "\\$")
+ .replace("#", "\\#")
+ .replace("{", "\\{")
+ .replace("}", "\\}")
+ .replace("~", "\\textasciitilde ")
+ .replace("^", "\\textasciicircum ")
+ .replace("&", "\\&")
+ if (x and x != "{}")
+ else "{}"
+ )
+ for x in row
+ ]
+
+
+def _convert_to_bold(crow: List[str], ilevels: int) -> List[str]:
+ """Convert elements in ``crow`` to bold."""
+ return [
+ f"\\textbf{{{x}}}" if j < ilevels and x.strip() not in ["", "{}"] else x
+ for j, x in enumerate(crow)
+ ]
+
+
+if __name__ == "__main__":
+ import doctest
+
+ doctest.testmod()
diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py
index 36e774305b577..0d2ca83f1012e 100644
--- a/pandas/io/formats/printing.py
+++ b/pandas/io/formats/printing.py
@@ -243,7 +243,7 @@ def pprint_thing_encoded(
return value.encode(encoding, errors)
-def _enable_data_resource_formatter(enable: bool) -> None:
+def enable_data_resource_formatter(enable: bool) -> None:
if "IPython" not in sys.modules:
# definitely not in IPython
return
@@ -276,9 +276,13 @@ class TableSchemaFormatter(BaseFormatter):
formatters[mimetype].enabled = False
-default_pprint = lambda x, max_seq_items=None: pprint_thing(
- x, escape_chars=("\t", "\r", "\n"), quote_strings=True, max_seq_items=max_seq_items
-)
+def default_pprint(thing: Any, max_seq_items: Optional[int] = None) -> str:
+ return pprint_thing(
+ thing,
+ escape_chars=("\t", "\r", "\n"),
+ quote_strings=True,
+ max_seq_items=max_seq_items,
+ )
def format_object_summary(
@@ -317,7 +321,7 @@ def format_object_summary(
summary string
"""
from pandas.io.formats.console import get_console_size
- from pandas.io.formats.format import _get_adjustment
+ from pandas.io.formats.format import get_adjustment
display_width, _ = get_console_size()
if display_width is None:
@@ -346,7 +350,7 @@ def format_object_summary(
is_truncated = n > max_seq_items
# adj can optionally handle unicode eastern asian width
- adj = _get_adjustment()
+ adj = get_adjustment()
def _extend_line(
s: str, line: str, value: str, display_width: int, next_line_prefix: str
@@ -495,7 +499,7 @@ def _justify(
# error: Incompatible return value type (got "Tuple[List[Sequence[str]],
# List[Sequence[str]]]", expected "Tuple[List[Tuple[str, ...]],
# List[Tuple[str, ...]]]")
- return head, tail # type: ignore
+ return head, tail # type: ignore[return-value]
def format_object_attrs(
@@ -520,14 +524,16 @@ def format_object_attrs(
attrs: List[Tuple[str, Union[str, int]]] = []
if hasattr(obj, "dtype") and include_dtype:
# error: "Sequence[Any]" has no attribute "dtype"
- attrs.append(("dtype", f"'{obj.dtype}'")) # type: ignore
+ attrs.append(("dtype", f"'{obj.dtype}'")) # type: ignore[attr-defined]
if getattr(obj, "name", None) is not None:
# error: "Sequence[Any]" has no attribute "name"
- attrs.append(("name", default_pprint(obj.name))) # type: ignore
+ attrs.append(("name", default_pprint(obj.name))) # type: ignore[attr-defined]
# error: "Sequence[Any]" has no attribute "names"
- elif getattr(obj, "names", None) is not None and any(obj.names): # type: ignore
+ elif getattr(obj, "names", None) is not None and any(
+ obj.names # type: ignore[attr-defined]
+ ):
# error: "Sequence[Any]" has no attribute "names"
- attrs.append(("names", default_pprint(obj.names))) # type: ignore
+ attrs.append(("names", default_pprint(obj.names))) # type: ignore[attr-defined]
max_seq_items = get_option("display.max_seq_items") or len(obj)
if len(obj) > max_seq_items:
attrs.append(("length", len(obj)))
diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py
index f7ba4750bc2ad..1df37da3da8d0 100644
--- a/pandas/io/formats/style.py
+++ b/pandas/io/formats/style.py
@@ -18,7 +18,7 @@
Tuple,
Union,
)
-from uuid import uuid1
+from uuid import uuid4
import numpy as np
@@ -36,14 +36,14 @@
import pandas.core.common as com
from pandas.core.frame import DataFrame
from pandas.core.generic import NDFrame
-from pandas.core.indexing import _maybe_numeric_slice, _non_reducing_slice
+from pandas.core.indexing import maybe_numeric_slice, non_reducing_slice
jinja2 = import_optional_dependency("jinja2", extra="DataFrame.style requires jinja2.")
try:
- import matplotlib.pyplot as plt
from matplotlib import colors
+ import matplotlib.pyplot as plt
has_mpl = True
except ImportError:
@@ -89,6 +89,12 @@ class Styler:
.. versionadded:: 1.0.0
+ uuid_len : int, default 5
+ If ``uuid`` is not specified, the length of the ``uuid`` to randomly generate
+ expressed in hex characters, in range [0, 32].
+
+ .. versionadded:: 1.2.0
+
Attributes
----------
env : Jinja2 jinja2.Environment
@@ -144,6 +150,7 @@ def __init__(
table_attributes: Optional[str] = None,
cell_ids: bool = True,
na_rep: Optional[str] = None,
+ uuid_len: int = 5,
):
self.ctx: DefaultDict[Tuple[int, int], List[str]] = defaultdict(list)
self._todo: List[Tuple[Callable, Tuple, Dict]] = []
@@ -159,7 +166,10 @@ def __init__(
self.index = data.index
self.columns = data.columns
- self.uuid = uuid
+ if not isinstance(uuid_len, int) or not uuid_len >= 0:
+ raise TypeError("``uuid_len`` must be an integer in range [0, 32].")
+ self.uuid_len = min(32, uuid_len)
+ self.uuid = (uuid or uuid4().hex[: self.uuid_len]) + "_"
self.table_styles = table_styles
self.caption = caption
if precision is None:
@@ -171,6 +181,8 @@ def __init__(
self.cell_ids = cell_ids
self.na_rep = na_rep
+ self.cell_context: Dict[str, Any] = {}
+
# display_funcs maps (row, col) -> formatting function
def default_display_func(x):
@@ -246,7 +258,7 @@ def _translate(self):
precision = self.precision
hidden_index = self.hidden_index
hidden_columns = self.hidden_columns
- uuid = self.uuid or str(uuid1()).replace("-", "_")
+ uuid = self.uuid
ROW_HEADING_CLASS = "row_heading"
COL_HEADING_CLASS = "col_heading"
INDEX_NAME_CLASS = "index_name"
@@ -262,7 +274,7 @@ def format_attr(pair):
idx_lengths = _get_level_lengths(self.index)
col_lengths = _get_level_lengths(self.columns, hidden_columns)
- cell_context = dict()
+ cell_context = self.cell_context
n_rlvls = self.data.index.nlevels
n_clvls = self.data.columns.nlevels
@@ -327,7 +339,7 @@ def format_attr(pair):
colspan = col_lengths.get((r, c), 0)
if colspan > 1:
es["attributes"] = [
- format_attr({"key": "colspan", "value": colspan})
+ format_attr({"key": "colspan", "value": f'"{colspan}"'})
]
row_es.append(es)
head.append(row_es)
@@ -390,16 +402,16 @@ def format_attr(pair):
"is_visible": (c not in hidden_columns),
}
# only add an id if the cell has a style
- if self.cell_ids or not (len(ctx[r, c]) == 1 and ctx[r, c][0] == ""):
+ props = []
+ if self.cell_ids or (r, c) in ctx:
row_dict["id"] = "_".join(cs[1:])
+ for x in ctx[r, c]:
+ # have to handle empty styles like ['']
+ if x.count(":"):
+ props.append(tuple(x.split(":")))
+ else:
+ props.append(("", ""))
row_es.append(row_dict)
- props = []
- for x in ctx[r, c]:
- # have to handle empty styles like ['']
- if x.count(":"):
- props.append(tuple(x.split(":")))
- else:
- props.append(("", ""))
cellstyle_map[tuple(props)].append(f"row{r}_col{c}")
body.append(row_es)
@@ -475,7 +487,7 @@ def format(self, formatter, subset=None, na_rep: Optional[str] = None) -> "Style
row_locs = range(len(self.data))
col_locs = range(len(self.data.columns))
else:
- subset = _non_reducing_slice(subset)
+ subset = non_reducing_slice(subset)
if len(subset) == 1:
subset = subset, self.data.columns
@@ -499,6 +511,70 @@ def format(self, formatter, subset=None, na_rep: Optional[str] = None) -> "Style
self._display_funcs[(i, j)] = formatter
return self
+ def set_td_classes(self, classes: DataFrame) -> "Styler":
+ """
+ Add string based CSS class names to data cells that will appear within the
+ `Styler` HTML result. These classes are added within specified `` elements.
+
+ Parameters
+ ----------
+ classes : DataFrame
+ DataFrame containing strings that will be translated to CSS classes,
+ mapped by identical column and index values that must exist on the
+ underlying `Styler` data. None, NaN values, and empty strings will
+ be ignored and not affect the rendered HTML.
+
+ Returns
+ -------
+ self : Styler
+
+ Examples
+ --------
+ >>> df = pd.DataFrame(data=[[1, 2, 3], [4, 5, 6]], columns=["A", "B", "C"])
+ >>> classes = pd.DataFrame([
+ ... ["min-val red", "", "blue"],
+ ... ["red", None, "blue max-val"]
+ ... ], index=df.index, columns=df.columns)
+ >>> df.style.set_td_classes(classes)
+
+ Using `MultiIndex` columns and a `classes` `DataFrame` as a subset of the
+ underlying,
+
+ >>> df = pd.DataFrame([[1,2],[3,4]], index=["a", "b"],
+ ... columns=[["level0", "level0"], ["level1a", "level1b"]])
+ >>> classes = pd.DataFrame(["min-val"], index=["a"],
+ ... columns=[["level0"],["level1a"]])
+ >>> df.style.set_td_classes(classes)
+
+ Form of the output with new additional css classes,
+
+ >>> df = pd.DataFrame([[1]])
+ >>> css = pd.DataFrame(["other-class"])
+ >>> s = Styler(df, uuid="_", cell_ids=False).set_td_classes(css)
+ >>> s.hide_index().render()
+ ''
+ ''
+ ' '
+ ' 0 | '
+ ' '
+ ' '
+ ' 1 | '
+ ' '
+ ' '
+
+ """
+ classes = classes.reindex_like(self.data)
+
+ mask = (classes.isna()) | (classes.eq(""))
+ self.cell_context["data"] = {
+ r: {c: [str(classes.iloc[r, c])]}
+ for r, rn in enumerate(classes.index)
+ for c, cn in enumerate(classes.columns)
+ if not mask.iloc[r, c]
+ }
+
+ return self
+
def render(self, **kwargs) -> str:
"""
Render the built up styles to HTML.
@@ -561,11 +637,19 @@ def _update_ctx(self, attrs: DataFrame) -> None:
Whitespace shouldn't matter and the final trailing ';' shouldn't
matter.
"""
- for row_label, v in attrs.iterrows():
- for col_label, col in v.items():
- i = self.index.get_indexer([row_label])[0]
- j = self.columns.get_indexer([col_label])[0]
- for pair in col.rstrip(";").split(";"):
+ coli = {k: i for i, k in enumerate(self.columns)}
+ rowi = {k: i for i, k in enumerate(self.index)}
+ for jj in range(len(attrs.columns)):
+ cn = attrs.columns[jj]
+ j = coli[cn]
+ for rn, c in attrs[[cn]].itertuples():
+ if not c:
+ continue
+ c = c.rstrip(";")
+ if not c:
+ continue
+ i = rowi[rn]
+ for pair in c.split(";"):
self.ctx[(i, j)].append(pair)
def _copy(self, deepcopy: bool = False) -> "Styler":
@@ -601,6 +685,7 @@ def clear(self) -> None:
Returns None.
"""
self.ctx.clear()
+ self.cell_context = {}
self._todo = []
def _compute(self):
@@ -625,7 +710,7 @@ def _apply(
**kwargs,
) -> "Styler":
subset = slice(None) if subset is None else subset
- subset = _non_reducing_slice(subset)
+ subset = non_reducing_slice(subset)
data = self.data.loc[subset]
if axis is not None:
result = data.apply(func, axis=axis, result_type="expand", **kwargs)
@@ -717,7 +802,7 @@ def _applymap(self, func: Callable, subset=None, **kwargs) -> "Styler":
func = partial(func, **kwargs) # applymap doesn't take kwargs?
if subset is None:
subset = pd.IndexSlice[:]
- subset = _non_reducing_slice(subset)
+ subset = non_reducing_slice(subset)
result = self.data.loc[subset].applymap(func)
self._update_ctx(result)
return self
@@ -952,8 +1037,6 @@ def hide_index(self) -> "Styler":
"""
Hide any indices from rendering.
- .. versionadded:: 0.23.0
-
Returns
-------
self : Styler
@@ -965,8 +1048,6 @@ def hide_columns(self, subset) -> "Styler":
"""
Hide columns from rendering.
- .. versionadded:: 0.23.0
-
Parameters
----------
subset : IndexSlice
@@ -977,7 +1058,7 @@ def hide_columns(self, subset) -> "Styler":
-------
self : Styler
"""
- subset = _non_reducing_slice(subset)
+ subset = non_reducing_slice(subset)
hidden_df = self.data.loc[subset]
self.hidden_columns = self.columns.get_indexer_for(hidden_df.columns)
return self
@@ -1079,8 +1160,8 @@ def background_gradient(
of the data is extended by ``low * (x.max() - x.min())`` and ``high *
(x.max() - x.min())`` before normalizing.
"""
- subset = _maybe_numeric_slice(self.data, subset)
- subset = _non_reducing_slice(subset)
+ subset = maybe_numeric_slice(self.data, subset)
+ subset = non_reducing_slice(subset)
self.apply(
self._background_gradient,
cmap=cmap,
@@ -1314,8 +1395,8 @@ def bar(
"(eg: color=['#d65f5f', '#5fba7d'])"
)
- subset = _maybe_numeric_slice(self.data, subset)
- subset = _non_reducing_slice(subset)
+ subset = maybe_numeric_slice(self.data, subset)
+ subset = non_reducing_slice(subset)
self.apply(
self._bar,
subset=subset,
@@ -1382,7 +1463,7 @@ def _highlight_handler(
axis: Optional[Axis] = None,
max_: bool = True,
) -> "Styler":
- subset = _non_reducing_slice(_maybe_numeric_slice(self.data, subset))
+ subset = non_reducing_slice(maybe_numeric_slice(self.data, subset))
self.apply(
self._highlight_extrema, color=color, axis=axis, subset=subset, max_=max_
)
@@ -1524,7 +1605,10 @@ def _get_level_lengths(index, hidden_elements=None):
Result is a dictionary of (level, initial_position): span
"""
- levels = index.format(sparsify=lib.no_default, adjoin=False, names=False)
+ if isinstance(index, pd.MultiIndex):
+ levels = index.format(sparsify=lib.no_default, adjoin=False)
+ else:
+ levels = index.format()
if hidden_elements is None:
hidden_elements = []
diff --git a/pandas/io/html.py b/pandas/io/html.py
index c4ffe332e3020..9a91b16e52723 100644
--- a/pandas/io/html.py
+++ b/pandas/io/html.py
@@ -8,7 +8,9 @@
import numbers
import os
import re
+from typing import Dict, List, Optional, Pattern, Sequence, Union
+from pandas._typing import FilePathOrBuffer
from pandas.compat._optional import import_optional_dependency
from pandas.errors import AbstractMethodError, EmptyDataError
from pandas.util._decorators import deprecate_nonkeyword_arguments
@@ -16,6 +18,7 @@
from pandas.core.dtypes.common import is_list_like
from pandas.core.construction import create_series_with_explicit_dtype
+from pandas.core.frame import DataFrame
from pandas.io.common import is_url, urlopen, validate_header_arg
from pandas.io.formats.printing import pprint_thing
@@ -158,8 +161,6 @@ class _HtmlFrameParser:
displayed_only : bool
Whether or not items with "display:none" should be ignored
- .. versionadded:: 0.23.0
-
Attributes
----------
io : str or file-like
@@ -178,8 +179,6 @@ class _HtmlFrameParser:
displayed_only : bool
Whether or not items with "display:none" should be ignored
- .. versionadded:: 0.23.0
-
Notes
-----
To subclass this class effectively you must override the following methods:
@@ -704,8 +703,8 @@ def _build_doc(self):
--------
pandas.io.html._HtmlFrameParser._build_doc
"""
- from lxml.html import parse, fromstring, HTMLParser
from lxml.etree import XMLSyntaxError
+ from lxml.html import HTMLParser, fromstring, parse
parser = HTMLParser(recover=True, encoding=self.encoding)
@@ -720,7 +719,7 @@ def _build_doc(self):
r = r.getroot()
except AttributeError:
pass
- except (UnicodeDecodeError, IOError) as e:
+ except (UnicodeDecodeError, OSError) as e:
# if the input is a blob of html goop
if not is_url(self.io):
r = fromstring(self.io, parser=parser)
@@ -924,22 +923,22 @@ def _parse(flavor, io, match, attrs, encoding, displayed_only, **kwargs):
@deprecate_nonkeyword_arguments(version="2.0")
def read_html(
- io,
- match=".+",
- flavor=None,
- header=None,
- index_col=None,
- skiprows=None,
- attrs=None,
- parse_dates=False,
- thousands=",",
- encoding=None,
- decimal=".",
- converters=None,
+ io: FilePathOrBuffer,
+ match: Union[str, Pattern] = ".+",
+ flavor: Optional[str] = None,
+ header: Optional[Union[int, Sequence[int]]] = None,
+ index_col: Optional[Union[int, Sequence[int]]] = None,
+ skiprows: Optional[Union[int, Sequence[int], slice]] = None,
+ attrs: Optional[Dict[str, str]] = None,
+ parse_dates: bool = False,
+ thousands: Optional[str] = ",",
+ encoding: Optional[str] = None,
+ decimal: str = ".",
+ converters: Optional[Dict] = None,
na_values=None,
- keep_default_na=True,
- displayed_only=True,
-):
+ keep_default_na: bool = True,
+ displayed_only: bool = True,
+) -> List[DataFrame]:
r"""
Read HTML tables into a ``list`` of ``DataFrame`` objects.
@@ -958,26 +957,26 @@ def read_html(
This value is converted to a regular expression so that there is
consistent behavior between Beautiful Soup and lxml.
- flavor : str or None
+ flavor : str, optional
The parsing engine to use. 'bs4' and 'html5lib' are synonymous with
each other, they are both there for backwards compatibility. The
default of ``None`` tries to use ``lxml`` to parse and if that fails it
falls back on ``bs4`` + ``html5lib``.
- header : int or list-like or None, optional
+ header : int or list-like, optional
The row (or list of rows for a :class:`~pandas.MultiIndex`) to use to
make the columns headers.
- index_col : int or list-like or None, optional
+ index_col : int or list-like, optional
The column (or list of columns) to use to create the index.
- skiprows : int or list-like or slice or None, optional
+ skiprows : int, list-like or slice, optional
Number of rows to skip after parsing the column integer. 0-based. If a
sequence of integers or a slice is given, will skip the rows indexed by
that sequence. Note that a single element sequence means 'skip the nth
row' whereas an integer means 'skip n rows'.
- attrs : dict or None, optional
+ attrs : dict, optional
This is a dictionary of attributes that you can pass to use to identify
the table in the HTML. These are not checked for validity before being
passed to lxml or Beautiful Soup. However, these attributes must be
@@ -1005,7 +1004,7 @@ def read_html(
thousands : str, optional
Separator to use to parse thousands. Defaults to ``','``.
- encoding : str or None, optional
+ encoding : str, optional
The encoding used to decode the web page. Defaults to ``None``.``None``
preserves the previous encoding behavior, which depends on the
underlying parser library (e.g., the parser library will try to use
diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py
index ff37c36962aec..a0ceb18c8bd20 100644
--- a/pandas/io/json/_json.py
+++ b/pandas/io/json/_json.py
@@ -3,13 +3,13 @@
from io import BytesIO, StringIO
from itertools import islice
import os
-from typing import Any, Callable, Optional, Type
+from typing import IO, Any, Callable, List, Optional, Type
import numpy as np
import pandas._libs.json as json
from pandas._libs.tslibs import iNaT
-from pandas._typing import JSONSerializable
+from pandas._typing import CompressionOptions, JSONSerializable, StorageOptions
from pandas.errors import AbstractMethodError
from pandas.util._decorators import deprecate_kwarg, deprecate_nonkeyword_arguments
@@ -19,10 +19,10 @@
from pandas.core.construction import create_series_with_explicit_dtype
from pandas.core.reshape.concat import concat
-from pandas.io.common import get_filepath_or_buffer, get_handle, infer_compression
+from pandas.io.common import get_compression_method, get_filepath_or_buffer, get_handle
from pandas.io.json._normalize import convert_to_line_delimits
from pandas.io.json._table_schema import build_table_schema, parse_table_schema
-from pandas.io.parsers import _validate_integer
+from pandas.io.parsers import validate_integer
loads = json.loads
dumps = json.dumps
@@ -41,9 +41,10 @@ def to_json(
date_unit: str = "ms",
default_handler: Optional[Callable[[Any], JSONSerializable]] = None,
lines: bool = False,
- compression: Optional[str] = "infer",
+ compression: CompressionOptions = "infer",
index: bool = True,
indent: int = 0,
+ storage_options: StorageOptions = None,
):
if not index and orient not in ["split", "table"]:
@@ -52,9 +53,15 @@ def to_json(
)
if path_or_buf is not None:
- path_or_buf, _, _, _ = get_filepath_or_buffer(
- path_or_buf, compression=compression, mode="w"
+ ioargs = get_filepath_or_buffer(
+ path_or_buf,
+ compression=compression,
+ mode="wt",
+ storage_options=storage_options,
)
+ path_or_buf = ioargs.filepath_or_buffer
+ should_close = ioargs.should_close
+ compression = ioargs.compression
if lines and orient != "records":
raise ValueError("'lines' keyword only valid when 'orient' is records")
@@ -93,10 +100,14 @@ def to_json(
fh.write(s)
finally:
fh.close()
+ for handle in handles:
+ handle.close()
elif path_or_buf is None:
return s
else:
path_or_buf.write(s)
+ if should_close:
+ path_or_buf.close()
class Writer:
@@ -115,7 +126,8 @@ def __init__(
self.obj = obj
if orient is None:
- orient = self._default_orient # type: ignore
+ # error: "Writer" has no attribute "_default_orient"
+ orient = self._default_orient # type: ignore[attr-defined]
self.orient = orient
self.date_format = date_format
@@ -362,8 +374,9 @@ def read_json(
encoding=None,
lines: bool = False,
chunksize: Optional[int] = None,
- compression="infer",
+ compression: CompressionOptions = "infer",
nrows: Optional[int] = None,
+ storage_options: StorageOptions = None,
):
"""
Convert a JSON string to pandas object.
@@ -415,9 +428,6 @@ def read_json(
- The DataFrame columns must be unique for orients ``'index'``,
``'columns'``, and ``'records'``.
- .. versionadded:: 0.23.0
- 'table' as an allowed value for the ``orient`` argument
-
typ : {'frame', 'series'}, default 'frame'
The type of object to recover.
@@ -509,6 +519,16 @@ def read_json(
.. versionadded:: 1.1
+ storage_options : dict, optional
+ Extra options that make sense for a particular storage connection, e.g.
+ host, port, username, password, etc., if using a URL that will
+ be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error
+ will be raised if providing this argument with a local path or
+ a file-like buffer. See the fsspec and backend storage implementation
+ docs for the set of allowed keys and values
+
+ .. versionadded:: 1.2.0
+
Returns
-------
Series or DataFrame
@@ -589,13 +609,15 @@ def read_json(
if encoding is None:
encoding = "utf-8"
- compression = infer_compression(path_or_buf, compression)
- filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer(
- path_or_buf, encoding=encoding, compression=compression
+ ioargs = get_filepath_or_buffer(
+ path_or_buf,
+ encoding=encoding,
+ compression=compression,
+ storage_options=storage_options,
)
json_reader = JsonReader(
- filepath_or_buffer,
+ ioargs.filepath_or_buffer,
orient=orient,
typ=typ,
dtype=dtype,
@@ -605,10 +627,10 @@ def read_json(
numpy=numpy,
precise_float=precise_float,
date_unit=date_unit,
- encoding=encoding,
+ encoding=ioargs.encoding,
lines=lines,
chunksize=chunksize,
- compression=compression,
+ compression=ioargs.compression,
nrows=nrows,
)
@@ -616,8 +638,9 @@ def read_json(
return json_reader
result = json_reader.read()
- if should_close:
- filepath_or_buffer.close()
+ if ioargs.should_close:
+ assert not isinstance(ioargs.filepath_or_buffer, str)
+ ioargs.filepath_or_buffer.close()
return result
@@ -646,10 +669,13 @@ def __init__(
encoding,
lines: bool,
chunksize: Optional[int],
- compression,
+ compression: CompressionOptions,
nrows: Optional[int],
):
+ compression_method, compression = get_compression_method(compression)
+ compression = dict(compression, method=compression_method)
+
self.orient = orient
self.typ = typ
self.dtype = dtype
@@ -666,13 +692,14 @@ def __init__(
self.nrows_seen = 0
self.should_close = False
self.nrows = nrows
+ self.file_handles: List[IO] = []
if self.chunksize is not None:
- self.chunksize = _validate_integer("chunksize", self.chunksize, 1)
+ self.chunksize = validate_integer("chunksize", self.chunksize, 1)
if not self.lines:
raise ValueError("chunksize can only be passed if lines=True")
if self.nrows is not None:
- self.nrows = _validate_integer("nrows", self.nrows, 0)
+ self.nrows = validate_integer("nrows", self.nrows, 0)
if not self.lines:
raise ValueError("nrows can only be passed if lines=True")
@@ -714,8 +741,8 @@ def _get_data_from_filepath(self, filepath_or_buffer):
except (TypeError, ValueError):
pass
- if exists or self.compression is not None:
- data, _ = get_handle(
+ if exists or self.compression["method"] is not None:
+ data, self.file_handles = get_handle(
filepath_or_buffer,
"r",
encoding=self.encoding,
@@ -733,8 +760,9 @@ def _combine_lines(self, lines) -> str:
"""
Combines a list of JSON objects into one JSON object.
"""
- lines = filter(None, map(lambda x: x.strip(), lines))
- return "[" + ",".join(lines) + "]"
+ return (
+ f'[{",".join((line for line in (line.strip() for line in lines) if line))}]'
+ )
def read(self):
"""
@@ -793,8 +821,10 @@ def close(self):
if self.should_close:
try:
self.open_stream.close()
- except (IOError, AttributeError):
+ except (OSError, AttributeError):
pass
+ for file_handle in self.file_handles:
+ file_handle.close()
def __next__(self):
if self.nrows:
diff --git a/pandas/io/json/_normalize.py b/pandas/io/json/_normalize.py
index 44765dbe74b46..3ed0b5851b395 100644
--- a/pandas/io/json/_normalize.py
+++ b/pandas/io/json/_normalize.py
@@ -163,11 +163,11 @@ def _json_normalize(
>>> data = [{'id': 1, 'name': {'first': 'Coleen', 'last': 'Volk'}},
... {'name': {'given': 'Mose', 'family': 'Regner'}},
... {'id': 2, 'name': 'Faye Raker'}]
- >>> pandas.json_normalize(data)
- id name name.family name.first name.given name.last
- 0 1.0 NaN NaN Coleen NaN Volk
- 1 NaN NaN Regner NaN Mose NaN
- 2 2.0 Faye Raker NaN NaN NaN NaN
+ >>> pd.json_normalize(data)
+ id name.first name.last name.given name.family name
+ 0 1.0 Coleen Volk NaN NaN NaN
+ 1 NaN NaN NaN Mose Regner NaN
+ 2 2.0 NaN NaN NaN NaN Faye Raker
>>> data = [{'id': 1,
... 'name': "Cole Volk",
@@ -176,11 +176,11 @@ def _json_normalize(
... 'fitness': {'height': 130, 'weight': 60}},
... {'id': 2, 'name': 'Faye Raker',
... 'fitness': {'height': 130, 'weight': 60}}]
- >>> json_normalize(data, max_level=0)
- fitness id name
- 0 {'height': 130, 'weight': 60} 1.0 Cole Volk
- 1 {'height': 130, 'weight': 60} NaN Mose Reg
- 2 {'height': 130, 'weight': 60} 2.0 Faye Raker
+ >>> pd.json_normalize(data, max_level=0)
+ id name fitness
+ 0 1.0 Cole Volk {'height': 130, 'weight': 60}
+ 1 NaN Mose Reg {'height': 130, 'weight': 60}
+ 2 2.0 Faye Raker {'height': 130, 'weight': 60}
Normalizes nested data up to level 1.
@@ -191,11 +191,11 @@ def _json_normalize(
... 'fitness': {'height': 130, 'weight': 60}},
... {'id': 2, 'name': 'Faye Raker',
... 'fitness': {'height': 130, 'weight': 60}}]
- >>> json_normalize(data, max_level=1)
- fitness.height fitness.weight id name
- 0 130 60 1.0 Cole Volk
- 1 130 60 NaN Mose Reg
- 2 130 60 2.0 Faye Raker
+ >>> pd.json_normalize(data, max_level=1)
+ id name fitness.height fitness.weight
+ 0 1.0 Cole Volk 130 60
+ 1 NaN Mose Reg 130 60
+ 2 2.0 Faye Raker 130 60
>>> data = [{'state': 'Florida',
... 'shortname': 'FL',
@@ -208,7 +208,7 @@ def _json_normalize(
... 'info': {'governor': 'John Kasich'},
... 'counties': [{'name': 'Summit', 'population': 1234},
... {'name': 'Cuyahoga', 'population': 1337}]}]
- >>> result = json_normalize(data, 'counties', ['state', 'shortname',
+ >>> result = pd.json_normalize(data, 'counties', ['state', 'shortname',
... ['info', 'governor']])
>>> result
name population state shortname info.governor
@@ -219,7 +219,7 @@ def _json_normalize(
4 Cuyahoga 1337 Ohio OH John Kasich
>>> data = {'A': [1, 2]}
- >>> json_normalize(data, 'A', record_prefix='Prefix.')
+ >>> pd.json_normalize(data, 'A', record_prefix='Prefix.')
Prefix.0
0 1
1 2
diff --git a/pandas/io/orc.py b/pandas/io/orc.py
index ea79efd0579e5..f1b1aa6a43cb5 100644
--- a/pandas/io/orc.py
+++ b/pandas/io/orc.py
@@ -12,7 +12,7 @@
def read_orc(
- path: FilePathOrBuffer, columns: Optional[List[str]] = None, **kwargs,
+ path: FilePathOrBuffer, columns: Optional[List[str]] = None, **kwargs
) -> "DataFrame":
"""
Load an ORC object from the file path, returning a DataFrame.
@@ -50,7 +50,7 @@ def read_orc(
import pyarrow.orc
- path, _, _, _ = get_filepath_or_buffer(path)
- orc_file = pyarrow.orc.ORCFile(path)
+ ioargs = get_filepath_or_buffer(path)
+ orc_file = pyarrow.orc.ORCFile(ioargs.filepath_or_buffer)
result = orc_file.read(columns=columns, **kwargs).to_pandas()
return result
diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
index a0c9242684f0f..07f2078931687 100644
--- a/pandas/io/parquet.py
+++ b/pandas/io/parquet.py
@@ -1,14 +1,15 @@
""" parquet compat """
-from typing import Any, Dict, Optional
+from typing import Any, AnyStr, Dict, List, Optional
from warnings import catch_warnings
+from pandas._typing import FilePathOrBuffer, StorageOptions
from pandas.compat._optional import import_optional_dependency
from pandas.errors import AbstractMethodError
from pandas import DataFrame, get_option
-from pandas.io.common import _expand_user, get_filepath_or_buffer, is_fsspec_url
+from pandas.io.common import get_filepath_or_buffer, is_fsspec_url, stringify_path
def get_engine(engine: str) -> "BaseImpl":
@@ -85,10 +86,11 @@ def __init__(self):
def write(
self,
df: DataFrame,
- path,
- compression="snappy",
+ path: FilePathOrBuffer[AnyStr],
+ compression: Optional[str] = "snappy",
index: Optional[bool] = None,
- partition_cols=None,
+ storage_options: StorageOptions = None,
+ partition_cols: Optional[List[str]] = None,
**kwargs,
):
self.validate_dataframe(df)
@@ -104,10 +106,14 @@ def write(
import_optional_dependency("fsspec")
import fsspec.core
- fs, path = fsspec.core.url_to_fs(path)
+ fs, path = fsspec.core.url_to_fs(path, **(storage_options or {}))
kwargs["filesystem"] = fs
else:
- path = _expand_user(path)
+ if storage_options:
+ raise ValueError(
+ "storage_options passed with file object or non-fsspec file path"
+ )
+ path = stringify_path(path)
if partition_cols is not None:
# writes to multiple files under the given path
self.api.parquet.write_to_dataset(
@@ -121,20 +127,28 @@ def write(
# write to single output file
self.api.parquet.write_table(table, path, compression=compression, **kwargs)
- def read(self, path, columns=None, **kwargs):
+ def read(
+ self, path, columns=None, storage_options: StorageOptions = None, **kwargs
+ ):
if is_fsspec_url(path) and "filesystem" not in kwargs:
import_optional_dependency("fsspec")
import fsspec.core
- fs, path = fsspec.core.url_to_fs(path)
+ fs, path = fsspec.core.url_to_fs(path, **(storage_options or {}))
should_close = False
else:
+ if storage_options:
+ raise ValueError(
+ "storage_options passed with buffer or non-fsspec filepath"
+ )
fs = kwargs.pop("filesystem", None)
should_close = False
- path = _expand_user(path)
+ path = stringify_path(path)
if not fs:
- path, _, _, should_close = get_filepath_or_buffer(path)
+ ioargs = get_filepath_or_buffer(path)
+ path = ioargs.filepath_or_buffer
+ should_close = ioargs.should_close
kwargs["use_pandas_metadata"] = True
result = self.api.parquet.read_table(
@@ -162,6 +176,7 @@ def write(
compression="snappy",
index=None,
partition_cols=None,
+ storage_options: StorageOptions = None,
**kwargs,
):
self.validate_dataframe(df)
@@ -184,9 +199,15 @@ def write(
fsspec = import_optional_dependency("fsspec")
# if filesystem is provided by fsspec, file must be opened in 'wb' mode.
- kwargs["open_with"] = lambda path, _: fsspec.open(path, "wb").open()
+ kwargs["open_with"] = lambda path, _: fsspec.open(
+ path, "wb", **(storage_options or {})
+ ).open()
else:
- path, _, _, _ = get_filepath_or_buffer(path)
+ if storage_options:
+ raise ValueError(
+ "storage_options passed with file object or non-fsspec file path"
+ )
+ path = get_filepath_or_buffer(path).filepath_or_buffer
with catch_warnings(record=True):
self.api.write(
@@ -198,14 +219,18 @@ def write(
**kwargs,
)
- def read(self, path, columns=None, **kwargs):
+ def read(
+ self, path, columns=None, storage_options: StorageOptions = None, **kwargs
+ ):
if is_fsspec_url(path):
fsspec = import_optional_dependency("fsspec")
- open_with = lambda path, _: fsspec.open(path, "rb").open()
+ open_with = lambda path, _: fsspec.open(
+ path, "rb", **(storage_options or {})
+ ).open()
parquet_file = self.api.ParquetFile(path, open_with=open_with)
else:
- path, _, _, _ = get_filepath_or_buffer(path)
+ path = get_filepath_or_buffer(path).filepath_or_buffer
parquet_file = self.api.ParquetFile(path)
return parquet_file.to_pandas(columns=columns, **kwargs)
@@ -213,11 +238,12 @@ def read(self, path, columns=None, **kwargs):
def to_parquet(
df: DataFrame,
- path,
+ path: FilePathOrBuffer[AnyStr],
engine: str = "auto",
- compression="snappy",
+ compression: Optional[str] = "snappy",
index: Optional[bool] = None,
- partition_cols=None,
+ storage_options: StorageOptions = None,
+ partition_cols: Optional[List[str]] = None,
**kwargs,
):
"""
@@ -226,9 +252,12 @@ def to_parquet(
Parameters
----------
df : DataFrame
- path : str
- File path or Root Directory path. Will be used as Root Directory path
- while writing a partitioned dataset.
+ path : str or file-like object
+ If a string, it will be used as Root Directory path
+ when writing a partitioned dataset. By file-like object,
+ we refer to objects with a write() method, such as a file handler
+ (e.g. via builtin open function) or io.BytesIO. The engine
+ fastparquet does not accept file-like objects.
.. versionchanged:: 0.24.0
@@ -251,11 +280,22 @@ def to_parquet(
.. versionadded:: 0.24.0
partition_cols : str or list, optional, default None
- Column names by which to partition the dataset
- Columns are partitioned in the order they are given
+ Column names by which to partition the dataset.
+ Columns are partitioned in the order they are given.
+ Must be None if path is not a string.
.. versionadded:: 0.24.0
+ storage_options : dict, optional
+ Extra options that make sense for a particular storage connection, e.g.
+ host, port, username, password, etc., if using a URL that will
+ be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error
+ will be raised if providing this argument with a local path or
+ a file-like buffer. See the fsspec and backend storage implementation
+ docs for the set of allowed keys and values
+
+ .. versionadded:: 1.2.0
+
kwargs
Additional keyword arguments passed to the engine
"""
@@ -268,6 +308,7 @@ def to_parquet(
compression=compression,
index=index,
partition_cols=partition_cols,
+ storage_options=storage_options,
**kwargs,
)
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index c427d3a198b10..54e836419e138 100644
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -20,7 +20,7 @@
import pandas._libs.parsers as parsers
from pandas._libs.parsers import STR_NA_VALUES
from pandas._libs.tslibs import parsing
-from pandas._typing import FilePathOrBuffer, Union
+from pandas._typing import FilePathOrBuffer, StorageOptions, Union
from pandas.errors import (
AbstractMethodError,
EmptyDataError,
@@ -63,12 +63,7 @@
from pandas.core.series import Series
from pandas.core.tools import datetimes as tools
-from pandas.io.common import (
- get_filepath_or_buffer,
- get_handle,
- infer_compression,
- validate_header_arg,
-)
+from pandas.io.common import get_filepath_or_buffer, get_handle, validate_header_arg
from pandas.io.date_converters import generic_parser
# BOM character (byte order mark)
@@ -343,9 +338,21 @@
option can improve performance because there is no longer any I/O overhead.
float_precision : str, optional
Specifies which converter the C engine should use for floating-point
- values. The options are `None` for the ordinary converter,
- `high` for the high-precision converter, and `round_trip` for the
- round-trip converter.
+ values. The options are ``None`` or 'high' for the ordinary converter,
+ 'legacy' for the original lower precision pandas converter, and
+ 'round_trip' for the round-trip converter.
+
+ .. versionchanged:: 1.2
+
+storage_options : dict, optional
+ Extra options that make sense for a particular storage connection, e.g.
+ host, port, username, password, etc., if using a URL that will
+ be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error
+ will be raised if providing this argument with a local path or
+ a file-like buffer. See the fsspec and backend storage implementation
+ docs for the set of allowed keys and values.
+
+ .. versionadded:: 1.2
Returns
-------
@@ -366,7 +373,7 @@
)
-def _validate_integer(name, val, min_val=0):
+def validate_integer(name, val, min_val=0):
"""
Checks whether the 'name' parameter for parsing is either
an integer OR float that can SAFELY be cast to an integer
@@ -420,21 +427,16 @@ def _validate_names(names):
def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
"""Generic reader of line files."""
encoding = kwds.get("encoding", None)
+ storage_options = kwds.get("storage_options", None)
if encoding is not None:
encoding = re.sub("_", "-", encoding).lower()
kwds["encoding"] = encoding
-
compression = kwds.get("compression", "infer")
- compression = infer_compression(filepath_or_buffer, compression)
-
- # TODO: get_filepath_or_buffer could return
- # Union[FilePathOrBuffer, s3fs.S3File, gcsfs.GCSFile]
- # though mypy handling of conditional imports is difficult.
- # See https://github.com/python/mypy/issues/1297
- fp_or_buf, _, compression, should_close = get_filepath_or_buffer(
- filepath_or_buffer, encoding, compression
+
+ ioargs = get_filepath_or_buffer(
+ filepath_or_buffer, encoding, compression, storage_options=storage_options
)
- kwds["compression"] = compression
+ kwds["compression"] = ioargs.compression
if kwds.get("date_parser", None) is not None:
if isinstance(kwds["parse_dates"], bool):
@@ -442,14 +444,14 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
# Extract some of the arguments (pass chunksize on).
iterator = kwds.get("iterator", False)
- chunksize = _validate_integer("chunksize", kwds.get("chunksize", None), 1)
+ chunksize = validate_integer("chunksize", kwds.get("chunksize", None), 1)
nrows = kwds.get("nrows", None)
# Check for duplicates in names.
_validate_names(kwds.get("names", None))
# Create the parser.
- parser = TextFileReader(fp_or_buf, **kwds)
+ parser = TextFileReader(ioargs.filepath_or_buffer, **kwds)
if chunksize or iterator:
return parser
@@ -459,9 +461,10 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
finally:
parser.close()
- if should_close:
+ if ioargs.should_close:
+ assert not isinstance(ioargs.filepath_or_buffer, str)
try:
- fp_or_buf.close()
+ ioargs.filepath_or_buffer.close()
except ValueError:
pass
@@ -521,7 +524,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds):
"float_precision": None,
}
-_fwf_defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None}
+_fwf_defaults = {"col_specs": "infer", "infer_nrows": 100, "col_widths": None}
_c_unsupported = {"skipfooter"}
_python_unsupported = {"low_memory", "float_precision"}
@@ -595,6 +598,7 @@ def read_csv(
low_memory=_c_parser_defaults["low_memory"],
memory_map=False,
float_precision=None,
+ storage_options: StorageOptions = None,
):
# gh-23761
#
@@ -681,6 +685,7 @@ def read_csv(
mangle_dupe_cols=mangle_dupe_cols,
infer_datetime_format=infer_datetime_format,
skip_blank_lines=skip_blank_lines,
+ storage_options=storage_options,
)
return _read(filepath_or_buffer, kwds)
@@ -757,8 +762,8 @@ def read_table(
def read_fwf(
filepath_or_buffer: FilePathOrBuffer,
- colspecs="infer",
- widths=None,
+ col_specs="infer",
+ col_widths=None,
infer_nrows=100,
**kwds,
):
@@ -785,18 +790,18 @@ def read_fwf(
By file-like object, we refer to objects with a ``read()`` method,
such as a file handler (e.g. via builtin ``open`` function)
or ``StringIO``.
- colspecs : list of tuple (int, int) or 'infer'. optional
+ col_specs : list of tuple (int, int) or 'infer'. optional
A list of tuples giving the extents of the fixed-width
fields of each line as half-open intervals (i.e., [from, to[ ).
String value 'infer' can be used to instruct the parser to try
detecting the column specifications from the first 100 rows of
the data which are not being skipped via skiprows (default='infer').
- widths : list of int, optional
- A list of field widths which can be used instead of 'colspecs' if
+ col_widths : list of int, optional
+ A list of column widths which can be used instead of 'col_specs' if
the intervals are contiguous.
infer_nrows : int, default 100
The number of rows to consider when letting the parser determine the
- `colspecs`.
+ `col_specs`.
.. versionadded:: 0.24.0
**kwds : optional
@@ -818,19 +823,19 @@ def read_fwf(
>>> pd.read_fwf('data.csv') # doctest: +SKIP
"""
# Check input arguments.
- if colspecs is None and widths is None:
- raise ValueError("Must specify either colspecs or widths")
- elif colspecs not in (None, "infer") and widths is not None:
- raise ValueError("You must specify only one of 'widths' and 'colspecs'")
-
- # Compute 'colspecs' from 'widths', if specified.
- if widths is not None:
- colspecs, col = [], 0
- for w in widths:
- colspecs.append((col, col + w))
+ if col_specs is None and col_widths is None:
+ raise ValueError("Must specify either col_specs or col_widths")
+ elif col_specs not in (None, "infer") and col_widths is not None:
+ raise ValueError("You must specify only one of 'col_widths' and 'col specs'")
+
+ # Compute 'col_specs' from 'col_widths', if specified.
+ if col_widths is not None:
+ col_specs, col = [], 0
+ for w in col_widths:
+ col_specs.append((col, col + w))
col += w
- kwds["colspecs"] = colspecs
+ kwds["col_specs"] = col_specs
kwds["infer_nrows"] = infer_nrows
kwds["engine"] = "python-fwf"
return _read(filepath_or_buffer, kwds)
@@ -917,7 +922,6 @@ def __init__(self, f, engine=None, **kwds):
# miscellanea
self.engine = engine
- self._engine = None
self._currow = 0
options = self._get_options_with_defaults(engine)
@@ -926,14 +930,13 @@ def __init__(self, f, engine=None, **kwds):
self.nrows = options.pop("nrows", None)
self.squeeze = options.pop("squeeze", False)
- # might mutate self.engine
- self.engine = self._check_file_or_buffer(f, engine)
+ self._check_file_or_buffer(f, engine)
self.options, self.engine = self._clean_options(options, engine)
if "has_index_names" in kwds:
self.options["has_index_names"] = kwds["has_index_names"]
- self._make_engine(self.engine)
+ self._engine = self._make_engine(self.engine)
def close(self):
self._engine.close()
@@ -990,24 +993,21 @@ def _check_file_or_buffer(self, f, engine):
msg = "The 'python' engine cannot iterate through this file buffer."
raise ValueError(msg)
- return engine
-
def _clean_options(self, options, engine):
result = options.copy()
engine_specified = self._engine_specified
fallback_reason = None
- sep = options["delimiter"]
- delim_whitespace = options["delim_whitespace"]
-
# C engine not supported yet
if engine == "c":
if options["skipfooter"] > 0:
fallback_reason = "the 'c' engine does not support skipfooter"
engine = "python"
- encoding = sys.getfilesystemencoding() or "utf-8"
+ sep = options["delimiter"]
+ delim_whitespace = options["delim_whitespace"]
+
if sep is None and not delim_whitespace:
if engine == "c":
fallback_reason = (
@@ -1032,6 +1032,7 @@ def _clean_options(self, options, engine):
result["delimiter"] = r"\s+"
elif sep is not None:
encodeable = True
+ encoding = sys.getfilesystemencoding() or "utf-8"
try:
if len(sep.encode(encoding)) > 1:
encodeable = False
@@ -1164,29 +1165,26 @@ def __next__(self):
raise
def _make_engine(self, engine="c"):
- if engine == "c":
- self._engine = CParserWrapper(self.f, **self.options)
+ mapping = {
+ "c": CParserWrapper,
+ "python": PythonParser,
+ "python-fwf": FixedWidthFieldParser,
+ }
+ try:
+ klass = mapping[engine]
+ except KeyError:
+ raise ValueError(
+ f"Unknown engine: {engine} (valid options are {mapping.keys()})"
+ )
else:
- if engine == "python":
- klass = PythonParser
- elif engine == "python-fwf":
- klass = FixedWidthFieldParser
- else:
- raise ValueError(
- f"Unknown engine: {engine} (valid options "
- 'are "c", "python", or "python-fwf")'
- )
- self._engine = klass(self.f, **self.options)
+ return klass(self.f, **self.options)
def _failover_to_python(self):
raise AbstractMethodError(self)
def read(self, nrows=None):
- nrows = _validate_integer("nrows", nrows)
- ret = self._engine.read(nrows)
-
- # May alter columns / col_dict
- index, columns, col_dict = self._create_index(ret)
+ nrows = validate_integer("nrows", nrows)
+ index, columns, col_dict = self._engine.read(nrows)
if index is None:
if col_dict:
@@ -1206,10 +1204,6 @@ def read(self, nrows=None):
return df[df.columns[0]].copy()
return df
- def _create_index(self, ret):
- index, columns, col_dict = ret
- return index, columns, col_dict
-
def get_chunk(self, size=None):
if size is None:
size = self.chunksize
@@ -1614,7 +1608,7 @@ def extract(r):
# Clean the column names (if we have an index_col).
if len(ic):
col_names = [
- r[0] if (len(r[0]) and r[0] not in self.unnamed_cols) else None
+ r[0] if ((r[0] is not None) and r[0] not in self.unnamed_cols) else None
for r in header
]
else:
@@ -1964,10 +1958,6 @@ def _do_date_conversions(self, names, data):
class CParserWrapper(ParserBase):
- """
-
- """
-
def __init__(self, src, **kwds):
self.kwds = kwds
kwds = kwds.copy()
@@ -1976,6 +1966,10 @@ def __init__(self, src, **kwds):
encoding = kwds.get("encoding")
+ # parsers.TextReader doesn't support compression dicts
+ if isinstance(kwds.get("compression"), dict):
+ kwds["compression"] = kwds["compression"]["method"]
+
if kwds.get("compression") is None and encoding:
if isinstance(src, str):
src = open(src, "rb")
@@ -2158,9 +2152,7 @@ def read(self, nrows=None):
if self.usecols is not None:
columns = self._filter_usecols(columns)
- col_dict = dict(
- filter(lambda item: item[0] in columns, col_dict.items())
- )
+ col_dict = {k: v for k, v in col_dict.items() if k in columns}
return index, columns, col_dict
@@ -2301,9 +2293,11 @@ def TextParser(*args, **kwds):
can be inferred, there often will be a large parsing speed-up.
float_precision : str, optional
Specifies which converter the C engine should use for floating-point
- values. The options are None for the ordinary converter,
- 'high' for the high-precision converter, and 'round_trip' for the
- round-trip converter.
+ values. The options are `None` or `high` for the ordinary converter,
+ `legacy` for the original lower precision pandas converter, and
+ `round_trip` for the round-trip converter.
+
+ .. versionchanged:: 1.2
"""
kwds["engine"] = "python"
return TextFileReader(*args, **kwds)
@@ -2892,14 +2886,12 @@ def _check_for_bom(self, first_row):
# quotation mark.
if len(first_row_bom) > end + 1:
new_row += first_row_bom[end + 1 :]
- return [new_row] + first_row[1:]
- elif len(first_row_bom) > 1:
- return [first_row_bom[1:]]
else:
- # First row is just the BOM, so we
- # return an empty string.
- return [""]
+
+ # No quotation so just remove BOM from first element
+ new_row = first_row_bom[1:]
+ return [new_row] + first_row[1:]
def _is_line_empty(self, line):
"""
@@ -3658,30 +3650,32 @@ class FixedWidthReader(abc.Iterator):
A reader of fixed-width lines.
"""
- def __init__(self, f, colspecs, delimiter, comment, skiprows=None, infer_nrows=100):
+ def __init__(
+ self, f, col_specs, delimiter, comment, skiprows=None, infer_nrows=100
+ ):
self.f = f
self.buffer = None
self.delimiter = "\r\n" + delimiter if delimiter else "\n\r\t "
self.comment = comment
- if colspecs == "infer":
- self.colspecs = self.detect_colspecs(
+ if col_specs == "infer":
+ self.col_specs = self.detect_colspecs(
infer_nrows=infer_nrows, skiprows=skiprows
)
else:
- self.colspecs = colspecs
+ self.col_specs = col_specs
- if not isinstance(self.colspecs, (tuple, list)):
+ if not isinstance(self.col_specs, (tuple, list)):
raise TypeError(
"column specifications must be a list or tuple, "
- f"input was a {type(colspecs).__name__}"
+ f"input was a {type(col_specs).__name__}"
)
- for colspec in self.colspecs:
+ for col_specs in self.col_specs:
if not (
- isinstance(colspec, (tuple, list))
- and len(colspec) == 2
- and isinstance(colspec[0], (int, np.integer, type(None)))
- and isinstance(colspec[1], (int, np.integer, type(None)))
+ isinstance(col_specs, (tuple, list))
+ and len(col_specs) == 2
+ and isinstance(col_specs[0], (int, np.integer, type(None)))
+ and isinstance(col_specs[1], (int, np.integer, type(None)))
):
raise TypeError(
"Each column specification must be "
@@ -3755,8 +3749,8 @@ def __next__(self):
line = next(self.f)
else:
line = next(self.f)
- # Note: 'colspecs' is a sequence of half-open intervals.
- return [line[fromm:to].strip(self.delimiter) for (fromm, to) in self.colspecs]
+ # Note: 'col_specs' is a sequence of half-open intervals.
+ return [line[fromm:to].strip(self.delimiter) for (fromm, to) in self.col_specs]
class FixedWidthFieldParser(PythonParser):
@@ -3767,14 +3761,14 @@ class FixedWidthFieldParser(PythonParser):
def __init__(self, f, **kwds):
# Support iterators, convert to a list.
- self.colspecs = kwds.pop("colspecs")
+ self.col_specs = kwds.pop("col_specs")
self.infer_nrows = kwds.pop("infer_nrows")
PythonParser.__init__(self, f, **kwds)
def _make_reader(self, f):
self.data = FixedWidthReader(
f,
- self.colspecs,
+ self.col_specs,
self.delimiter,
self.comment,
self.skiprows,
diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py
index 3b35b54a6dc16..655deb5ca3779 100644
--- a/pandas/io/pickle.py
+++ b/pandas/io/pickle.py
@@ -1,9 +1,9 @@
""" pickle compat """
import pickle
-from typing import Any, Optional
+from typing import Any
import warnings
-from pandas._typing import FilePathOrBuffer
+from pandas._typing import CompressionOptions, FilePathOrBuffer, StorageOptions
from pandas.compat import pickle_compat as pc
from pandas.io.common import get_filepath_or_buffer, get_handle
@@ -12,8 +12,9 @@
def to_pickle(
obj: Any,
filepath_or_buffer: FilePathOrBuffer,
- compression: Optional[str] = "infer",
+ compression: CompressionOptions = "infer",
protocol: int = pickle.HIGHEST_PROTOCOL,
+ storage_options: StorageOptions = None,
):
"""
Pickle (serialize) object to file.
@@ -42,6 +43,16 @@ def to_pickle(
protocol parameter is equivalent to setting its value to
HIGHEST_PROTOCOL.
+ storage_options : dict, optional
+ Extra options that make sense for a particular storage connection, e.g.
+ host, port, username, password, etc., if using a URL that will
+ be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error
+ will be raised if providing this argument with a local path or
+ a file-like buffer. See the fsspec and backend storage implementation
+ docs for the set of allowed keys and values
+
+ .. versionadded:: 1.2.0
+
.. [1] https://docs.python.org/3/library/pickle.html
See Also
@@ -75,29 +86,37 @@ def to_pickle(
>>> import os
>>> os.remove("./dummy.pkl")
"""
- fp_or_buf, _, compression, should_close = get_filepath_or_buffer(
- filepath_or_buffer, compression=compression, mode="wb"
+ ioargs = get_filepath_or_buffer(
+ filepath_or_buffer,
+ compression=compression,
+ mode="wb",
+ storage_options=storage_options,
+ )
+ f, fh = get_handle(
+ ioargs.filepath_or_buffer, "wb", compression=ioargs.compression, is_text=False
)
- if not isinstance(fp_or_buf, str) and compression == "infer":
- compression = None
- f, fh = get_handle(fp_or_buf, "wb", compression=compression, is_text=False)
if protocol < 0:
protocol = pickle.HIGHEST_PROTOCOL
try:
f.write(pickle.dumps(obj, protocol=protocol))
finally:
- f.close()
+ if f != filepath_or_buffer:
+ # do not close user-provided file objects GH 35679
+ f.close()
for _f in fh:
_f.close()
- if should_close:
+ if ioargs.should_close:
+ assert not isinstance(ioargs.filepath_or_buffer, str)
try:
- fp_or_buf.close()
+ ioargs.filepath_or_buffer.close()
except ValueError:
pass
def read_pickle(
- filepath_or_buffer: FilePathOrBuffer, compression: Optional[str] = "infer"
+ filepath_or_buffer: FilePathOrBuffer,
+ compression: CompressionOptions = "infer",
+ storage_options: StorageOptions = None,
):
"""
Load pickled pandas object (or any object) from file.
@@ -121,6 +140,16 @@ def read_pickle(
compression) If 'infer' and 'path_or_url' is not path-like, then use
None (= no decompression).
+ storage_options : dict, optional
+ Extra options that make sense for a particular storage connection, e.g.
+ host, port, username, password, etc., if using a URL that will
+ be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error
+ will be raised if providing this argument with a local path or
+ a file-like buffer. See the fsspec and backend storage implementation
+ docs for the set of allowed keys and values
+
+ .. versionadded:: 1.2.0
+
Returns
-------
unpickled : same type as object stored in file
@@ -161,12 +190,12 @@ def read_pickle(
>>> import os
>>> os.remove("./dummy.pkl")
"""
- fp_or_buf, _, compression, should_close = get_filepath_or_buffer(
- filepath_or_buffer, compression=compression
+ ioargs = get_filepath_or_buffer(
+ filepath_or_buffer, compression=compression, storage_options=storage_options
+ )
+ f, fh = get_handle(
+ ioargs.filepath_or_buffer, "rb", compression=ioargs.compression, is_text=False
)
- if not isinstance(fp_or_buf, str) and compression == "infer":
- compression = None
- f, fh = get_handle(fp_or_buf, "rb", compression=compression, is_text=False)
# 1) try standard library Pickle
# 2) try pickle_compat (older pandas version) to handle subclass changes
@@ -189,11 +218,14 @@ def read_pickle(
# e.g. can occur for files written in py27; see GH#28645 and GH#31988
return pc.load(f, encoding="latin-1")
finally:
- f.close()
+ if f != filepath_or_buffer:
+ # do not close user-provided file objects GH 35679
+ f.close()
for _f in fh:
_f.close()
- if should_close:
+ if ioargs.should_close:
+ assert not isinstance(ioargs.filepath_or_buffer, str)
try:
- fp_or_buf.close()
+ ioargs.filepath_or_buffer.close()
except ValueError:
pass
diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
index 0e5d7b007bd89..5e5a89d96f0e5 100644
--- a/pandas/io/pytables.py
+++ b/pandas/io/pytables.py
@@ -16,7 +16,7 @@
from pandas._libs import lib, writers as libwriters
from pandas._libs.tslibs import timezones
-from pandas._typing import ArrayLike, FrameOrSeries, Label
+from pandas._typing import ArrayLike, FrameOrSeries, FrameOrSeriesUnion, Label
from pandas.compat._optional import import_optional_dependency
from pandas.compat.pickle_compat import patch_pickle
from pandas.errors import PerformanceWarning
@@ -57,7 +57,7 @@
from pandas.io.formats.printing import adjoin, pprint_thing
if TYPE_CHECKING:
- from tables import File, Node, Col # noqa:F401
+ from tables import Col, File, Node # noqa:F401
# versioning attribute
@@ -99,22 +99,20 @@ def _ensure_str(name):
def _ensure_term(where, scope_level: int):
"""
- ensure that the where is a Term or a list of Term
- this makes sure that we are capturing the scope of variables
- that are passed
- create the terms here with a frame_level=2 (we are 2 levels down)
+ Ensure that the where is a Term or a list of Term.
+
+ This makes sure that we are capturing the scope of variables that are
+ passed create the terms here with a frame_level=2 (we are 2 levels down)
"""
# only consider list/tuple here as an ndarray is automatically a coordinate
# list
level = scope_level + 1
if isinstance(where, (list, tuple)):
- wlist = []
- for w in filter(lambda x: x is not None, where):
- if not maybe_expression(w):
- wlist.append(w)
- else:
- wlist.append(Term(w, scope_level=level))
- where = wlist
+ where = [
+ Term(term, scope_level=level + 1) if maybe_expression(term) else term
+ for term in where
+ if term is not None
+ ]
elif maybe_expression(where):
where = Term(where, scope_level=level)
return where if where is None or len(where) else None
@@ -289,7 +287,15 @@ def read_hdf(
Read from the store, close it if we opened it.
Retrieve pandas object stored in file, optionally based on where
- criteria
+ criteria.
+
+ .. warning::
+
+ Pandas uses PyTables for reading and writing HDF5 files, which allows
+ serializing object-dtype data with pickle when using the "fixed" format.
+ Loading pickled data received from untrusted sources can be unsafe.
+
+ See: https://docs.python.org/3/library/pickle.html for more.
Parameters
----------
@@ -312,6 +318,10 @@ def read_hdf(
mode : {'r', 'r+', 'a'}, default 'r'
Mode to use when opening the file. Ignored if path_or_buf is a
:class:`pandas.HDFStore`. Default is 'r'.
+ errors : str, default 'strict'
+ Specifies how encoding and decoding errors are to be handled.
+ See the errors argument for :func:`open` for a full list
+ of options.
where : list, optional
A list of Term (or convertible) objects.
start : int, optional
@@ -324,10 +334,6 @@ def read_hdf(
Return an iterator object.
chunksize : int, optional
Number of rows to include in an iteration when using an iterator.
- errors : str, default 'strict'
- Specifies how encoding and decoding errors are to be handled.
- See the errors argument for :func:`open` for a full list
- of options.
**kwargs
Additional keyword arguments passed to HDFStore.
@@ -358,7 +364,7 @@ def read_hdf(
if isinstance(path_or_buf, HDFStore):
if not path_or_buf.is_open:
- raise IOError("The HDFStore must be open for reading.")
+ raise OSError("The HDFStore must be open for reading.")
store = path_or_buf
auto_close = False
@@ -445,6 +451,14 @@ class HDFStore:
Either Fixed or Table format.
+ .. warning::
+
+ Pandas uses PyTables for reading and writing HDF5 files, which allows
+ serializing object-dtype data with pickle when using the "fixed" format.
+ Loading pickled data received from untrusted sources can be unsafe.
+
+ See: https://docs.python.org/3/library/pickle.html for more.
+
Parameters
----------
path : str
@@ -679,7 +693,7 @@ def open(self, mode: str = "a", **kwargs):
try:
self._handle = tables.open_file(self._path, self._mode, **kwargs)
- except IOError as err: # pragma: no cover
+ except OSError as err: # pragma: no cover
if "can not be written" in str(err):
print(f"Opening {self._path} in read-only mode")
self._handle = tables.open_file(self._path, "r", **kwargs)
@@ -710,7 +724,7 @@ def open(self, mode: str = "a", **kwargs):
# trying to read from a non-existent file causes an error which
# is not part of IOError, make it one
if self._mode == "r" and "Unable to open/create file" in str(err):
- raise IOError(str(err)) from err
+ raise OSError(str(err)) from err
raise
def close(self):
@@ -789,6 +803,14 @@ def select(
"""
Retrieve pandas object stored in file, optionally based on where criteria.
+ .. warning::
+
+ Pandas uses PyTables for reading and writing HDF5 files, which allows
+ serializing object-dtype data with pickle when using the "fixed" format.
+ Loading pickled data received from untrusted sources can be unsafe.
+
+ See: https://docs.python.org/3/library/pickle.html for more.
+
Parameters
----------
key : str
@@ -852,6 +874,15 @@ def select_as_coordinates(
"""
return the selection as an Index
+ .. warning::
+
+ Pandas uses PyTables for reading and writing HDF5 files, which allows
+ serializing object-dtype data with pickle when using the "fixed" format.
+ Loading pickled data received from untrusted sources can be unsafe.
+
+ See: https://docs.python.org/3/library/pickle.html for more.
+
+
Parameters
----------
key : str
@@ -876,6 +907,14 @@ def select_column(
return a single column from the table. This is generally only useful to
select an indexable
+ .. warning::
+
+ Pandas uses PyTables for reading and writing HDF5 files, which allows
+ serializing object-dtype data with pickle when using the "fixed" format.
+ Loading pickled data received from untrusted sources can be unsafe.
+
+ See: https://docs.python.org/3/library/pickle.html for more.
+
Parameters
----------
key : str
@@ -912,6 +951,14 @@ def select_as_multiple(
"""
Retrieve pandas objects from multiple tables.
+ .. warning::
+
+ Pandas uses PyTables for reading and writing HDF5 files, which allows
+ serializing object-dtype data with pickle when using the "fixed" format.
+ Loading pickled data received from untrusted sources can be unsafe.
+
+ See: https://docs.python.org/3/library/pickle.html for more.
+
Parameters
----------
keys : a list of the tables
@@ -2280,7 +2327,8 @@ def _get_atom(cls, values: ArrayLike) -> "Col":
Get an appropriately typed and shaped pytables.Col object for values.
"""
dtype = values.dtype
- itemsize = dtype.itemsize # type: ignore
+ # error: "ExtensionDtype" has no attribute "itemsize"
+ itemsize = dtype.itemsize # type: ignore[attr-defined]
shape = values.shape
if values.ndim == 1:
@@ -2518,7 +2566,7 @@ class Fixed:
pandas_kind: str
format_type: str = "fixed" # GH#30962 needed by dask
- obj_type: Type[Union[DataFrame, Series]]
+ obj_type: Type[FrameOrSeriesUnion]
ndim: int
encoding: str
parent: HDFStore
@@ -2883,7 +2931,7 @@ def read_index_node(
# If the index was an empty array write_array_empty() will
# have written a sentinel. Here we replace it with the original.
if "shape" in node._v_attrs and np.prod(node._v_attrs.shape) == 0:
- data = np.empty(node._v_attrs.shape, dtype=node._v_attrs.value_type,)
+ data = np.empty(node._v_attrs.shape, dtype=node._v_attrs.value_type)
kind = _ensure_decoded(node._v_attrs.kind)
name = None
@@ -3349,9 +3397,9 @@ def queryables(self) -> Dict[str, Any]:
(v.cname, v) for v in self.values_axes if v.name in set(self.data_columns)
]
- return dict(d1 + d2 + d3) # type: ignore
- # error: List comprehension has incompatible type
- # List[Tuple[Any, None]]; expected List[Tuple[str, IndexCol]]
+ # error: Unsupported operand types for + ("List[Tuple[str, IndexCol]]"
+ # and "List[Tuple[str, None]]")
+ return dict(d1 + d2 + d3) # type: ignore[operator]
def index_cols(self):
""" return a list of my index cols """
@@ -3569,7 +3617,6 @@ def create_index(self, columns=None, optlevel=None, kind: Optional[str] = None):
for c in columns:
v = getattr(table.cols, c, None)
if v is not None:
-
# remove the index if the kind/optlevel have changed
if v.is_indexed:
index = v.index
@@ -3597,6 +3644,13 @@ def create_index(self, columns=None, optlevel=None, kind: Optional[str] = None):
"data_columns when initializing the table."
)
v.create_index(**kw)
+ elif c in self.non_index_axes[0][1]:
+ # GH 28156
+ raise AttributeError(
+ f"column {c} is not a data_column.\n"
+ f"In order to read column {c} you must reload the dataframe \n"
+ f"into HDFStore and include {c} with the data_columns argument."
+ )
def _read_axes(
self, where, start: Optional[int] = None, stop: Optional[int] = None
@@ -4049,7 +4103,7 @@ def create_description(
return d
def read_coordinates(
- self, where=None, start: Optional[int] = None, stop: Optional[int] = None,
+ self, where=None, start: Optional[int] = None, stop: Optional[int] = None
):
"""
select coordinates (row numbers) from a table; return the
@@ -4320,7 +4374,7 @@ def write_data_chunk(
self.table.flush()
def delete(
- self, where=None, start: Optional[int] = None, stop: Optional[int] = None,
+ self, where=None, start: Optional[int] = None, stop: Optional[int] = None
):
# delete all rows (and return the nrows)
@@ -4388,7 +4442,7 @@ class AppendableFrameTable(AppendableTable):
pandas_kind = "frame_table"
table_type = "appendable_frame"
ndim = 2
- obj_type: Type[Union[DataFrame, Series]] = DataFrame
+ obj_type: Type[FrameOrSeriesUnion] = DataFrame
@property
def is_transposed(self) -> bool:
@@ -4708,7 +4762,7 @@ def _set_tz(
if tz is not None:
name = getattr(values, "name", None)
values = values.ravel()
- tz = timezones.get_timezone(_ensure_decoded(tz))
+ tz = _ensure_decoded(tz)
values = DatetimeIndex(values, name=name)
values = values.tz_localize("UTC").tz_convert(tz)
elif coerce:
@@ -4751,7 +4805,7 @@ def _convert_index(name: str, index: Index, encoding: str, errors: str) -> Index
if inferred_type == "date":
converted = np.asarray([v.toordinal() for v in values], dtype=np.int32)
return IndexCol(
- name, converted, "date", _tables().Time32Col(), index_name=index_name,
+ name, converted, "date", _tables().Time32Col(), index_name=index_name
)
elif inferred_type == "string":
@@ -4767,13 +4821,13 @@ def _convert_index(name: str, index: Index, encoding: str, errors: str) -> Index
elif inferred_type in ["integer", "floating"]:
return IndexCol(
- name, values=converted, kind=kind, typ=atom, index_name=index_name,
+ name, values=converted, kind=kind, typ=atom, index_name=index_name
)
else:
assert isinstance(converted, np.ndarray) and converted.dtype == object
assert kind == "object", kind
atom = _tables().ObjectAtom()
- return IndexCol(name, converted, kind, atom, index_name=index_name,)
+ return IndexCol(name, converted, kind, atom, index_name=index_name)
def _unconvert_index(
diff --git a/pandas/io/sas/sas.pyx b/pandas/io/sas/sas.pyx
index 0038e39e2ffcc..17b41fd2b4379 100644
--- a/pandas/io/sas/sas.pyx
+++ b/pandas/io/sas/sas.pyx
@@ -1,8 +1,8 @@
# cython: profile=False
# cython: boundscheck=False, initializedcheck=False
from cython import Py_ssize_t
-
import numpy as np
+
import pandas.io.sas.sas_constants as const
ctypedef signed long long int64_t
diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py
index 3d9be7c15726b..f2ee642d8fd42 100644
--- a/pandas/io/sas/sas7bdat.py
+++ b/pandas/io/sas/sas7bdat.py
@@ -137,13 +137,17 @@ def __init__(
self._current_row_on_page_index = 0
self._current_row_in_file_index = 0
- self._path_or_buf, _, _, _ = get_filepath_or_buffer(path_or_buf)
+ self._path_or_buf = get_filepath_or_buffer(path_or_buf).filepath_or_buffer
if isinstance(self._path_or_buf, str):
self._path_or_buf = open(self._path_or_buf, "rb")
self.handle = self._path_or_buf
- self._get_properties()
- self._parse_metadata()
+ try:
+ self._get_properties()
+ self._parse_metadata()
+ except Exception:
+ self.close()
+ raise
def column_data_lengths(self):
"""Return a numpy int64 array of the column data lengths"""
diff --git a/pandas/io/sas/sas_xport.py b/pandas/io/sas/sas_xport.py
index 7fc1bc6d3eb6c..9727ec930119b 100644
--- a/pandas/io/sas/sas_xport.py
+++ b/pandas/io/sas/sas_xport.py
@@ -253,12 +253,9 @@ def __init__(
self._chunksize = chunksize
if isinstance(filepath_or_buffer, str):
- (
- filepath_or_buffer,
- encoding,
- compression,
- should_close,
- ) = get_filepath_or_buffer(filepath_or_buffer, encoding=encoding)
+ filepath_or_buffer = get_filepath_or_buffer(
+ filepath_or_buffer, encoding=encoding
+ ).filepath_or_buffer
if isinstance(filepath_or_buffer, (str, bytes)):
self.filepath_or_buffer = open(filepath_or_buffer, "rb")
@@ -267,7 +264,11 @@ def __init__(
# should already be opened in binary mode in Python 3.
self.filepath_or_buffer = filepath_or_buffer
- self._read_header()
+ try:
+ self._read_header()
+ except Exception:
+ self.close()
+ raise
def close(self):
self.filepath_or_buffer.close()
diff --git a/pandas/io/sas/sasreader.py b/pandas/io/sas/sasreader.py
index 291c9d1ee7f0c..31d1a6ad471ea 100644
--- a/pandas/io/sas/sasreader.py
+++ b/pandas/io/sas/sasreader.py
@@ -6,7 +6,7 @@
from pandas._typing import FilePathOrBuffer, Label
-from pandas.io.common import stringify_path
+from pandas.io.common import get_filepath_or_buffer, stringify_path
if TYPE_CHECKING:
from pandas import DataFrame # noqa: F401
@@ -109,18 +109,26 @@ def read_sas(
else:
raise ValueError("unable to infer format of SAS file")
+ ioargs = get_filepath_or_buffer(filepath_or_buffer, encoding)
+
reader: ReaderBase
if format.lower() == "xport":
from pandas.io.sas.sas_xport import XportReader
reader = XportReader(
- filepath_or_buffer, index=index, encoding=encoding, chunksize=chunksize
+ ioargs.filepath_or_buffer,
+ index=index,
+ encoding=ioargs.encoding,
+ chunksize=chunksize,
)
elif format.lower() == "sas7bdat":
from pandas.io.sas.sas7bdat import SAS7BDATReader
reader = SAS7BDATReader(
- filepath_or_buffer, index=index, encoding=encoding, chunksize=chunksize
+ ioargs.filepath_or_buffer,
+ index=index,
+ encoding=ioargs.encoding,
+ chunksize=chunksize,
)
else:
raise ValueError("unknown SAS format")
@@ -128,6 +136,8 @@ def read_sas(
if iterator or chunksize:
return reader
- data = reader.read()
- reader.close()
- return data
+ try:
+ return reader.read()
+ finally:
+ if ioargs.should_close:
+ reader.close()
diff --git a/pandas/io/spss.py b/pandas/io/spss.py
index 9605faeb36590..79cdfbf15392a 100644
--- a/pandas/io/spss.py
+++ b/pandas/io/spss.py
@@ -7,6 +7,8 @@
from pandas.core.api import DataFrame
+from pandas.io.common import stringify_path
+
def read_spss(
path: Union[str, Path],
@@ -40,6 +42,6 @@ def read_spss(
usecols = list(usecols) # pyreadstat requires a list
df, _ = pyreadstat.read_sav(
- path, usecols=usecols, apply_value_formats=convert_categoricals
+ stringify_path(path), usecols=usecols, apply_value_formats=convert_categoricals
)
return df
diff --git a/pandas/io/sql.py b/pandas/io/sql.py
index b137608475b3d..51888e5021d80 100644
--- a/pandas/io/sql.py
+++ b/pandas/io/sql.py
@@ -439,7 +439,8 @@ def read_sql(
con : SQLAlchemy connectable, str, or sqlite3 connection
Using SQLAlchemy makes it possible to use any DB supported by that
library. If a DBAPI2 object, only sqlite3 is supported. The user is responsible
- for engine disposal and connection closure for the SQLAlchemy connectable. See
+ for engine disposal and connection closure for the SQLAlchemy connectable; str
+ connections are closed automatically. See
`here `_.
index_col : str or list of str, optional, default: None
Column(s) to set as index(MultiIndex).
@@ -937,7 +938,7 @@ def _get_column_names_and_types(self, dtype_mapper):
return column_names_and_types
def _create_table_setup(self):
- from sqlalchemy import Table, Column, PrimaryKeyConstraint
+ from sqlalchemy import Column, PrimaryKeyConstraint, Table
column_names_and_types = self._get_column_names_and_types(self._sqlalchemy_type)
@@ -1026,15 +1027,15 @@ def _sqlalchemy_type(self, col):
col_type = lib.infer_dtype(col, skipna=True)
from sqlalchemy.types import (
+ TIMESTAMP,
BigInteger,
- Integer,
- Float,
- Text,
Boolean,
- DateTime,
Date,
+ DateTime,
+ Float,
+ Integer,
+ Text,
Time,
- TIMESTAMP,
)
if col_type == "datetime64" or col_type == "datetime":
@@ -1079,7 +1080,7 @@ def _sqlalchemy_type(self, col):
return Text
def _get_dtype(self, sqltype):
- from sqlalchemy.types import Integer, Float, Boolean, DateTime, Date, TIMESTAMP
+ from sqlalchemy.types import TIMESTAMP, Boolean, Date, DateTime, Float, Integer
if isinstance(sqltype, Float):
return float
@@ -1374,7 +1375,7 @@ def to_sql(
dtype = {col_name: dtype for col_name in frame}
if dtype is not None:
- from sqlalchemy.types import to_instance, TypeEngine
+ from sqlalchemy.types import TypeEngine, to_instance
for col, my_type in dtype.items():
if not isinstance(to_instance(my_type), TypeEngine):
@@ -1391,7 +1392,20 @@ def to_sql(
dtype=dtype,
)
table.create()
- table.insert(chunksize, method=method)
+
+ from sqlalchemy import exc
+
+ try:
+ table.insert(chunksize, method=method)
+ except exc.SQLAlchemyError as err:
+ # GH34431
+ msg = "(1054, \"Unknown column 'inf' in 'field list'\")"
+ err_text = str(err.orig)
+ if re.search(msg, err_text):
+ raise ValueError("inf cannot be used with MySQL") from err
+ else:
+ raise err
+
if not name.isdigit() and not name.islower():
# check for potentially case sensitivity issues (GH7815)
# Only check when name is not a number and name is not lower case
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
index 7677d8a94d521..a8af84e42918d 100644
--- a/pandas/io/stata.py
+++ b/pandas/io/stata.py
@@ -11,23 +11,12 @@
"""
from collections import abc
import datetime
-from io import BytesIO, IOBase
+from io import BytesIO
import os
from pathlib import Path
import struct
import sys
-from typing import (
- Any,
- AnyStr,
- BinaryIO,
- Dict,
- List,
- Mapping,
- Optional,
- Sequence,
- Tuple,
- Union,
-)
+from typing import Any, AnyStr, BinaryIO, Dict, List, Optional, Sequence, Tuple, Union
import warnings
from dateutil.relativedelta import relativedelta
@@ -35,7 +24,7 @@
from pandas._libs.lib import infer_dtype
from pandas._libs.writers import max_len_string_array
-from pandas._typing import FilePathOrBuffer, Label
+from pandas._typing import CompressionOptions, FilePathOrBuffer, Label, StorageOptions
from pandas.util._decorators import Appender
from pandas.core.dtypes.common import (
@@ -58,13 +47,7 @@
from pandas.core.indexes.base import Index
from pandas.core.series import Series
-from pandas.io.common import (
- get_compression_method,
- get_filepath_or_buffer,
- get_handle,
- infer_compression,
- stringify_path,
-)
+from pandas.io.common import get_filepath_or_buffer, get_handle, stringify_path
_version_error = (
"Version of given Stata file is {version}. pandas supports importing "
@@ -181,8 +164,6 @@
path_or_buf : path (string), buffer or path object
string, path object (pathlib.Path or py._path.local.LocalPath) or object
implementing a binary read() functions.
-
- .. versionadded:: 0.23.0 support for pathlib, py.path.
{_statafile_processing_params1}
{_statafile_processing_params2}
{_chunksize_params}
@@ -1035,6 +1016,7 @@ def __init__(
columns: Optional[Sequence[str]] = None,
order_categoricals: bool = True,
chunksize: Optional[int] = None,
+ storage_options: StorageOptions = None,
):
super().__init__()
self.col_sizes: List[int] = []
@@ -1068,13 +1050,16 @@ def __init__(
self._native_byteorder = _set_endianness(sys.byteorder)
path_or_buf = stringify_path(path_or_buf)
if isinstance(path_or_buf, str):
- path_or_buf, encoding, _, should_close = get_filepath_or_buffer(path_or_buf)
+ path_or_buf = get_filepath_or_buffer(
+ path_or_buf, storage_options=storage_options
+ ).filepath_or_buffer
if isinstance(path_or_buf, (str, bytes)):
self.path_or_buf = open(path_or_buf, "rb")
- elif isinstance(path_or_buf, IOBase):
+ elif hasattr(path_or_buf, "read"):
# Copy to BytesIO, and ensure no encoding
- contents = path_or_buf.read()
+ pb: Any = path_or_buf
+ contents = pb.read()
self.path_or_buf = BytesIO(contents)
self._read_header()
@@ -1092,7 +1077,7 @@ def close(self) -> None:
""" close the handle if its open """
try:
self.path_or_buf.close()
- except IOError:
+ except OSError:
pass
def _set_encoding(self) -> None:
@@ -1643,8 +1628,7 @@ def read(
data = self._insert_strls(data)
- cols_ = np.where(self.dtyplist)[0]
-
+ cols_ = np.where([dtyp is not None for dtyp in self.dtyplist])[0]
# Convert columns (if needed) to match input type
ix = data.index
requires_type_conversion = False
@@ -1907,6 +1891,7 @@ def read_stata(
order_categoricals: bool = True,
chunksize: Optional[int] = None,
iterator: bool = False,
+ storage_options: StorageOptions = None,
) -> Union[DataFrame, StataReader]:
reader = StataReader(
@@ -1919,6 +1904,7 @@ def read_stata(
columns=columns,
order_categoricals=order_categoricals,
chunksize=chunksize,
+ storage_options=storage_options,
)
if iterator or chunksize:
@@ -1932,8 +1918,10 @@ def read_stata(
def _open_file_binary_write(
- fname: FilePathOrBuffer, compression: Union[str, Mapping[str, str], None],
-) -> Tuple[BinaryIO, bool, Optional[Union[str, Mapping[str, str]]]]:
+ fname: FilePathOrBuffer,
+ compression: CompressionOptions,
+ storage_options: StorageOptions = None,
+) -> Tuple[BinaryIO, bool, CompressionOptions]:
"""
Open a binary file or no-op if file-like.
@@ -1944,6 +1932,16 @@ def _open_file_binary_write(
compression : {str, dict, None}
The compression method to use.
+ storage_options : dict, optional
+ Extra options that make sense for a particular storage connection, e.g.
+ host, port, username, password, etc., if using a URL that will
+ be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error
+ will be raised if providing this argument with a local path or
+ a file-like buffer. See the fsspec and backend storage implementation
+ docs for the set of allowed keys and values
+
+ .. versionadded:: 1.2.0
+
Returns
-------
file : file-like object
@@ -1953,21 +1951,22 @@ def _open_file_binary_write(
"""
if hasattr(fname, "write"):
# See https://github.com/python/mypy/issues/1424 for hasattr challenges
- return fname, False, None # type: ignore
+ # error: Incompatible return value type (got "Tuple[Union[str, Path,
+ # IO[Any]], bool, None]", expected "Tuple[BinaryIO, bool, Union[str,
+ # Mapping[str, str], None]]")
+ return fname, False, None # type: ignore[return-value]
elif isinstance(fname, (str, Path)):
# Extract compression mode as given, if dict
- compression_typ, compression_args = get_compression_method(compression)
- compression_typ = infer_compression(fname, compression_typ)
- path_or_buf, _, compression_typ, _ = get_filepath_or_buffer(
- fname, compression=compression_typ
+ ioargs = get_filepath_or_buffer(
+ fname, mode="wb", compression=compression, storage_options=storage_options
)
- if compression_typ is not None:
- compression = compression_args
- compression["method"] = compression_typ
- else:
- compression = None
- f, _ = get_handle(path_or_buf, "wb", compression=compression, is_text=False)
- return f, True, compression
+ f, _ = get_handle(
+ ioargs.filepath_or_buffer,
+ "wb",
+ compression=ioargs.compression,
+ is_text=False,
+ )
+ return f, True, ioargs.compression
else:
raise TypeError("fname must be a binary file, buffer or path-like.")
@@ -2121,9 +2120,6 @@ class StataWriter(StataParser):
object implementing a binary write() functions. If using a buffer
then the buffer will not be automatically closed after the file
is written.
-
- .. versionadded:: 0.23.0 support for pathlib, py.path.
-
data : DataFrame
Input to save
convert_dates : dict
@@ -2156,6 +2152,16 @@ class StataWriter(StataParser):
.. versionadded:: 1.1.0
+ storage_options : dict, optional
+ Extra options that make sense for a particular storage connection, e.g.
+ host, port, username, password, etc., if using a URL that will
+ be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error
+ will be raised if providing this argument with a local path or
+ a file-like buffer. See the fsspec and backend storage implementation
+ docs for the set of allowed keys and values
+
+ .. versionadded:: 1.2.0
+
Returns
-------
writer : StataWriter instance
@@ -2204,7 +2210,8 @@ def __init__(
time_stamp: Optional[datetime.datetime] = None,
data_label: Optional[str] = None,
variable_labels: Optional[Dict[Label, str]] = None,
- compression: Union[str, Mapping[str, str], None] = "infer",
+ compression: CompressionOptions = "infer",
+ storage_options: StorageOptions = None,
):
super().__init__()
self._convert_dates = {} if convert_dates is None else convert_dates
@@ -2217,6 +2224,7 @@ def __init__(
self._output_file: Optional[BinaryIO] = None
# attach nobs, nvars, data, varlist, typlist
self._prepare_pandas(data)
+ self.storage_options = storage_options
if byteorder is None:
byteorder = sys.byteorder
@@ -2503,7 +2511,7 @@ def _encode_strings(self) -> None:
def write_file(self) -> None:
self._file, self._own_file, compression = _open_file_binary_write(
- self._fname, self._compression
+ self._fname, self._compression, storage_options=self.storage_options
)
if compression is not None:
self._output_file = self._file
@@ -2987,8 +2995,6 @@ class StataWriter117(StataWriter):
"""
A class for writing Stata binary dta files in Stata 13 format (117)
- .. versionadded:: 0.23.0
-
Parameters
----------
fname : path (string), buffer or path object
@@ -3085,7 +3091,8 @@ def __init__(
data_label: Optional[str] = None,
variable_labels: Optional[Dict[Label, str]] = None,
convert_strl: Optional[Sequence[Label]] = None,
- compression: Union[str, Mapping[str, str], None] = "infer",
+ compression: CompressionOptions = "infer",
+ storage_options: StorageOptions = None,
):
# Copy to new list since convert_strl might be modified later
self._convert_strl: List[Label] = []
@@ -3102,6 +3109,7 @@ def __init__(
data_label=data_label,
variable_labels=variable_labels,
compression=compression,
+ storage_options=storage_options,
)
self._map: Dict[str, int] = {}
self._strl_blob = b""
@@ -3488,7 +3496,8 @@ def __init__(
variable_labels: Optional[Dict[Label, str]] = None,
convert_strl: Optional[Sequence[Label]] = None,
version: Optional[int] = None,
- compression: Union[str, Mapping[str, str], None] = "infer",
+ compression: CompressionOptions = "infer",
+ storage_options: StorageOptions = None,
):
if version is None:
version = 118 if data.shape[1] <= 32767 else 119
@@ -3511,6 +3520,7 @@ def __init__(
variable_labels=variable_labels,
convert_strl=convert_strl,
compression=compression,
+ storage_options=storage_options,
)
# Override version set in StataWriter117 init
self._dta_version = version
diff --git a/pandas/plotting/_core.py b/pandas/plotting/_core.py
index 3a8cc5c299640..d02f12a8e1029 100644
--- a/pandas/plotting/_core.py
+++ b/pandas/plotting/_core.py
@@ -542,12 +542,8 @@ def boxplot_frame_groupby(
The layout of the plot: (rows, columns).
sharex : bool, default False
Whether x-axes will be shared among subplots.
-
- .. versionadded:: 0.23.1
sharey : bool, default True
Whether y-axes will be shared among subplots.
-
- .. versionadded:: 0.23.1
backend : str, default None
Backend to use instead of the backend specified in the option
``plotting.backend``. For instance, 'matplotlib'. Alternatively, to
@@ -567,17 +563,25 @@ def boxplot_frame_groupby(
Examples
--------
- >>> import itertools
- >>> tuples = [t for t in itertools.product(range(1000), range(4))]
- >>> index = pd.MultiIndex.from_tuples(tuples, names=['lvl0', 'lvl1'])
- >>> data = np.random.randn(len(index),4)
- >>> df = pd.DataFrame(data, columns=list('ABCD'), index=index)
- >>>
- >>> grouped = df.groupby(level='lvl1')
- >>> boxplot_frame_groupby(grouped)
- >>>
- >>> grouped = df.unstack(level='lvl1').groupby(level=0, axis=1)
- >>> boxplot_frame_groupby(grouped, subplots=False)
+ You can create boxplots for grouped data and show them as separate subplots:
+
+ .. plot::
+ :context: close-figs
+
+ >>> import itertools
+ >>> tuples = [t for t in itertools.product(range(1000), range(4))]
+ >>> index = pd.MultiIndex.from_tuples(tuples, names=['lvl0', 'lvl1'])
+ >>> data = np.random.randn(len(index),4)
+ >>> df = pd.DataFrame(data, columns=list('ABCD'), index=index)
+ >>> grouped = df.groupby(level='lvl1')
+ >>> grouped.boxplot(rot=45, fontsize=12, figsize=(8,10))
+
+ The ``subplots=False`` option shows the boxplots in a single figure.
+
+ .. plot::
+ :context: close-figs
+
+ >>> grouped.boxplot(subplots=False, rot=45, fontsize=12)
"""
plot_backend = _get_plot_backend(backend)
return plot_backend.boxplot_frame_groupby(
diff --git a/pandas/plotting/_matplotlib/boxplot.py b/pandas/plotting/_matplotlib/boxplot.py
index 4b79bef41d025..8ceba22b1f7a4 100644
--- a/pandas/plotting/_matplotlib/boxplot.py
+++ b/pandas/plotting/_matplotlib/boxplot.py
@@ -1,4 +1,5 @@
from collections import namedtuple
+from typing import TYPE_CHECKING
import warnings
from matplotlib.artist import setp
@@ -11,8 +12,11 @@
from pandas.io.formats.printing import pprint_thing
from pandas.plotting._matplotlib.core import LinePlot, MPLPlot
-from pandas.plotting._matplotlib.style import _get_standard_colors
-from pandas.plotting._matplotlib.tools import _flatten, _subplots
+from pandas.plotting._matplotlib.style import get_standard_colors
+from pandas.plotting._matplotlib.tools import create_subplots, flatten_axes
+
+if TYPE_CHECKING:
+ from matplotlib.axes import Axes
class BoxPlot(LinePlot):
@@ -80,7 +84,7 @@ def _validate_color_args(self):
self.color = None
# get standard colors for default
- colors = _get_standard_colors(num_colors=3, colormap=self.colormap, color=None)
+ colors = get_standard_colors(num_colors=3, colormap=self.colormap, color=None)
# use 2 colors by default, for box/whisker and median
# flier colors isn't needed here
# because it can be specified by ``sym`` kw
@@ -150,7 +154,7 @@ def _make_plot(self):
labels = [pprint_thing(key) for key in range(len(labels))]
self._set_ticklabels(ax, labels)
- def _set_ticklabels(self, ax, labels):
+ def _set_ticklabels(self, ax: "Axes", labels):
if self.orientation == "vertical":
ax.set_xticklabels(labels)
else:
@@ -196,11 +200,11 @@ def _grouped_plot_by_column(
by = [by]
columns = data._get_numeric_data().columns.difference(by)
naxes = len(columns)
- fig, axes = _subplots(
+ fig, axes = create_subplots(
naxes=naxes, sharex=True, sharey=True, figsize=figsize, ax=ax, layout=layout
)
- _axes = _flatten(axes)
+ _axes = flatten_axes(axes)
ax_values = []
@@ -255,7 +259,7 @@ def _get_colors():
# num_colors=3 is required as method maybe_color_bp takes the colors
# in positions 0 and 2.
# if colors not provided, use same defaults as DataFrame.plot.box
- result = _get_standard_colors(num_colors=3)
+ result = get_standard_colors(num_colors=3)
result = np.take(result, [0, 0, 2])
result = np.append(result, "k")
@@ -292,13 +296,18 @@ def maybe_color_bp(bp, **kwds):
if not kwds.get("capprops"):
setp(bp["caps"], color=colors[3], alpha=1)
- def plot_group(keys, values, ax):
+ def plot_group(keys, values, ax: "Axes"):
keys = [pprint_thing(x) for x in keys]
- values = [np.asarray(remove_na_arraylike(v)) for v in values]
+ values = [np.asarray(remove_na_arraylike(v), dtype=object) for v in values]
bp = ax.boxplot(values, **kwds)
if fontsize is not None:
ax.tick_params(axis="both", labelsize=fontsize)
if kwds.get("vert", 1):
+ ticks = ax.get_xticks()
+ if len(ticks) != len(keys):
+ i, remainder = divmod(len(ticks), len(keys))
+ assert remainder == 0, remainder
+ keys *= i
ax.set_xticklabels(keys, rotation=rot)
else:
ax.set_yticklabels(keys, rotation=rot)
@@ -405,7 +414,7 @@ def boxplot_frame_groupby(
):
if subplots is True:
naxes = len(grouped)
- fig, axes = _subplots(
+ fig, axes = create_subplots(
naxes=naxes,
squeeze=False,
ax=ax,
@@ -414,7 +423,7 @@ def boxplot_frame_groupby(
figsize=figsize,
layout=layout,
)
- axes = _flatten(axes)
+ axes = flatten_axes(axes)
ret = pd.Series(dtype=object)
diff --git a/pandas/plotting/_matplotlib/compat.py b/pandas/plotting/_matplotlib/compat.py
index f2c5032112bc9..964596d9b6319 100644
--- a/pandas/plotting/_matplotlib/compat.py
+++ b/pandas/plotting/_matplotlib/compat.py
@@ -17,7 +17,8 @@ def inner():
return inner
-_mpl_ge_2_2_3 = _mpl_version("2.2.3", operator.ge)
-_mpl_ge_3_0_0 = _mpl_version("3.0.0", operator.ge)
-_mpl_ge_3_1_0 = _mpl_version("3.1.0", operator.ge)
-_mpl_ge_3_2_0 = _mpl_version("3.2.0", operator.ge)
+mpl_ge_2_2_3 = _mpl_version("2.2.3", operator.ge)
+mpl_ge_3_0_0 = _mpl_version("3.0.0", operator.ge)
+mpl_ge_3_1_0 = _mpl_version("3.1.0", operator.ge)
+mpl_ge_3_2_0 = _mpl_version("3.2.0", operator.ge)
+mpl_ge_3_3_0 = _mpl_version("3.3.0", operator.ge)
diff --git a/pandas/plotting/_matplotlib/converter.py b/pandas/plotting/_matplotlib/converter.py
index 05377e0c240b9..3db7c38eced65 100644
--- a/pandas/plotting/_matplotlib/converter.py
+++ b/pandas/plotting/_matplotlib/converter.py
@@ -1,7 +1,8 @@
import contextlib
import datetime as pydt
-from datetime import datetime, timedelta
+from datetime import datetime, timedelta, tzinfo
import functools
+from typing import Any, List, Optional, Tuple
from dateutil.relativedelta import relativedelta
import matplotlib.dates as dates
@@ -16,7 +17,6 @@
from pandas._libs.tslibs.offsets import BaseOffset
from pandas.core.dtypes.common import (
- is_datetime64_ns_dtype,
is_float,
is_float_dtype,
is_integer,
@@ -144,7 +144,7 @@ def convert(value, unit, axis):
return value
@staticmethod
- def axisinfo(unit, axis):
+ def axisinfo(unit, axis) -> Optional[units.AxisInfo]:
if unit != "time":
return None
@@ -153,7 +153,7 @@ def axisinfo(unit, axis):
return units.AxisInfo(majloc=majloc, majfmt=majfmt, label="time")
@staticmethod
- def default_units(x, axis):
+ def default_units(x, axis) -> str:
return "time"
@@ -246,19 +246,6 @@ def get_datevalue(date, freq):
raise ValueError(f"Unrecognizable date '{date}'")
-def _dt_to_float_ordinal(dt):
- """
- Convert :mod:`datetime` to the Gregorian date as UTC float days,
- preserving hours, minutes, seconds and microseconds. Return value
- is a :func:`float`.
- """
- if isinstance(dt, (np.ndarray, Index, Series)) and is_datetime64_ns_dtype(dt):
- base = dates.epoch2num(dt.asi8 / 1.0e9)
- else:
- base = dates.date2num(dt)
- return base
-
-
# Datetime Conversion
class DatetimeConverter(dates.DateConverter):
@staticmethod
@@ -274,15 +261,11 @@ def convert(values, unit, axis):
def _convert_1d(values, unit, axis):
def try_parse(values):
try:
- return _dt_to_float_ordinal(tools.to_datetime(values))
+ return dates.date2num(tools.to_datetime(values))
except Exception:
return values
- if isinstance(values, (datetime, pydt.date)):
- return _dt_to_float_ordinal(values)
- elif isinstance(values, np.datetime64):
- return _dt_to_float_ordinal(Timestamp(values))
- elif isinstance(values, pydt.time):
+ if isinstance(values, (datetime, pydt.date, np.datetime64, pydt.time)):
return dates.date2num(values)
elif is_integer(values) or is_float(values):
return values
@@ -303,17 +286,15 @@ def try_parse(values):
try:
values = tools.to_datetime(values)
- if isinstance(values, Index):
- values = _dt_to_float_ordinal(values)
- else:
- values = [_dt_to_float_ordinal(x) for x in values]
except Exception:
- values = _dt_to_float_ordinal(values)
+ pass
+
+ values = dates.date2num(values)
return values
@staticmethod
- def axisinfo(unit, axis):
+ def axisinfo(unit: Optional[tzinfo], axis) -> units.AxisInfo:
"""
Return the :class:`~matplotlib.units.AxisInfo` for *unit*.
@@ -411,8 +392,8 @@ def __call__(self):
interval = self._get_interval()
freq = f"{interval}L"
tz = self.tz.tzname(None)
- st = _from_ordinal(dates.date2num(dmin)) # strip tz
- ed = _from_ordinal(dates.date2num(dmax))
+ st = dmin.replace(tzinfo=None)
+ ed = dmin.replace(tzinfo=None)
all_dates = date_range(start=st, end=ed, freq=freq, tz=tz).astype(object)
try:
@@ -441,7 +422,7 @@ def autoscale(self):
return self.nonsingular(vmin, vmax)
-def _from_ordinal(x, tz=None):
+def _from_ordinal(x, tz: Optional[tzinfo] = None) -> datetime:
ix = int(x)
dt = datetime.fromordinal(ix)
remainder = float(x) - ix
@@ -470,7 +451,7 @@ def _from_ordinal(x, tz=None):
# -------------------------------------------------------------------------
-def _get_default_annual_spacing(nyears):
+def _get_default_annual_spacing(nyears) -> Tuple[int, int]:
"""
Returns a default spacing between consecutive ticks for annual data.
"""
@@ -492,7 +473,7 @@ def _get_default_annual_spacing(nyears):
return (min_spacing, maj_spacing)
-def period_break(dates, period):
+def period_break(dates: PeriodIndex, period: str) -> np.ndarray:
"""
Returns the indices where the given period changes.
@@ -508,7 +489,7 @@ def period_break(dates, period):
return np.nonzero(current - previous)[0]
-def has_level_label(label_flags, vmin):
+def has_level_label(label_flags: np.ndarray, vmin: float) -> bool:
"""
Returns true if the ``label_flags`` indicate there is at least one label
for this level.
@@ -1003,18 +984,24 @@ class TimeSeries_DateFormatter(Formatter):
----------
freq : {int, string}
Valid frequency specifier.
- minor_locator : {False, True}
+ minor_locator : bool, default False
Whether the current formatter should apply to minor ticks (True) or
major ticks (False).
- dynamic_mode : {True, False}
+ dynamic_mode : bool, default True
Whether the formatter works in dynamic mode or not.
"""
- def __init__(self, freq, minor_locator=False, dynamic_mode=True, plot_obj=None):
+ def __init__(
+ self,
+ freq,
+ minor_locator: bool = False,
+ dynamic_mode: bool = True,
+ plot_obj=None,
+ ):
freq = to_offset(freq)
self.format = None
self.freq = freq
- self.locs = []
+ self.locs: List[Any] = [] # unused, for matplotlib compat
self.formatdict = None
self.isminor = minor_locator
self.isdynamic = dynamic_mode
diff --git a/pandas/plotting/_matplotlib/core.py b/pandas/plotting/_matplotlib/core.py
index e510f7140519a..0c64ea824996f 100644
--- a/pandas/plotting/_matplotlib/core.py
+++ b/pandas/plotting/_matplotlib/core.py
@@ -1,5 +1,4 @@
-import re
-from typing import List, Optional
+from typing import TYPE_CHECKING, List, Optional, Tuple
import warnings
from matplotlib.artist import Artist
@@ -30,19 +29,39 @@
import pandas.core.common as com
from pandas.io.formats.printing import pprint_thing
-from pandas.plotting._matplotlib.compat import _mpl_ge_3_0_0
+from pandas.plotting._matplotlib.compat import mpl_ge_3_0_0
from pandas.plotting._matplotlib.converter import register_pandas_matplotlib_converters
-from pandas.plotting._matplotlib.style import _get_standard_colors
+from pandas.plotting._matplotlib.style import get_standard_colors
+from pandas.plotting._matplotlib.timeseries import (
+ decorate_axes,
+ format_dateaxis,
+ maybe_convert_index,
+ maybe_resample,
+ use_dynamic_x,
+)
from pandas.plotting._matplotlib.tools import (
- _flatten,
- _get_all_lines,
- _get_xlim,
- _handle_shared_axes,
- _subplots,
+ create_subplots,
+ flatten_axes,
format_date_labels,
+ get_all_lines,
+ get_xlim,
+ handle_shared_axes,
table,
)
+if TYPE_CHECKING:
+ from matplotlib.axes import Axes
+ from matplotlib.axis import Axis
+
+
+def _color_in_style(style: str) -> bool:
+ """
+ Check if there is a color letter in the style string.
+ """
+ from matplotlib.colors import BASE_COLORS
+
+ return not set(BASE_COLORS).isdisjoint(style)
+
class MPLPlot:
"""
@@ -62,22 +81,6 @@ def _kind(self):
_layout_type = "vertical"
_default_rot = 0
orientation: Optional[str] = None
- _pop_attributes = [
- "label",
- "style",
- "logy",
- "logx",
- "loglog",
- "mark_right",
- "stacked",
- ]
- _attr_defaults = {
- "logy": False,
- "logx": False,
- "loglog": False,
- "mark_right": True,
- "stacked": False,
- }
def __init__(
self,
@@ -164,9 +167,13 @@ def __init__(
self.legend_handles: List[Artist] = []
self.legend_labels: List[Label] = []
- for attr in self._pop_attributes:
- value = kwds.pop(attr, self._attr_defaults.get(attr, None))
- setattr(self, attr, value)
+ self.logx = kwds.pop("logx", False)
+ self.logy = kwds.pop("logy", False)
+ self.loglog = kwds.pop("loglog", False)
+ self.label = kwds.pop("label", None)
+ self.style = kwds.pop("style", None)
+ self.mark_right = kwds.pop("mark_right", True)
+ self.stacked = kwds.pop("stacked", False)
self.ax = ax
self.fig = fig
@@ -201,8 +208,6 @@ def __init__(
self._validate_color_args()
def _validate_color_args(self):
- import matplotlib.colors
-
if (
"color" in self.kwds
and self.nseries == 1
@@ -234,13 +239,12 @@ def _validate_color_args(self):
styles = [self.style]
# need only a single match
for s in styles:
- for char in s:
- if char in matplotlib.colors.BASE_COLORS:
- raise ValueError(
- "Cannot pass 'style' string with a color symbol and "
- "'color' keyword argument. Please use one or the other or "
- "pass 'style' without a color symbol"
- )
+ if _color_in_style(s):
+ raise ValueError(
+ "Cannot pass 'style' string with a color symbol and "
+ "'color' keyword argument. Please use one or the "
+ "other or pass 'style' without a color symbol"
+ )
def _iter_data(self, data=None, keep_index=False, fillna=None):
if data is None:
@@ -280,11 +284,11 @@ def generate(self):
def _args_adjust(self):
pass
- def _has_plotted_object(self, ax):
+ def _has_plotted_object(self, ax: "Axes") -> bool:
"""check whether ax has data"""
return len(ax.lines) != 0 or len(ax.artists) != 0 or len(ax.containers) != 0
- def _maybe_right_yaxis(self, ax, axes_num):
+ def _maybe_right_yaxis(self, ax: "Axes", axes_num):
if not self.on_right(axes_num):
# secondary axes may be passed via ax kw
return self._get_ax_layer(ax)
@@ -314,7 +318,7 @@ def _maybe_right_yaxis(self, ax, axes_num):
def _setup_subplots(self):
if self.subplots:
- fig, axes = _subplots(
+ fig, axes = create_subplots(
naxes=self.nseries,
sharex=self.sharex,
sharey=self.sharey,
@@ -333,12 +337,12 @@ def _setup_subplots(self):
fig.set_size_inches(self.figsize)
axes = self.ax
- axes = _flatten(axes)
+ axes = flatten_axes(axes)
valid_log = {False, True, "sym", None}
input_log = {self.logx, self.logy, self.loglog}
if input_log - valid_log:
- invalid_log = next(iter((input_log - valid_log)))
+ invalid_log = next(iter(input_log - valid_log))
raise ValueError(
f"Boolean, None and 'sym' are valid options, '{invalid_log}' is given."
)
@@ -465,7 +469,7 @@ def _adorn_subplots(self):
if len(self.axes) > 0:
all_axes = self._get_subplots()
nrows, ncols = self._get_axes_layout()
- _handle_shared_axes(
+ handle_shared_axes(
axarr=all_axes,
nplots=len(all_axes),
naxes=nrows * ncols,
@@ -520,7 +524,7 @@ def _adorn_subplots(self):
raise ValueError(msg)
self.axes[0].set_title(self.title)
- def _apply_axis_properties(self, axis, rot=None, fontsize=None):
+ def _apply_axis_properties(self, axis: "Axis", rot=None, fontsize=None):
"""
Tick creation within matplotlib is reasonably expensive and is
internally deferred until accessed as Ticks are created/destroyed
@@ -537,7 +541,7 @@ def _apply_axis_properties(self, axis, rot=None, fontsize=None):
label.set_fontsize(fontsize)
@property
- def legend_title(self):
+ def legend_title(self) -> Optional[str]:
if not isinstance(self.data.columns, ABCMultiIndex):
name = self.data.columns.name
if name is not None:
@@ -588,7 +592,7 @@ def _make_legend(self):
if ax.get_visible():
ax.legend(loc="best")
- def _get_ax_legend_handle(self, ax):
+ def _get_ax_legend_handle(self, ax: "Axes"):
"""
Take in axes and return ax, legend and handle under different scenarios
"""
@@ -613,7 +617,7 @@ def plt(self):
_need_to_set_index = False
- def _get_xticks(self, convert_period=False):
+ def _get_xticks(self, convert_period: bool = False):
index = self.data.index
is_datetype = index.inferred_type in ("datetime", "date", "datetime64", "time")
@@ -643,7 +647,7 @@ def _get_xticks(self, convert_period=False):
@classmethod
@register_pandas_matplotlib_converters
- def _plot(cls, ax, x, y, style=None, is_errorbar=False, **kwds):
+ def _plot(cls, ax: "Axes", x, y, style=None, is_errorbar: bool = False, **kwds):
mask = isna(y)
if mask.any():
y = np.ma.array(y)
@@ -664,10 +668,10 @@ def _plot(cls, ax, x, y, style=None, is_errorbar=False, **kwds):
if style is not None:
args = (x, y, style)
else:
- args = (x, y)
+ args = (x, y) # type: ignore[assignment]
return ax.plot(*args, **kwds)
- def _get_index_name(self):
+ def _get_index_name(self) -> Optional[str]:
if isinstance(self.data.index, ABCMultiIndex):
name = self.data.index.names
if com.any_not_none(*name):
@@ -740,7 +744,7 @@ def _apply_style_colors(self, colors, kwds, col_num, label):
style = self.style
has_color = "color" in kwds or self.colormap is not None
- nocolor_style = style is None or re.match("[a-z]+", style) is None
+ nocolor_style = style is None or not _color_in_style(style)
if (has_color or self.subplots) and nocolor_style:
if isinstance(colors, dict):
kwds["color"] = colors[label]
@@ -752,7 +756,7 @@ def _get_colors(self, num_colors=None, color_kwds="color"):
if num_colors is None:
num_colors = self.nseries
- return _get_standard_colors(
+ return get_standard_colors(
num_colors=num_colors,
colormap=self.colormap,
color=self.kwds.get(color_kwds),
@@ -770,6 +774,12 @@ def _parse_errorbars(self, label, err):
DataFrame/dict: error values are paired with keys matching the
key in the plotted DataFrame
str: the name of the column within the plotted DataFrame
+
+ Asymmetrical error bars are also supported, however raw error values
+ must be provided in this case. For a ``N`` length :class:`Series`, a
+ ``2xN`` array should be provided indicating lower and upper (or left
+ and right) errors. For a ``MxN`` :class:`DataFrame`, asymmetrical errors
+ should be in a ``Mx2xN`` array.
"""
if err is None:
return None
@@ -810,7 +820,15 @@ def match_labels(data, e):
err_shape = err.shape
# asymmetrical error bars
- if err.ndim == 3:
+ if isinstance(self.data, ABCSeries) and err_shape[0] == 2:
+ err = np.expand_dims(err, 0)
+ err_shape = err.shape
+ if err_shape[2] != len(self.data):
+ raise ValueError(
+ "Asymmetrical error bars should be provided "
+ f"with the shape (2, {len(self.data)})"
+ )
+ elif isinstance(self.data, ABCDataFrame) and err.ndim == 3:
if (
(err_shape[0] != self.nseries)
or (err_shape[1] != 2)
@@ -860,7 +878,7 @@ def _get_subplots(self):
ax for ax in self.axes[0].get_figure().get_axes() if isinstance(ax, Subplot)
]
- def _get_axes_layout(self):
+ def _get_axes_layout(self) -> Tuple[int, int]:
axes = self._get_subplots()
x_set = set()
y_set = set()
@@ -899,15 +917,15 @@ def __init__(self, data, x, y, **kwargs):
self.y = y
@property
- def nseries(self):
+ def nseries(self) -> int:
return 1
- def _post_plot_logic(self, ax, data):
+ def _post_plot_logic(self, ax: "Axes", data):
x, y = self.x, self.y
ax.set_ylabel(pprint_thing(y))
ax.set_xlabel(pprint_thing(x))
- def _plot_colorbar(self, ax, **kwds):
+ def _plot_colorbar(self, ax: "Axes", **kwds):
# Addresses issues #10611 and #10678:
# When plotting scatterplots and hexbinplots in IPython
# inline backend the colorbar axis height tends not to
@@ -926,7 +944,7 @@ def _plot_colorbar(self, ax, **kwds):
img = ax.collections[-1]
cbar = self.fig.colorbar(img, ax=ax, **kwds)
- if _mpl_ge_3_0_0():
+ if mpl_ge_3_0_0():
# The workaround below is no longer necessary.
return
@@ -1063,20 +1081,16 @@ def __init__(self, data, **kwargs):
if "x_compat" in self.kwds:
self.x_compat = bool(self.kwds.pop("x_compat"))
- def _is_ts_plot(self):
+ def _is_ts_plot(self) -> bool:
# this is slightly deceptive
return not self.x_compat and self.use_index and self._use_dynamic_x()
def _use_dynamic_x(self):
- from pandas.plotting._matplotlib.timeseries import _use_dynamic_x
-
- return _use_dynamic_x(self._get_ax(0), self.data)
+ return use_dynamic_x(self._get_ax(0), self.data)
def _make_plot(self):
if self._is_ts_plot():
- from pandas.plotting._matplotlib.timeseries import _maybe_convert_index
-
- data = _maybe_convert_index(self._get_ax(0), self.data)
+ data = maybe_convert_index(self._get_ax(0), self.data)
x = data.index # dummy, not used
plotf = self._ts_plot
@@ -1117,12 +1131,14 @@ def _make_plot(self):
# reset of xlim should be used for ts data
# TODO: GH28021, should find a way to change view limit on xaxis
- lines = _get_all_lines(ax)
- left, right = _get_xlim(lines)
+ lines = get_all_lines(ax)
+ left, right = get_xlim(lines)
ax.set_xlim(left, right)
@classmethod
- def _plot(cls, ax, x, y, style=None, column_num=None, stacking_id=None, **kwds):
+ def _plot(
+ cls, ax: "Axes", x, y, style=None, column_num=None, stacking_id=None, **kwds
+ ):
# column_num is used to get the target column from plotf in line and
# area plots
if column_num == 0:
@@ -1133,25 +1149,19 @@ def _plot(cls, ax, x, y, style=None, column_num=None, stacking_id=None, **kwds):
return lines
@classmethod
- def _ts_plot(cls, ax, x, data, style=None, **kwds):
- from pandas.plotting._matplotlib.timeseries import (
- _maybe_resample,
- _decorate_axes,
- format_dateaxis,
- )
-
+ def _ts_plot(cls, ax: "Axes", x, data, style=None, **kwds):
# accept x to be consistent with normal plot func,
# x is not passed to tsplot as it uses data.index as x coordinate
# column_num must be in kwds for stacking purpose
- freq, data = _maybe_resample(data, ax, kwds)
+ freq, data = maybe_resample(data, ax, kwds)
# Set ax with freq info
- _decorate_axes(ax, freq, kwds)
+ decorate_axes(ax, freq, kwds)
# digging deeper
if hasattr(ax, "left_ax"):
- _decorate_axes(ax.left_ax, freq, kwds)
+ decorate_axes(ax.left_ax, freq, kwds)
if hasattr(ax, "right_ax"):
- _decorate_axes(ax.right_ax, freq, kwds)
+ decorate_axes(ax.right_ax, freq, kwds)
ax._plot_data.append((data, cls._kind, kwds))
lines = cls._plot(ax, data.index, data.values, style=style, **kwds)
@@ -1166,7 +1176,7 @@ def _get_stacking_id(self):
return None
@classmethod
- def _initialize_stacker(cls, ax, stacking_id, n):
+ def _initialize_stacker(cls, ax: "Axes", stacking_id, n: int):
if stacking_id is None:
return
if not hasattr(ax, "_stacker_pos_prior"):
@@ -1177,7 +1187,7 @@ def _initialize_stacker(cls, ax, stacking_id, n):
ax._stacker_neg_prior[stacking_id] = np.zeros(n)
@classmethod
- def _get_stacked_values(cls, ax, stacking_id, values, label):
+ def _get_stacked_values(cls, ax: "Axes", stacking_id, values, label):
if stacking_id is None:
return values
if not hasattr(ax, "_stacker_pos_prior"):
@@ -1196,7 +1206,7 @@ def _get_stacked_values(cls, ax, stacking_id, values, label):
)
@classmethod
- def _update_stacker(cls, ax, stacking_id, values):
+ def _update_stacker(cls, ax: "Axes", stacking_id, values):
if stacking_id is None:
return
if (values >= 0).all():
@@ -1204,7 +1214,7 @@ def _update_stacker(cls, ax, stacking_id, values):
elif (values <= 0).all():
ax._stacker_neg_prior[stacking_id] += values
- def _post_plot_logic(self, ax, data):
+ def _post_plot_logic(self, ax: "Axes", data):
from matplotlib.ticker import FixedLocator
def get_label(i):
@@ -1218,14 +1228,18 @@ def get_label(i):
if self._need_to_set_index:
xticks = ax.get_xticks()
xticklabels = [get_label(x) for x in xticks]
- ax.set_xticklabels(xticklabels)
ax.xaxis.set_major_locator(FixedLocator(xticks))
+ ax.set_xticklabels(xticklabels)
+ # If the index is an irregular time series, then by default
+ # we rotate the tick labels. The exception is if there are
+ # subplots which don't share their x-axes, in which we case
+ # we don't rotate the ticklabels as by default the subplots
+ # would be too close together.
condition = (
not self._use_dynamic_x()
- and data.index.is_all_dates
- and not self.subplots
- or (self.subplots and self.sharex)
+ and (data.index.is_all_dates and self.use_index)
+ and (not self.subplots or (self.subplots and self.sharex))
)
index_name = self._get_index_name()
@@ -1259,7 +1273,7 @@ def __init__(self, data, **kwargs):
@classmethod
def _plot(
cls,
- ax,
+ ax: "Axes",
x,
y,
style=None,
@@ -1301,7 +1315,7 @@ def _plot(
res = [rect]
return res
- def _post_plot_logic(self, ax, data):
+ def _post_plot_logic(self, ax: "Axes", data):
LinePlot._post_plot_logic(self, ax, data)
if self.ylim is None:
@@ -1355,7 +1369,7 @@ def _args_adjust(self):
self.left = np.array(self.left)
@classmethod
- def _plot(cls, ax, x, y, w, start=0, log=False, **kwds):
+ def _plot(cls, ax: "Axes", x, y, w, start=0, log=False, **kwds):
return ax.bar(x, y, w, bottom=start, log=log, **kwds)
@property
@@ -1437,7 +1451,7 @@ def _make_plot(self):
)
self._add_legend_handle(rect, label, index=i)
- def _post_plot_logic(self, ax, data):
+ def _post_plot_logic(self, ax: "Axes", data):
if self.use_index:
str_index = [pprint_thing(key) for key in data.index]
else:
@@ -1449,7 +1463,7 @@ def _post_plot_logic(self, ax, data):
self._decorate_ticks(ax, name, str_index, s_edge, e_edge)
- def _decorate_ticks(self, ax, name, ticklabels, start_edge, end_edge):
+ def _decorate_ticks(self, ax: "Axes", name, ticklabels, start_edge, end_edge):
ax.set_xlim((start_edge, end_edge))
if self.xticks is not None:
@@ -1472,10 +1486,10 @@ def _start_base(self):
return self.left
@classmethod
- def _plot(cls, ax, x, y, w, start=0, log=False, **kwds):
+ def _plot(cls, ax: "Axes", x, y, w, start=0, log=False, **kwds):
return ax.barh(x, y, w, left=start, log=log, **kwds)
- def _decorate_ticks(self, ax, name, ticklabels, start_edge, end_edge):
+ def _decorate_ticks(self, ax: "Axes", name, ticklabels, start_edge, end_edge):
# horizontal bars
ax.set_ylim((start_edge, end_edge))
ax.set_yticks(self.tick_pos)
diff --git a/pandas/plotting/_matplotlib/hist.py b/pandas/plotting/_matplotlib/hist.py
index ee41479b3c7c9..89035552d4309 100644
--- a/pandas/plotting/_matplotlib/hist.py
+++ b/pandas/plotting/_matplotlib/hist.py
@@ -1,3 +1,5 @@
+from typing import TYPE_CHECKING
+
import numpy as np
from pandas.core.dtypes.common import is_integer, is_list_like
@@ -6,7 +8,14 @@
from pandas.io.formats.printing import pprint_thing
from pandas.plotting._matplotlib.core import LinePlot, MPLPlot
-from pandas.plotting._matplotlib.tools import _flatten, _set_ticks_props, _subplots
+from pandas.plotting._matplotlib.tools import (
+ create_subplots,
+ flatten_axes,
+ set_ticks_props,
+)
+
+if TYPE_CHECKING:
+ from matplotlib.axes import Axes
class HistPlot(LinePlot):
@@ -90,7 +99,7 @@ def _make_plot_keywords(self, kwds, y):
kwds["bins"] = self.bins
return kwds
- def _post_plot_logic(self, ax, data):
+ def _post_plot_logic(self, ax: "Axes", data):
if self.orientation == "horizontal":
ax.set_xlabel("Frequency")
else:
@@ -193,11 +202,11 @@ def _grouped_plot(
grouped = grouped[column]
naxes = len(grouped)
- fig, axes = _subplots(
+ fig, axes = create_subplots(
naxes=naxes, figsize=figsize, sharex=sharex, sharey=sharey, ax=ax, layout=layout
)
- _axes = _flatten(axes)
+ _axes = flatten_axes(axes)
for i, (key, group) in enumerate(grouped):
ax = _axes[i]
@@ -281,7 +290,7 @@ def plot_group(group, ax):
rot=rot,
)
- _set_ticks_props(
+ set_ticks_props(
axes, xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot
)
@@ -332,7 +341,7 @@ def hist_series(
ax.grid(grid)
axes = np.array([ax])
- _set_ticks_props(
+ set_ticks_props(
axes, xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot
)
@@ -414,7 +423,7 @@ def hist_frame(
if naxes == 0:
raise ValueError("hist method requires numerical columns, nothing to plot.")
- fig, axes = _subplots(
+ fig, axes = create_subplots(
naxes=naxes,
ax=ax,
squeeze=False,
@@ -423,7 +432,7 @@ def hist_frame(
figsize=figsize,
layout=layout,
)
- _axes = _flatten(axes)
+ _axes = flatten_axes(axes)
can_set_label = "label" not in kwds
@@ -437,7 +446,7 @@ def hist_frame(
if legend:
ax.legend()
- _set_ticks_props(
+ set_ticks_props(
axes, xlabelsize=xlabelsize, xrot=xrot, ylabelsize=ylabelsize, yrot=yrot
)
fig.subplots_adjust(wspace=0.3, hspace=0.3)
diff --git a/pandas/plotting/_matplotlib/misc.py b/pandas/plotting/_matplotlib/misc.py
index bb6530b0f6412..a1c62f9fce23c 100644
--- a/pandas/plotting/_matplotlib/misc.py
+++ b/pandas/plotting/_matplotlib/misc.py
@@ -1,18 +1,27 @@
import random
+from typing import TYPE_CHECKING, Dict, List, Optional, Set
import matplotlib.lines as mlines
import matplotlib.patches as patches
import numpy as np
+from pandas._typing import Label
+
from pandas.core.dtypes.missing import notna
from pandas.io.formats.printing import pprint_thing
-from pandas.plotting._matplotlib.style import _get_standard_colors
-from pandas.plotting._matplotlib.tools import _set_ticks_props, _subplots
+from pandas.plotting._matplotlib.style import get_standard_colors
+from pandas.plotting._matplotlib.tools import create_subplots, set_ticks_props
+
+if TYPE_CHECKING:
+ from matplotlib.axes import Axes
+ from matplotlib.figure import Figure
+
+ from pandas import DataFrame, Series
def scatter_matrix(
- frame,
+ frame: "DataFrame",
alpha=0.5,
figsize=None,
ax=None,
@@ -27,7 +36,7 @@ def scatter_matrix(
df = frame._get_numeric_data()
n = df.columns.size
naxes = n * n
- fig, axes = _subplots(naxes=naxes, figsize=figsize, ax=ax, squeeze=False)
+ fig, axes = create_subplots(naxes=naxes, figsize=figsize, ax=ax, squeeze=False)
# no gaps between subplots
fig.subplots_adjust(wspace=0, hspace=0)
@@ -103,7 +112,7 @@ def scatter_matrix(
locs = locs.astype(int)
axes[0][0].yaxis.set_ticklabels(locs)
- _set_ticks_props(axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0)
+ set_ticks_props(axes, xlabelsize=8, xrot=90, ylabelsize=8, yrot=0)
return axes
@@ -114,7 +123,14 @@ def _get_marker_compat(marker):
return marker
-def radviz(frame, class_column, ax=None, color=None, colormap=None, **kwds):
+def radviz(
+ frame: "DataFrame",
+ class_column,
+ ax: Optional["Axes"] = None,
+ color=None,
+ colormap=None,
+ **kwds,
+) -> "Axes":
import matplotlib.pyplot as plt
def normalize(series):
@@ -130,8 +146,8 @@ def normalize(series):
if ax is None:
ax = plt.gca(xlim=[-1, 1], ylim=[-1, 1])
- to_plot = {}
- colors = _get_standard_colors(
+ to_plot: Dict[Label, List[List]] = {}
+ colors = get_standard_colors(
num_colors=len(classes), colormap=colormap, color_type="random", color=color
)
@@ -197,8 +213,14 @@ def normalize(series):
def andrews_curves(
- frame, class_column, ax=None, samples=200, color=None, colormap=None, **kwds
-):
+ frame: "DataFrame",
+ class_column,
+ ax: Optional["Axes"] = None,
+ samples: int = 200,
+ color=None,
+ colormap=None,
+ **kwds,
+) -> "Axes":
import matplotlib.pyplot as plt
def function(amplitudes):
@@ -231,9 +253,9 @@ def f(t):
classes = frame[class_column].drop_duplicates()
df = frame.drop(class_column, axis=1)
t = np.linspace(-np.pi, np.pi, samples)
- used_legends = set()
+ used_legends: Set[str] = set()
- color_values = _get_standard_colors(
+ color_values = get_standard_colors(
num_colors=len(classes), colormap=colormap, color_type="random", color=color
)
colors = dict(zip(classes, color_values))
@@ -256,7 +278,13 @@ def f(t):
return ax
-def bootstrap_plot(series, fig=None, size=50, samples=500, **kwds):
+def bootstrap_plot(
+ series: "Series",
+ fig: Optional["Figure"] = None,
+ size: int = 50,
+ samples: int = 500,
+ **kwds,
+) -> "Figure":
import matplotlib.pyplot as plt
@@ -306,19 +334,19 @@ def bootstrap_plot(series, fig=None, size=50, samples=500, **kwds):
def parallel_coordinates(
- frame,
+ frame: "DataFrame",
class_column,
cols=None,
- ax=None,
+ ax: Optional["Axes"] = None,
color=None,
use_columns=False,
xticks=None,
colormap=None,
- axvlines=True,
+ axvlines: bool = True,
axvlines_kwds=None,
- sort_labels=False,
+ sort_labels: bool = False,
**kwds,
-):
+) -> "Axes":
import matplotlib.pyplot as plt
if axvlines_kwds is None:
@@ -333,7 +361,7 @@ def parallel_coordinates(
else:
df = frame[cols]
- used_legends = set()
+ used_legends: Set[str] = set()
ncols = len(df.columns)
@@ -354,7 +382,7 @@ def parallel_coordinates(
if ax is None:
ax = plt.gca()
- color_values = _get_standard_colors(
+ color_values = get_standard_colors(
num_colors=len(classes), colormap=colormap, color_type="random", color=color
)
@@ -385,7 +413,9 @@ def parallel_coordinates(
return ax
-def lag_plot(series, lag=1, ax=None, **kwds):
+def lag_plot(
+ series: "Series", lag: int = 1, ax: Optional["Axes"] = None, **kwds
+) -> "Axes":
# workaround because `c='b'` is hardcoded in matplotlib's scatter method
import matplotlib.pyplot as plt
@@ -402,7 +432,9 @@ def lag_plot(series, lag=1, ax=None, **kwds):
return ax
-def autocorrelation_plot(series, ax=None, **kwds):
+def autocorrelation_plot(
+ series: "Series", ax: Optional["Axes"] = None, **kwds
+) -> "Axes":
import matplotlib.pyplot as plt
n = len(series)
diff --git a/pandas/plotting/_matplotlib/style.py b/pandas/plotting/_matplotlib/style.py
index 7990bff4f517c..3e0954ef3d74d 100644
--- a/pandas/plotting/_matplotlib/style.py
+++ b/pandas/plotting/_matplotlib/style.py
@@ -10,8 +10,8 @@
import pandas.core.common as com
-def _get_standard_colors(
- num_colors=None, colormap=None, color_type="default", color=None
+def get_standard_colors(
+ num_colors: int, colormap=None, color_type: str = "default", color=None
):
import matplotlib.pyplot as plt
diff --git a/pandas/plotting/_matplotlib/timeseries.py b/pandas/plotting/_matplotlib/timeseries.py
index 8f3571cf13cbc..f8faac6a6a026 100644
--- a/pandas/plotting/_matplotlib/timeseries.py
+++ b/pandas/plotting/_matplotlib/timeseries.py
@@ -24,14 +24,15 @@
from pandas.tseries.frequencies import get_period_alias, is_subperiod, is_superperiod
if TYPE_CHECKING:
- from pandas import Series, Index # noqa:F401
+ from matplotlib.axes import Axes
+ from pandas import Index, Series # noqa:F401
# ---------------------------------------------------------------------
# Plotting functions and monkey patches
-def _maybe_resample(series: "Series", ax, kwargs):
+def maybe_resample(series: "Series", ax: "Axes", kwargs):
# resample against axes freq if necessary
freq, ax_freq = _get_freq(ax, series)
@@ -45,7 +46,10 @@ def _maybe_resample(series: "Series", ax, kwargs):
if ax_freq is not None and freq != ax_freq:
if is_superperiod(freq, ax_freq): # upsample input
series = series.copy()
- series.index = series.index.asfreq(ax_freq, how="s") # type: ignore
+ # error: "Index" has no attribute "asfreq"
+ series.index = series.index.asfreq( # type: ignore[attr-defined]
+ ax_freq, how="s"
+ )
freq = ax_freq
elif _is_sup(freq, ax_freq): # one is weekly
how = kwargs.pop("how", "last")
@@ -59,19 +63,19 @@ def _maybe_resample(series: "Series", ax, kwargs):
return freq, series
-def _is_sub(f1, f2):
+def _is_sub(f1: str, f2: str) -> bool:
return (f1.startswith("W") and is_subperiod("D", f2)) or (
f2.startswith("W") and is_subperiod(f1, "D")
)
-def _is_sup(f1, f2):
+def _is_sup(f1: str, f2: str) -> bool:
return (f1.startswith("W") and is_superperiod("D", f2)) or (
f2.startswith("W") and is_superperiod(f1, "D")
)
-def _upsample_others(ax, freq, kwargs):
+def _upsample_others(ax: "Axes", freq, kwargs):
legend = ax.get_legend()
lines, labels = _replot_ax(ax, freq, kwargs)
_replot_ax(ax, freq, kwargs)
@@ -94,14 +98,14 @@ def _upsample_others(ax, freq, kwargs):
ax.legend(lines, labels, loc="best", title=title)
-def _replot_ax(ax, freq, kwargs):
+def _replot_ax(ax: "Axes", freq, kwargs):
data = getattr(ax, "_plot_data", None)
# clear current axes and data
ax._plot_data = []
ax.clear()
- _decorate_axes(ax, freq, kwargs)
+ decorate_axes(ax, freq, kwargs)
lines = []
labels = []
@@ -124,7 +128,7 @@ def _replot_ax(ax, freq, kwargs):
return lines, labels
-def _decorate_axes(ax, freq, kwargs):
+def decorate_axes(ax: "Axes", freq, kwargs):
"""Initialize axes for time-series plotting"""
if not hasattr(ax, "_plot_data"):
ax._plot_data = []
@@ -140,7 +144,7 @@ def _decorate_axes(ax, freq, kwargs):
ax.date_axis_info = None
-def _get_ax_freq(ax):
+def _get_ax_freq(ax: "Axes"):
"""
Get the freq attribute of the ax object if set.
Also checks shared axes (eg when using secondary yaxis, sharex=True
@@ -171,7 +175,7 @@ def _get_period_alias(freq) -> Optional[str]:
return freq
-def _get_freq(ax, series: "Series"):
+def _get_freq(ax: "Axes", series: "Series"):
# get frequency from data
freq = getattr(series.index, "freq", None)
if freq is None:
@@ -189,7 +193,7 @@ def _get_freq(ax, series: "Series"):
return freq, ax_freq
-def _use_dynamic_x(ax, data: "FrameOrSeriesUnion") -> bool:
+def use_dynamic_x(ax: "Axes", data: FrameOrSeriesUnion) -> bool:
freq = _get_index_freq(data.index)
ax_freq = _get_ax_freq(ax)
@@ -222,7 +226,8 @@ def _get_index_freq(index: "Index") -> Optional[BaseOffset]:
if freq is None:
freq = getattr(index, "inferred_freq", None)
if freq == "B":
- weekdays = np.unique(index.dayofweek) # type: ignore
+ # error: "Index" has no attribute "dayofweek"
+ weekdays = np.unique(index.dayofweek) # type: ignore[attr-defined]
if (5 in weekdays) or (6 in weekdays):
freq = None
@@ -230,7 +235,7 @@ def _get_index_freq(index: "Index") -> Optional[BaseOffset]:
return freq
-def _maybe_convert_index(ax, data):
+def maybe_convert_index(ax: "Axes", data):
# tsplot converts automatically, but don't want to convert index
# over and over for DataFrames
if isinstance(data.index, (ABCDatetimeIndex, ABCPeriodIndex)):
@@ -260,7 +265,7 @@ def _maybe_convert_index(ax, data):
# Do we need the rest for convenience?
-def _format_coord(freq, t, y):
+def _format_coord(freq, t, y) -> str:
time_period = Period(ordinal=int(t), freq=freq)
return f"t = {time_period} y = {y:8f}"
diff --git a/pandas/plotting/_matplotlib/tools.py b/pandas/plotting/_matplotlib/tools.py
index caf2f27de9276..c5b44f37150bb 100644
--- a/pandas/plotting/_matplotlib/tools.py
+++ b/pandas/plotting/_matplotlib/tools.py
@@ -1,18 +1,27 @@
# being a bit too dynamic
from math import ceil
+from typing import TYPE_CHECKING, Iterable, List, Sequence, Tuple, Union
import warnings
import matplotlib.table
import matplotlib.ticker as ticker
import numpy as np
+from pandas._typing import FrameOrSeries
+
from pandas.core.dtypes.common import is_list_like
from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries
from pandas.plotting._matplotlib import compat
+if TYPE_CHECKING:
+ from matplotlib.axes import Axes
+ from matplotlib.axis import Axis
+ from matplotlib.lines import Line2D # noqa:F401
+ from matplotlib.table import Table
+
-def format_date_labels(ax, rot):
+def format_date_labels(ax: "Axes", rot):
# mini version of autofmt_xdate
for label in ax.get_xticklabels():
label.set_ha("right")
@@ -21,7 +30,7 @@ def format_date_labels(ax, rot):
fig.subplots_adjust(bottom=0.2)
-def table(ax, data, rowLabels=None, colLabels=None, **kwargs):
+def table(ax, data: FrameOrSeries, rowLabels=None, colLabels=None, **kwargs) -> "Table":
if isinstance(data, ABCSeries):
data = data.to_frame()
elif isinstance(data, ABCDataFrame):
@@ -43,7 +52,7 @@ def table(ax, data, rowLabels=None, colLabels=None, **kwargs):
return table
-def _get_layout(nplots, layout=None, layout_type="box"):
+def _get_layout(nplots: int, layout=None, layout_type: str = "box") -> Tuple[int, int]:
if layout is not None:
if not isinstance(layout, (tuple, list)) or len(layout) != 2:
raise ValueError("Layout must be a tuple of (rows, columns)")
@@ -91,15 +100,15 @@ def _get_layout(nplots, layout=None, layout_type="box"):
# copied from matplotlib/pyplot.py and modified for pandas.plotting
-def _subplots(
- naxes=None,
- sharex=False,
- sharey=False,
- squeeze=True,
+def create_subplots(
+ naxes: int,
+ sharex: bool = False,
+ sharey: bool = False,
+ squeeze: bool = True,
subplot_kw=None,
ax=None,
layout=None,
- layout_type="box",
+ layout_type: str = "box",
**fig_kw,
):
"""
@@ -185,7 +194,7 @@ def _subplots(
fig = plt.figure(**fig_kw)
else:
if is_list_like(ax):
- ax = _flatten(ax)
+ ax = flatten_axes(ax)
if layout is not None:
warnings.warn(
"When passing multiple axes, layout keyword is ignored", UserWarning
@@ -212,7 +221,7 @@ def _subplots(
if squeeze:
return fig, ax
else:
- return fig, _flatten(ax)
+ return fig, flatten_axes(ax)
else:
warnings.warn(
"To output multiple subplots, the figure containing "
@@ -255,7 +264,7 @@ def _subplots(
for ax in axarr[naxes:]:
ax.set_visible(False)
- _handle_shared_axes(axarr, nplots, naxes, nrows, ncols, sharex, sharey)
+ handle_shared_axes(axarr, nplots, naxes, nrows, ncols, sharex, sharey)
if squeeze:
# Reshape the array to have the final desired dimension (nrow,ncol),
@@ -272,7 +281,7 @@ def _subplots(
return fig, axes
-def _remove_labels_from_axis(axis):
+def _remove_labels_from_axis(axis: "Axis"):
for t in axis.get_majorticklabels():
t.set_visible(False)
@@ -288,9 +297,17 @@ def _remove_labels_from_axis(axis):
axis.get_label().set_visible(False)
-def _handle_shared_axes(axarr, nplots, naxes, nrows, ncols, sharex, sharey):
+def handle_shared_axes(
+ axarr: Iterable["Axes"],
+ nplots: int,
+ naxes: int,
+ nrows: int,
+ ncols: int,
+ sharex: bool,
+ sharey: bool,
+):
if nplots > 1:
- if compat._mpl_ge_3_2_0():
+ if compat.mpl_ge_3_2_0():
row_num = lambda x: x.get_subplotspec().rowspan.start
col_num = lambda x: x.get_subplotspec().colspan.start
else:
@@ -334,7 +351,7 @@ def _handle_shared_axes(axarr, nplots, naxes, nrows, ncols, sharex, sharey):
_remove_labels_from_axis(ax.yaxis)
-def _flatten(axes):
+def flatten_axes(axes: Union["Axes", Sequence["Axes"]]) -> Sequence["Axes"]:
if not is_list_like(axes):
return np.array([axes])
elif isinstance(axes, (np.ndarray, ABCIndexClass)):
@@ -342,10 +359,16 @@ def _flatten(axes):
return np.array(axes)
-def _set_ticks_props(axes, xlabelsize=None, xrot=None, ylabelsize=None, yrot=None):
+def set_ticks_props(
+ axes: Union["Axes", Sequence["Axes"]],
+ xlabelsize=None,
+ xrot=None,
+ ylabelsize=None,
+ yrot=None,
+):
import matplotlib.pyplot as plt
- for ax in _flatten(axes):
+ for ax in flatten_axes(axes):
if xlabelsize is not None:
plt.setp(ax.get_xticklabels(), fontsize=xlabelsize)
if xrot is not None:
@@ -357,7 +380,7 @@ def _set_ticks_props(axes, xlabelsize=None, xrot=None, ylabelsize=None, yrot=Non
return axes
-def _get_all_lines(ax):
+def get_all_lines(ax: "Axes") -> List["Line2D"]:
lines = ax.get_lines()
if hasattr(ax, "right_ax"):
@@ -369,7 +392,7 @@ def _get_all_lines(ax):
return lines
-def _get_xlim(lines):
+def get_xlim(lines: Iterable["Line2D"]) -> Tuple[float, float]:
left, right = np.inf, -np.inf
for l in lines:
x = l.get_xdata(orig=False)
diff --git a/pandas/tests/api/test_api.py b/pandas/tests/api/test_api.py
index ecd20796b6f21..54da13c3c620b 100644
--- a/pandas/tests/api/test_api.py
+++ b/pandas/tests/api/test_api.py
@@ -5,7 +5,7 @@
import pytest
import pandas as pd
-from pandas import api, compat
+from pandas import api
import pandas._testing as tm
@@ -61,6 +61,7 @@ class TestPDApi(Base):
"ExcelFile",
"ExcelWriter",
"Float64Index",
+ "Flags",
"Grouper",
"HDFStore",
"Index",
@@ -100,11 +101,6 @@ class TestPDApi(Base):
# these should be deprecated in the future
deprecated_classes_in_future: List[str] = ["SparseArray"]
- if not compat.PY37:
- classes.extend(["Panel", "SparseSeries", "SparseDataFrame"])
- # deprecated_modules.extend(["np", "datetime"])
- # deprecated_classes_in_future.extend(["SparseArray"])
-
# external modules exposed in pandas namespace
modules: List[str] = []
@@ -193,7 +189,6 @@ class TestPDApi(Base):
"_hashtable",
"_lib",
"_libs",
- "_np_version_under1p16",
"_np_version_under1p17",
"_np_version_under1p18",
"_is_numpy_dev",
@@ -217,14 +212,6 @@ def test_api(self):
+ self.funcs_to
+ self.private_modules
)
- if not compat.PY37:
- checkthese.extend(
- self.deprecated_modules
- + self.deprecated_classes
- + self.deprecated_classes_in_future
- + self.deprecated_funcs_in_future
- + self.deprecated_funcs
- )
self.check(pd, checkthese, self.ignored)
def test_depr(self):
@@ -237,14 +224,7 @@ def test_depr(self):
)
for depr in deprecated_list:
with tm.assert_produces_warning(FutureWarning):
- deprecated = getattr(pd, depr)
- if not compat.PY37:
- if depr == "datetime":
- deprecated.__getattr__(dir(pd.datetime.datetime)[-1])
- elif depr == "SparseArray":
- deprecated([])
- else:
- deprecated.__getattr__(dir(deprecated)[-1])
+ _ = getattr(pd, depr)
def test_datetime():
@@ -267,9 +247,10 @@ def test_sparsearray():
def test_np():
- import numpy as np
import warnings
+ import numpy as np
+
with warnings.catch_warnings():
warnings.simplefilter("ignore", FutureWarning)
assert (pd.np.arange(0, 10) == np.arange(0, 10)).all()
diff --git a/pandas/tests/arithmetic/test_datetime64.py b/pandas/tests/arithmetic/test_datetime64.py
index 5dfaea7c77420..0dd389ed516c7 100644
--- a/pandas/tests/arithmetic/test_datetime64.py
+++ b/pandas/tests/arithmetic/test_datetime64.py
@@ -749,6 +749,7 @@ class TestDatetime64Arithmetic:
# -------------------------------------------------------------
# Addition/Subtraction of timedelta-like
+ @pytest.mark.arm_slow
def test_dt64arr_add_timedeltalike_scalar(
self, tz_naive_fixture, two_hours, box_with_array
):
diff --git a/pandas/tests/arithmetic/test_interval.py b/pandas/tests/arithmetic/test_interval.py
index 50b5fe8e6f6b9..72ef7ea6bf8ca 100644
--- a/pandas/tests/arithmetic/test_interval.py
+++ b/pandas/tests/arithmetic/test_interval.py
@@ -156,9 +156,7 @@ def test_compare_scalar_other(self, op, array, other):
expected = self.elementwise_comparison(op, array, other)
tm.assert_numpy_array_equal(result, expected)
- def test_compare_list_like_interval(
- self, op, array, interval_constructor,
- ):
+ def test_compare_list_like_interval(self, op, array, interval_constructor):
# same endpoints
other = interval_constructor(array.left, array.right)
result = op(array, other)
diff --git a/pandas/tests/arithmetic/test_numeric.py b/pandas/tests/arithmetic/test_numeric.py
index 2155846b271fc..139401bdf5806 100644
--- a/pandas/tests/arithmetic/test_numeric.py
+++ b/pandas/tests/arithmetic/test_numeric.py
@@ -89,6 +89,26 @@ def test_compare_invalid(self):
b.name = pd.Timestamp("2000-01-01")
tm.assert_series_equal(a / b, 1 / (b / a))
+ def test_numeric_cmp_string_numexpr_path(self, box):
+ # GH#36377, GH#35700
+ xbox = box if box is not pd.Index else np.ndarray
+
+ obj = pd.Series(np.random.randn(10 ** 5))
+ obj = tm.box_expected(obj, box, transpose=False)
+
+ result = obj == "a"
+
+ expected = pd.Series(np.zeros(10 ** 5, dtype=bool))
+ expected = tm.box_expected(expected, xbox, transpose=False)
+ tm.assert_equal(result, expected)
+
+ result = obj != "a"
+ tm.assert_equal(result, ~expected)
+
+ msg = "Invalid comparison between dtype=float64 and str"
+ with pytest.raises(TypeError, match=msg):
+ obj < "a"
+
# ------------------------------------------------------------------
# Numeric dtypes Arithmetic with Datetime/Timedelta Scalar
@@ -99,7 +119,7 @@ class TestNumericArraylikeArithmeticWithDatetimeLike:
# TODO: also check name retentention
@pytest.mark.parametrize("box_cls", [np.array, pd.Index, pd.Series])
@pytest.mark.parametrize(
- "left", lefts, ids=lambda x: type(x).__name__ + str(x.dtype),
+ "left", lefts, ids=lambda x: type(x).__name__ + str(x.dtype)
)
def test_mul_td64arr(self, left, box_cls):
# GH#22390
@@ -119,7 +139,7 @@ def test_mul_td64arr(self, left, box_cls):
# TODO: also check name retentention
@pytest.mark.parametrize("box_cls", [np.array, pd.Index, pd.Series])
@pytest.mark.parametrize(
- "left", lefts, ids=lambda x: type(x).__name__ + str(x.dtype),
+ "left", lefts, ids=lambda x: type(x).__name__ + str(x.dtype)
)
def test_div_td64arr(self, left, box_cls):
# GH#22390
@@ -548,20 +568,6 @@ class TestMultiplicationDivision:
# __mul__, __rmul__, __div__, __rdiv__, __floordiv__, __rfloordiv__
# for non-timestamp/timedelta/period dtypes
- @pytest.mark.parametrize(
- "box",
- [
- pytest.param(
- pd.Index,
- marks=pytest.mark.xfail(
- reason="Index.__div__ always raises", raises=TypeError
- ),
- ),
- pd.Series,
- pd.DataFrame,
- ],
- ids=lambda x: x.__name__,
- )
def test_divide_decimal(self, box):
# resolves issue GH#9787
ser = Series([Decimal(10)])
diff --git a/pandas/tests/arithmetic/test_timedelta64.py b/pandas/tests/arithmetic/test_timedelta64.py
index f94408d657ae5..64d3d5b6d684d 100644
--- a/pandas/tests/arithmetic/test_timedelta64.py
+++ b/pandas/tests/arithmetic/test_timedelta64.py
@@ -1733,6 +1733,23 @@ def test_tdarr_div_length_mismatch(self, box_with_array):
# ------------------------------------------------------------------
# __floordiv__, __rfloordiv__
+ def test_td64arr_floordiv_td64arr_with_nat(self, box_with_array):
+ # GH#35529
+ box = box_with_array
+
+ left = pd.Series([1000, 222330, 30], dtype="timedelta64[ns]")
+ right = pd.Series([1000, 222330, None], dtype="timedelta64[ns]")
+
+ left = tm.box_expected(left, box)
+ right = tm.box_expected(right, box)
+
+ expected = np.array([1.0, 1.0, np.nan], dtype=np.float64)
+ expected = tm.box_expected(expected, box)
+
+ result = left // right
+
+ tm.assert_equal(result, expected)
+
def test_td64arr_floordiv_tdscalar(self, box_with_array, scalar_td):
# GH#18831
td1 = Series([timedelta(minutes=5, seconds=3)] * 3)
diff --git a/pandas/tests/arrays/boolean/test_construction.py b/pandas/tests/arrays/boolean/test_construction.py
index f7354a089df3b..2f5c61304d415 100644
--- a/pandas/tests/arrays/boolean/test_construction.py
+++ b/pandas/tests/arrays/boolean/test_construction.py
@@ -247,10 +247,11 @@ def test_coerce_to_numpy_array():
def test_to_boolean_array_from_strings():
result = BooleanArray._from_sequence_of_strings(
- np.array(["True", "False", np.nan], dtype=object)
+ np.array(["True", "False", "1", "1.0", "0", "0.0", np.nan], dtype=object)
)
expected = BooleanArray(
- np.array([True, False, False]), np.array([False, False, True])
+ np.array([True, False, True, True, False, False, False]),
+ np.array([False, False, False, False, False, False, True]),
)
tm.assert_extension_array_equal(result, expected)
diff --git a/pandas/tests/arrays/boolean/test_logical.py b/pandas/tests/arrays/boolean/test_logical.py
index e79262e1b7934..8ed1c27087b02 100644
--- a/pandas/tests/arrays/boolean/test_logical.py
+++ b/pandas/tests/arrays/boolean/test_logical.py
@@ -205,9 +205,7 @@ def test_kleene_xor_scalar(self, other, expected):
a, pd.array([True, False, None], dtype="boolean")
)
- @pytest.mark.parametrize(
- "other", [True, False, pd.NA, [True, False, None] * 3],
- )
+ @pytest.mark.parametrize("other", [True, False, pd.NA, [True, False, None] * 3])
def test_no_masked_assumptions(self, other, all_logical_operators):
# The logical operations should not assume that masked values are False!
a = pd.arrays.BooleanArray(
diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py
index ca942c9288898..e200f13652a84 100644
--- a/pandas/tests/arrays/categorical/test_constructors.py
+++ b/pandas/tests/arrays/categorical/test_constructors.py
@@ -3,8 +3,6 @@
import numpy as np
import pytest
-from pandas.compat.numpy import _np_version_under1p16
-
from pandas.core.dtypes.common import is_float_dtype, is_integer_dtype
from pandas.core.dtypes.dtypes import CategoricalDtype
@@ -279,7 +277,7 @@ def test_constructor_with_generator(self):
# returned a scalar for a generator
exp = Categorical([0, 1, 2])
- cat = Categorical((x for x in [0, 1, 2]))
+ cat = Categorical(x for x in [0, 1, 2])
tm.assert_categorical_equal(cat, exp)
cat = Categorical(range(3))
tm.assert_categorical_equal(cat, exp)
@@ -637,7 +635,6 @@ def test_constructor_imaginary(self):
tm.assert_index_equal(c1.categories, Index(values))
tm.assert_numpy_array_equal(np.array(c1), np.array(values))
- @pytest.mark.skipif(_np_version_under1p16, reason="Skipping for NumPy <1.16")
def test_constructor_string_and_tuples(self):
# GH 21416
c = pd.Categorical(np.array(["c", ("a", "b"), ("b", "a"), "c"], dtype=object))
diff --git a/pandas/tests/arrays/categorical/test_indexing.py b/pandas/tests/arrays/categorical/test_indexing.py
index abfae189bb4d7..ab8606ef9258d 100644
--- a/pandas/tests/arrays/categorical/test_indexing.py
+++ b/pandas/tests/arrays/categorical/test_indexing.py
@@ -183,7 +183,7 @@ def test_get_indexer_non_unique(self, idx_values, key_values, key_class):
# GH 21448
key = key_class(key_values, categories=range(1, 5))
# Test for flat index and CategoricalIndex with same/different cats:
- for dtype in None, "category", key.dtype:
+ for dtype in [None, "category", key.dtype]:
idx = Index(idx_values, dtype=dtype)
expected, exp_miss = idx.get_indexer_non_unique(key_values)
result, res_miss = idx.get_indexer_non_unique(key)
diff --git a/pandas/tests/arrays/categorical/test_operators.py b/pandas/tests/arrays/categorical/test_operators.py
index 6ea003c122eea..9d118f1ed8753 100644
--- a/pandas/tests/arrays/categorical/test_operators.py
+++ b/pandas/tests/arrays/categorical/test_operators.py
@@ -79,10 +79,6 @@ def test_comparisons(self):
cat_rev_base2 = Categorical(["b", "b", "b"], categories=["c", "b", "a", "d"])
- msg = (
- "Categoricals can only be compared if 'categories' are the same. "
- "Categories are different lengths"
- )
with pytest.raises(TypeError, match=msg):
cat_rev > cat_rev_base2
@@ -90,7 +86,6 @@ def test_comparisons(self):
cat_unorderd = cat.set_ordered(False)
assert not (cat > cat).any()
- msg = "Categoricals can only be compared if 'ordered' is the same"
with pytest.raises(TypeError, match=msg):
cat > cat_unorderd
@@ -171,17 +166,14 @@ def test_comparison_with_unknown_scalars(self):
# for unequal comps, but not for equal/not equal
cat = Categorical([1, 2, 3], ordered=True)
- msg = (
- "Cannot compare a Categorical for op __{}__ with a scalar, "
- "which is not a category"
- )
- with pytest.raises(TypeError, match=msg.format("lt")):
+ msg = "Invalid comparison between dtype=category and int"
+ with pytest.raises(TypeError, match=msg):
cat < 4
- with pytest.raises(TypeError, match=msg.format("gt")):
+ with pytest.raises(TypeError, match=msg):
cat > 4
- with pytest.raises(TypeError, match=msg.format("gt")):
+ with pytest.raises(TypeError, match=msg):
4 < cat
- with pytest.raises(TypeError, match=msg.format("lt")):
+ with pytest.raises(TypeError, match=msg):
4 > cat
tm.assert_numpy_array_equal(cat == 4, np.array([False, False, False]))
@@ -324,7 +316,7 @@ def test_compare_different_lengths(self):
c1 = Categorical([], categories=["a", "b"])
c2 = Categorical([], categories=["a"])
- msg = "Categories are different lengths"
+ msg = "Categoricals can only be compared if 'categories' are the same."
with pytest.raises(TypeError, match=msg):
c1 == c2
diff --git a/pandas/tests/arrays/categorical/test_replace.py b/pandas/tests/arrays/categorical/test_replace.py
index b9ac3ce9a37ae..8b784fde1d3c5 100644
--- a/pandas/tests/arrays/categorical/test_replace.py
+++ b/pandas/tests/arrays/categorical/test_replace.py
@@ -43,9 +43,5 @@ def test_replace(to_replace, value, expected, flip_categories):
# the replace call loses categorical dtype
expected = pd.Series(np.asarray(expected))
- tm.assert_series_equal(
- expected, result, check_category_order=False,
- )
- tm.assert_series_equal(
- expected, s, check_category_order=False,
- )
+ tm.assert_series_equal(expected, result, check_category_order=False)
+ tm.assert_series_equal(expected, s, check_category_order=False)
diff --git a/pandas/tests/arrays/categorical/test_sorting.py b/pandas/tests/arrays/categorical/test_sorting.py
index 2a0ef043bf9a9..9589216557cd5 100644
--- a/pandas/tests/arrays/categorical/test_sorting.py
+++ b/pandas/tests/arrays/categorical/test_sorting.py
@@ -66,7 +66,9 @@ def test_sort_values(self):
# sort (inplace order)
cat1 = cat.copy()
+ orig_codes = cat1._codes
cat1.sort_values(inplace=True)
+ assert cat1._codes is orig_codes
exp = np.array(["a", "b", "c", "d"], dtype=object)
tm.assert_numpy_array_equal(cat1.__array__(), exp)
tm.assert_index_equal(res.categories, cat.categories)
diff --git a/pandas/tests/arrays/integer/test_arithmetic.py b/pandas/tests/arrays/integer/test_arithmetic.py
index d309f6423e0c1..cf382dd5e37e0 100644
--- a/pandas/tests/arrays/integer/test_arithmetic.py
+++ b/pandas/tests/arrays/integer/test_arithmetic.py
@@ -261,3 +261,39 @@ def test_reduce_to_float(op):
index=pd.Index(["a", "b"], name="A"),
)
tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+ "source, target",
+ [
+ ([1, 2, 3], [-1, -2, -3]),
+ ([1, 2, None], [-1, -2, None]),
+ ([-1, 0, 1], [1, 0, -1]),
+ ],
+)
+def test_unary_minus_nullable_int(any_signed_nullable_int_dtype, source, target):
+ dtype = any_signed_nullable_int_dtype
+ arr = pd.array(source, dtype=dtype)
+ result = -arr
+ expected = pd.array(target, dtype=dtype)
+ tm.assert_extension_array_equal(result, expected)
+
+
+@pytest.mark.parametrize("source", [[1, 2, 3], [1, 2, None], [-1, 0, 1]])
+def test_unary_plus_nullable_int(any_signed_nullable_int_dtype, source):
+ dtype = any_signed_nullable_int_dtype
+ expected = pd.array(source, dtype=dtype)
+ result = +expected
+ tm.assert_extension_array_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+ "source, target",
+ [([1, 2, 3], [1, 2, 3]), ([1, -2, None], [1, 2, None]), ([-1, 0, 1], [1, 0, 1])],
+)
+def test_abs_nullable_int(any_signed_nullable_int_dtype, source, target):
+ dtype = any_signed_nullable_int_dtype
+ s = pd.array(source, dtype=dtype)
+ result = abs(s)
+ expected = pd.array(target, dtype=dtype)
+ tm.assert_extension_array_equal(result, expected)
diff --git a/pandas/tests/arrays/integer/test_concat.py b/pandas/tests/arrays/integer/test_concat.py
index 3ace35700bd3e..fc24709deb82c 100644
--- a/pandas/tests/arrays/integer/test_concat.py
+++ b/pandas/tests/arrays/integer/test_concat.py
@@ -1,3 +1,4 @@
+import numpy as np
import pytest
import pandas as pd
@@ -15,12 +16,52 @@
(["Int32", "UInt32"], "Int64"),
# this still gives object (awaiting float extension dtype)
(["Int64", "UInt64"], "object"),
+ (["Int64", "boolean"], "Int64"),
+ (["UInt8", "boolean"], "UInt8"),
],
)
def test_concat_series(to_concat_dtypes, result_dtype):
- result = pd.concat([pd.Series([1, 2, pd.NA], dtype=t) for t in to_concat_dtypes])
- expected = pd.concat([pd.Series([1, 2, pd.NA], dtype=object)] * 2).astype(
+ result = pd.concat([pd.Series([0, 1, pd.NA], dtype=t) for t in to_concat_dtypes])
+ expected = pd.concat([pd.Series([0, 1, pd.NA], dtype=object)] * 2).astype(
result_dtype
)
tm.assert_series_equal(result, expected)
+
+ # order doesn't matter for result
+ result = pd.concat(
+ [pd.Series([0, 1, pd.NA], dtype=t) for t in to_concat_dtypes[::-1]]
+ )
+ expected = pd.concat([pd.Series([0, 1, pd.NA], dtype=object)] * 2).astype(
+ result_dtype
+ )
+ tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+ "to_concat_dtypes, result_dtype",
+ [
+ (["Int64", "int64"], "Int64"),
+ (["UInt64", "uint64"], "UInt64"),
+ (["Int8", "int8"], "Int8"),
+ (["Int8", "int16"], "Int16"),
+ (["UInt8", "int8"], "Int16"),
+ (["Int32", "uint32"], "Int64"),
+ # this still gives object (awaiting float extension dtype)
+ (["Int64", "uint64"], "object"),
+ (["Int64", "bool"], "Int64"),
+ (["UInt8", "bool"], "UInt8"),
+ ],
+)
+def test_concat_series_with_numpy(to_concat_dtypes, result_dtype):
+
+ s1 = pd.Series([0, 1, pd.NA], dtype=to_concat_dtypes[0])
+ s2 = pd.Series(np.array([0, 1], dtype=to_concat_dtypes[1]))
+ result = pd.concat([s1, s2], ignore_index=True)
+ expected = pd.Series([0, 1, pd.NA, 0, 1], dtype=object).astype(result_dtype)
+ tm.assert_series_equal(result, expected)
+
+ # order doesn't matter for result
+ result = pd.concat([s2, s1], ignore_index=True)
+ expected = pd.Series([0, 1, 0, 1, pd.NA], dtype=object).astype(result_dtype)
+ tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/arrays/integer/test_construction.py b/pandas/tests/arrays/integer/test_construction.py
index 1893c4554bfbf..e0a4877da6c7e 100644
--- a/pandas/tests/arrays/integer/test_construction.py
+++ b/pandas/tests/arrays/integer/test_construction.py
@@ -29,7 +29,7 @@ def test_from_dtype_from_float(data):
# from int / array
expected = pd.Series(data).dropna().reset_index(drop=True)
- dropped = np.array(data.dropna()).astype(np.dtype((dtype.type)))
+ dropped = np.array(data.dropna()).astype(np.dtype(dtype.type))
result = pd.Series(dropped, dtype=str(dtype))
tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/arrays/integer/test_dtypes.py b/pandas/tests/arrays/integer/test_dtypes.py
index cafe9e47a18f4..67efa4cb2ce4a 100644
--- a/pandas/tests/arrays/integer/test_dtypes.py
+++ b/pandas/tests/arrays/integer/test_dtypes.py
@@ -144,6 +144,44 @@ def test_astype(all_data):
tm.assert_series_equal(result, expected)
+def test_astype_copy():
+ arr = pd.array([1, 2, 3, None], dtype="Int64")
+ orig = pd.array([1, 2, 3, None], dtype="Int64")
+
+ # copy=True -> ensure both data and mask are actual copies
+ result = arr.astype("Int64", copy=True)
+ assert result is not arr
+ assert not np.shares_memory(result._data, arr._data)
+ assert not np.shares_memory(result._mask, arr._mask)
+ result[0] = 10
+ tm.assert_extension_array_equal(arr, orig)
+ result[0] = pd.NA
+ tm.assert_extension_array_equal(arr, orig)
+
+ # copy=False
+ result = arr.astype("Int64", copy=False)
+ assert result is arr
+ assert np.shares_memory(result._data, arr._data)
+ assert np.shares_memory(result._mask, arr._mask)
+ result[0] = 10
+ assert arr[0] == 10
+ result[0] = pd.NA
+ assert arr[0] is pd.NA
+
+ # astype to different dtype -> always needs a copy -> even with copy=False
+ # we need to ensure that also the mask is actually copied
+ arr = pd.array([1, 2, 3, None], dtype="Int64")
+ orig = pd.array([1, 2, 3, None], dtype="Int64")
+
+ result = arr.astype("Int32", copy=False)
+ assert not np.shares_memory(result._data, arr._data)
+ assert not np.shares_memory(result._mask, arr._mask)
+ result[0] = 10
+ tm.assert_extension_array_equal(arr, orig)
+ result[0] = pd.NA
+ tm.assert_extension_array_equal(arr, orig)
+
+
def test_astype_to_larger_numpy():
a = pd.array([1, 2], dtype="Int32")
result = a.astype("int64")
diff --git a/pandas/tests/arrays/integer/test_function.py b/pandas/tests/arrays/integer/test_function.py
index 44c3077228e80..a81434339fdae 100644
--- a/pandas/tests/arrays/integer/test_function.py
+++ b/pandas/tests/arrays/integer/test_function.py
@@ -133,6 +133,15 @@ def test_integer_array_numpy_sum(values, expected):
assert result == expected
+@pytest.mark.parametrize("op", ["sum", "prod", "min", "max"])
+def test_dataframe_reductions(op):
+ # https://github.com/pandas-dev/pandas/pull/32867
+ # ensure the integers are not cast to float during reductions
+ df = pd.DataFrame({"a": pd.array([1, 2], dtype="Int64")})
+ result = df.max()
+ assert isinstance(result["a"], np.int64)
+
+
# TODO(jreback) - these need testing / are broken
# shift
diff --git a/pandas/tests/arrays/interval/test_interval.py b/pandas/tests/arrays/interval/test_interval.py
index d517eaaec68d2..e5ccb51ce36f5 100644
--- a/pandas/tests/arrays/interval/test_interval.py
+++ b/pandas/tests/arrays/interval/test_interval.py
@@ -105,6 +105,10 @@ def test_set_na(self, left_right_dtypes):
left, right = left_right_dtypes
result = IntervalArray.from_arrays(left, right)
+ if result.dtype.subtype.kind not in ["m", "M"]:
+ msg = "'value' should be an interval type, got <.*NaTType'> instead."
+ with pytest.raises(TypeError, match=msg):
+ result[0] = pd.NaT
if result.dtype.subtype.kind in ["i", "u"]:
msg = "Cannot set float NaN to integer-backed IntervalArray"
with pytest.raises(ValueError, match=msg):
@@ -142,6 +146,7 @@ def test_repr():
@pyarrow_skip
def test_arrow_extension_type():
import pyarrow as pa
+
from pandas.core.arrays._arrow_utils import ArrowIntervalType
p1 = ArrowIntervalType(pa.int64(), "left")
@@ -158,6 +163,7 @@ def test_arrow_extension_type():
@pyarrow_skip
def test_arrow_array():
import pyarrow as pa
+
from pandas.core.arrays._arrow_utils import ArrowIntervalType
intervals = pd.interval_range(1, 5, freq=1).array
@@ -187,6 +193,7 @@ def test_arrow_array():
@pyarrow_skip
def test_arrow_array_missing():
import pyarrow as pa
+
from pandas.core.arrays._arrow_utils import ArrowIntervalType
arr = IntervalArray.from_breaks([0.0, 1.0, 2.0, 3.0])
@@ -221,6 +228,7 @@ def test_arrow_array_missing():
)
def test_arrow_table_roundtrip(breaks):
import pyarrow as pa
+
from pandas.core.arrays._arrow_utils import ArrowIntervalType
arr = IntervalArray.from_breaks(breaks)
diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py
index d0cdec712f39d..ece9367cea7fe 100644
--- a/pandas/tests/arrays/sparse/test_array.py
+++ b/pandas/tests/arrays/sparse/test_array.py
@@ -194,8 +194,7 @@ def test_constructor_inferred_fill_value(self, data, fill_value):
@pytest.mark.parametrize("format", ["coo", "csc", "csr"])
@pytest.mark.parametrize(
- "size",
- [pytest.param(0, marks=td.skip_if_np_lt("1.16", reason="NumPy-11383")), 10],
+ "size", [0, 10],
)
@td.skip_if_no_scipy
def test_from_spmatrix(self, size, format):
@@ -281,6 +280,11 @@ def test_take(self):
exp = SparseArray(np.take(self.arr_data, [0, 1, 2]))
tm.assert_sp_array_equal(self.arr.take([0, 1, 2]), exp)
+ def test_take_all_empty(self):
+ a = pd.array([0, 0], dtype=pd.SparseDtype("int64"))
+ result = a.take([0, 1], allow_fill=True, fill_value=np.nan)
+ tm.assert_sp_array_equal(a, result)
+
def test_take_fill_value(self):
data = np.array([1, np.nan, 0, 3, 0])
sparse = SparseArray(data, fill_value=0)
@@ -899,7 +903,6 @@ def test_all(self, data, pos, neg):
([1.0, 2.0, 1.0], 1.0, 0.0),
],
)
- @td.skip_if_np_lt("1.15") # prior didn't dispatch
def test_numpy_all(self, data, pos, neg):
# GH 17570
out = np.all(SparseArray(data))
@@ -951,7 +954,6 @@ def test_any(self, data, pos, neg):
([0.0, 2.0, 0.0], 2.0, 0.0),
],
)
- @td.skip_if_np_lt("1.15") # prior didn't dispatch
def test_numpy_any(self, data, pos, neg):
# GH 17570
out = np.any(SparseArray(data))
diff --git a/pandas/tests/arrays/sparse/test_libsparse.py b/pandas/tests/arrays/sparse/test_libsparse.py
index a2f861d378e67..2d6e657debdb2 100644
--- a/pandas/tests/arrays/sparse/test_libsparse.py
+++ b/pandas/tests/arrays/sparse/test_libsparse.py
@@ -8,7 +8,7 @@
from pandas import Series
import pandas._testing as tm
-from pandas.core.arrays.sparse import BlockIndex, IntIndex, _make_index
+from pandas.core.arrays.sparse import BlockIndex, IntIndex, make_sparse_index
TEST_LENGTH = 20
@@ -273,41 +273,43 @@ def test_intersect_identical(self):
class TestSparseIndexCommon:
def test_int_internal(self):
- idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind="integer")
+ idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind="integer")
assert isinstance(idx, IntIndex)
assert idx.npoints == 2
tm.assert_numpy_array_equal(idx.indices, np.array([2, 3], dtype=np.int32))
- idx = _make_index(4, np.array([], dtype=np.int32), kind="integer")
+ idx = make_sparse_index(4, np.array([], dtype=np.int32), kind="integer")
assert isinstance(idx, IntIndex)
assert idx.npoints == 0
tm.assert_numpy_array_equal(idx.indices, np.array([], dtype=np.int32))
- idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind="integer")
+ idx = make_sparse_index(
+ 4, np.array([0, 1, 2, 3], dtype=np.int32), kind="integer"
+ )
assert isinstance(idx, IntIndex)
assert idx.npoints == 4
tm.assert_numpy_array_equal(idx.indices, np.array([0, 1, 2, 3], dtype=np.int32))
def test_block_internal(self):
- idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind="block")
+ idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind="block")
assert isinstance(idx, BlockIndex)
assert idx.npoints == 2
tm.assert_numpy_array_equal(idx.blocs, np.array([2], dtype=np.int32))
tm.assert_numpy_array_equal(idx.blengths, np.array([2], dtype=np.int32))
- idx = _make_index(4, np.array([], dtype=np.int32), kind="block")
+ idx = make_sparse_index(4, np.array([], dtype=np.int32), kind="block")
assert isinstance(idx, BlockIndex)
assert idx.npoints == 0
tm.assert_numpy_array_equal(idx.blocs, np.array([], dtype=np.int32))
tm.assert_numpy_array_equal(idx.blengths, np.array([], dtype=np.int32))
- idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind="block")
+ idx = make_sparse_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind="block")
assert isinstance(idx, BlockIndex)
assert idx.npoints == 4
tm.assert_numpy_array_equal(idx.blocs, np.array([0], dtype=np.int32))
tm.assert_numpy_array_equal(idx.blengths, np.array([4], dtype=np.int32))
- idx = _make_index(4, np.array([0, 2, 3], dtype=np.int32), kind="block")
+ idx = make_sparse_index(4, np.array([0, 2, 3], dtype=np.int32), kind="block")
assert isinstance(idx, BlockIndex)
assert idx.npoints == 3
tm.assert_numpy_array_equal(idx.blocs, np.array([0, 2], dtype=np.int32))
@@ -315,7 +317,7 @@ def test_block_internal(self):
def test_lookup(self):
for kind in ["integer", "block"]:
- idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind=kind)
+ idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind=kind)
assert idx.lookup(-1) == -1
assert idx.lookup(0) == -1
assert idx.lookup(1) == -1
@@ -323,12 +325,14 @@ def test_lookup(self):
assert idx.lookup(3) == 1
assert idx.lookup(4) == -1
- idx = _make_index(4, np.array([], dtype=np.int32), kind=kind)
+ idx = make_sparse_index(4, np.array([], dtype=np.int32), kind=kind)
for i in range(-1, 5):
assert idx.lookup(i) == -1
- idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind=kind)
+ idx = make_sparse_index(
+ 4, np.array([0, 1, 2, 3], dtype=np.int32), kind=kind
+ )
assert idx.lookup(-1) == -1
assert idx.lookup(0) == 0
assert idx.lookup(1) == 1
@@ -336,7 +340,7 @@ def test_lookup(self):
assert idx.lookup(3) == 3
assert idx.lookup(4) == -1
- idx = _make_index(4, np.array([0, 2, 3], dtype=np.int32), kind=kind)
+ idx = make_sparse_index(4, np.array([0, 2, 3], dtype=np.int32), kind=kind)
assert idx.lookup(-1) == -1
assert idx.lookup(0) == 0
assert idx.lookup(1) == -1
@@ -346,7 +350,7 @@ def test_lookup(self):
def test_lookup_array(self):
for kind in ["integer", "block"]:
- idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind=kind)
+ idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind=kind)
res = idx.lookup_array(np.array([-1, 0, 2], dtype=np.int32))
exp = np.array([-1, -1, 0], dtype=np.int32)
@@ -356,11 +360,13 @@ def test_lookup_array(self):
exp = np.array([-1, 0, -1, 1], dtype=np.int32)
tm.assert_numpy_array_equal(res, exp)
- idx = _make_index(4, np.array([], dtype=np.int32), kind=kind)
+ idx = make_sparse_index(4, np.array([], dtype=np.int32), kind=kind)
res = idx.lookup_array(np.array([-1, 0, 2, 4], dtype=np.int32))
exp = np.array([-1, -1, -1, -1], dtype=np.int32)
- idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind=kind)
+ idx = make_sparse_index(
+ 4, np.array([0, 1, 2, 3], dtype=np.int32), kind=kind
+ )
res = idx.lookup_array(np.array([-1, 0, 2], dtype=np.int32))
exp = np.array([-1, 0, 2], dtype=np.int32)
tm.assert_numpy_array_equal(res, exp)
@@ -369,7 +375,7 @@ def test_lookup_array(self):
exp = np.array([-1, 2, 1, 3], dtype=np.int32)
tm.assert_numpy_array_equal(res, exp)
- idx = _make_index(4, np.array([0, 2, 3], dtype=np.int32), kind=kind)
+ idx = make_sparse_index(4, np.array([0, 2, 3], dtype=np.int32), kind=kind)
res = idx.lookup_array(np.array([2, 1, 3, 0], dtype=np.int32))
exp = np.array([1, -1, 2, 0], dtype=np.int32)
tm.assert_numpy_array_equal(res, exp)
@@ -402,25 +408,25 @@ def _check(index):
class TestBlockIndex:
def test_block_internal(self):
- idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind="block")
+ idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind="block")
assert isinstance(idx, BlockIndex)
assert idx.npoints == 2
tm.assert_numpy_array_equal(idx.blocs, np.array([2], dtype=np.int32))
tm.assert_numpy_array_equal(idx.blengths, np.array([2], dtype=np.int32))
- idx = _make_index(4, np.array([], dtype=np.int32), kind="block")
+ idx = make_sparse_index(4, np.array([], dtype=np.int32), kind="block")
assert isinstance(idx, BlockIndex)
assert idx.npoints == 0
tm.assert_numpy_array_equal(idx.blocs, np.array([], dtype=np.int32))
tm.assert_numpy_array_equal(idx.blengths, np.array([], dtype=np.int32))
- idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind="block")
+ idx = make_sparse_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind="block")
assert isinstance(idx, BlockIndex)
assert idx.npoints == 4
tm.assert_numpy_array_equal(idx.blocs, np.array([0], dtype=np.int32))
tm.assert_numpy_array_equal(idx.blengths, np.array([4], dtype=np.int32))
- idx = _make_index(4, np.array([0, 2, 3], dtype=np.int32), kind="block")
+ idx = make_sparse_index(4, np.array([0, 2, 3], dtype=np.int32), kind="block")
assert isinstance(idx, BlockIndex)
assert idx.npoints == 3
tm.assert_numpy_array_equal(idx.blocs, np.array([0, 2], dtype=np.int32))
@@ -428,7 +434,7 @@ def test_block_internal(self):
def test_make_block_boundary(self):
for i in [5, 10, 100, 101]:
- idx = _make_index(i, np.arange(0, i, 2, dtype=np.int32), kind="block")
+ idx = make_sparse_index(i, np.arange(0, i, 2, dtype=np.int32), kind="block")
exp = np.arange(0, i, 2, dtype=np.int32)
tm.assert_numpy_array_equal(idx.blocs, exp)
@@ -514,17 +520,19 @@ def test_check_integrity(self):
IntIndex(length=5, indices=[1, 3, 3])
def test_int_internal(self):
- idx = _make_index(4, np.array([2, 3], dtype=np.int32), kind="integer")
+ idx = make_sparse_index(4, np.array([2, 3], dtype=np.int32), kind="integer")
assert isinstance(idx, IntIndex)
assert idx.npoints == 2
tm.assert_numpy_array_equal(idx.indices, np.array([2, 3], dtype=np.int32))
- idx = _make_index(4, np.array([], dtype=np.int32), kind="integer")
+ idx = make_sparse_index(4, np.array([], dtype=np.int32), kind="integer")
assert isinstance(idx, IntIndex)
assert idx.npoints == 0
tm.assert_numpy_array_equal(idx.indices, np.array([], dtype=np.int32))
- idx = _make_index(4, np.array([0, 1, 2, 3], dtype=np.int32), kind="integer")
+ idx = make_sparse_index(
+ 4, np.array([0, 1, 2, 3], dtype=np.int32), kind="integer"
+ )
assert isinstance(idx, IntIndex)
assert idx.npoints == 4
tm.assert_numpy_array_equal(idx.indices, np.array([0, 1, 2, 3], dtype=np.int32))
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index 6f9a1a5be4c43..56a8e21edd004 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -206,12 +206,16 @@ def test_constructor_raises():
@pytest.mark.parametrize("copy", [True, False])
def test_from_sequence_no_mutate(copy):
- a = np.array(["a", np.nan], dtype=object)
- original = a.copy()
- result = pd.arrays.StringArray._from_sequence(a, copy=copy)
- expected = pd.arrays.StringArray(np.array(["a", pd.NA], dtype=object))
+ nan_arr = np.array(["a", np.nan], dtype=object)
+ na_arr = np.array(["a", pd.NA], dtype=object)
+
+ result = pd.arrays.StringArray._from_sequence(nan_arr, copy=copy)
+ expected = pd.arrays.StringArray(na_arr)
+
tm.assert_extension_array_equal(result, expected)
- tm.assert_numpy_array_equal(a, original)
+
+ expected = nan_arr if copy else na_arr
+ tm.assert_numpy_array_equal(nan_arr, expected)
def test_astype_int():
@@ -332,3 +336,12 @@ def test_memory_usage():
series = pd.Series(["a", "b", "c"], dtype="string")
assert 0 < series.nbytes <= series.memory_usage() < series.memory_usage(deep=True)
+
+
+@pytest.mark.parametrize("dtype", [np.float16, np.float32, np.float64])
+def test_astype_from_float_dtype(dtype):
+ # https://github.com/pandas-dev/pandas/issues/36451
+ s = pd.Series([0.1], dtype=dtype)
+ result = s.astype("string")
+ expected = pd.Series(["0.1"], dtype="string")
+ tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py
index ad6e6e4a98057..304e1c80a3f77 100644
--- a/pandas/tests/arrays/test_array.py
+++ b/pandas/tests/arrays/test_array.py
@@ -5,7 +5,7 @@
import pytest
import pytz
-from pandas.core.dtypes.dtypes import registry
+from pandas.core.dtypes.base import registry
import pandas as pd
import pandas._testing as tm
@@ -35,7 +35,7 @@
np.dtype("float32"),
PandasArray(np.array([1.0, 2.0], dtype=np.dtype("float32"))),
),
- (np.array([1, 2], dtype="int64"), None, IntegerArray._from_sequence([1, 2]),),
+ (np.array([1, 2], dtype="int64"), None, IntegerArray._from_sequence([1, 2])),
# String alias passes through to NumPy
([1, 2], "float32", PandasArray(np.array([1, 2], dtype="float32"))),
# Period alias
@@ -120,10 +120,10 @@
(pd.Series([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))),
# String
(["a", None], "string", StringArray._from_sequence(["a", None])),
- (["a", None], pd.StringDtype(), StringArray._from_sequence(["a", None]),),
+ (["a", None], pd.StringDtype(), StringArray._from_sequence(["a", None])),
# Boolean
([True, None], "boolean", BooleanArray._from_sequence([True, None])),
- ([True, None], pd.BooleanDtype(), BooleanArray._from_sequence([True, None]),),
+ ([True, None], pd.BooleanDtype(), BooleanArray._from_sequence([True, None])),
# Index
(pd.Index([1, 2]), None, PandasArray(np.array([1, 2], dtype=np.int64))),
# Series[EA] returns the EA
@@ -174,7 +174,7 @@ def test_array_copy():
period_array(["2000", "2001"], freq="D"),
),
# interval
- ([pd.Interval(0, 1), pd.Interval(1, 2)], IntervalArray.from_breaks([0, 1, 2]),),
+ ([pd.Interval(0, 1), pd.Interval(1, 2)], IntervalArray.from_breaks([0, 1, 2])),
# datetime
(
[pd.Timestamp("2000"), pd.Timestamp("2001")],
diff --git a/pandas/tests/arrays/test_datetimelike.py b/pandas/tests/arrays/test_datetimelike.py
index b1ab700427c28..f512b168d2795 100644
--- a/pandas/tests/arrays/test_datetimelike.py
+++ b/pandas/tests/arrays/test_datetimelike.py
@@ -2,9 +2,10 @@
import numpy as np
import pytest
+import pytz
from pandas._libs import OutOfBoundsDatetime
-from pandas.compat.numpy import _np_version_under1p18
+from pandas.compat.numpy import np_version_under1p18
import pandas as pd
import pandas._testing as tm
@@ -241,10 +242,56 @@ def test_searchsorted(self):
expected = np.array([2, 3], dtype=np.intp)
tm.assert_numpy_array_equal(result, expected)
- # Following numpy convention, NaT goes at the beginning
- # (unlike NaN which goes at the end)
+ # GH#29884 match numpy convention on whether NaT goes
+ # at the end or the beginning
result = arr.searchsorted(pd.NaT)
- assert result == 0
+ if np_version_under1p18:
+ # Following numpy convention, NaT goes at the beginning
+ # (unlike NaN which goes at the end)
+ assert result == 0
+ else:
+ assert result == 10
+
+ @pytest.mark.parametrize("box", [None, "index", "series"])
+ def test_searchsorted_castable_strings(self, arr1d, box):
+ if isinstance(arr1d, DatetimeArray):
+ tz = arr1d.tz
+ if (
+ tz is not None
+ and tz is not pytz.UTC
+ and not isinstance(tz, pytz._FixedOffset)
+ ):
+ # If we have e.g. tzutc(), when we cast to string and parse
+ # back we get pytz.UTC, and then consider them different timezones
+ # so incorrectly raise.
+ pytest.xfail(reason="timezone comparisons inconsistent")
+
+ arr = arr1d
+ if box is None:
+ pass
+ elif box == "index":
+ # Test the equivalent Index.searchsorted method while we're here
+ arr = self.index_cls(arr)
+ else:
+ # Test the equivalent Series.searchsorted method while we're here
+ arr = pd.Series(arr)
+
+ # scalar
+ result = arr.searchsorted(str(arr[1]))
+ assert result == 1
+
+ result = arr.searchsorted(str(arr[2]), side="right")
+ assert result == 3
+
+ result = arr.searchsorted([str(x) for x in arr[1:3]])
+ expected = np.array([1, 2], dtype=np.intp)
+ tm.assert_numpy_array_equal(result, expected)
+
+ with pytest.raises(TypeError):
+ arr.searchsorted("foo")
+
+ with pytest.raises(TypeError):
+ arr.searchsorted([str(arr[1]), "baz"])
def test_getitem_2d(self, arr1d):
# 2d slicing on a 1D array
@@ -277,15 +324,35 @@ def test_setitem(self):
expected[:2] = expected[-2:]
tm.assert_numpy_array_equal(arr.asi8, expected)
- def test_setitem_str_array(self, arr1d):
- if isinstance(arr1d, DatetimeArray) and arr1d.tz is not None:
- pytest.xfail(reason="timezone comparisons inconsistent")
+ def test_setitem_strs(self, arr1d):
+ # Check that we parse strs in both scalar and listlike
+ if isinstance(arr1d, DatetimeArray):
+ tz = arr1d.tz
+ if (
+ tz is not None
+ and tz is not pytz.UTC
+ and not isinstance(tz, pytz._FixedOffset)
+ ):
+ # If we have e.g. tzutc(), when we cast to string and parse
+ # back we get pytz.UTC, and then consider them different timezones
+ # so incorrectly raise.
+ pytest.xfail(reason="timezone comparisons inconsistent")
+
+ # Setting list-like of strs
expected = arr1d.copy()
expected[[0, 1]] = arr1d[-2:]
- arr1d[:2] = [str(x) for x in arr1d[-2:]]
+ result = arr1d.copy()
+ result[:2] = [str(x) for x in arr1d[-2:]]
+ tm.assert_equal(result, expected)
- tm.assert_equal(arr1d, expected)
+ # Same thing but now for just a scalar str
+ expected = arr1d.copy()
+ expected[0] = arr1d[-1]
+
+ result = arr1d.copy()
+ result[0] = str(arr1d[-1])
+ tm.assert_equal(result, expected)
@pytest.mark.parametrize("as_index", [True, False])
def test_setitem_categorical(self, arr1d, as_index):
@@ -312,6 +379,16 @@ def test_setitem_raises(self):
with pytest.raises(TypeError, match="'value' should be a.* 'object'"):
arr[0] = object()
+ msg = "cannot set using a list-like indexer with a different length"
+ with pytest.raises(ValueError, match=msg):
+ # GH#36339
+ arr[[]] = [arr[1]]
+
+ msg = "cannot set using a slice indexer with a different length than"
+ with pytest.raises(ValueError, match=msg):
+ # GH#36339
+ arr[1:1] = arr[:3]
+
@pytest.mark.parametrize("box", [list, np.array, pd.Index, pd.Series])
def test_setitem_numeric_raises(self, arr1d, box):
# We dont case e.g. int64 to our own dtype for setitem
@@ -955,7 +1032,7 @@ def test_invalid_nat_setitem_array(array, non_casting_nats):
],
)
def test_to_numpy_extra(array):
- if _np_version_under1p18:
+ if np_version_under1p18:
# np.isnan(NaT) raises, so use pandas'
isnan = pd.isna
else:
diff --git a/pandas/tests/arrays/test_datetimes.py b/pandas/tests/arrays/test_datetimes.py
index 804654451a6d9..53f26de09f94e 100644
--- a/pandas/tests/arrays/test_datetimes.py
+++ b/pandas/tests/arrays/test_datetimes.py
@@ -197,6 +197,29 @@ def test_tz_setter_raises(self):
with pytest.raises(AttributeError, match="tz_localize"):
arr.tz = "UTC"
+ def test_setitem_str_impute_tz(self, tz_naive_fixture):
+ # Like for getitem, if we are passed a naive-like string, we impute
+ # our own timezone.
+ tz = tz_naive_fixture
+
+ data = np.array([1, 2, 3], dtype="M8[ns]")
+ dtype = data.dtype if tz is None else DatetimeTZDtype(tz=tz)
+ arr = DatetimeArray(data, dtype=dtype)
+ expected = arr.copy()
+
+ ts = pd.Timestamp("2020-09-08 16:50").tz_localize(tz)
+ setter = str(ts.tz_localize(None))
+
+ # Setting a scalar tznaive string
+ expected[0] = ts
+ arr[0] = setter
+ tm.assert_equal(arr, expected)
+
+ # Setting a listlike of tznaive strings
+ expected[1] = ts
+ arr[:2] = [setter, setter]
+ tm.assert_equal(arr, expected)
+
def test_setitem_different_tz_raises(self):
data = np.array([1, 2, 3], dtype="M8[ns]")
arr = DatetimeArray(data, copy=False, dtype=DatetimeTZDtype(tz="US/Central"))
diff --git a/pandas/tests/arrays/test_period.py b/pandas/tests/arrays/test_period.py
index 27e6334788284..0d81e8e733842 100644
--- a/pandas/tests/arrays/test_period.py
+++ b/pandas/tests/arrays/test_period.py
@@ -5,7 +5,8 @@
from pandas._libs.tslibs.period import IncompatibleFrequency
import pandas.util._test_decorators as td
-from pandas.core.dtypes.dtypes import PeriodDtype, registry
+from pandas.core.dtypes.base import registry
+from pandas.core.dtypes.dtypes import PeriodDtype
import pandas as pd
import pandas._testing as tm
@@ -358,6 +359,7 @@ def test_arrow_extension_type():
)
def test_arrow_array(data, freq):
import pyarrow as pa
+
from pandas.core.arrays._arrow_utils import ArrowPeriodType
periods = period_array(data, freq=freq)
@@ -383,6 +385,7 @@ def test_arrow_array(data, freq):
@pyarrow_skip
def test_arrow_array_missing():
import pyarrow as pa
+
from pandas.core.arrays._arrow_utils import ArrowPeriodType
arr = PeriodArray([1, 2, 3], freq="D")
@@ -398,6 +401,7 @@ def test_arrow_array_missing():
@pyarrow_skip
def test_arrow_table_roundtrip():
import pyarrow as pa
+
from pandas.core.arrays._arrow_utils import ArrowPeriodType
arr = PeriodArray([1, 2, 3], freq="D")
diff --git a/pandas/tests/arrays/test_timedeltas.py b/pandas/tests/arrays/test_timedeltas.py
index c86b4f71ee592..a32529cb58ba3 100644
--- a/pandas/tests/arrays/test_timedeltas.py
+++ b/pandas/tests/arrays/test_timedeltas.py
@@ -46,7 +46,7 @@ def test_incorrect_dtype_raises(self):
TimedeltaArray(np.array([1, 2, 3], dtype="i8"), dtype="category")
with pytest.raises(
- ValueError, match=r"dtype int64 cannot be converted to timedelta64\[ns\]",
+ ValueError, match=r"dtype int64 cannot be converted to timedelta64\[ns\]"
):
TimedeltaArray(np.array([1, 2, 3], dtype="i8"), dtype=np.dtype("int64"))
diff --git a/pandas/tests/base/test_conversion.py b/pandas/tests/base/test_conversion.py
index b688a048cbe8e..b5595ba220a15 100644
--- a/pandas/tests/base/test_conversion.py
+++ b/pandas/tests/base/test_conversion.py
@@ -183,7 +183,7 @@ def test_iter_box(self):
PeriodArray,
pd.core.dtypes.dtypes.PeriodDtype("A-DEC"),
),
- (pd.IntervalIndex.from_breaks([0, 1, 2]), IntervalArray, "interval",),
+ (pd.IntervalIndex.from_breaks([0, 1, 2]), IntervalArray, "interval"),
# This test is currently failing for datetime64[ns] and timedelta64[ns].
# The NumPy type system is sufficient for representing these types, so
# we just use NumPy for Series / DataFrame columns of these types (so
@@ -285,10 +285,7 @@ def test_array_multiindex_raises():
pd.core.arrays.period_array(["2000", "2001"], freq="D"),
np.array([pd.Period("2000", freq="D"), pd.Period("2001", freq="D")]),
),
- (
- pd.core.arrays.integer_array([0, np.nan]),
- np.array([0, pd.NA], dtype=object),
- ),
+ (pd.core.arrays.integer_array([0, np.nan]), np.array([0, pd.NA], dtype=object)),
(
IntervalArray.from_breaks([0, 1, 2]),
np.array([pd.Interval(0, 1), pd.Interval(1, 2)], dtype=object),
diff --git a/pandas/tests/base/test_factorize.py b/pandas/tests/base/test_factorize.py
index 415a8b7e4362f..f8cbadb987d29 100644
--- a/pandas/tests/base/test_factorize.py
+++ b/pandas/tests/base/test_factorize.py
@@ -26,3 +26,16 @@ def test_factorize(index_or_series_obj, sort):
tm.assert_numpy_array_equal(result_codes, expected_codes)
tm.assert_index_equal(result_uniques, expected_uniques)
+
+
+def test_series_factorize_na_sentinel_none():
+ # GH35667
+ values = np.array([1, 2, 1, np.nan])
+ ser = pd.Series(values)
+ codes, uniques = ser.factorize(na_sentinel=None)
+
+ expected_codes = np.array([0, 1, 0, 2], dtype=np.intp)
+ expected_uniques = pd.Index([1.0, 2.0, np.nan])
+
+ tm.assert_numpy_array_equal(codes, expected_codes)
+ tm.assert_index_equal(uniques, expected_uniques)
diff --git a/pandas/tests/base/test_misc.py b/pandas/tests/base/test_misc.py
index 78a830c7f43d8..b8468a5acf277 100644
--- a/pandas/tests/base/test_misc.py
+++ b/pandas/tests/base/test_misc.py
@@ -99,7 +99,7 @@ def test_ndarray_compat_properties(index_or_series_obj):
assert getattr(obj, p, None) is not None
# deprecated properties
- for p in ["flags", "strides", "itemsize", "base", "data"]:
+ for p in ["strides", "itemsize", "base", "data"]:
assert not hasattr(obj, p)
msg = "can only convert an array of size 1 to a Python scalar"
@@ -116,6 +116,7 @@ def test_ndarray_compat_properties(index_or_series_obj):
@pytest.mark.skipif(PYPY, reason="not relevant for PyPy")
def test_memory_usage(index_or_series_obj):
obj = index_or_series_obj
+
res = obj.memory_usage()
res_deep = obj.memory_usage(deep=True)
@@ -200,4 +201,4 @@ def test_get_indexer_non_unique_dtype_mismatch():
# GH 25459
indexes, missing = pd.Index(["A", "B"]).get_indexer_non_unique(pd.Index([0]))
tm.assert_numpy_array_equal(np.array([-1], dtype=np.intp), indexes)
- tm.assert_numpy_array_equal(np.array([0], dtype=np.int64), missing)
+ tm.assert_numpy_array_equal(np.array([0], dtype=np.intp), missing)
diff --git a/pandas/tests/base/test_value_counts.py b/pandas/tests/base/test_value_counts.py
index de04c30432e6f..2b8a918505255 100644
--- a/pandas/tests/base/test_value_counts.py
+++ b/pandas/tests/base/test_value_counts.py
@@ -208,7 +208,7 @@ def test_value_counts_datetime64(index_or_series):
)
f = StringIO(txt)
df = pd.read_fwf(
- f, widths=[6, 8, 3], names=["person_id", "dt", "food"], parse_dates=["dt"]
+ f, col_widths=[6, 8, 3], names=["person_id", "dt", "food"], parse_dates=["dt"]
)
s = klass(df["dt"].copy())
diff --git a/pandas/tests/computation/test_compat.py b/pandas/tests/computation/test_compat.py
index b3fbd8c17d8bf..9fc3ed4800d09 100644
--- a/pandas/tests/computation/test_compat.py
+++ b/pandas/tests/computation/test_compat.py
@@ -5,29 +5,29 @@
from pandas.compat._optional import VERSIONS
import pandas as pd
-from pandas.core.computation.engines import _engines
+from pandas.core.computation.engines import ENGINES
import pandas.core.computation.expr as expr
def test_compat():
# test we have compat with our version of nu
- from pandas.core.computation.check import _NUMEXPR_INSTALLED
+ from pandas.core.computation.check import NUMEXPR_INSTALLED
try:
import numexpr as ne
ver = ne.__version__
if LooseVersion(ver) < LooseVersion(VERSIONS["numexpr"]):
- assert not _NUMEXPR_INSTALLED
+ assert not NUMEXPR_INSTALLED
else:
- assert _NUMEXPR_INSTALLED
+ assert NUMEXPR_INSTALLED
except ImportError:
pytest.skip("not testing numexpr version compat")
-@pytest.mark.parametrize("engine", _engines)
-@pytest.mark.parametrize("parser", expr._parsers)
+@pytest.mark.parametrize("engine", ENGINES)
+@pytest.mark.parametrize("parser", expr.PARSERS)
def test_invalid_numexpr_version(engine, parser):
def testit():
a, b = 1, 2 # noqa
diff --git a/pandas/tests/computation/test_eval.py b/pandas/tests/computation/test_eval.py
index 08d8d5ca342b7..cca64a6bf487c 100644
--- a/pandas/tests/computation/test_eval.py
+++ b/pandas/tests/computation/test_eval.py
@@ -18,20 +18,20 @@
from pandas import DataFrame, Series, compat, date_range
import pandas._testing as tm
from pandas.core.computation import pytables
-from pandas.core.computation.check import _NUMEXPR_VERSION
-from pandas.core.computation.engines import NumExprClobberingError, _engines
+from pandas.core.computation.check import NUMEXPR_VERSION
+from pandas.core.computation.engines import ENGINES, NumExprClobberingError
import pandas.core.computation.expr as expr
from pandas.core.computation.expr import (
BaseExprVisitor,
PandasExprVisitor,
PythonExprVisitor,
)
-from pandas.core.computation.expressions import _NUMEXPR_INSTALLED, _USE_NUMEXPR
+from pandas.core.computation.expressions import NUMEXPR_INSTALLED, USE_NUMEXPR
from pandas.core.computation.ops import (
- _arith_ops_syms,
+ ARITH_OPS_SYMS,
+ SPECIAL_CASE_ARITH_OPS_SYMS,
_binary_math_ops,
_binary_ops_dict,
- _special_case_arith_ops_syms,
_unary_math_ops,
)
@@ -41,34 +41,34 @@
pytest.param(
engine,
marks=pytest.mark.skipif(
- engine == "numexpr" and not _USE_NUMEXPR,
- reason=f"numexpr enabled->{_USE_NUMEXPR}, "
- f"installed->{_NUMEXPR_INSTALLED}",
+ engine == "numexpr" and not USE_NUMEXPR,
+ reason=f"numexpr enabled->{USE_NUMEXPR}, "
+ f"installed->{NUMEXPR_INSTALLED}",
),
)
- for engine in _engines
+ for engine in ENGINES
)
) # noqa
def engine(request):
return request.param
-@pytest.fixture(params=expr._parsers)
+@pytest.fixture(params=expr.PARSERS)
def parser(request):
return request.param
@pytest.fixture
def ne_lt_2_6_9():
- if _NUMEXPR_INSTALLED and _NUMEXPR_VERSION >= LooseVersion("2.6.9"):
+ if NUMEXPR_INSTALLED and NUMEXPR_VERSION >= LooseVersion("2.6.9"):
pytest.skip("numexpr is >= 2.6.9")
return "numexpr"
@pytest.fixture
def unary_fns_for_ne():
- if _NUMEXPR_INSTALLED:
- if _NUMEXPR_VERSION >= LooseVersion("2.6.9"):
+ if NUMEXPR_INSTALLED:
+ if NUMEXPR_VERSION >= LooseVersion("2.6.9"):
return _unary_math_ops
else:
return tuple(x for x in _unary_math_ops if x not in ("floor", "ceil"))
@@ -77,7 +77,7 @@ def unary_fns_for_ne():
def engine_has_neg_frac(engine):
- return _engines[engine].has_neg_frac
+ return ENGINES[engine].has_neg_frac
def _eval_single_bin(lhs, cmp1, rhs, engine):
@@ -114,7 +114,7 @@ def _is_py3_complex_incompat(result, expected):
return isinstance(expected, (complex, np.complexfloating)) and np.isnan(result)
-_good_arith_ops = set(_arith_ops_syms).difference(_special_case_arith_ops_syms)
+_good_arith_ops = set(ARITH_OPS_SYMS).difference(SPECIAL_CASE_ARITH_OPS_SYMS)
@td.skip_if_no_ne
@@ -158,17 +158,17 @@ def setup_data(self):
self.rhses = self.pandas_rhses + self.scalar_rhses
def setup_ops(self):
- self.cmp_ops = expr._cmp_ops_syms
+ self.cmp_ops = expr.CMP_OPS_SYMS
self.cmp2_ops = self.cmp_ops[::-1]
- self.bin_ops = expr._bool_ops_syms
- self.special_case_ops = _special_case_arith_ops_syms
+ self.bin_ops = expr.BOOL_OPS_SYMS
+ self.special_case_ops = SPECIAL_CASE_ARITH_OPS_SYMS
self.arith_ops = _good_arith_ops
self.unary_ops = "-", "~", "not "
def setup_method(self, method):
self.setup_ops()
self.setup_data()
- self.current_engines = filter(lambda x: x != self.engine, _engines)
+ self.current_engines = (engine for engine in ENGINES if engine != self.engine)
def teardown_method(self, method):
del self.lhses, self.rhses, self.scalar_rhses, self.scalar_lhses
@@ -774,12 +774,10 @@ def setup_class(cls):
cls.parser = "python"
def setup_ops(self):
- self.cmp_ops = list(
- filter(lambda x: x not in ("in", "not in"), expr._cmp_ops_syms)
- )
+ self.cmp_ops = [op for op in expr.CMP_OPS_SYMS if op not in ("in", "not in")]
self.cmp2_ops = self.cmp_ops[::-1]
- self.bin_ops = [s for s in expr._bool_ops_syms if s not in ("and", "or")]
- self.special_case_ops = _special_case_arith_ops_syms
+ self.bin_ops = [op for op in expr.BOOL_OPS_SYMS if op not in ("and", "or")]
+ self.special_case_ops = SPECIAL_CASE_ARITH_OPS_SYMS
self.arith_ops = _good_arith_ops
self.unary_ops = "+", "-", "~"
@@ -1137,7 +1135,7 @@ class TestOperationsNumExprPandas:
def setup_class(cls):
cls.engine = "numexpr"
cls.parser = "pandas"
- cls.arith_ops = expr._arith_ops_syms + expr._cmp_ops_syms
+ cls.arith_ops = expr.ARITH_OPS_SYMS + expr.CMP_OPS_SYMS
@classmethod
def teardown_class(cls):
@@ -1150,9 +1148,9 @@ def eval(self, *args, **kwargs):
return pd.eval(*args, **kwargs)
def test_simple_arith_ops(self):
- ops = self.arith_ops
+ ops = (op for op in self.arith_ops if op != "//")
- for op in filter(lambda x: x != "//", ops):
+ for op in ops:
ex = f"1 {op} 1"
ex2 = f"x {op} 1"
ex3 = f"1 {op} (x + 1)"
@@ -1179,7 +1177,7 @@ def test_simple_arith_ops(self):
assert y == expec
def test_simple_bool_ops(self):
- for op, lhs, rhs in product(expr._bool_ops_syms, (True, False), (True, False)):
+ for op, lhs, rhs in product(expr.BOOL_OPS_SYMS, (True, False), (True, False)):
ex = f"{lhs} {op} {rhs}"
res = self.eval(ex)
exp = eval(ex)
@@ -1187,7 +1185,7 @@ def test_simple_bool_ops(self):
def test_bool_ops_with_constants(self):
for op, lhs, rhs in product(
- expr._bool_ops_syms, ("True", "False"), ("True", "False")
+ expr.BOOL_OPS_SYMS, ("True", "False"), ("True", "False")
):
ex = f"{lhs} {op} {rhs}"
res = self.eval(ex)
@@ -1637,8 +1635,11 @@ def setup_class(cls):
super().setup_class()
cls.engine = "numexpr"
cls.parser = "python"
- cls.arith_ops = expr._arith_ops_syms + expr._cmp_ops_syms
- cls.arith_ops = filter(lambda x: x not in ("in", "not in"), cls.arith_ops)
+ cls.arith_ops = [
+ op
+ for op in expr.ARITH_OPS_SYMS + expr.CMP_OPS_SYMS
+ if op not in ("in", "not in")
+ ]
def test_check_many_exprs(self):
a = 1 # noqa
@@ -1696,7 +1697,7 @@ def test_fails_pipe(self):
def test_bool_ops_with_constants(self):
for op, lhs, rhs in product(
- expr._bool_ops_syms, ("True", "False"), ("True", "False")
+ expr.BOOL_OPS_SYMS, ("True", "False"), ("True", "False")
):
ex = f"{lhs} {op} {rhs}"
if op in ("and", "or"):
@@ -1709,7 +1710,7 @@ def test_bool_ops_with_constants(self):
assert res == exp
def test_simple_bool_ops(self):
- for op, lhs, rhs in product(expr._bool_ops_syms, (True, False), (True, False)):
+ for op, lhs, rhs in product(expr.BOOL_OPS_SYMS, (True, False), (True, False)):
ex = f"lhs {op} rhs"
if op in ("and", "or"):
msg = "'BoolOp' nodes are not implemented"
@@ -1726,8 +1727,11 @@ class TestOperationsPythonPython(TestOperationsNumExprPython):
def setup_class(cls):
super().setup_class()
cls.engine = cls.parser = "python"
- cls.arith_ops = expr._arith_ops_syms + expr._cmp_ops_syms
- cls.arith_ops = filter(lambda x: x not in ("in", "not in"), cls.arith_ops)
+ cls.arith_ops = [
+ op
+ for op in expr.ARITH_OPS_SYMS + expr.CMP_OPS_SYMS
+ if op not in ("in", "not in")
+ ]
class TestOperationsPythonPandas(TestOperationsNumExprPandas):
@@ -1736,7 +1740,7 @@ def setup_class(cls):
super().setup_class()
cls.engine = "python"
cls.parser = "pandas"
- cls.arith_ops = expr._arith_ops_syms + expr._cmp_ops_syms
+ cls.arith_ops = expr.ARITH_OPS_SYMS + expr.CMP_OPS_SYMS
@td.skip_if_no_ne
@@ -1917,7 +1921,7 @@ def test_invalid_parser():
}
-@pytest.mark.parametrize("engine", _engines)
+@pytest.mark.parametrize("engine", ENGINES)
@pytest.mark.parametrize("parser", _parsers)
def test_disallowed_nodes(engine, parser):
VisitorClass = _parsers[parser]
@@ -2016,7 +2020,7 @@ def test_equals_various(other):
df = DataFrame({"A": ["a", "b", "c"]})
result = df.eval(f"A == {other}")
expected = Series([False, False, False], name="A")
- if _USE_NUMEXPR:
+ if USE_NUMEXPR:
# https://github.com/pandas-dev/pandas/issues/10239
# lose name with numexpr engine. Remove when that's fixed.
expected.name = None
diff --git a/pandas/tests/dtypes/cast/test_infer_dtype.py b/pandas/tests/dtypes/cast/test_infer_dtype.py
index 70d38aad951cc..157adacbdfdf7 100644
--- a/pandas/tests/dtypes/cast/test_infer_dtype.py
+++ b/pandas/tests/dtypes/cast/test_infer_dtype.py
@@ -84,13 +84,11 @@ def test_infer_dtype_from_period(freq, pandas_dtype):
if pandas_dtype:
exp_dtype = f"period[{freq}]"
- exp_val = p.ordinal
else:
exp_dtype = np.object_
- exp_val = p
assert dtype == exp_dtype
- assert val == exp_val
+ assert val == p
@pytest.mark.parametrize(
diff --git a/pandas/tests/dtypes/test_common.py b/pandas/tests/dtypes/test_common.py
index ce12718e48d0d..2db9a9a403e1c 100644
--- a/pandas/tests/dtypes/test_common.py
+++ b/pandas/tests/dtypes/test_common.py
@@ -649,8 +649,8 @@ def test_is_complex_dtype():
(IntervalDtype(), IntervalDtype()),
],
)
-def test__get_dtype(input_param, result):
- assert com._get_dtype(input_param) == result
+def test_get_dtype(input_param, result):
+ assert com.get_dtype(input_param) == result
@pytest.mark.parametrize(
@@ -664,12 +664,12 @@ def test__get_dtype(input_param, result):
(pd.DataFrame([1, 2]), "data type not understood"),
],
)
-def test__get_dtype_fails(input_param, expected_error_message):
+def test_get_dtype_fails(input_param, expected_error_message):
# python objects
# 2020-02-02 npdev changed error message
expected_error_message += f"|Cannot interpret '{input_param}' as a data type"
with pytest.raises(TypeError, match=expected_error_message):
- com._get_dtype(input_param)
+ com.get_dtype(input_param)
@pytest.mark.parametrize(
@@ -746,3 +746,13 @@ def test_astype_object_preserves_datetime_na(from_type):
result = astype_nansafe(arr, dtype="object")
assert isna(result)[0]
+
+
+def test_validate_allhashable():
+ assert com.validate_all_hashable(1, "a") is None
+
+ with pytest.raises(TypeError, match="All elements must be hashable"):
+ com.validate_all_hashable([])
+
+ with pytest.raises(TypeError, match="list must be a hashable type"):
+ com.validate_all_hashable([], error_name="list")
diff --git a/pandas/tests/dtypes/test_concat.py b/pandas/tests/dtypes/test_concat.py
index 1fbbd3356ae13..53d53e35c6eb5 100644
--- a/pandas/tests/dtypes/test_concat.py
+++ b/pandas/tests/dtypes/test_concat.py
@@ -44,7 +44,7 @@
)
def test_get_dtype_kinds(index_or_series, to_concat, expected):
to_concat_klass = [index_or_series(c) for c in to_concat]
- result = _concat.get_dtype_kinds(to_concat_klass)
+ result = _concat._get_dtype_kinds(to_concat_klass)
assert result == set(expected)
@@ -76,7 +76,7 @@ def test_get_dtype_kinds(index_or_series, to_concat, expected):
],
)
def test_get_dtype_kinds_period(to_concat, expected):
- result = _concat.get_dtype_kinds(to_concat)
+ result = _concat._get_dtype_kinds(to_concat)
assert result == set(expected)
@@ -88,3 +88,14 @@ def test_concat_mismatched_categoricals_with_empty():
result = _concat.concat_compat([ser1._values, ser2._values])
expected = pd.concat([ser1, ser2])._values
tm.assert_categorical_equal(result, expected)
+
+
+@pytest.mark.parametrize("copy", [True, False])
+def test_concat_single_dataframe_tz_aware(copy):
+ # https://github.com/pandas-dev/pandas/issues/25257
+ df = pd.DataFrame(
+ {"timestamp": [pd.Timestamp("2020-04-08 09:00:00.709949+0000", tz="UTC")]}
+ )
+ expected = df.copy()
+ result = pd.concat([df], copy=copy)
+ tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py
index b1fe673e9e2f1..a58dc5e5ec74a 100644
--- a/pandas/tests/dtypes/test_dtypes.py
+++ b/pandas/tests/dtypes/test_dtypes.py
@@ -4,6 +4,7 @@
import pytest
import pytz
+from pandas.core.dtypes.base import registry
from pandas.core.dtypes.common import (
is_bool_dtype,
is_categorical,
@@ -22,7 +23,6 @@
DatetimeTZDtype,
IntervalDtype,
PeriodDtype,
- registry,
)
import pandas as pd
diff --git a/pandas/tests/dtypes/test_missing.py b/pandas/tests/dtypes/test_missing.py
index f9a854c5778a2..046b82ef3131a 100644
--- a/pandas/tests/dtypes/test_missing.py
+++ b/pandas/tests/dtypes/test_missing.py
@@ -1,3 +1,4 @@
+from contextlib import nullcontext
from datetime import datetime
from decimal import Decimal
@@ -300,48 +301,78 @@ def test_period(self):
tm.assert_series_equal(notna(s), ~exp)
-def test_array_equivalent():
- assert array_equivalent(np.array([np.nan, np.nan]), np.array([np.nan, np.nan]))
+@pytest.mark.parametrize("dtype_equal", [True, False])
+def test_array_equivalent(dtype_equal):
assert array_equivalent(
- np.array([np.nan, 1, np.nan]), np.array([np.nan, 1, np.nan])
+ np.array([np.nan, np.nan]), np.array([np.nan, np.nan]), dtype_equal=dtype_equal
+ )
+ assert array_equivalent(
+ np.array([np.nan, 1, np.nan]),
+ np.array([np.nan, 1, np.nan]),
+ dtype_equal=dtype_equal,
)
assert array_equivalent(
np.array([np.nan, None], dtype="object"),
np.array([np.nan, None], dtype="object"),
+ dtype_equal=dtype_equal,
)
# Check the handling of nested arrays in array_equivalent_object
assert array_equivalent(
np.array([np.array([np.nan, None], dtype="object"), None], dtype="object"),
np.array([np.array([np.nan, None], dtype="object"), None], dtype="object"),
+ dtype_equal=dtype_equal,
)
assert array_equivalent(
np.array([np.nan, 1 + 1j], dtype="complex"),
np.array([np.nan, 1 + 1j], dtype="complex"),
+ dtype_equal=dtype_equal,
)
assert not array_equivalent(
np.array([np.nan, 1 + 1j], dtype="complex"),
np.array([np.nan, 1 + 2j], dtype="complex"),
+ dtype_equal=dtype_equal,
+ )
+ assert not array_equivalent(
+ np.array([np.nan, 1, np.nan]),
+ np.array([np.nan, 2, np.nan]),
+ dtype_equal=dtype_equal,
)
assert not array_equivalent(
- np.array([np.nan, 1, np.nan]), np.array([np.nan, 2, np.nan])
+ np.array(["a", "b", "c", "d"]), np.array(["e", "e"]), dtype_equal=dtype_equal
+ )
+ assert array_equivalent(
+ Float64Index([0, np.nan]), Float64Index([0, np.nan]), dtype_equal=dtype_equal
)
- assert not array_equivalent(np.array(["a", "b", "c", "d"]), np.array(["e", "e"]))
- assert array_equivalent(Float64Index([0, np.nan]), Float64Index([0, np.nan]))
- assert not array_equivalent(Float64Index([0, np.nan]), Float64Index([1, np.nan]))
- assert array_equivalent(DatetimeIndex([0, np.nan]), DatetimeIndex([0, np.nan]))
- assert not array_equivalent(DatetimeIndex([0, np.nan]), DatetimeIndex([1, np.nan]))
- assert array_equivalent(TimedeltaIndex([0, np.nan]), TimedeltaIndex([0, np.nan]))
assert not array_equivalent(
- TimedeltaIndex([0, np.nan]), TimedeltaIndex([1, np.nan])
+ Float64Index([0, np.nan]), Float64Index([1, np.nan]), dtype_equal=dtype_equal
+ )
+ assert array_equivalent(
+ DatetimeIndex([0, np.nan]), DatetimeIndex([0, np.nan]), dtype_equal=dtype_equal
+ )
+ assert not array_equivalent(
+ DatetimeIndex([0, np.nan]), DatetimeIndex([1, np.nan]), dtype_equal=dtype_equal
+ )
+ assert array_equivalent(
+ TimedeltaIndex([0, np.nan]),
+ TimedeltaIndex([0, np.nan]),
+ dtype_equal=dtype_equal,
+ )
+ assert not array_equivalent(
+ TimedeltaIndex([0, np.nan]),
+ TimedeltaIndex([1, np.nan]),
+ dtype_equal=dtype_equal,
)
assert array_equivalent(
DatetimeIndex([0, np.nan], tz="US/Eastern"),
DatetimeIndex([0, np.nan], tz="US/Eastern"),
+ dtype_equal=dtype_equal,
)
assert not array_equivalent(
DatetimeIndex([0, np.nan], tz="US/Eastern"),
DatetimeIndex([1, np.nan], tz="US/Eastern"),
+ dtype_equal=dtype_equal,
)
+ # The rest are not dtype_equal
assert not array_equivalent(
DatetimeIndex([0, np.nan]), DatetimeIndex([0, np.nan], tz="US/Eastern")
)
@@ -353,6 +384,25 @@ def test_array_equivalent():
assert not array_equivalent(DatetimeIndex([0, np.nan]), TimedeltaIndex([0, np.nan]))
+@pytest.mark.parametrize(
+ "val", [1, 1.1, 1 + 1j, True, "abc", [1, 2], (1, 2), {1, 2}, {"a": 1}, None]
+)
+def test_array_equivalent_series(val):
+ arr = np.array([1, 2])
+ cm = (
+ tm.assert_produces_warning(FutureWarning, check_stacklevel=False)
+ if isinstance(val, str)
+ else nullcontext()
+ )
+ with cm:
+ assert not array_equivalent(Series([arr, arr]), Series([arr, val]))
+
+
+def test_array_equivalent_different_dtype_but_equal():
+ # Unclear if this is exposed anywhere in the public-facing API
+ assert array_equivalent(np.array([1, 2]), np.array([1.0, 2.0]))
+
+
@pytest.mark.parametrize(
"lvalue, rvalue",
[
diff --git a/pandas/tests/extension/arrow/arrays.py b/pandas/tests/extension/arrow/arrays.py
index 29cfe1e0fe606..8a18f505058bc 100644
--- a/pandas/tests/extension/arrow/arrays.py
+++ b/pandas/tests/extension/arrow/arrays.py
@@ -162,14 +162,14 @@ def _concat_same_type(cls, to_concat):
def __invert__(self):
return type(self).from_scalars(~self._data.to_pandas())
- def _reduce(self, method, skipna=True, **kwargs):
+ def _reduce(self, name: str, skipna: bool = True, **kwargs):
if skipna:
arr = self[~self.isna()]
else:
arr = self
try:
- op = getattr(arr, method)
+ op = getattr(arr, name)
except AttributeError as err:
raise TypeError from err
return op(**kwargs)
diff --git a/pandas/tests/extension/arrow/test_bool.py b/pandas/tests/extension/arrow/test_bool.py
index 7841360e568ed..12426a0c92c55 100644
--- a/pandas/tests/extension/arrow/test_bool.py
+++ b/pandas/tests/extension/arrow/test_bool.py
@@ -1,8 +1,6 @@
import numpy as np
import pytest
-from pandas.compat import PY37
-
import pandas as pd
import pandas._testing as tm
from pandas.tests.extension import base
@@ -62,13 +60,11 @@ def test_from_dtype(self, data):
def test_from_sequence_from_cls(self, data):
super().test_from_sequence_from_cls(data)
- @pytest.mark.skipif(not PY37, reason="timeout on Linux py36_locale")
@pytest.mark.xfail(reason="pa.NULL is not recognised as scalar, GH-33899")
def test_series_constructor_no_data_with_index(self, dtype, na_value):
# pyarrow.lib.ArrowInvalid: only handle 1-dimensional arrays
super().test_series_constructor_no_data_with_index(dtype, na_value)
- @pytest.mark.skipif(not PY37, reason="timeout on Linux py36_locale")
@pytest.mark.xfail(reason="pa.NULL is not recognised as scalar, GH-33899")
def test_series_constructor_scalar_na_with_index(self, dtype, na_value):
# pyarrow.lib.ArrowInvalid: only handle 1-dimensional arrays
diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py
index 5d0ea69007e27..251376798efc3 100644
--- a/pandas/tests/extension/base/getitem.py
+++ b/pandas/tests/extension/base/getitem.py
@@ -399,31 +399,3 @@ def test_item(self, data):
with pytest.raises(ValueError, match=msg):
s.item()
-
- def test_boolean_mask_frame_fill_value(self, data):
- # https://github.com/pandas-dev/pandas/issues/27781
- df = pd.DataFrame({"A": data})
-
- mask = np.random.choice([True, False], df.shape[0])
- result = pd.isna(df.iloc[mask]["A"])
- expected = pd.isna(df["A"].iloc[mask])
- self.assert_series_equal(result, expected)
-
- mask = pd.Series(mask, index=df.index)
- result = pd.isna(df.loc[mask]["A"])
- expected = pd.isna(df["A"].loc[mask])
- self.assert_series_equal(result, expected)
-
- def test_fancy_index_frame_fill_value(self, data):
- # https://github.com/pandas-dev/pandas/issues/29563
- df = pd.DataFrame({"A": data})
-
- mask = np.random.choice(df.shape[0], df.shape[0])
- result = pd.isna(df.iloc[mask]["A"])
- expected = pd.isna(df["A"].iloc[mask])
- self.assert_series_equal(result, expected)
-
- mask = pd.Series(mask, index=df.index)
- result = pd.isna(df.loc[mask]["A"])
- expected = pd.isna(df["A"].loc[mask])
- self.assert_series_equal(result, expected)
diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py
index 874a8dfd4253f..23e20a2c0903a 100644
--- a/pandas/tests/extension/base/methods.py
+++ b/pandas/tests/extension/base/methods.py
@@ -75,6 +75,38 @@ def test_argsort_missing(self, data_missing_for_sorting):
expected = pd.Series(np.array([1, -1, 0], dtype=np.int64))
self.assert_series_equal(result, expected)
+ def test_argmin_argmax(self, data_for_sorting, data_missing_for_sorting, na_value):
+ # GH 24382
+
+ # data_for_sorting -> [B, C, A] with A < B < C
+ assert data_for_sorting.argmax() == 1
+ assert data_for_sorting.argmin() == 2
+
+ # with repeated values -> first occurence
+ data = data_for_sorting.take([2, 0, 0, 1, 1, 2])
+ assert data.argmax() == 3
+ assert data.argmin() == 0
+
+ # with missing values
+ # data_missing_for_sorting -> [B, NA, A] with A < B and NA missing.
+ assert data_missing_for_sorting.argmax() == 0
+ assert data_missing_for_sorting.argmin() == 2
+
+ @pytest.mark.parametrize("method", ["argmax", "argmin"])
+ def test_argmin_argmax_empty_array(self, method, data):
+ # GH 24382
+ err_msg = "attempt to get"
+ with pytest.raises(ValueError, match=err_msg):
+ getattr(data[:0], method)()
+
+ @pytest.mark.parametrize("method", ["argmax", "argmin"])
+ def test_argmin_argmax_all_na(self, method, data, na_value):
+ # all missing with skipna=True is the same as emtpy
+ err_msg = "attempt to get"
+ data_na = type(data)._from_sequence([na_value, na_value], dtype=data.dtype)
+ with pytest.raises(ValueError, match=err_msg):
+ getattr(data_na, method)()
+
@pytest.mark.parametrize(
"na_position, expected",
[
diff --git a/pandas/tests/extension/base/ops.py b/pandas/tests/extension/base/ops.py
index 359acf230ce14..c93603398977e 100644
--- a/pandas/tests/extension/base/ops.py
+++ b/pandas/tests/extension/base/ops.py
@@ -114,10 +114,13 @@ def test_error(self, data, all_arithmetic_operators):
with pytest.raises(AttributeError):
getattr(data, op_name)
- def test_direct_arith_with_series_returns_not_implemented(self, data):
- # EAs should return NotImplemented for ops with Series.
+ @pytest.mark.parametrize("box", [pd.Series, pd.DataFrame])
+ def test_direct_arith_with_ndframe_returns_not_implemented(self, data, box):
+ # EAs should return NotImplemented for ops with Series/DataFrame
# Pandas takes care of unboxing the series and calling the EA's op.
other = pd.Series(data)
+ if box is pd.DataFrame:
+ other = other.to_frame()
if hasattr(data, "__add__"):
result = data.__add__(other)
assert result is NotImplemented
@@ -156,10 +159,14 @@ def test_compare_array(self, data, all_compare_operators):
other = pd.Series([data[0]] * len(data))
self._compare_other(s, data, op_name, other)
- def test_direct_arith_with_series_returns_not_implemented(self, data):
- # EAs should return NotImplemented for ops with Series.
+ @pytest.mark.parametrize("box", [pd.Series, pd.DataFrame])
+ def test_direct_arith_with_ndframe_returns_not_implemented(self, data, box):
+ # EAs should return NotImplemented for ops with Series/DataFrame
# Pandas takes care of unboxing the series and calling the EA's op.
other = pd.Series(data)
+ if box is pd.DataFrame:
+ other = other.to_frame()
+
if hasattr(data, "__eq__"):
result = data.__eq__(other)
assert result is NotImplemented
diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py
index cd932e842e00c..3774e018a8e51 100644
--- a/pandas/tests/extension/base/reshaping.py
+++ b/pandas/tests/extension/base/reshaping.py
@@ -107,6 +107,19 @@ def test_concat_extension_arrays_copy_false(self, data, na_value):
result = pd.concat([df1, df2], axis=1, copy=False)
self.assert_frame_equal(result, expected)
+ def test_concat_with_reindex(self, data):
+ # GH-33027
+ a = pd.DataFrame({"a": data[:5]})
+ b = pd.DataFrame({"b": data[:5]})
+ result = pd.concat([a, b], ignore_index=True)
+ expected = pd.DataFrame(
+ {
+ "a": data.take(list(range(5)) + ([-1] * 5), allow_fill=True),
+ "b": data.take(([-1] * 5) + list(range(5)), allow_fill=True),
+ }
+ )
+ self.assert_frame_equal(result, expected)
+
def test_align(self, data, na_value):
a = data[:3]
b = data[2:5]
diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py
index bfa53ad02525b..a4e6fc0f78cbb 100644
--- a/pandas/tests/extension/base/setitem.py
+++ b/pandas/tests/extension/base/setitem.py
@@ -244,7 +244,10 @@ def test_setitem_expand_with_extension(self, data):
def test_setitem_frame_invalid_length(self, data):
df = pd.DataFrame({"A": [1] * len(data)})
- xpr = "Length of values does not match length of index"
+ xpr = (
+ rf"Length of values \({len(data[:5])}\) "
+ rf"does not match length of index \({len(df)}\)"
+ )
with pytest.raises(ValueError, match=xpr):
df["B"] = data[:5]
diff --git a/pandas/tests/extension/decimal/array.py b/pandas/tests/extension/decimal/array.py
index 4d5be75ff8200..9147360e71c73 100644
--- a/pandas/tests/extension/decimal/array.py
+++ b/pandas/tests/extension/decimal/array.py
@@ -167,14 +167,14 @@ def _na_value(self):
def _formatter(self, boxed=False):
if boxed:
- return "Decimal: {0}".format
+ return "Decimal: {}".format
return repr
@classmethod
def _concat_same_type(cls, to_concat):
return cls(np.concatenate([x._data for x in to_concat]))
- def _reduce(self, name, skipna=True, **kwargs):
+ def _reduce(self, name: str, skipna: bool = True, **kwargs):
if skipna:
# If we don't have any NAs, we can ignore skipna
diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py
index 447a6108fc3c7..e3cdeb9c1951f 100644
--- a/pandas/tests/extension/json/array.py
+++ b/pandas/tests/extension/json/array.py
@@ -189,7 +189,7 @@ def _concat_same_type(cls, to_concat):
def _values_for_factorize(self):
frozen = self._values_for_argsort()
if len(frozen) == 0:
- # _factorize_array expects 1-d array, this is a len-0 2-d array.
+ # factorize_array expects 1-d array, this is a len-0 2-d array.
frozen = frozen.ravel()
return frozen, ()
diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py
index 725067951eeef..8acbeaf0b8170 100644
--- a/pandas/tests/extension/test_boolean.py
+++ b/pandas/tests/extension/test_boolean.py
@@ -235,6 +235,23 @@ def test_searchsorted(self, data_for_sorting, as_series):
def test_value_counts(self, all_data, dropna):
return super().test_value_counts(all_data, dropna)
+ def test_argmin_argmax(self, data_for_sorting, data_missing_for_sorting):
+ # override because there are only 2 unique values
+
+ # data_for_sorting -> [B, C, A] with A < B < C -> here True, True, False
+ assert data_for_sorting.argmax() == 0
+ assert data_for_sorting.argmin() == 2
+
+ # with repeated values -> first occurence
+ data = data_for_sorting.take([2, 0, 0, 1, 1, 2])
+ assert data.argmax() == 1
+ assert data.argmin() == 0
+
+ # with missing values
+ # data_missing_for_sorting -> [B, NA, A] with A < B and NA missing.
+ assert data_missing_for_sorting.argmax() == 0
+ assert data_missing_for_sorting.argmin() == 2
+
class TestCasting(base.BaseCastingTests):
pass
diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py
index d1211e477fe3e..7d03dadb20dd9 100644
--- a/pandas/tests/extension/test_categorical.py
+++ b/pandas/tests/extension/test_categorical.py
@@ -93,7 +93,8 @@ class TestConstructors(base.BaseConstructorsTests):
class TestReshaping(base.BaseReshapingTests):
- pass
+ def test_concat_with_reindex(self, data):
+ pytest.xfail(reason="Deliberately upcast to object?")
class TestGetitem(base.BaseGetitemTests):
@@ -136,7 +137,7 @@ def test_combine_add(self, data_repeated):
s2 = pd.Series(orig_data2)
result = s1.combine(s2, lambda x1, x2: x1 + x2)
expected = pd.Series(
- ([a + b for (a, b) in zip(list(orig_data1), list(orig_data2))])
+ [a + b for (a, b) in zip(list(orig_data1), list(orig_data2))]
)
self.assert_series_equal(result, expected)
diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py
index 78000c0252375..bbfaacae1b444 100644
--- a/pandas/tests/extension/test_numpy.py
+++ b/pandas/tests/extension/test_numpy.py
@@ -1,8 +1,6 @@
import numpy as np
import pytest
-from pandas.compat.numpy import _np_version_under1p16
-
import pandas as pd
import pandas._testing as tm
from pandas.core.arrays.numpy_ import PandasArray, PandasDtype
@@ -46,11 +44,7 @@ def data(allow_in_pandas, dtype):
@pytest.fixture
def data_missing(allow_in_pandas, dtype):
- # For NumPy <1.16, np.array([np.nan, (1,)]) raises
- # ValueError: setting an array element with a sequence.
if dtype.numpy_dtype == "object":
- if _np_version_under1p16:
- raise pytest.skip("Skipping for NumPy <1.16")
return PandasArray(np.array([np.nan, (1,)], dtype=object))
return PandasArray(np.array([np.nan, 1.0]))
@@ -354,6 +348,12 @@ def test_fillna_frame(self, data_missing):
# Non-scalar "scalar" values.
super().test_fillna_frame(data_missing)
+ @pytest.mark.skip("Invalid test")
+ def test_fillna_fill_other(self, data):
+ # inplace update doesn't work correctly with patched extension arrays
+ # extract_array returns PandasArray, while dtype is a numpy dtype
+ super().test_fillna_fill_other(data_missing)
+
class TestReshaping(BaseNumPyTests, base.BaseReshapingTests):
@pytest.mark.skip("Incorrect parent test")
diff --git a/pandas/tests/extension/test_period.py b/pandas/tests/extension/test_period.py
index b1eb276bfc227..817881e00fa99 100644
--- a/pandas/tests/extension/test_period.py
+++ b/pandas/tests/extension/test_period.py
@@ -126,9 +126,13 @@ def test_add_series_with_extension_array(self, data):
def test_error(self):
pass
- def test_direct_arith_with_series_returns_not_implemented(self, data):
+ @pytest.mark.parametrize("box", [pd.Series, pd.DataFrame])
+ def test_direct_arith_with_ndframe_returns_not_implemented(self, data, box):
# Override to use __sub__ instead of __add__
other = pd.Series(data)
+ if box is pd.DataFrame:
+ other = other.to_frame()
+
result = data.__sub__(other)
assert result is NotImplemented
diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py
index f318934ef5e52..d11cfd219a443 100644
--- a/pandas/tests/extension/test_sparse.py
+++ b/pandas/tests/extension/test_sparse.py
@@ -41,11 +41,6 @@ def data_for_twos(request):
return SparseArray(np.ones(100) * 2)
-@pytest.fixture(params=[0, np.nan])
-def data_zeros(request):
- return SparseArray(np.zeros(100, dtype=int), fill_value=request.param)
-
-
@pytest.fixture(params=[0, np.nan])
def data_missing(request):
"""Length 2 array with [NA, Valid]"""
@@ -321,6 +316,12 @@ def test_shift_0_periods(self, data):
data._sparse_values[0] = data._sparse_values[1]
assert result._sparse_values[0] != result._sparse_values[1]
+ @pytest.mark.parametrize("method", ["argmax", "argmin"])
+ def test_argmin_argmax_all_na(self, method, data, na_value):
+ # overriding because Sparse[int64, 0] cannot handle na_value
+ self._check_unsupported(data)
+ super().test_argmin_argmax_all_na(method, data, na_value)
+
@pytest.mark.parametrize("box", [pd.array, pd.Series, pd.DataFrame])
def test_equals(self, data, na_value, as_series, box):
self._check_unsupported(data)
diff --git a/pandas/tests/frame/apply/__init__.py b/pandas/tests/frame/apply/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/pandas/tests/frame/apply/test_apply_relabeling.py b/pandas/tests/frame/apply/test_apply_relabeling.py
new file mode 100644
index 0000000000000..965f69753bdc7
--- /dev/null
+++ b/pandas/tests/frame/apply/test_apply_relabeling.py
@@ -0,0 +1,104 @@
+import numpy as np
+import pytest
+
+import pandas as pd
+import pandas._testing as tm
+
+
+class TestDataFrameNamedAggregate:
+ def test_agg_relabel(self):
+ # GH 26513
+ df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]})
+
+ # simplest case with one column, one func
+ result = df.agg(foo=("B", "sum"))
+ expected = pd.DataFrame({"B": [10]}, index=pd.Index(["foo"]))
+ tm.assert_frame_equal(result, expected)
+
+ # test on same column with different methods
+ result = df.agg(foo=("B", "sum"), bar=("B", "min"))
+ expected = pd.DataFrame({"B": [10, 1]}, index=pd.Index(["foo", "bar"]))
+
+ tm.assert_frame_equal(result, expected)
+
+ def test_agg_relabel_multi_columns_multi_methods(self):
+ # GH 26513, test on multiple columns with multiple methods
+ df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]})
+ result = df.agg(
+ foo=("A", "sum"),
+ bar=("B", "mean"),
+ cat=("A", "min"),
+ dat=("B", "max"),
+ f=("A", "max"),
+ g=("C", "min"),
+ )
+ expected = pd.DataFrame(
+ {
+ "A": [6.0, np.nan, 1.0, np.nan, 2.0, np.nan],
+ "B": [np.nan, 2.5, np.nan, 4.0, np.nan, np.nan],
+ "C": [np.nan, np.nan, np.nan, np.nan, np.nan, 3.0],
+ },
+ index=pd.Index(["foo", "bar", "cat", "dat", "f", "g"]),
+ )
+ tm.assert_frame_equal(result, expected)
+
+ def test_agg_relabel_partial_functions(self):
+ # GH 26513, test on partial, functools or more complex cases
+ df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4], "C": [3, 4, 5, 6]})
+ result = df.agg(foo=("A", np.mean), bar=("A", "mean"), cat=("A", min))
+ expected = pd.DataFrame(
+ {"A": [1.5, 1.5, 1.0]}, index=pd.Index(["foo", "bar", "cat"])
+ )
+ tm.assert_frame_equal(result, expected)
+
+ result = df.agg(
+ foo=("A", min),
+ bar=("A", np.min),
+ cat=("B", max),
+ dat=("C", "min"),
+ f=("B", np.sum),
+ kk=("B", lambda x: min(x)),
+ )
+ expected = pd.DataFrame(
+ {
+ "A": [1.0, 1.0, np.nan, np.nan, np.nan, np.nan],
+ "B": [np.nan, np.nan, 4.0, np.nan, 10.0, 1.0],
+ "C": [np.nan, np.nan, np.nan, 3.0, np.nan, np.nan],
+ },
+ index=pd.Index(["foo", "bar", "cat", "dat", "f", "kk"]),
+ )
+ tm.assert_frame_equal(result, expected)
+
+ def test_agg_namedtuple(self):
+ # GH 26513
+ df = pd.DataFrame({"A": [0, 1], "B": [1, 2]})
+ result = df.agg(
+ foo=pd.NamedAgg("B", "sum"),
+ bar=pd.NamedAgg("B", min),
+ cat=pd.NamedAgg(column="B", aggfunc="count"),
+ fft=pd.NamedAgg("B", aggfunc="max"),
+ )
+
+ expected = pd.DataFrame(
+ {"B": [3, 1, 2, 2]}, index=pd.Index(["foo", "bar", "cat", "fft"])
+ )
+ tm.assert_frame_equal(result, expected)
+
+ result = df.agg(
+ foo=pd.NamedAgg("A", "min"),
+ bar=pd.NamedAgg(column="B", aggfunc="max"),
+ cat=pd.NamedAgg(column="A", aggfunc="max"),
+ )
+ expected = pd.DataFrame(
+ {"A": [0.0, np.nan, 1.0], "B": [np.nan, 2.0, np.nan]},
+ index=pd.Index(["foo", "bar", "cat"]),
+ )
+ tm.assert_frame_equal(result, expected)
+
+ def test_agg_raises(self):
+ # GH 26513
+ df = pd.DataFrame({"A": [0, 1], "B": [1, 2]})
+ msg = "Must provide"
+
+ with pytest.raises(TypeError, match=msg):
+ df.agg()
diff --git a/pandas/tests/frame/test_apply.py b/pandas/tests/frame/apply/test_frame_apply.py
similarity index 93%
rename from pandas/tests/frame/test_apply.py
rename to pandas/tests/frame/apply/test_frame_apply.py
index 8f0d3d9fbc734..e25b681c8c7c3 100644
--- a/pandas/tests/frame/test_apply.py
+++ b/pandas/tests/frame/apply/test_frame_apply.py
@@ -1,7 +1,6 @@
from collections import OrderedDict
from datetime import datetime
from itertools import chain
-import operator
import warnings
import numpy as np
@@ -14,6 +13,7 @@
import pandas._testing as tm
from pandas.core.apply import frame_apply
from pandas.core.base import SpecificationError
+from pandas.tests.frame.common import zip_frames
@pytest.fixture
@@ -630,6 +630,22 @@ def test_applymap(self, float_frame):
result = frame.applymap(func)
tm.assert_frame_equal(result, frame)
+ def test_applymap_na_ignore(self, float_frame):
+ # GH 23803
+ strlen_frame = float_frame.applymap(lambda x: len(str(x)))
+ float_frame_with_na = float_frame.copy()
+ mask = np.random.randint(0, 2, size=float_frame.shape, dtype=bool)
+ float_frame_with_na[mask] = pd.NA
+ strlen_frame_na_ignore = float_frame_with_na.applymap(
+ lambda x: len(str(x)), na_action="ignore"
+ )
+ strlen_frame_with_na = strlen_frame.copy()
+ strlen_frame_with_na[mask] = pd.NA
+ tm.assert_frame_equal(strlen_frame_na_ignore, strlen_frame_with_na)
+
+ with pytest.raises(ValueError, match="na_action must be .*Got 'abc'"):
+ float_frame_with_na.applymap(lambda x: len(str(x)), na_action="abc")
+
def test_applymap_box_timestamps(self):
# GH 2689, GH 2627
ser = pd.Series(date_range("1/1/2000", periods=10))
@@ -793,6 +809,18 @@ def test_apply_with_byte_string(self):
result = df.apply(lambda x: x.astype("object"))
tm.assert_frame_equal(result, expected)
+ @pytest.mark.parametrize("val", ["asd", 12, None, np.NaN])
+ def test_apply_category_equalness(self, val):
+ # Check if categorical comparisons on apply, GH 21239
+ df_values = ["asd", None, 12, "asd", "cde", np.NaN]
+ df = pd.DataFrame({"a": df_values}, dtype="category")
+
+ result = df.a.apply(lambda x: x == val)
+ expected = pd.Series(
+ [np.NaN if pd.isnull(x) else x == val for x in df_values], name="a"
+ )
+ tm.assert_series_equal(result, expected)
+
class TestInferOutputShape:
# the user has supplied an opaque UDF where
@@ -1046,25 +1074,6 @@ def test_consistency_for_boxed(self, box, int_frame_const_col):
tm.assert_frame_equal(result, expected)
-def zip_frames(frames, axis=1):
- """
- take a list of frames, zip them together under the
- assumption that these all have the first frames' index/columns.
-
- Returns
- -------
- new_frame : DataFrame
- """
- if axis == 1:
- columns = frames[0].columns
- zipped = [f.loc[:, c] for c in columns for f in frames]
- return pd.concat(zipped, axis=1)
- else:
- index = frames[0].index
- zipped = [f.loc[i, :] for i in index for f in frames]
- return pd.DataFrame(zipped)
-
-
class TestDataFrameAggregate:
def test_agg_transform(self, axis, float_frame):
other_axis = 1 if axis in {0, "index"} else 0
@@ -1075,16 +1084,10 @@ def test_agg_transform(self, axis, float_frame):
f_sqrt = np.sqrt(float_frame)
# ufunc
- result = float_frame.transform(np.sqrt, axis=axis)
expected = f_sqrt.copy()
- tm.assert_frame_equal(result, expected)
-
result = float_frame.apply(np.sqrt, axis=axis)
tm.assert_frame_equal(result, expected)
- result = float_frame.transform(np.sqrt, axis=axis)
- tm.assert_frame_equal(result, expected)
-
# list-like
result = float_frame.apply([np.sqrt], axis=axis)
expected = f_sqrt.copy()
@@ -1098,9 +1101,6 @@ def test_agg_transform(self, axis, float_frame):
)
tm.assert_frame_equal(result, expected)
- result = float_frame.transform([np.sqrt], axis=axis)
- tm.assert_frame_equal(result, expected)
-
# multiple items in list
# these are in the order as if we are applying both
# functions per series and then concatting
@@ -1116,38 +1116,19 @@ def test_agg_transform(self, axis, float_frame):
)
tm.assert_frame_equal(result, expected)
- result = float_frame.transform([np.abs, "sqrt"], axis=axis)
- tm.assert_frame_equal(result, expected)
-
def test_transform_and_agg_err(self, axis, float_frame):
# cannot both transform and agg
- msg = "transforms cannot produce aggregated results"
- with pytest.raises(ValueError, match=msg):
- float_frame.transform(["max", "min"], axis=axis)
-
msg = "cannot combine transform and aggregation operations"
with pytest.raises(ValueError, match=msg):
with np.errstate(all="ignore"):
float_frame.agg(["max", "sqrt"], axis=axis)
- with pytest.raises(ValueError, match=msg):
- with np.errstate(all="ignore"):
- float_frame.transform(["max", "sqrt"], axis=axis)
-
df = pd.DataFrame({"A": range(5), "B": 5})
def f():
with np.errstate(all="ignore"):
df.agg({"A": ["abs", "sum"], "B": ["mean", "max"]}, axis=axis)
- @pytest.mark.parametrize("method", ["abs", "shift", "pct_change", "cumsum", "rank"])
- def test_transform_method_name(self, method):
- # GH 19760
- df = pd.DataFrame({"A": [-1, 2]})
- result = df.transform(method)
- expected = operator.methodcaller(method)(df)
- tm.assert_frame_equal(result, expected)
-
def test_demo(self):
# demonstration tests
df = pd.DataFrame({"A": range(5), "B": 5})
@@ -1166,6 +1147,21 @@ def test_demo(self):
)
tm.assert_frame_equal(result.reindex_like(expected), expected)
+ def test_agg_with_name_as_column_name(self):
+ # GH 36212 - Column name is "name"
+ data = {"name": ["foo", "bar"]}
+ df = pd.DataFrame(data)
+
+ # result's name should be None
+ result = df.agg({"name": "count"})
+ expected = pd.Series({"name": 2})
+ tm.assert_series_equal(result, expected)
+
+ # Check if name is still preserved when aggregating series instead
+ result = df["name"].agg({"name": "count"})
+ expected = pd.Series({"name": 2}, name="name")
+ tm.assert_series_equal(result, expected)
+
def test_agg_multiple_mixed_no_warning(self):
# GH 20909
mdf = pd.DataFrame(
@@ -1501,3 +1497,51 @@ def test_consistency_of_aggregates_of_columns_with_missing_values(self, df, meth
tm.assert_series_equal(
none_in_first_column_result, none_in_second_column_result
)
+
+ @pytest.mark.parametrize("col", [1, 1.0, True, "a", np.nan])
+ def test_apply_dtype(self, col):
+ # GH 31466
+ df = pd.DataFrame([[1.0, col]], columns=["a", "b"])
+ result = df.apply(lambda x: x.dtype)
+ expected = df.dtypes
+
+ tm.assert_series_equal(result, expected)
+
+
+def test_apply_mutating():
+ # GH#35462 case where applied func pins a new BlockManager to a row
+ df = pd.DataFrame({"a": range(100), "b": range(100, 200)})
+
+ def func(row):
+ mgr = row._mgr
+ row.loc["a"] += 1
+ assert row._mgr is not mgr
+ return row
+
+ expected = df.copy()
+ expected["a"] += 1
+
+ result = df.apply(func, axis=1)
+
+ tm.assert_frame_equal(result, expected)
+ tm.assert_frame_equal(df, result)
+
+
+def test_apply_empty_list_reduce():
+ # GH#35683 get columns correct
+ df = pd.DataFrame([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]], columns=["a", "b"])
+
+ result = df.apply(lambda x: [], result_type="reduce")
+ expected = pd.Series({"a": [], "b": []}, dtype=object)
+ tm.assert_series_equal(result, expected)
+
+
+def test_apply_no_suffix_index():
+ # GH36189
+ pdf = pd.DataFrame([[4, 9]] * 3, columns=["A", "B"])
+ result = pdf.apply(["sum", lambda x: x.sum(), lambda x: x.sum()])
+ expected = pd.DataFrame(
+ {"A": [12, 12, 12], "B": [27, 27, 27]}, index=["sum", "", ""]
+ )
+
+ tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/frame/apply/test_frame_transform.py b/pandas/tests/frame/apply/test_frame_transform.py
new file mode 100644
index 0000000000000..346e60954fc13
--- /dev/null
+++ b/pandas/tests/frame/apply/test_frame_transform.py
@@ -0,0 +1,203 @@
+import operator
+import re
+
+import numpy as np
+import pytest
+
+from pandas import DataFrame, MultiIndex
+import pandas._testing as tm
+from pandas.core.base import SpecificationError
+from pandas.core.groupby.base import transformation_kernels
+from pandas.tests.frame.common import zip_frames
+
+
+def test_transform_ufunc(axis, float_frame):
+ # GH 35964
+ with np.errstate(all="ignore"):
+ f_sqrt = np.sqrt(float_frame)
+ result = float_frame.transform(np.sqrt, axis=axis)
+ expected = f_sqrt
+ tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("op", transformation_kernels)
+def test_transform_groupby_kernel(axis, float_frame, op):
+ # GH 35964
+ if op == "cumcount":
+ pytest.xfail("DataFrame.cumcount does not exist")
+ if op == "tshift":
+ pytest.xfail("Only works on time index and is deprecated")
+ if axis == 1 or axis == "columns":
+ pytest.xfail("GH 36308: groupby.transform with axis=1 is broken")
+
+ args = [0.0] if op == "fillna" else []
+ if axis == 0 or axis == "index":
+ ones = np.ones(float_frame.shape[0])
+ else:
+ ones = np.ones(float_frame.shape[1])
+ expected = float_frame.groupby(ones, axis=axis).transform(op, *args)
+ result = float_frame.transform(op, axis, *args)
+ tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+ "ops, names", [([np.sqrt], ["sqrt"]), ([np.abs, np.sqrt], ["absolute", "sqrt"])]
+)
+def test_transform_list(axis, float_frame, ops, names):
+ # GH 35964
+ other_axis = 1 if axis in {0, "index"} else 0
+ with np.errstate(all="ignore"):
+ expected = zip_frames([op(float_frame) for op in ops], axis=other_axis)
+ if axis in {0, "index"}:
+ expected.columns = MultiIndex.from_product([float_frame.columns, names])
+ else:
+ expected.index = MultiIndex.from_product([float_frame.index, names])
+ result = float_frame.transform(ops, axis=axis)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_transform_dict(axis, float_frame):
+ # GH 35964
+ if axis == 0 or axis == "index":
+ e = float_frame.columns[0]
+ expected = float_frame[[e]].transform(np.abs)
+ else:
+ e = float_frame.index[0]
+ expected = float_frame.iloc[[0]].transform(np.abs)
+ result = float_frame.transform({e: np.abs}, axis=axis)
+ tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("use_apply", [True, False])
+def test_transform_udf(axis, float_frame, use_apply):
+ # GH 35964
+ # transform uses UDF either via apply or passing the entire DataFrame
+ def func(x):
+ # transform is using apply iff x is not a DataFrame
+ if use_apply == isinstance(x, DataFrame):
+ # Force transform to fallback
+ raise ValueError
+ return x + 1
+
+ result = float_frame.transform(func, axis=axis)
+ expected = float_frame + 1
+ tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("method", ["abs", "shift", "pct_change", "cumsum", "rank"])
+def test_transform_method_name(method):
+ # GH 19760
+ df = DataFrame({"A": [-1, 2]})
+ result = df.transform(method)
+ expected = operator.methodcaller(method)(df)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_transform_and_agg_err(axis, float_frame):
+ # GH 35964
+ # cannot both transform and agg
+ msg = "Function did not transform"
+ with pytest.raises(ValueError, match=msg):
+ float_frame.transform(["max", "min"], axis=axis)
+
+ msg = "Function did not transform"
+ with pytest.raises(ValueError, match=msg):
+ float_frame.transform(["max", "sqrt"], axis=axis)
+
+
+def test_agg_dict_nested_renaming_depr():
+ df = DataFrame({"A": range(5), "B": 5})
+
+ # nested renaming
+ msg = r"nested renamer is not supported"
+ with pytest.raises(SpecificationError, match=msg):
+ # mypy identifies the argument as an invalid type
+ df.transform({"A": {"foo": "min"}, "B": {"bar": "max"}})
+
+
+def test_transform_reducer_raises(all_reductions):
+ # GH 35964
+ op = all_reductions
+ df = DataFrame({"A": [1, 2, 3]})
+ msg = "Function did not transform"
+ with pytest.raises(ValueError, match=msg):
+ df.transform(op)
+ with pytest.raises(ValueError, match=msg):
+ df.transform([op])
+ with pytest.raises(ValueError, match=msg):
+ df.transform({"A": op})
+ with pytest.raises(ValueError, match=msg):
+ df.transform({"A": [op]})
+
+
+# mypy doesn't allow adding lists of different types
+# https://github.com/python/mypy/issues/5492
+@pytest.mark.parametrize("op", [*transformation_kernels, lambda x: x + 1])
+def test_transform_bad_dtype(op):
+ # GH 35964
+ df = DataFrame({"A": 3 * [object]}) # DataFrame that will fail on most transforms
+ if op in ("backfill", "shift", "pad", "bfill", "ffill"):
+ pytest.xfail("Transform function works on any datatype")
+ msg = "Transform function failed"
+ with pytest.raises(ValueError, match=msg):
+ df.transform(op)
+ with pytest.raises(ValueError, match=msg):
+ df.transform([op])
+ with pytest.raises(ValueError, match=msg):
+ df.transform({"A": op})
+ with pytest.raises(ValueError, match=msg):
+ df.transform({"A": [op]})
+
+
+@pytest.mark.parametrize("op", transformation_kernels)
+def test_transform_partial_failure(op):
+ # GH 35964
+ wont_fail = ["ffill", "bfill", "fillna", "pad", "backfill", "shift"]
+ if op in wont_fail:
+ pytest.xfail("Transform kernel is successful on all dtypes")
+ if op == "cumcount":
+ pytest.xfail("transform('cumcount') not implemented")
+ if op == "tshift":
+ pytest.xfail("Only works on time index; deprecated")
+
+ # Using object makes most transform kernels fail
+ df = DataFrame({"A": 3 * [object], "B": [1, 2, 3]})
+
+ expected = df[["B"]].transform([op])
+ result = df.transform([op])
+ tm.assert_equal(result, expected)
+
+ expected = df[["B"]].transform({"B": op})
+ result = df.transform({"B": op})
+ tm.assert_equal(result, expected)
+
+ expected = df[["B"]].transform({"B": [op]})
+ result = df.transform({"B": [op]})
+ tm.assert_equal(result, expected)
+
+
+@pytest.mark.parametrize("use_apply", [True, False])
+def test_transform_passes_args(use_apply):
+ # GH 35964
+ # transform uses UDF either via apply or passing the entire DataFrame
+ expected_args = [1, 2]
+ expected_kwargs = {"c": 3}
+
+ def f(x, a, b, c):
+ # transform is using apply iff x is not a DataFrame
+ if use_apply == isinstance(x, DataFrame):
+ # Force transform to fallback
+ raise ValueError
+ assert [a, b] == expected_args
+ assert c == expected_kwargs["c"]
+ return x
+
+ DataFrame([1]).transform(f, 0, *expected_args, **expected_kwargs)
+
+
+def test_transform_missing_columns(axis):
+ # GH 35964
+ df = DataFrame({"A": [1, 2], "B": [3, 4]})
+ match = re.escape("Column(s) ['C'] do not exist")
+ with pytest.raises(SpecificationError, match=match):
+ df.transform({"C": "cumsum"})
diff --git a/pandas/tests/frame/common.py b/pandas/tests/frame/common.py
index 463a140972ab5..73e60ff389038 100644
--- a/pandas/tests/frame/common.py
+++ b/pandas/tests/frame/common.py
@@ -1,3 +1,8 @@
+from typing import List
+
+from pandas import DataFrame, concat
+
+
def _check_mixed_float(df, dtype=None):
# float16 are most likely to be upcasted to float32
dtypes = dict(A="float32", B="float32", C="float16", D="float64")
@@ -29,3 +34,22 @@ def _check_mixed_int(df, dtype=None):
assert df.dtypes["C"] == dtypes["C"]
if dtypes.get("D"):
assert df.dtypes["D"] == dtypes["D"]
+
+
+def zip_frames(frames: List[DataFrame], axis: int = 1) -> DataFrame:
+ """
+ take a list of frames, zip them together under the
+ assumption that these all have the first frames' index/columns.
+
+ Returns
+ -------
+ new_frame : DataFrame
+ """
+ if axis == 1:
+ columns = frames[0].columns
+ zipped = [f.loc[:, c] for c in columns for f in frames]
+ return concat(zipped, axis=1)
+ else:
+ index = frames[0].index
+ zipped = [f.loc[i, :] for i in index for f in frames]
+ return DataFrame(zipped)
diff --git a/pandas/tests/frame/indexing/test_categorical.py b/pandas/tests/frame/indexing/test_categorical.py
index d94dc8d2ffe00..314de5bdd8146 100644
--- a/pandas/tests/frame/indexing/test_categorical.py
+++ b/pandas/tests/frame/indexing/test_categorical.py
@@ -326,7 +326,10 @@ def test_assigning_ops(self):
df = DataFrame({"cats": catsf, "values": valuesf}, index=idxf)
exp_fancy = exp_multi_row.copy()
- exp_fancy["cats"].cat.set_categories(["a", "b", "c"], inplace=True)
+ return_value = exp_fancy["cats"].cat.set_categories(
+ ["a", "b", "c"], inplace=True
+ )
+ assert return_value is None
df[df["cats"] == "c"] = ["b", 2]
# category c is kept in .categories
@@ -391,3 +394,14 @@ def test_loc_indexing_preserves_index_category_dtype(self):
result = df.loc[["a"]].index.levels[0]
tm.assert_index_equal(result, expected)
+
+ def test_categorical_filtering(self):
+ # GH22609 Verify filtering operations on DataFrames with categorical Series
+ df = pd.DataFrame(data=[[0, 0], [1, 1]], columns=["a", "b"])
+ df["b"] = df.b.astype("category")
+
+ result = df.where(df.a > 0)
+ expected = df.copy()
+ expected.loc[0, :] = np.nan
+
+ tm.assert_equal(result, expected)
diff --git a/pandas/tests/frame/indexing/test_datetime.py b/pandas/tests/frame/indexing/test_datetime.py
index 1937a4c380dc9..1866ac341def6 100644
--- a/pandas/tests/frame/indexing/test_datetime.py
+++ b/pandas/tests/frame/indexing/test_datetime.py
@@ -23,7 +23,9 @@ def test_setitem(self, timezone_frame):
b1 = df._mgr.blocks[1]
b2 = df._mgr.blocks[2]
tm.assert_extension_array_equal(b1.values, b2.values)
- assert id(b1.values._data.base) != id(b2.values._data.base)
+ b1base = b1.values._data.base
+ b2base = b2.values._data.base
+ assert b1base is None or (id(b1base) != id(b2base))
# with nan
df2 = df.copy()
diff --git a/pandas/tests/frame/indexing/test_indexing.py b/pandas/tests/frame/indexing/test_indexing.py
index 3865ea64ee479..b947be705a329 100644
--- a/pandas/tests/frame/indexing/test_indexing.py
+++ b/pandas/tests/frame/indexing/test_indexing.py
@@ -21,7 +21,6 @@
notna,
)
import pandas._testing as tm
-from pandas.arrays import SparseArray
import pandas.core.common as com
from pandas.core.indexing import IndexingError
@@ -161,10 +160,13 @@ def test_setitem_list(self, float_frame):
msg = "Columns must be same length as key"
with pytest.raises(ValueError, match=msg):
data[["A"]] = float_frame[["A", "B"]]
-
- msg = "Length of values does not match length of index"
+ newcolumndata = range(len(data.index) - 1)
+ msg = (
+ rf"Length of values \({len(newcolumndata)}\) "
+ rf"does not match length of index \({len(data)}\)"
+ )
with pytest.raises(ValueError, match=msg):
- data["A"] = range(len(data.index) - 1)
+ data["A"] = newcolumndata
df = DataFrame(0, index=range(3), columns=["tt1", "tt2"], dtype=np.int_)
df.loc[1, ["tt1", "tt2"]] = [1, 2]
@@ -1338,7 +1340,8 @@ def test_lookup_float(self, float_frame):
df = float_frame
rows = list(df.index) * len(df.columns)
cols = list(df.columns) * len(df.index)
- result = df.lookup(rows, cols)
+ with tm.assert_produces_warning(FutureWarning):
+ result = df.lookup(rows, cols)
expected = np.array([df.loc[r, c] for r, c in zip(rows, cols)])
tm.assert_numpy_array_equal(result, expected)
@@ -1347,7 +1350,8 @@ def test_lookup_mixed(self, float_string_frame):
df = float_string_frame
rows = list(df.index) * len(df.columns)
cols = list(df.columns) * len(df.index)
- result = df.lookup(rows, cols)
+ with tm.assert_produces_warning(FutureWarning):
+ result = df.lookup(rows, cols)
expected = np.array(
[df.loc[r, c] for r, c in zip(rows, cols)], dtype=np.object_
@@ -1363,7 +1367,8 @@ def test_lookup_bool(self):
"mask_c": [False, True, False, True],
}
)
- df["mask"] = df.lookup(df.index, "mask_" + df["label"])
+ with tm.assert_produces_warning(FutureWarning):
+ df["mask"] = df.lookup(df.index, "mask_" + df["label"])
exp_mask = np.array(
[df.loc[r, c] for r, c in zip(df.index, "mask_" + df["label"])]
@@ -1374,13 +1379,16 @@ def test_lookup_bool(self):
def test_lookup_raises(self, float_frame):
with pytest.raises(KeyError, match="'One or more row labels was not found'"):
- float_frame.lookup(["xyz"], ["A"])
+ with tm.assert_produces_warning(FutureWarning):
+ float_frame.lookup(["xyz"], ["A"])
with pytest.raises(KeyError, match="'One or more column labels was not found'"):
- float_frame.lookup([float_frame.index[0]], ["xyz"])
+ with tm.assert_produces_warning(FutureWarning):
+ float_frame.lookup([float_frame.index[0]], ["xyz"])
with pytest.raises(ValueError, match="same size"):
- float_frame.lookup(["a", "b", "c"], ["a"])
+ with tm.assert_produces_warning(FutureWarning):
+ float_frame.lookup(["a", "b", "c"], ["a"])
def test_lookup_requires_unique_axes(self):
# GH#33041 raise with a helpful error message
@@ -1391,14 +1399,17 @@ def test_lookup_requires_unique_axes(self):
# homogeneous-dtype case
with pytest.raises(ValueError, match="requires unique index and columns"):
- df.lookup(rows, cols)
+ with tm.assert_produces_warning(FutureWarning):
+ df.lookup(rows, cols)
with pytest.raises(ValueError, match="requires unique index and columns"):
- df.T.lookup(cols, rows)
+ with tm.assert_produces_warning(FutureWarning):
+ df.T.lookup(cols, rows)
# heterogeneous dtype
df["B"] = 0
with pytest.raises(ValueError, match="requires unique index and columns"):
- df.lookup(rows, cols)
+ with tm.assert_produces_warning(FutureWarning):
+ df.lookup(rows, cols)
def test_set_value(self, float_frame):
for idx in float_frame.index:
@@ -1907,20 +1918,6 @@ def test_getitem_ix_float_duplicates(self):
expect = df.iloc[[1, -1], 0]
tm.assert_series_equal(df.loc[0.2, "a"], expect)
- def test_getitem_sparse_column(self):
- # https://github.com/pandas-dev/pandas/issues/23559
- data = SparseArray([0, 1])
- df = pd.DataFrame({"A": data})
- expected = pd.Series(data, name="A")
- result = df["A"]
- tm.assert_series_equal(result, expected)
-
- result = df.iloc[:, 0]
- tm.assert_series_equal(result, expected)
-
- result = df.loc[:, "A"]
- tm.assert_series_equal(result, expected)
-
def test_setitem_with_unaligned_tz_aware_datetime_column(self):
# GH 12981
# Assignment of unaligned offset-aware datetime series.
@@ -2123,7 +2120,7 @@ def test_type_error_multiindex(self):
)
dg = df.pivot_table(index="i", columns="c", values=["x", "y"])
- with pytest.raises(TypeError, match="is an invalid key"):
+ with pytest.raises(TypeError, match="unhashable type"):
dg[:, 0]
index = Index(range(2), name="i")
@@ -2244,3 +2241,12 @@ def test_object_casting_indexing_wraps_datetimelike():
assert blk.dtype == "m8[ns]" # we got the right block
val = blk.iget((0, 0))
assert isinstance(val, pd.Timedelta)
+
+
+def test_lookup_deprecated():
+ # GH18262
+ df = pd.DataFrame(
+ {"col": ["A", "A", "B", "B"], "A": [80, 23, np.nan, 22], "B": [80, 55, 76, 67]}
+ )
+ with tm.assert_produces_warning(FutureWarning):
+ df.lookup(df.index, df["col"])
diff --git a/pandas/tests/frame/indexing/test_mask.py b/pandas/tests/frame/indexing/test_mask.py
index 30db6110efc80..23f3a18881782 100644
--- a/pandas/tests/frame/indexing/test_mask.py
+++ b/pandas/tests/frame/indexing/test_mask.py
@@ -36,12 +36,14 @@ def test_mask_inplace(self):
rdf = df.copy()
- rdf.where(cond, inplace=True)
+ return_value = rdf.where(cond, inplace=True)
+ assert return_value is None
tm.assert_frame_equal(rdf, df.where(cond))
tm.assert_frame_equal(rdf, df.mask(~cond))
rdf = df.copy()
- rdf.where(cond, -df, inplace=True)
+ return_value = rdf.where(cond, -df, inplace=True)
+ assert return_value is None
tm.assert_frame_equal(rdf, df.where(cond, -df))
tm.assert_frame_equal(rdf, df.mask(~cond, -df))
diff --git a/pandas/tests/frame/indexing/test_setitem.py b/pandas/tests/frame/indexing/test_setitem.py
index 8fcdae95fbab5..8313ab0b99bac 100644
--- a/pandas/tests/frame/indexing/test_setitem.py
+++ b/pandas/tests/frame/indexing/test_setitem.py
@@ -1,7 +1,18 @@
import numpy as np
import pytest
-from pandas import Categorical, DataFrame, Index, Series, Timestamp, date_range
+from pandas.core.dtypes.dtypes import DatetimeTZDtype, IntervalDtype, PeriodDtype
+
+from pandas import (
+ Categorical,
+ DataFrame,
+ Index,
+ Interval,
+ Period,
+ Series,
+ Timestamp,
+ date_range,
+)
import pandas._testing as tm
from pandas.core.arrays import SparseArray
@@ -97,7 +108,7 @@ def test_setitem_timestamp_empty_columns(self):
df["now"] = Timestamp("20130101", tz="UTC")
expected = DataFrame(
- [[Timestamp("20130101", tz="UTC")]] * 3, index=[0, 1, 2], columns=["now"],
+ [[Timestamp("20130101", tz="UTC")]] * 3, index=[0, 1, 2], columns=["now"]
)
tm.assert_frame_equal(df, expected)
@@ -106,7 +117,10 @@ def test_setitem_wrong_length_categorical_dtype_raises(self):
cat = Categorical.from_codes([0, 1, 1, 0, 1, 2], ["a", "b", "c"])
df = DataFrame(range(10), columns=["bar"])
- msg = "Length of values does not match length of index"
+ msg = (
+ rf"Length of values \({len(cat)}\) "
+ rf"does not match length of index \({len(df)}\)"
+ )
with pytest.raises(ValueError, match=msg):
df["foo"] = cat
@@ -144,9 +158,25 @@ def test_setitem_dict_preserves_dtypes(self):
}
)
for idx, b in enumerate([1, 2, 3]):
- df.loc[df.shape[0]] = {
- "a": int(idx),
- "b": float(b),
- "c": float(b),
- }
+ df.loc[df.shape[0]] = {"a": int(idx), "b": float(b), "c": float(b)}
+ tm.assert_frame_equal(df, expected)
+
+ @pytest.mark.parametrize(
+ "obj,dtype",
+ [
+ (Period("2020-01"), PeriodDtype("M")),
+ (Interval(left=0, right=5), IntervalDtype("int64")),
+ (
+ Timestamp("2011-01-01", tz="US/Eastern"),
+ DatetimeTZDtype(tz="US/Eastern"),
+ ),
+ ],
+ )
+ def test_setitem_extension_types(self, obj, dtype):
+ # GH: 34832
+ expected = DataFrame({"idx": [1, 2, 3], "obj": Series([obj] * 3, dtype=dtype)})
+
+ df = DataFrame({"idx": [1, 2, 3]})
+ df["obj"] = obj
+
tm.assert_frame_equal(df, expected)
diff --git a/pandas/tests/frame/indexing/test_sparse.py b/pandas/tests/frame/indexing/test_sparse.py
new file mode 100644
index 0000000000000..04e1c8b94c4d9
--- /dev/null
+++ b/pandas/tests/frame/indexing/test_sparse.py
@@ -0,0 +1,71 @@
+import numpy as np
+import pytest
+
+import pandas.util._test_decorators as td
+
+import pandas as pd
+import pandas._testing as tm
+from pandas.arrays import SparseArray
+from pandas.core.arrays.sparse import SparseDtype
+
+
+class TestSparseDataFrameIndexing:
+ def test_getitem_sparse_column(self):
+ # https://github.com/pandas-dev/pandas/issues/23559
+ data = SparseArray([0, 1])
+ df = pd.DataFrame({"A": data})
+ expected = pd.Series(data, name="A")
+ result = df["A"]
+ tm.assert_series_equal(result, expected)
+
+ result = df.iloc[:, 0]
+ tm.assert_series_equal(result, expected)
+
+ result = df.loc[:, "A"]
+ tm.assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize("spmatrix_t", ["coo_matrix", "csc_matrix", "csr_matrix"])
+ @pytest.mark.parametrize("dtype", [np.int64, np.float64, complex])
+ @td.skip_if_no_scipy
+ def test_locindexer_from_spmatrix(self, spmatrix_t, dtype):
+ import scipy.sparse
+
+ spmatrix_t = getattr(scipy.sparse, spmatrix_t)
+
+ # The bug is triggered by a sparse matrix with purely sparse columns. So the
+ # recipe below generates a rectangular matrix of dimension (5, 7) where all the
+ # diagonal cells are ones, meaning the last two columns are purely sparse.
+ rows, cols = 5, 7
+ spmatrix = spmatrix_t(np.eye(rows, cols, dtype=dtype), dtype=dtype)
+ df = pd.DataFrame.sparse.from_spmatrix(spmatrix)
+
+ # regression test for #34526
+ itr_idx = range(2, rows)
+ result = df.loc[itr_idx].values
+ expected = spmatrix.toarray()[itr_idx]
+ tm.assert_numpy_array_equal(result, expected)
+
+ # regression test for #34540
+ result = df.loc[itr_idx].dtypes.values
+ expected = np.full(cols, SparseDtype(dtype, fill_value=0))
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_reindex(self):
+ # https://github.com/pandas-dev/pandas/issues/35286
+ df = pd.DataFrame(
+ {"A": [0, 1], "B": pd.array([0, 1], dtype=pd.SparseDtype("int64", 0))}
+ )
+ result = df.reindex([0, 2])
+ expected = pd.DataFrame(
+ {
+ "A": [0.0, np.nan],
+ "B": pd.array([0.0, np.nan], dtype=pd.SparseDtype("float64", 0.0)),
+ },
+ index=[0, 2],
+ )
+ tm.assert_frame_equal(result, expected)
+
+ def test_all_sparse(self):
+ df = pd.DataFrame({"A": pd.array([0, 0], dtype=pd.SparseDtype("int64"))})
+ result = df.loc[[0, 1]]
+ tm.assert_frame_equal(result, df)
diff --git a/pandas/tests/frame/indexing/test_where.py b/pandas/tests/frame/indexing/test_where.py
index 24eb424bd5735..d114a3178b686 100644
--- a/pandas/tests/frame/indexing/test_where.py
+++ b/pandas/tests/frame/indexing/test_where.py
@@ -162,7 +162,8 @@ def _check_set(df, cond, check_dtypes=True):
econd = cond.reindex_like(df).fillna(True)
expected = dfi.mask(~econd)
- dfi.where(cond, np.nan, inplace=True)
+ return_value = dfi.where(cond, np.nan, inplace=True)
+ assert return_value is None
tm.assert_frame_equal(dfi, expected)
# dtypes (and confirm upcasts)x
@@ -303,7 +304,8 @@ def test_where_bug(self):
tm.assert_frame_equal(result, expected)
result = df.copy()
- result.where(result > 2, np.nan, inplace=True)
+ return_value = result.where(result > 2, np.nan, inplace=True)
+ assert return_value is None
tm.assert_frame_equal(result, expected)
def test_where_bug_mixed(self, sint_dtype):
@@ -324,7 +326,8 @@ def test_where_bug_mixed(self, sint_dtype):
tm.assert_frame_equal(result, expected)
result = df.copy()
- result.where(result > 2, np.nan, inplace=True)
+ return_value = result.where(result > 2, np.nan, inplace=True)
+ assert return_value is None
tm.assert_frame_equal(result, expected)
def test_where_bug_transposition(self):
@@ -417,7 +420,8 @@ def create():
result = df.where(pd.notna(df), df.mean(), axis="columns")
tm.assert_frame_equal(result, expected)
- df.where(pd.notna(df), df.mean(), inplace=True, axis="columns")
+ return_value = df.where(pd.notna(df), df.mean(), inplace=True, axis="columns")
+ assert return_value is None
tm.assert_frame_equal(df, expected)
df = create().fillna(0)
@@ -453,7 +457,8 @@ def test_where_axis(self):
tm.assert_frame_equal(result, expected)
result = df.copy()
- result.where(mask, s, axis="index", inplace=True)
+ return_value = result.where(mask, s, axis="index", inplace=True)
+ assert return_value is None
tm.assert_frame_equal(result, expected)
expected = DataFrame([[0, 1], [0, 1]], dtype="float64")
@@ -461,7 +466,8 @@ def test_where_axis(self):
tm.assert_frame_equal(result, expected)
result = df.copy()
- result.where(mask, s, axis="columns", inplace=True)
+ return_value = result.where(mask, s, axis="columns", inplace=True)
+ assert return_value is None
tm.assert_frame_equal(result, expected)
# Upcast needed
@@ -474,7 +480,8 @@ def test_where_axis(self):
tm.assert_frame_equal(result, expected)
result = df.copy()
- result.where(mask, s, axis="index", inplace=True)
+ return_value = result.where(mask, s, axis="index", inplace=True)
+ assert return_value is None
tm.assert_frame_equal(result, expected)
expected = DataFrame([[0, np.nan], [0, np.nan]])
@@ -488,7 +495,8 @@ def test_where_axis(self):
}
)
result = df.copy()
- result.where(mask, s, axis="columns", inplace=True)
+ return_value = result.where(mask, s, axis="columns", inplace=True)
+ assert return_value is None
tm.assert_frame_equal(result, expected)
# Multiple dtypes (=> multiple Blocks)
@@ -511,7 +519,8 @@ def test_where_axis(self):
tm.assert_frame_equal(result, expected)
result = df.copy()
- result.where(mask, s1, axis="columns", inplace=True)
+ return_value = result.where(mask, s1, axis="columns", inplace=True)
+ assert return_value is None
tm.assert_frame_equal(result, expected)
result = df.where(mask, s2, axis="index")
@@ -521,7 +530,8 @@ def test_where_axis(self):
tm.assert_frame_equal(result, expected)
result = df.copy()
- result.where(mask, s2, axis="index", inplace=True)
+ return_value = result.where(mask, s2, axis="index", inplace=True)
+ assert return_value is None
tm.assert_frame_equal(result, expected)
# DataFrame vs DataFrame
@@ -534,10 +544,12 @@ def test_where_axis(self):
result = df.where(mask, d1, axis="index")
tm.assert_frame_equal(result, expected)
result = df.copy()
- result.where(mask, d1, inplace=True)
+ return_value = result.where(mask, d1, inplace=True)
+ assert return_value is None
tm.assert_frame_equal(result, expected)
result = df.copy()
- result.where(mask, d1, inplace=True, axis="index")
+ return_value = result.where(mask, d1, inplace=True, axis="index")
+ assert return_value is None
tm.assert_frame_equal(result, expected)
d2 = df.copy().drop(1, axis=1)
@@ -549,10 +561,12 @@ def test_where_axis(self):
result = df.where(mask, d2, axis="columns")
tm.assert_frame_equal(result, expected)
result = df.copy()
- result.where(mask, d2, inplace=True)
+ return_value = result.where(mask, d2, inplace=True)
+ assert return_value is None
tm.assert_frame_equal(result, expected)
result = df.copy()
- result.where(mask, d2, inplace=True, axis="columns")
+ return_value = result.where(mask, d2, inplace=True, axis="columns")
+ assert return_value is None
tm.assert_frame_equal(result, expected)
def test_where_callable(self):
diff --git a/pandas/tests/frame/methods/test_align.py b/pandas/tests/frame/methods/test_align.py
index 5dae719283d17..d19b59debfdea 100644
--- a/pandas/tests/frame/methods/test_align.py
+++ b/pandas/tests/frame/methods/test_align.py
@@ -129,6 +129,39 @@ def test_align_mixed_int(self, mixed_int_frame):
)
tm.assert_index_equal(bf.index, Index([]))
+ @pytest.mark.parametrize(
+ "l_ordered,r_ordered,expected",
+ [
+ [True, True, pd.CategoricalIndex],
+ [True, False, pd.Index],
+ [False, True, pd.Index],
+ [False, False, pd.CategoricalIndex],
+ ],
+ )
+ def test_align_categorical(self, l_ordered, r_ordered, expected):
+ # GH-28397
+ df_1 = DataFrame(
+ {
+ "A": np.arange(6, dtype="int64"),
+ "B": Series(list("aabbca")).astype(
+ pd.CategoricalDtype(list("cab"), ordered=l_ordered)
+ ),
+ }
+ ).set_index("B")
+ df_2 = DataFrame(
+ {
+ "A": np.arange(5, dtype="int64"),
+ "B": Series(list("babca")).astype(
+ pd.CategoricalDtype(list("cab"), ordered=r_ordered)
+ ),
+ }
+ ).set_index("B")
+
+ aligned_1, aligned_2 = df_1.align(df_2)
+ assert isinstance(aligned_1.index, expected)
+ assert isinstance(aligned_2.index, expected)
+ tm.assert_index_equal(aligned_1.index, aligned_2.index)
+
def test_align_multiindex(self):
# GH#10665
# same test cases as test_align_multiindex in test_series.py
diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py
index b0fd0496ea81e..d3f256259b15f 100644
--- a/pandas/tests/frame/methods/test_astype.py
+++ b/pandas/tests/frame/methods/test_astype.py
@@ -8,6 +8,7 @@
CategoricalDtype,
DataFrame,
DatetimeTZDtype,
+ Interval,
IntervalDtype,
NaT,
Series,
@@ -565,3 +566,24 @@ def test_astype_empty_dtype_dict(self):
result = df.astype(dict())
tm.assert_frame_equal(result, df)
assert result is not df
+
+ @pytest.mark.parametrize(
+ "df",
+ [
+ DataFrame(Series(["x", "y", "z"], dtype="string")),
+ DataFrame(Series(["x", "y", "z"], dtype="category")),
+ DataFrame(Series(3 * [Timestamp("2020-01-01", tz="UTC")])),
+ DataFrame(Series(3 * [Interval(0, 1)])),
+ ],
+ )
+ @pytest.mark.parametrize("errors", ["raise", "ignore"])
+ def test_astype_ignores_errors_for_extension_dtypes(self, df, errors):
+ # https://github.com/pandas-dev/pandas/issues/35471
+ if errors == "ignore":
+ expected = df
+ result = df.astype(float, errors=errors)
+ tm.assert_frame_equal(result, expected)
+ else:
+ msg = "(Cannot cast)|(could not convert)"
+ with pytest.raises((ValueError, TypeError), match=msg):
+ df.astype(float, errors=errors)
diff --git a/pandas/tests/frame/methods/test_clip.py b/pandas/tests/frame/methods/test_clip.py
index 34727da3b95ae..ca62b56664518 100644
--- a/pandas/tests/frame/methods/test_clip.py
+++ b/pandas/tests/frame/methods/test_clip.py
@@ -22,7 +22,8 @@ def test_inplace_clip(self, float_frame):
median = float_frame.median().median()
frame_copy = float_frame.copy()
- frame_copy.clip(upper=median, lower=median, inplace=True)
+ return_value = frame_copy.clip(upper=median, lower=median, inplace=True)
+ assert return_value is None
assert not (frame_copy.values != median).any()
def test_dataframe_clip(self):
diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py
index 7715cb1cb6eec..78f265d32f8df 100644
--- a/pandas/tests/frame/methods/test_combine_first.py
+++ b/pandas/tests/frame/methods/test_combine_first.py
@@ -199,12 +199,14 @@ def test_combine_first_timezone(self):
columns=["UTCdatetime", "abc"],
data=data1,
index=pd.date_range("20140627", periods=1),
+ dtype="object",
)
data2 = pd.to_datetime("20121212 12:12").tz_localize("UTC")
df2 = pd.DataFrame(
columns=["UTCdatetime", "xyz"],
data=data2,
index=pd.date_range("20140628", periods=1),
+ dtype="object",
)
res = df2[["UTCdatetime"]].combine_first(df1)
exp = pd.DataFrame(
@@ -217,10 +219,14 @@ def test_combine_first_timezone(self):
},
columns=["UTCdatetime", "abc"],
index=pd.date_range("20140627", periods=2, freq="D"),
+ dtype="object",
)
- tm.assert_frame_equal(res, exp)
assert res["UTCdatetime"].dtype == "datetime64[ns, UTC]"
assert res["abc"].dtype == "datetime64[ns, UTC]"
+ # Need to cast all to "obejct" because combine_first does not retain dtypes:
+ # GH Issue 7509
+ res = res.astype("object")
+ tm.assert_frame_equal(res, exp)
# see gh-10567
dts1 = pd.date_range("2015-01-01", "2015-01-05", tz="UTC")
diff --git a/pandas/tests/frame/methods/test_cov_corr.py b/pandas/tests/frame/methods/test_cov_corr.py
index d3548b639572d..f307acd8c2178 100644
--- a/pandas/tests/frame/methods/test_cov_corr.py
+++ b/pandas/tests/frame/methods/test_cov_corr.py
@@ -191,6 +191,23 @@ def test_corr_nullable_integer(self, nullable_column, other_column, method):
expected = pd.DataFrame(np.ones((2, 2)), columns=["a", "b"], index=["a", "b"])
tm.assert_frame_equal(result, expected)
+ def test_corr_item_cache(self):
+ # Check that corr does not lead to incorrect entries in item_cache
+
+ df = pd.DataFrame({"A": range(10)})
+ df["B"] = range(10)[::-1]
+
+ ser = df["A"] # populate item_cache
+ assert len(df._mgr.blocks) == 2
+
+ _ = df.corr()
+
+ # Check that the corr didnt break link between ser and df
+ ser.values[0] = 99
+ assert df.loc[0, "A"] == 99
+ assert df["A"] is ser
+ assert df.values[0, 0] == 99
+
class TestDataFrameCorrWith:
def test_corrwith(self, datetime_frame):
diff --git a/pandas/tests/frame/methods/test_describe.py b/pandas/tests/frame/methods/test_describe.py
index b61d0d28e2fba..0b70bead375da 100644
--- a/pandas/tests/frame/methods/test_describe.py
+++ b/pandas/tests/frame/methods/test_describe.py
@@ -267,7 +267,69 @@ def test_describe_tz_values(self, tz_naive_fixture):
},
index=["count", "mean", "min", "25%", "50%", "75%", "max", "std"],
)
- result = df.describe(include="all")
+ result = df.describe(include="all", datetime_is_numeric=True)
+ tm.assert_frame_equal(result, expected)
+
+ def test_datetime_is_numeric_includes_datetime(self):
+ df = pd.DataFrame({"a": pd.date_range("2012", periods=3), "b": [1, 2, 3]})
+ result = df.describe(datetime_is_numeric=True)
+ expected = pd.DataFrame(
+ {
+ "a": [
+ 3,
+ pd.Timestamp("2012-01-02"),
+ pd.Timestamp("2012-01-01"),
+ pd.Timestamp("2012-01-01T12:00:00"),
+ pd.Timestamp("2012-01-02"),
+ pd.Timestamp("2012-01-02T12:00:00"),
+ pd.Timestamp("2012-01-03"),
+ np.nan,
+ ],
+ "b": [3, 2, 1, 1.5, 2, 2.5, 3, 1],
+ },
+ index=["count", "mean", "min", "25%", "50%", "75%", "max", "std"],
+ )
+ tm.assert_frame_equal(result, expected)
+
+ def test_describe_tz_values2(self):
+ tz = "CET"
+ s1 = Series(range(5))
+ start = Timestamp(2018, 1, 1)
+ end = Timestamp(2018, 1, 5)
+ s2 = Series(date_range(start, end, tz=tz))
+ df = pd.DataFrame({"s1": s1, "s2": s2})
+
+ s1_ = s1.describe()
+ s2_ = pd.Series(
+ [
+ 5,
+ 5,
+ s2.value_counts().index[0],
+ 1,
+ start.tz_localize(tz),
+ end.tz_localize(tz),
+ ],
+ index=["count", "unique", "top", "freq", "first", "last"],
+ )
+ idx = [
+ "count",
+ "unique",
+ "top",
+ "freq",
+ "first",
+ "last",
+ "mean",
+ "std",
+ "min",
+ "25%",
+ "50%",
+ "75%",
+ "max",
+ ]
+ expected = pd.concat([s1_, s2_], axis=1, keys=["s1", "s2"]).loc[idx]
+
+ with tm.assert_produces_warning(FutureWarning):
+ result = df.describe(include="all")
tm.assert_frame_equal(result, expected)
def test_describe_percentiles_integer_idx(self):
diff --git a/pandas/tests/frame/methods/test_diff.py b/pandas/tests/frame/methods/test_diff.py
index 45f134a93a23a..0486fb2d588b6 100644
--- a/pandas/tests/frame/methods/test_diff.py
+++ b/pandas/tests/frame/methods/test_diff.py
@@ -214,3 +214,12 @@ def test_diff_integer_na(self, axis, expected):
# Test case for default behaviour of diff
result = df.diff(axis=axis)
tm.assert_frame_equal(result, expected)
+
+ def test_diff_readonly(self):
+ # https://github.com/pandas-dev/pandas/issues/35559
+ arr = np.random.randn(5, 2)
+ arr.flags.writeable = False
+ df = pd.DataFrame(arr)
+ result = df.diff()
+ expected = pd.DataFrame(np.array(df)).diff()
+ tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/frame/methods/test_drop.py b/pandas/tests/frame/methods/test_drop.py
index 177d10cdbf615..aa44a2427dc8f 100644
--- a/pandas/tests/frame/methods/test_drop.py
+++ b/pandas/tests/frame/methods/test_drop.py
@@ -70,8 +70,10 @@ def test_drop_names(self):
df_dropped_b = df.drop("b")
df_dropped_e = df.drop("e", axis=1)
df_inplace_b, df_inplace_e = df.copy(), df.copy()
- df_inplace_b.drop("b", inplace=True)
- df_inplace_e.drop("e", axis=1, inplace=True)
+ return_value = df_inplace_b.drop("b", inplace=True)
+ assert return_value is None
+ return_value = df_inplace_e.drop("e", axis=1, inplace=True)
+ assert return_value is None
for obj in (df_dropped_b, df_dropped_e, df_inplace_b, df_inplace_e):
assert obj.index.name == "first"
assert obj.columns.name == "second"
@@ -148,7 +150,8 @@ def test_drop(self):
# GH#5628
df = pd.DataFrame(np.random.randn(10, 3), columns=list("abc"))
expected = df[~(df.b > 0)]
- df.drop(labels=df[df.b > 0].index, inplace=True)
+ return_value = df.drop(labels=df[df.b > 0].index, inplace=True)
+ assert return_value is None
tm.assert_frame_equal(df, expected)
def test_drop_multiindex_not_lexsorted(self):
diff --git a/pandas/tests/frame/methods/test_drop_duplicates.py b/pandas/tests/frame/methods/test_drop_duplicates.py
index 7c6391140e2bb..cebec215a0d9d 100644
--- a/pandas/tests/frame/methods/test_drop_duplicates.py
+++ b/pandas/tests/frame/methods/test_drop_duplicates.py
@@ -333,64 +333,73 @@ def test_drop_duplicates_inplace():
)
# single column
df = orig.copy()
- df.drop_duplicates("A", inplace=True)
+ return_value = df.drop_duplicates("A", inplace=True)
expected = orig[:2]
result = df
tm.assert_frame_equal(result, expected)
+ assert return_value is None
df = orig.copy()
- df.drop_duplicates("A", keep="last", inplace=True)
+ return_value = df.drop_duplicates("A", keep="last", inplace=True)
expected = orig.loc[[6, 7]]
result = df
tm.assert_frame_equal(result, expected)
+ assert return_value is None
df = orig.copy()
- df.drop_duplicates("A", keep=False, inplace=True)
+ return_value = df.drop_duplicates("A", keep=False, inplace=True)
expected = orig.loc[[]]
result = df
tm.assert_frame_equal(result, expected)
assert len(df) == 0
+ assert return_value is None
# multi column
df = orig.copy()
- df.drop_duplicates(["A", "B"], inplace=True)
+ return_value = df.drop_duplicates(["A", "B"], inplace=True)
expected = orig.loc[[0, 1, 2, 3]]
result = df
tm.assert_frame_equal(result, expected)
+ assert return_value is None
df = orig.copy()
- df.drop_duplicates(["A", "B"], keep="last", inplace=True)
+ return_value = df.drop_duplicates(["A", "B"], keep="last", inplace=True)
expected = orig.loc[[0, 5, 6, 7]]
result = df
tm.assert_frame_equal(result, expected)
+ assert return_value is None
df = orig.copy()
- df.drop_duplicates(["A", "B"], keep=False, inplace=True)
+ return_value = df.drop_duplicates(["A", "B"], keep=False, inplace=True)
expected = orig.loc[[0]]
result = df
tm.assert_frame_equal(result, expected)
+ assert return_value is None
# consider everything
orig2 = orig.loc[:, ["A", "B", "C"]].copy()
df2 = orig2.copy()
- df2.drop_duplicates(inplace=True)
+ return_value = df2.drop_duplicates(inplace=True)
# in this case only
expected = orig2.drop_duplicates(["A", "B"])
result = df2
tm.assert_frame_equal(result, expected)
+ assert return_value is None
df2 = orig2.copy()
- df2.drop_duplicates(keep="last", inplace=True)
+ return_value = df2.drop_duplicates(keep="last", inplace=True)
expected = orig2.drop_duplicates(["A", "B"], keep="last")
result = df2
tm.assert_frame_equal(result, expected)
+ assert return_value is None
df2 = orig2.copy()
- df2.drop_duplicates(keep=False, inplace=True)
+ return_value = df2.drop_duplicates(keep=False, inplace=True)
expected = orig2.drop_duplicates(["A", "B"], keep=False)
result = df2
tm.assert_frame_equal(result, expected)
+ assert return_value is None
@pytest.mark.parametrize("inplace", [True, False])
diff --git a/pandas/tests/frame/methods/test_explode.py b/pandas/tests/frame/methods/test_explode.py
index 2bbe8ac2d5b81..bd0901387eeed 100644
--- a/pandas/tests/frame/methods/test_explode.py
+++ b/pandas/tests/frame/methods/test_explode.py
@@ -172,3 +172,11 @@ def test_ignore_index():
{"id": [0, 0, 10, 10], "values": list("abcd")}, index=[0, 1, 2, 3]
)
tm.assert_frame_equal(result, expected)
+
+
+def test_explode_sets():
+ # https://github.com/pandas-dev/pandas/issues/35614
+ df = pd.DataFrame({"a": [{"x", "y"}], "b": [1]}, index=[1])
+ result = df.explode(column="a").sort_values(by="a")
+ expected = pd.DataFrame({"a": ["x", "y"], "b": [1, 1]}, index=[1, 1])
+ tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py
index facb116646573..6b86a13fcf1b9 100644
--- a/pandas/tests/frame/methods/test_interpolate.py
+++ b/pandas/tests/frame/methods/test_interpolate.py
@@ -34,6 +34,14 @@ def test_interp_basic(self):
expected.loc[5, "B"] = 9
tm.assert_frame_equal(result, expected)
+ def test_interp_empty(self):
+ # https://github.com/pandas-dev/pandas/issues/35598
+ df = DataFrame()
+ result = df.interpolate()
+ assert result is not df
+ expected = df
+ tm.assert_frame_equal(result, expected)
+
def test_interp_bad_method(self):
df = DataFrame(
{
@@ -246,11 +254,13 @@ def test_interp_inplace(self):
df = DataFrame({"a": [1.0, 2.0, np.nan, 4.0]})
expected = DataFrame({"a": [1.0, 2.0, 3.0, 4.0]})
result = df.copy()
- result["a"].interpolate(inplace=True)
+ return_value = result["a"].interpolate(inplace=True)
+ assert return_value is None
tm.assert_frame_equal(result, expected)
result = df.copy()
- result["a"].interpolate(inplace=True, downcast="infer")
+ return_value = result["a"].interpolate(inplace=True, downcast="infer")
+ assert return_value is None
tm.assert_frame_equal(result, expected.astype("int64"))
def test_interp_inplace_row(self):
@@ -259,7 +269,8 @@ def test_interp_inplace_row(self):
{"a": [1.0, 2.0, 3.0, 4.0], "b": [np.nan, 2.0, 3.0, 4.0], "c": [3, 2, 2, 2]}
)
expected = result.interpolate(method="linear", axis=1, inplace=False)
- result.interpolate(method="linear", axis=1, inplace=True)
+ return_value = result.interpolate(method="linear", axis=1, inplace=True)
+ assert return_value is None
tm.assert_frame_equal(result, expected)
def test_interp_ignore_all_good(self):
@@ -297,7 +308,8 @@ def test_interp_time_inplace_axis(self, axis):
expected = DataFrame(index=idx, columns=idx, data=data)
result = expected.interpolate(axis=0, method="time")
- expected.interpolate(axis=0, method="time", inplace=True)
+ return_value = expected.interpolate(axis=0, method="time", inplace=True)
+ assert return_value is None
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("axis_name, axis_number", [("index", 0), ("columns", 1)])
diff --git a/pandas/tests/frame/methods/test_isin.py b/pandas/tests/frame/methods/test_isin.py
index 79ea70a38f145..35d45bd00131b 100644
--- a/pandas/tests/frame/methods/test_isin.py
+++ b/pandas/tests/frame/methods/test_isin.py
@@ -189,3 +189,18 @@ def test_isin_empty_datetimelike(self):
tm.assert_frame_equal(result, expected)
result = df1_td.isin(df3)
tm.assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize(
+ "values",
+ [
+ pd.DataFrame({"a": [1, 2, 3]}, dtype="category"),
+ pd.Series([1, 2, 3], dtype="category"),
+ ],
+ )
+ def test_isin_category_frame(self, values):
+ # GH#34256
+ df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+ expected = DataFrame({"a": [True, True, True], "b": [False, False, False]})
+
+ result = df.isin(values)
+ tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/frame/methods/test_quantile.py b/pandas/tests/frame/methods/test_quantile.py
index 0eec30cbc5c67..0b8f1e0495155 100644
--- a/pandas/tests/frame/methods/test_quantile.py
+++ b/pandas/tests/frame/methods/test_quantile.py
@@ -7,14 +7,29 @@
class TestDataFrameQuantile:
- def test_quantile_sparse(self):
+ @pytest.mark.parametrize(
+ "df,expected",
+ [
+ [
+ pd.DataFrame(
+ {
+ 0: pd.Series(pd.arrays.SparseArray([1, 2])),
+ 1: pd.Series(pd.arrays.SparseArray([3, 4])),
+ }
+ ),
+ pd.Series([1.5, 3.5], name=0.5),
+ ],
+ [
+ pd.DataFrame(pd.Series([0.0, None, 1.0, 2.0], dtype="Sparse[float]")),
+ pd.Series([1.0], name=0.5),
+ ],
+ ],
+ )
+ def test_quantile_sparse(self, df, expected):
# GH#17198
- s = pd.Series(pd.arrays.SparseArray([1, 2]))
- s1 = pd.Series(pd.arrays.SparseArray([3, 4]))
- df = pd.DataFrame({0: s, 1: s1})
+ # GH#24600
result = df.quantile()
- expected = pd.Series([1.5, 3.5], name=0.5)
tm.assert_series_equal(result, expected)
def test_quantile(self, datetime_frame):
@@ -59,6 +74,20 @@ def test_quantile(self, datetime_frame):
expected = Series([3.0, 4.0], index=[0, 1], name=0.5)
tm.assert_series_equal(result, expected)
+ def test_quantile_date_range(self):
+ # GH 2460
+
+ dti = pd.date_range("2016-01-01", periods=3, tz="US/Pacific")
+ ser = pd.Series(dti)
+ df = pd.DataFrame(ser)
+
+ result = df.quantile(numeric_only=False)
+ expected = pd.Series(
+ ["2016-01-02 00:00:00"], name=0.5, dtype="datetime64[ns, US/Pacific]"
+ )
+
+ tm.assert_series_equal(result, expected)
+
def test_quantile_axis_mixed(self):
# mixed on axis=1
diff --git a/pandas/tests/frame/methods/test_rename.py b/pandas/tests/frame/methods/test_rename.py
index ffad526d3f4d1..eb908e9472fe2 100644
--- a/pandas/tests/frame/methods/test_rename.py
+++ b/pandas/tests/frame/methods/test_rename.py
@@ -150,7 +150,8 @@ def test_rename_inplace(self, float_frame):
c_id = id(float_frame["C"])
float_frame = float_frame.copy()
- float_frame.rename(columns={"C": "foo"}, inplace=True)
+ return_value = float_frame.rename(columns={"C": "foo"}, inplace=True)
+ assert return_value is None
assert "C" not in float_frame
assert "foo" in float_frame
diff --git a/pandas/tests/frame/methods/test_rename_axis.py b/pandas/tests/frame/methods/test_rename_axis.py
index 9b964d842526c..3339119841813 100644
--- a/pandas/tests/frame/methods/test_rename_axis.py
+++ b/pandas/tests/frame/methods/test_rename_axis.py
@@ -10,14 +10,16 @@ def test_rename_axis_inplace(self, float_frame):
# GH#15704
expected = float_frame.rename_axis("foo")
result = float_frame.copy()
- no_return = result.rename_axis("foo", inplace=True)
+ return_value = no_return = result.rename_axis("foo", inplace=True)
+ assert return_value is None
assert no_return is None
tm.assert_frame_equal(result, expected)
expected = float_frame.rename_axis("bar", axis=1)
result = float_frame.copy()
- no_return = result.rename_axis("bar", axis=1, inplace=True)
+ return_value = no_return = result.rename_axis("bar", axis=1, inplace=True)
+ assert return_value is None
assert no_return is None
tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/frame/methods/test_replace.py b/pandas/tests/frame/methods/test_replace.py
index 498f7f7790514..a77753ed9f9d0 100644
--- a/pandas/tests/frame/methods/test_replace.py
+++ b/pandas/tests/frame/methods/test_replace.py
@@ -27,7 +27,8 @@ def test_replace_inplace(self, datetime_frame, float_string_frame):
datetime_frame["A"][-5:] = np.nan
tsframe = datetime_frame.copy()
- tsframe.replace(np.nan, 0, inplace=True)
+ return_value = tsframe.replace(np.nan, 0, inplace=True)
+ assert return_value is None
tm.assert_frame_equal(tsframe, datetime_frame.fillna(0))
# mixed type
@@ -40,7 +41,8 @@ def test_replace_inplace(self, datetime_frame, float_string_frame):
tm.assert_frame_equal(result, expected)
tsframe = datetime_frame.copy()
- tsframe.replace([np.nan], [0], inplace=True)
+ return_value = tsframe.replace([np.nan], [0], inplace=True)
+ assert return_value is None
tm.assert_frame_equal(tsframe, datetime_frame.fillna(0))
def test_regex_replace_scalar(self, mix_ab):
@@ -117,18 +119,21 @@ def test_regex_replace_scalar_inplace(self, mix_ab):
# regex -> value
# obj frame
res = dfobj.copy()
- res.replace(r"\s*\.\s*", np.nan, regex=True, inplace=True)
+ return_value = res.replace(r"\s*\.\s*", np.nan, regex=True, inplace=True)
+ assert return_value is None
tm.assert_frame_equal(dfobj, res.fillna("."))
# mixed
res = dfmix.copy()
- res.replace(r"\s*\.\s*", np.nan, regex=True, inplace=True)
+ return_value = res.replace(r"\s*\.\s*", np.nan, regex=True, inplace=True)
+ assert return_value is None
tm.assert_frame_equal(dfmix, res.fillna("."))
# regex -> regex
# obj frame
res = dfobj.copy()
- res.replace(r"\s*(\.)\s*", r"\1\1\1", regex=True, inplace=True)
+ return_value = res.replace(r"\s*(\.)\s*", r"\1\1\1", regex=True, inplace=True)
+ assert return_value is None
objc = obj.copy()
objc["a"] = ["a", "b", "...", "..."]
expec = DataFrame(objc)
@@ -136,7 +141,8 @@ def test_regex_replace_scalar_inplace(self, mix_ab):
# with mixed
res = dfmix.copy()
- res.replace(r"\s*(\.)\s*", r"\1\1\1", regex=True, inplace=True)
+ return_value = res.replace(r"\s*(\.)\s*", r"\1\1\1", regex=True, inplace=True)
+ assert return_value is None
mixc = mix_ab.copy()
mixc["b"] = ["a", "b", "...", "..."]
expec = DataFrame(mixc)
@@ -144,18 +150,27 @@ def test_regex_replace_scalar_inplace(self, mix_ab):
# everything with compiled regexs as well
res = dfobj.copy()
- res.replace(re.compile(r"\s*\.\s*"), np.nan, regex=True, inplace=True)
+ return_value = res.replace(
+ re.compile(r"\s*\.\s*"), np.nan, regex=True, inplace=True
+ )
+ assert return_value is None
tm.assert_frame_equal(dfobj, res.fillna("."))
# mixed
res = dfmix.copy()
- res.replace(re.compile(r"\s*\.\s*"), np.nan, regex=True, inplace=True)
+ return_value = res.replace(
+ re.compile(r"\s*\.\s*"), np.nan, regex=True, inplace=True
+ )
+ assert return_value is None
tm.assert_frame_equal(dfmix, res.fillna("."))
# regex -> regex
# obj frame
res = dfobj.copy()
- res.replace(re.compile(r"\s*(\.)\s*"), r"\1\1\1", regex=True, inplace=True)
+ return_value = res.replace(
+ re.compile(r"\s*(\.)\s*"), r"\1\1\1", regex=True, inplace=True
+ )
+ assert return_value is None
objc = obj.copy()
objc["a"] = ["a", "b", "...", "..."]
expec = DataFrame(objc)
@@ -163,25 +178,31 @@ def test_regex_replace_scalar_inplace(self, mix_ab):
# with mixed
res = dfmix.copy()
- res.replace(re.compile(r"\s*(\.)\s*"), r"\1\1\1", regex=True, inplace=True)
+ return_value = res.replace(
+ re.compile(r"\s*(\.)\s*"), r"\1\1\1", regex=True, inplace=True
+ )
+ assert return_value is None
mixc = mix_ab.copy()
mixc["b"] = ["a", "b", "...", "..."]
expec = DataFrame(mixc)
tm.assert_frame_equal(res, expec)
res = dfobj.copy()
- res.replace(regex=r"\s*\.\s*", value=np.nan, inplace=True)
+ return_value = res.replace(regex=r"\s*\.\s*", value=np.nan, inplace=True)
+ assert return_value is None
tm.assert_frame_equal(dfobj, res.fillna("."))
# mixed
res = dfmix.copy()
- res.replace(regex=r"\s*\.\s*", value=np.nan, inplace=True)
+ return_value = res.replace(regex=r"\s*\.\s*", value=np.nan, inplace=True)
+ assert return_value is None
tm.assert_frame_equal(dfmix, res.fillna("."))
# regex -> regex
# obj frame
res = dfobj.copy()
- res.replace(regex=r"\s*(\.)\s*", value=r"\1\1\1", inplace=True)
+ return_value = res.replace(regex=r"\s*(\.)\s*", value=r"\1\1\1", inplace=True)
+ assert return_value is None
objc = obj.copy()
objc["a"] = ["a", "b", "...", "..."]
expec = DataFrame(objc)
@@ -189,7 +210,8 @@ def test_regex_replace_scalar_inplace(self, mix_ab):
# with mixed
res = dfmix.copy()
- res.replace(regex=r"\s*(\.)\s*", value=r"\1\1\1", inplace=True)
+ return_value = res.replace(regex=r"\s*(\.)\s*", value=r"\1\1\1", inplace=True)
+ assert return_value is None
mixc = mix_ab.copy()
mixc["b"] = ["a", "b", "...", "..."]
expec = DataFrame(mixc)
@@ -197,18 +219,27 @@ def test_regex_replace_scalar_inplace(self, mix_ab):
# everything with compiled regexs as well
res = dfobj.copy()
- res.replace(regex=re.compile(r"\s*\.\s*"), value=np.nan, inplace=True)
+ return_value = res.replace(
+ regex=re.compile(r"\s*\.\s*"), value=np.nan, inplace=True
+ )
+ assert return_value is None
tm.assert_frame_equal(dfobj, res.fillna("."))
# mixed
res = dfmix.copy()
- res.replace(regex=re.compile(r"\s*\.\s*"), value=np.nan, inplace=True)
+ return_value = res.replace(
+ regex=re.compile(r"\s*\.\s*"), value=np.nan, inplace=True
+ )
+ assert return_value is None
tm.assert_frame_equal(dfmix, res.fillna("."))
# regex -> regex
# obj frame
res = dfobj.copy()
- res.replace(regex=re.compile(r"\s*(\.)\s*"), value=r"\1\1\1", inplace=True)
+ return_value = res.replace(
+ regex=re.compile(r"\s*(\.)\s*"), value=r"\1\1\1", inplace=True
+ )
+ assert return_value is None
objc = obj.copy()
objc["a"] = ["a", "b", "...", "..."]
expec = DataFrame(objc)
@@ -216,7 +247,10 @@ def test_regex_replace_scalar_inplace(self, mix_ab):
# with mixed
res = dfmix.copy()
- res.replace(regex=re.compile(r"\s*(\.)\s*"), value=r"\1\1\1", inplace=True)
+ return_value = res.replace(
+ regex=re.compile(r"\s*(\.)\s*"), value=r"\1\1\1", inplace=True
+ )
+ assert return_value is None
mixc = mix_ab.copy()
mixc["b"] = ["a", "b", "...", "..."]
expec = DataFrame(mixc)
@@ -290,7 +324,8 @@ def test_regex_replace_list_obj_inplace(self):
to_replace_res = [r"\s*\.\s*", r"e|f|g"]
values = [np.nan, "crap"]
res = dfobj.copy()
- res.replace(to_replace_res, values, inplace=True, regex=True)
+ return_value = res.replace(to_replace_res, values, inplace=True, regex=True)
+ assert return_value is None
expec = DataFrame(
{
"a": ["a", "b", np.nan, np.nan],
@@ -304,7 +339,8 @@ def test_regex_replace_list_obj_inplace(self):
to_replace_res = [r"\s*(\.)\s*", r"(e|f|g)"]
values = [r"\1\1", r"\1_crap"]
res = dfobj.copy()
- res.replace(to_replace_res, values, inplace=True, regex=True)
+ return_value = res.replace(to_replace_res, values, inplace=True, regex=True)
+ assert return_value is None
expec = DataFrame(
{
"a": ["a", "b", "..", ".."],
@@ -319,7 +355,8 @@ def test_regex_replace_list_obj_inplace(self):
to_replace_res = [r"\s*(\.)\s*", r"e"]
values = [r"\1\1", r"crap"]
res = dfobj.copy()
- res.replace(to_replace_res, values, inplace=True, regex=True)
+ return_value = res.replace(to_replace_res, values, inplace=True, regex=True)
+ assert return_value is None
expec = DataFrame(
{
"a": ["a", "b", "..", ".."],
@@ -332,7 +369,8 @@ def test_regex_replace_list_obj_inplace(self):
to_replace_res = [r"\s*(\.)\s*", r"e"]
values = [r"\1\1", r"crap"]
res = dfobj.copy()
- res.replace(value=values, regex=to_replace_res, inplace=True)
+ return_value = res.replace(value=values, regex=to_replace_res, inplace=True)
+ assert return_value is None
expec = DataFrame(
{
"a": ["a", "b", "..", ".."],
@@ -391,7 +429,8 @@ def test_regex_replace_list_mixed_inplace(self, mix_ab):
to_replace_res = [r"\s*\.\s*", r"a"]
values = [np.nan, "crap"]
res = dfmix.copy()
- res.replace(to_replace_res, values, inplace=True, regex=True)
+ return_value = res.replace(to_replace_res, values, inplace=True, regex=True)
+ assert return_value is None
expec = DataFrame({"a": mix_ab["a"], "b": ["crap", "b", np.nan, np.nan]})
tm.assert_frame_equal(res, expec)
@@ -399,7 +438,8 @@ def test_regex_replace_list_mixed_inplace(self, mix_ab):
to_replace_res = [r"\s*(\.)\s*", r"(a|b)"]
values = [r"\1\1", r"\1_crap"]
res = dfmix.copy()
- res.replace(to_replace_res, values, inplace=True, regex=True)
+ return_value = res.replace(to_replace_res, values, inplace=True, regex=True)
+ assert return_value is None
expec = DataFrame({"a": mix_ab["a"], "b": ["a_crap", "b_crap", "..", ".."]})
tm.assert_frame_equal(res, expec)
@@ -408,14 +448,16 @@ def test_regex_replace_list_mixed_inplace(self, mix_ab):
to_replace_res = [r"\s*(\.)\s*", r"a", r"(b)"]
values = [r"\1\1", r"crap", r"\1_crap"]
res = dfmix.copy()
- res.replace(to_replace_res, values, inplace=True, regex=True)
+ return_value = res.replace(to_replace_res, values, inplace=True, regex=True)
+ assert return_value is None
expec = DataFrame({"a": mix_ab["a"], "b": ["crap", "b_crap", "..", ".."]})
tm.assert_frame_equal(res, expec)
to_replace_res = [r"\s*(\.)\s*", r"a", r"(b)"]
values = [r"\1\1", r"crap", r"\1_crap"]
res = dfmix.copy()
- res.replace(regex=to_replace_res, value=values, inplace=True)
+ return_value = res.replace(regex=to_replace_res, value=values, inplace=True)
+ assert return_value is None
expec = DataFrame({"a": mix_ab["a"], "b": ["crap", "b_crap", "..", ".."]})
tm.assert_frame_equal(res, expec)
@@ -430,7 +472,10 @@ def test_regex_replace_dict_mixed(self, mix_abc):
# frame
res = dfmix.replace({"b": r"\s*\.\s*"}, {"b": np.nan}, regex=True)
res2 = dfmix.copy()
- res2.replace({"b": r"\s*\.\s*"}, {"b": np.nan}, inplace=True, regex=True)
+ return_value = res2.replace(
+ {"b": r"\s*\.\s*"}, {"b": np.nan}, inplace=True, regex=True
+ )
+ assert return_value is None
expec = DataFrame(
{"a": mix_abc["a"], "b": ["a", "b", np.nan, np.nan], "c": mix_abc["c"]}
)
@@ -441,7 +486,10 @@ def test_regex_replace_dict_mixed(self, mix_abc):
# whole frame
res = dfmix.replace({"b": r"\s*(\.)\s*"}, {"b": r"\1ty"}, regex=True)
res2 = dfmix.copy()
- res2.replace({"b": r"\s*(\.)\s*"}, {"b": r"\1ty"}, inplace=True, regex=True)
+ return_value = res2.replace(
+ {"b": r"\s*(\.)\s*"}, {"b": r"\1ty"}, inplace=True, regex=True
+ )
+ assert return_value is None
expec = DataFrame(
{"a": mix_abc["a"], "b": ["a", "b", ".ty", ".ty"], "c": mix_abc["c"]}
)
@@ -450,7 +498,10 @@ def test_regex_replace_dict_mixed(self, mix_abc):
res = dfmix.replace(regex={"b": r"\s*(\.)\s*"}, value={"b": r"\1ty"})
res2 = dfmix.copy()
- res2.replace(regex={"b": r"\s*(\.)\s*"}, value={"b": r"\1ty"}, inplace=True)
+ return_value = res2.replace(
+ regex={"b": r"\s*(\.)\s*"}, value={"b": r"\1ty"}, inplace=True
+ )
+ assert return_value is None
expec = DataFrame(
{"a": mix_abc["a"], "b": ["a", "b", ".ty", ".ty"], "c": mix_abc["c"]}
)
@@ -464,13 +515,15 @@ def test_regex_replace_dict_mixed(self, mix_abc):
)
res = dfmix.replace("a", {"b": np.nan}, regex=True)
res2 = dfmix.copy()
- res2.replace("a", {"b": np.nan}, regex=True, inplace=True)
+ return_value = res2.replace("a", {"b": np.nan}, regex=True, inplace=True)
+ assert return_value is None
tm.assert_frame_equal(res, expec)
tm.assert_frame_equal(res2, expec)
res = dfmix.replace("a", {"b": np.nan}, regex=True)
res2 = dfmix.copy()
- res2.replace(regex="a", value={"b": np.nan}, inplace=True)
+ return_value = res2.replace(regex="a", value={"b": np.nan}, inplace=True)
+ assert return_value is None
expec = DataFrame(
{"a": mix_abc["a"], "b": [np.nan, "b", ".", "."], "c": mix_abc["c"]}
)
@@ -483,9 +536,13 @@ def test_regex_replace_dict_nested(self, mix_abc):
res = dfmix.replace({"b": {r"\s*\.\s*": np.nan}}, regex=True)
res2 = dfmix.copy()
res4 = dfmix.copy()
- res2.replace({"b": {r"\s*\.\s*": np.nan}}, inplace=True, regex=True)
+ return_value = res2.replace(
+ {"b": {r"\s*\.\s*": np.nan}}, inplace=True, regex=True
+ )
+ assert return_value is None
res3 = dfmix.replace(regex={"b": {r"\s*\.\s*": np.nan}})
- res4.replace(regex={"b": {r"\s*\.\s*": np.nan}}, inplace=True)
+ return_value = res4.replace(regex={"b": {r"\s*\.\s*": np.nan}}, inplace=True)
+ assert return_value is None
expec = DataFrame(
{"a": mix_abc["a"], "b": ["a", "b", np.nan, np.nan], "c": mix_abc["c"]}
)
@@ -519,8 +576,14 @@ def test_regex_replace_list_to_scalar(self, mix_abc):
res = df.replace([r"\s*\.\s*", "a|b"], np.nan, regex=True)
res2 = df.copy()
res3 = df.copy()
- res2.replace([r"\s*\.\s*", "a|b"], np.nan, regex=True, inplace=True)
- res3.replace(regex=[r"\s*\.\s*", "a|b"], value=np.nan, inplace=True)
+ return_value = res2.replace(
+ [r"\s*\.\s*", "a|b"], np.nan, regex=True, inplace=True
+ )
+ assert return_value is None
+ return_value = res3.replace(
+ regex=[r"\s*\.\s*", "a|b"], value=np.nan, inplace=True
+ )
+ assert return_value is None
tm.assert_frame_equal(res, expec)
tm.assert_frame_equal(res2, expec)
tm.assert_frame_equal(res3, expec)
@@ -530,9 +593,11 @@ def test_regex_replace_str_to_numeric(self, mix_abc):
df = DataFrame(mix_abc)
res = df.replace(r"\s*\.\s*", 0, regex=True)
res2 = df.copy()
- res2.replace(r"\s*\.\s*", 0, inplace=True, regex=True)
+ return_value = res2.replace(r"\s*\.\s*", 0, inplace=True, regex=True)
+ assert return_value is None
res3 = df.copy()
- res3.replace(regex=r"\s*\.\s*", value=0, inplace=True)
+ return_value = res3.replace(regex=r"\s*\.\s*", value=0, inplace=True)
+ assert return_value is None
expec = DataFrame({"a": mix_abc["a"], "b": ["a", "b", 0, 0], "c": mix_abc["c"]})
tm.assert_frame_equal(res, expec)
tm.assert_frame_equal(res2, expec)
@@ -542,9 +607,11 @@ def test_regex_replace_regex_list_to_numeric(self, mix_abc):
df = DataFrame(mix_abc)
res = df.replace([r"\s*\.\s*", "b"], 0, regex=True)
res2 = df.copy()
- res2.replace([r"\s*\.\s*", "b"], 0, regex=True, inplace=True)
+ return_value = res2.replace([r"\s*\.\s*", "b"], 0, regex=True, inplace=True)
+ assert return_value is None
res3 = df.copy()
- res3.replace(regex=[r"\s*\.\s*", "b"], value=0, inplace=True)
+ return_value = res3.replace(regex=[r"\s*\.\s*", "b"], value=0, inplace=True)
+ assert return_value is None
expec = DataFrame(
{"a": mix_abc["a"], "b": ["a", 0, 0, 0], "c": ["a", 0, np.nan, "d"]}
)
@@ -558,9 +625,11 @@ def test_regex_replace_series_of_regexes(self, mix_abc):
s2 = Series({"b": np.nan})
res = df.replace(s1, s2, regex=True)
res2 = df.copy()
- res2.replace(s1, s2, inplace=True, regex=True)
+ return_value = res2.replace(s1, s2, inplace=True, regex=True)
+ assert return_value is None
res3 = df.copy()
- res3.replace(regex=s1, value=s2, inplace=True)
+ return_value = res3.replace(regex=s1, value=s2, inplace=True)
+ assert return_value is None
expec = DataFrame(
{"a": mix_abc["a"], "b": ["a", "b", np.nan, np.nan], "c": mix_abc["c"]}
)
@@ -714,7 +783,8 @@ def test_replace_mixed(self, float_string_frame):
result = df.replace(0, 0.5)
tm.assert_frame_equal(result, expected)
- df.replace(0, 0.5, inplace=True)
+ return_value = df.replace(0, 0.5, inplace=True)
+ assert return_value is None
tm.assert_frame_equal(df, expected)
# int block splitting
@@ -942,7 +1012,8 @@ def test_replace_input_formats_listlike(self):
result = df.replace(to_rep, values)
expected = df.copy()
for i in range(len(to_rep)):
- expected.replace(to_rep[i], values[i], inplace=True)
+ return_value = expected.replace(to_rep[i], values[i], inplace=True)
+ assert return_value is None
tm.assert_frame_equal(result, expected)
msg = r"Replacement lists must match in length\. Expecting 3 got 2"
@@ -969,7 +1040,8 @@ def test_replace_input_formats_scalar(self):
result = df.replace(to_rep, -1)
expected = df.copy()
for i in range(len(to_rep)):
- expected.replace(to_rep[i], -1, inplace=True)
+ return_value = expected.replace(to_rep[i], -1, inplace=True)
+ assert return_value is None
tm.assert_frame_equal(result, expected)
def test_replace_limit(self):
@@ -1059,8 +1131,19 @@ def test_replace_bool_with_bool(self):
def test_replace_with_dict_with_bool_keys(self):
df = DataFrame({0: [True, False], 1: [False, True]})
- with pytest.raises(TypeError, match="Cannot compare types .+"):
- df.replace({"asdf": "asdb", True: "yes"})
+ result = df.replace({"asdf": "asdb", True: "yes"})
+ expected = DataFrame({0: ["yes", False], 1: [False, "yes"]})
+ tm.assert_frame_equal(result, expected)
+
+ def test_replace_dict_strings_vs_ints(self):
+ # GH#34789
+ df = pd.DataFrame({"Y0": [1, 2], "Y1": [3, 4]})
+ result = df.replace({"replace_string": "test"})
+
+ tm.assert_frame_equal(result, df)
+
+ result = df["Y0"].replace({"replace_string": "test"})
+ tm.assert_series_equal(result, df["Y0"])
def test_replace_truthy(self):
df = DataFrame({"a": [True, True]})
@@ -1321,7 +1404,8 @@ def test_categorical_replace_with_dict(self, replace_dict, final_data):
with pytest.raises(AssertionError, match=msg):
# ensure non-inplace call does not affect original
tm.assert_frame_equal(df, expected)
- df.replace(replace_dict, 3, inplace=True)
+ return_value = df.replace(replace_dict, 3, inplace=True)
+ assert return_value is None
tm.assert_frame_equal(df, expected)
@pytest.mark.parametrize(
@@ -1420,3 +1504,106 @@ def test_replace_period_ignore_float(self):
result = df.replace(1.0, 0.0)
expected = pd.DataFrame({"Per": [pd.Period("2020-01")] * 3})
tm.assert_frame_equal(expected, result)
+
+ def test_replace_value_category_type(self):
+ """
+ Test for #23305: to ensure category dtypes are maintained
+ after replace with direct values
+ """
+
+ # create input data
+ input_dict = {
+ "col1": [1, 2, 3, 4],
+ "col2": ["a", "b", "c", "d"],
+ "col3": [1.5, 2.5, 3.5, 4.5],
+ "col4": ["cat1", "cat2", "cat3", "cat4"],
+ "col5": ["obj1", "obj2", "obj3", "obj4"],
+ }
+ # explicitly cast columns as category and order them
+ input_df = pd.DataFrame(data=input_dict).astype(
+ {"col2": "category", "col4": "category"}
+ )
+ input_df["col2"] = input_df["col2"].cat.reorder_categories(
+ ["a", "b", "c", "d"], ordered=True
+ )
+ input_df["col4"] = input_df["col4"].cat.reorder_categories(
+ ["cat1", "cat2", "cat3", "cat4"], ordered=True
+ )
+
+ # create expected dataframe
+ expected_dict = {
+ "col1": [1, 2, 3, 4],
+ "col2": ["a", "b", "c", "z"],
+ "col3": [1.5, 2.5, 3.5, 4.5],
+ "col4": ["cat1", "catX", "cat3", "cat4"],
+ "col5": ["obj9", "obj2", "obj3", "obj4"],
+ }
+ # explicitly cast columns as category and order them
+ expected = pd.DataFrame(data=expected_dict).astype(
+ {"col2": "category", "col4": "category"}
+ )
+ expected["col2"] = expected["col2"].cat.reorder_categories(
+ ["a", "b", "c", "z"], ordered=True
+ )
+ expected["col4"] = expected["col4"].cat.reorder_categories(
+ ["cat1", "catX", "cat3", "cat4"], ordered=True
+ )
+
+ # replace values in input dataframe
+ input_df = input_df.replace("d", "z")
+ input_df = input_df.replace("obj1", "obj9")
+ result = input_df.replace("cat2", "catX")
+
+ tm.assert_frame_equal(result, expected)
+
+ @pytest.mark.xfail(
+ reason="category dtype gets changed to object type after replace, see #35268",
+ strict=True,
+ )
+ def test_replace_dict_category_type(self, input_category_df, expected_category_df):
+ """
+ Test to ensure category dtypes are maintained
+ after replace with dict values
+ """
+
+ # create input dataframe
+ input_dict = {"col1": ["a"], "col2": ["obj1"], "col3": ["cat1"]}
+ # explicitly cast columns as category
+ input_df = pd.DataFrame(data=input_dict).astype(
+ {"col1": "category", "col2": "category", "col3": "category"}
+ )
+
+ # create expected dataframe
+ expected_dict = {"col1": ["z"], "col2": ["obj9"], "col3": ["catX"]}
+ # explicitly cast columns as category
+ expected = pd.DataFrame(data=expected_dict).astype(
+ {"col1": "category", "col2": "category", "col3": "category"}
+ )
+
+ # replace values in input dataframe using a dict
+ result = input_df.replace({"a": "z", "obj1": "obj9", "cat1": "catX"})
+
+ tm.assert_frame_equal(result, expected)
+
+ def test_replace_with_compiled_regex(self):
+ # https://github.com/pandas-dev/pandas/issues/35680
+ df = pd.DataFrame(["a", "b", "c"])
+ regex = re.compile("^a$")
+ result = df.replace({regex: "z"}, regex=True)
+ expected = pd.DataFrame(["z", "b", "c"])
+ tm.assert_frame_equal(result, expected)
+
+ def test_replace_intervals(self):
+ # https://github.com/pandas-dev/pandas/issues/35931
+ df = pd.DataFrame({"a": [pd.Interval(0, 1), pd.Interval(0, 1)]})
+ result = df.replace({"a": {pd.Interval(0, 1): "x"}})
+ expected = pd.DataFrame({"a": ["x", "x"]})
+ tm.assert_frame_equal(result, expected)
+
+ def test_replace_unicode(self):
+ # GH: 16784
+ columns_values_map = {"positive": {"正面": 1, "中立": 1, "负面": 0}}
+ df1 = pd.DataFrame({"positive": np.ones(3)})
+ result = df1.replace(columns_values_map)
+ expected = pd.DataFrame({"positive": np.ones(3)})
+ tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/frame/methods/test_reset_index.py b/pandas/tests/frame/methods/test_reset_index.py
index 79442acccb326..b88ef0e6691cb 100644
--- a/pandas/tests/frame/methods/test_reset_index.py
+++ b/pandas/tests/frame/methods/test_reset_index.py
@@ -3,6 +3,7 @@
import numpy as np
import pytest
+import pandas as pd
from pandas import (
DataFrame,
Index,
@@ -118,7 +119,8 @@ def test_reset_index(self, float_frame):
# test resetting in place
df = float_frame.copy()
resetted = float_frame.reset_index()
- df.reset_index(inplace=True)
+ return_value = df.reset_index(inplace=True)
+ assert return_value is None
tm.assert_frame_equal(df, resetted, check_names=False)
df = float_frame.reset_index().set_index(["index", "A", "B"])
@@ -136,7 +138,8 @@ def test_reset_index_name(self):
)
assert df.reset_index().index.name is None
assert df.reset_index(drop=True).index.name is None
- df.reset_index(inplace=True)
+ return_value = df.reset_index(inplace=True)
+ assert return_value is None
assert df.index.name is None
def test_reset_index_level(self):
@@ -299,9 +302,49 @@ def test_reset_index_range(self):
tm.assert_frame_equal(result, expected)
-def test_reset_index_dtypes_on_empty_frame_with_multiindex():
+@pytest.mark.parametrize(
+ "array, dtype",
+ [
+ (["a", "b"], object),
+ (
+ pd.period_range("12-1-2000", periods=2, freq="Q-DEC"),
+ pd.PeriodDtype(freq="Q-DEC"),
+ ),
+ ],
+)
+def test_reset_index_dtypes_on_empty_frame_with_multiindex(array, dtype):
# GH 19602 - Preserve dtype on empty DataFrame with MultiIndex
- idx = MultiIndex.from_product([[0, 1], [0.5, 1.0], ["a", "b"]])
+ idx = MultiIndex.from_product([[0, 1], [0.5, 1.0], array])
result = DataFrame(index=idx)[:0].reset_index().dtypes
- expected = Series({"level_0": np.int64, "level_1": np.float64, "level_2": object})
+ expected = Series({"level_0": np.int64, "level_1": np.float64, "level_2": dtype})
tm.assert_series_equal(result, expected)
+
+
+def test_reset_index_empty_frame_with_datetime64_multiindex():
+ # https://github.com/pandas-dev/pandas/issues/35606
+ idx = MultiIndex(
+ levels=[[pd.Timestamp("2020-07-20 00:00:00")], [3, 4]],
+ codes=[[], []],
+ names=["a", "b"],
+ )
+ df = DataFrame(index=idx, columns=["c", "d"])
+ result = df.reset_index()
+ expected = DataFrame(
+ columns=list("abcd"), index=RangeIndex(start=0, stop=0, step=1)
+ )
+ expected["a"] = expected["a"].astype("datetime64[ns]")
+ expected["b"] = expected["b"].astype("int64")
+ tm.assert_frame_equal(result, expected)
+
+
+def test_reset_index_empty_frame_with_datetime64_multiindex_from_groupby():
+ # https://github.com/pandas-dev/pandas/issues/35657
+ df = DataFrame(dict(c1=[10.0], c2=["a"], c3=pd.to_datetime("2020-01-01")))
+ df = df.head(0).groupby(["c2", "c3"])[["c1"]].sum()
+ result = df.reset_index()
+ expected = DataFrame(
+ columns=["c2", "c3", "c1"], index=RangeIndex(start=0, stop=0, step=1)
+ )
+ expected["c3"] = expected["c3"].astype("datetime64[ns]")
+ expected["c1"] = expected["c1"].astype("float64")
+ tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/frame/methods/test_set_index.py b/pandas/tests/frame/methods/test_set_index.py
index 5f62697cc3e43..ebe7eabd53b46 100644
--- a/pandas/tests/frame/methods/test_set_index.py
+++ b/pandas/tests/frame/methods/test_set_index.py
@@ -137,7 +137,8 @@ def test_set_index_drop_inplace(self, frame_of_index_cols, drop, inplace, keys):
if inplace:
result = df.copy()
- result.set_index(keys, drop=drop, inplace=True)
+ return_value = result.set_index(keys, drop=drop, inplace=True)
+ assert return_value is None
else:
result = df.set_index(keys, drop=drop)
diff --git a/pandas/tests/frame/methods/test_shift.py b/pandas/tests/frame/methods/test_shift.py
index 9ec029a6c4304..8f6902eca816f 100644
--- a/pandas/tests/frame/methods/test_shift.py
+++ b/pandas/tests/frame/methods/test_shift.py
@@ -145,6 +145,33 @@ def test_shift_duplicate_columns(self):
tm.assert_frame_equal(shifted[0], shifted[1])
tm.assert_frame_equal(shifted[0], shifted[2])
+ def test_shift_axis1_multiple_blocks(self):
+ # GH#35488
+ df1 = pd.DataFrame(np.random.randint(1000, size=(5, 3)))
+ df2 = pd.DataFrame(np.random.randint(1000, size=(5, 2)))
+ df3 = pd.concat([df1, df2], axis=1)
+ assert len(df3._mgr.blocks) == 2
+
+ result = df3.shift(2, axis=1)
+
+ expected = df3.take([-1, -1, 0, 1, 2], axis=1)
+ expected.iloc[:, :2] = np.nan
+ expected.columns = df3.columns
+
+ tm.assert_frame_equal(result, expected)
+
+ # Case with periods < 0
+ # rebuild df3 because `take` call above consolidated
+ df3 = pd.concat([df1, df2], axis=1)
+ assert len(df3._mgr.blocks) == 2
+ result = df3.shift(-2, axis=1)
+
+ expected = df3.take([2, 3, 4, -1, -1], axis=1)
+ expected.iloc[:, -2:] = np.nan
+ expected.columns = df3.columns
+
+ tm.assert_frame_equal(result, expected)
+
@pytest.mark.filterwarnings("ignore:tshift is deprecated:FutureWarning")
def test_tshift(self, datetime_frame):
# TODO: remove this test when tshift deprecation is enforced
diff --git a/pandas/tests/frame/methods/test_sort_index.py b/pandas/tests/frame/methods/test_sort_index.py
index 543d87485d3c4..a106702aff807 100644
--- a/pandas/tests/frame/methods/test_sort_index.py
+++ b/pandas/tests/frame/methods/test_sort_index.py
@@ -218,25 +218,29 @@ def test_sort_index_inplace(self):
unordered = frame.loc[[3, 2, 4, 1]]
a_id = id(unordered["A"])
df = unordered.copy()
- df.sort_index(inplace=True)
+ return_value = df.sort_index(inplace=True)
+ assert return_value is None
expected = frame
tm.assert_frame_equal(df, expected)
assert a_id != id(df["A"])
df = unordered.copy()
- df.sort_index(ascending=False, inplace=True)
+ return_value = df.sort_index(ascending=False, inplace=True)
+ assert return_value is None
expected = frame[::-1]
tm.assert_frame_equal(df, expected)
# axis=1
unordered = frame.loc[:, ["D", "B", "C", "A"]]
df = unordered.copy()
- df.sort_index(axis=1, inplace=True)
+ return_value = df.sort_index(axis=1, inplace=True)
+ assert return_value is None
expected = frame
tm.assert_frame_equal(df, expected)
df = unordered.copy()
- df.sort_index(axis=1, ascending=False, inplace=True)
+ return_value = df.sort_index(axis=1, ascending=False, inplace=True)
+ assert return_value is None
expected = frame.iloc[:, ::-1]
tm.assert_frame_equal(df, expected)
@@ -551,8 +555,8 @@ def test_sort_index_and_reconstruction(self):
),
)
- df.columns.set_levels(
- pd.to_datetime(df.columns.levels[1]), level=1, inplace=True
+ df.columns = df.columns.set_levels(
+ pd.to_datetime(df.columns.levels[1]), level=1
)
assert not df.columns.is_lexsorted()
assert not df.columns.is_monotonic
@@ -589,7 +593,8 @@ def test_sort_index_level2(self):
# inplace
rs = frame.copy()
- rs.sort_index(level=0, inplace=True)
+ return_value = rs.sort_index(level=0, inplace=True)
+ assert return_value is None
tm.assert_frame_equal(rs, frame.sort_index(level=0))
def test_sort_index_level_large_cardinality(self):
@@ -734,3 +739,18 @@ def test_changes_length_raises(self):
df = pd.DataFrame({"A": [1, 2, 3]})
with pytest.raises(ValueError, match="change the shape"):
df.sort_index(key=lambda x: x[:1])
+
+ def test_sort_index_multiindex_sparse_column(self):
+ # GH 29735, testing that sort_index on a multiindexed frame with sparse
+ # columns fills with 0.
+ expected = pd.DataFrame(
+ {
+ i: pd.array([0.0, 0.0, 0.0, 0.0], dtype=pd.SparseDtype("float64", 0.0))
+ for i in range(0, 4)
+ },
+ index=pd.MultiIndex.from_product([[1, 2], [1, 2]]),
+ )
+
+ result = expected.sort_index(level=0)
+
+ tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/frame/methods/test_sort_values.py b/pandas/tests/frame/methods/test_sort_values.py
index 1275da01eace9..0ca232ec433e7 100644
--- a/pandas/tests/frame/methods/test_sort_values.py
+++ b/pandas/tests/frame/methods/test_sort_values.py
@@ -77,22 +77,28 @@ def test_sort_values_inplace(self):
)
sorted_df = frame.copy()
- sorted_df.sort_values(by="A", inplace=True)
+ return_value = sorted_df.sort_values(by="A", inplace=True)
+ assert return_value is None
expected = frame.sort_values(by="A")
tm.assert_frame_equal(sorted_df, expected)
sorted_df = frame.copy()
- sorted_df.sort_values(by=1, axis=1, inplace=True)
+ return_value = sorted_df.sort_values(by=1, axis=1, inplace=True)
+ assert return_value is None
expected = frame.sort_values(by=1, axis=1)
tm.assert_frame_equal(sorted_df, expected)
sorted_df = frame.copy()
- sorted_df.sort_values(by="A", ascending=False, inplace=True)
+ return_value = sorted_df.sort_values(by="A", ascending=False, inplace=True)
+ assert return_value is None
expected = frame.sort_values(by="A", ascending=False)
tm.assert_frame_equal(sorted_df, expected)
sorted_df = frame.copy()
- sorted_df.sort_values(by=["A", "B"], ascending=False, inplace=True)
+ return_value = sorted_df.sort_values(
+ by=["A", "B"], ascending=False, inplace=True
+ )
+ assert return_value is None
expected = frame.sort_values(by=["A", "B"], ascending=False)
tm.assert_frame_equal(sorted_df, expected)
@@ -544,17 +550,24 @@ def test_sort_values_inplace_key(self, sort_by_key):
)
sorted_df = frame.copy()
- sorted_df.sort_values(by="A", inplace=True, key=sort_by_key)
+ return_value = sorted_df.sort_values(by="A", inplace=True, key=sort_by_key)
+ assert return_value is None
expected = frame.sort_values(by="A", key=sort_by_key)
tm.assert_frame_equal(sorted_df, expected)
sorted_df = frame.copy()
- sorted_df.sort_values(by=1, axis=1, inplace=True, key=sort_by_key)
+ return_value = sorted_df.sort_values(
+ by=1, axis=1, inplace=True, key=sort_by_key
+ )
+ assert return_value is None
expected = frame.sort_values(by=1, axis=1, key=sort_by_key)
tm.assert_frame_equal(sorted_df, expected)
sorted_df = frame.copy()
- sorted_df.sort_values(by="A", ascending=False, inplace=True, key=sort_by_key)
+ return_value = sorted_df.sort_values(
+ by="A", ascending=False, inplace=True, key=sort_by_key
+ )
+ assert return_value is None
expected = frame.sort_values(by="A", ascending=False, key=sort_by_key)
tm.assert_frame_equal(sorted_df, expected)
@@ -678,3 +691,23 @@ def test_sort_values_key_dict_axis(self):
result = df.sort_values(1, key=lambda col: -col, axis=1)
expected = df.loc[:, ::-1]
tm.assert_frame_equal(result, expected)
+
+ @pytest.mark.parametrize("ordered", [True, False])
+ def test_sort_values_key_casts_to_categorical(self, ordered):
+ # https://github.com/pandas-dev/pandas/issues/36383
+ categories = ["c", "b", "a"]
+ df = pd.DataFrame({"x": [1, 1, 1], "y": ["a", "b", "c"]})
+
+ def sorter(key):
+ if key.name == "y":
+ return pd.Series(
+ pd.Categorical(key, categories=categories, ordered=ordered)
+ )
+ return key
+
+ result = df.sort_values(by=["x", "y"], key=sorter)
+ expected = pd.DataFrame(
+ {"x": [1, 1, 1], "y": ["c", "b", "a"]}, index=pd.Index([2, 1, 0])
+ )
+
+ tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py
index db8bb5ca3c437..f21b1d3dfe487 100644
--- a/pandas/tests/frame/test_analytics.py
+++ b/pandas/tests/frame/test_analytics.py
@@ -86,11 +86,7 @@ def wrapper(x):
result0 = f(axis=0, skipna=False)
result1 = f(axis=1, skipna=False)
tm.assert_series_equal(
- result0,
- frame.apply(wrapper),
- check_dtype=check_dtype,
- rtol=rtol,
- atol=atol,
+ result0, frame.apply(wrapper), check_dtype=check_dtype, rtol=rtol, atol=atol
)
# HACK: win32
tm.assert_series_equal(
@@ -116,7 +112,7 @@ def wrapper(x):
if opname in ["sum", "prod"]:
expected = frame.apply(skipna_wrapper, axis=1)
tm.assert_series_equal(
- result1, expected, check_dtype=False, rtol=rtol, atol=atol,
+ result1, expected, check_dtype=False, rtol=rtol, atol=atol
)
# check dtypes
@@ -287,7 +283,7 @@ def test_stat_op_api(self, float_frame, float_string_frame):
assert_stat_op_api("median", float_frame, float_string_frame)
try:
- from scipy.stats import skew, kurtosis # noqa:F401
+ from scipy.stats import kurtosis, skew # noqa:F401
assert_stat_op_api("skew", float_frame, float_string_frame)
assert_stat_op_api("kurt", float_frame, float_string_frame)
@@ -370,7 +366,7 @@ def kurt(x):
)
try:
- from scipy import skew, kurtosis # noqa:F401
+ from scipy import kurtosis, skew # noqa:F401
assert_stat_op_calc("skew", skewness, float_frame_with_na)
assert_stat_op_calc("kurt", kurt, float_frame_with_na)
@@ -1064,54 +1060,14 @@ def test_any_all_bool_only(self):
(np.any, {"A": pd.Series([0.0, 1.0], dtype="float")}, True),
(np.all, {"A": pd.Series([0, 1], dtype=int)}, False),
(np.any, {"A": pd.Series([0, 1], dtype=int)}, True),
- pytest.param(
- np.all,
- {"A": pd.Series([0, 1], dtype="M8[ns]")},
- False,
- marks=[td.skip_if_np_lt("1.15")],
- ),
- pytest.param(
- np.any,
- {"A": pd.Series([0, 1], dtype="M8[ns]")},
- True,
- marks=[td.skip_if_np_lt("1.15")],
- ),
- pytest.param(
- np.all,
- {"A": pd.Series([1, 2], dtype="M8[ns]")},
- True,
- marks=[td.skip_if_np_lt("1.15")],
- ),
- pytest.param(
- np.any,
- {"A": pd.Series([1, 2], dtype="M8[ns]")},
- True,
- marks=[td.skip_if_np_lt("1.15")],
- ),
- pytest.param(
- np.all,
- {"A": pd.Series([0, 1], dtype="m8[ns]")},
- False,
- marks=[td.skip_if_np_lt("1.15")],
- ),
- pytest.param(
- np.any,
- {"A": pd.Series([0, 1], dtype="m8[ns]")},
- True,
- marks=[td.skip_if_np_lt("1.15")],
- ),
- pytest.param(
- np.all,
- {"A": pd.Series([1, 2], dtype="m8[ns]")},
- True,
- marks=[td.skip_if_np_lt("1.15")],
- ),
- pytest.param(
- np.any,
- {"A": pd.Series([1, 2], dtype="m8[ns]")},
- True,
- marks=[td.skip_if_np_lt("1.15")],
- ),
+ pytest.param(np.all, {"A": pd.Series([0, 1], dtype="M8[ns]")}, False,),
+ pytest.param(np.any, {"A": pd.Series([0, 1], dtype="M8[ns]")}, True,),
+ pytest.param(np.all, {"A": pd.Series([1, 2], dtype="M8[ns]")}, True,),
+ pytest.param(np.any, {"A": pd.Series([1, 2], dtype="M8[ns]")}, True,),
+ pytest.param(np.all, {"A": pd.Series([0, 1], dtype="m8[ns]")}, False,),
+ pytest.param(np.any, {"A": pd.Series([0, 1], dtype="m8[ns]")}, True,),
+ pytest.param(np.all, {"A": pd.Series([1, 2], dtype="m8[ns]")}, True,),
+ pytest.param(np.any, {"A": pd.Series([1, 2], dtype="m8[ns]")}, True,),
(np.all, {"A": pd.Series([0, 1], dtype="category")}, False),
(np.any, {"A": pd.Series([0, 1], dtype="category")}, True),
(np.all, {"A": pd.Series([1, 2], dtype="category")}, True),
@@ -1124,8 +1080,6 @@ def test_any_all_bool_only(self):
"B": pd.Series([10, 20], dtype="m8[ns]"),
},
True,
- # In 1.13.3 and 1.14 np.all(df) returns a Timedelta here
- marks=[td.skip_if_np_lt("1.15")],
),
],
)
@@ -1303,3 +1257,26 @@ def test_preserve_timezone(self, initial: str, method):
df = DataFrame([expected])
result = getattr(df, method)(axis=1)
tm.assert_series_equal(result, expected)
+
+
+def test_mixed_frame_with_integer_sum():
+ # https://github.com/pandas-dev/pandas/issues/34520
+ df = pd.DataFrame([["a", 1]], columns=list("ab"))
+ df = df.astype({"b": "Int64"})
+ result = df.sum()
+ expected = pd.Series(["a", 1], index=["a", "b"])
+ tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("numeric_only", [True, False, None])
+@pytest.mark.parametrize("method", ["min", "max"])
+def test_minmax_extensionarray(method, numeric_only):
+ # https://github.com/pandas-dev/pandas/issues/32651
+ int64_info = np.iinfo("int64")
+ ser = Series([int64_info.max, None, int64_info.min], dtype=pd.Int64Dtype())
+ df = DataFrame({"Int64": ser})
+ result = getattr(df, method)(numeric_only=numeric_only)
+ expected = Series(
+ [getattr(int64_info, method)], index=pd.Index(["Int64"], dtype="object")
+ )
+ tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/frame/test_api.py b/pandas/tests/frame/test_api.py
index 2b79fc8cd3406..8b5d0c7ade56c 100644
--- a/pandas/tests/frame/test_api.py
+++ b/pandas/tests/frame/test_api.py
@@ -6,11 +6,12 @@
import numpy as np
import pytest
-from pandas.compat import PY37
+from pandas.compat import IS64, is_platform_windows
+import pandas.util._test_decorators as td
from pandas.util._test_decorators import async_mark, skip_if_no
import pandas as pd
-from pandas import Categorical, DataFrame, Series, compat, date_range, timedelta_range
+from pandas import Categorical, DataFrame, Series, date_range, timedelta_range
import pandas._testing as tm
@@ -254,7 +255,7 @@ def test_itertuples(self, float_frame):
assert list(dfaa.itertuples()) == [(0, 1, 1), (1, 2, 2), (2, 3, 3)]
# repr with int on 32-bit/windows
- if not (compat.is_platform_windows() or compat.is_platform_32bit()):
+ if not (is_platform_windows() or not IS64):
assert (
repr(list(df.itertuples(name=None)))
== "[(0, 1, 4), (1, 2, 5), (2, 3, 6)]"
@@ -274,10 +275,7 @@ def test_itertuples(self, float_frame):
# will raise SyntaxError if trying to create namedtuple
tup3 = next(df3.itertuples())
assert isinstance(tup3, tuple)
- if PY37:
- assert hasattr(tup3, "_fields")
- else:
- assert not hasattr(tup3, "_fields")
+ assert hasattr(tup3, "_fields")
# GH 28282
df_254_columns = DataFrame([{f"foo_{i}": f"bar_{i}" for i in range(254)}])
@@ -288,12 +286,7 @@ def test_itertuples(self, float_frame):
df_255_columns = DataFrame([{f"foo_{i}": f"bar_{i}" for i in range(255)}])
result_255_columns = next(df_255_columns.itertuples(index=False))
assert isinstance(result_255_columns, tuple)
-
- # Dataframes with >=255 columns will fallback to regular tuples on python < 3.7
- if PY37:
- assert hasattr(result_255_columns, "_fields")
- else:
- assert not hasattr(result_255_columns, "_fields")
+ assert hasattr(result_255_columns, "_fields")
def test_sequence_like_with_categorical(self):
@@ -367,6 +360,13 @@ def test_to_numpy_copy(self):
assert df.to_numpy(copy=False).base is arr
assert df.to_numpy(copy=True).base is not arr
+ def test_to_numpy_mixed_dtype_to_str(self):
+ # https://github.com/pandas-dev/pandas/issues/35455
+ df = pd.DataFrame([[pd.Timestamp("2020-01-01 00:00:00"), 100.0]])
+ result = df.to_numpy(dtype=str)
+ expected = np.array([["2020-01-01 00:00:00", "100.0"]], dtype=str)
+ tm.assert_numpy_array_equal(result, expected)
+
def test_swapaxes(self):
df = DataFrame(np.random.randn(10, 5))
tm.assert_frame_equal(df.T, df.swapaxes(0, 1))
@@ -523,6 +523,7 @@ def _check_f(base, f):
_check_f(d.copy(), f)
@async_mark()
+ @td.check_file_leaks
async def test_tab_complete_warning(self, ip):
# GH 16409
pytest.importorskip("IPython", minversion="6.0.0")
@@ -553,6 +554,33 @@ def test_attrs(self):
result = df.rename(columns=str)
assert result.attrs == {"version": 1}
+ @pytest.mark.parametrize("allows_duplicate_labels", [True, False, None])
+ def test_set_flags(self, allows_duplicate_labels):
+ df = pd.DataFrame({"A": [1, 2]})
+ result = df.set_flags(allows_duplicate_labels=allows_duplicate_labels)
+ if allows_duplicate_labels is None:
+ # We don't update when it's not provided
+ assert result.flags.allows_duplicate_labels is True
+ else:
+ assert result.flags.allows_duplicate_labels is allows_duplicate_labels
+
+ # We made a copy
+ assert df is not result
+
+ # We didn't mutate df
+ assert df.flags.allows_duplicate_labels is True
+
+ # But we didn't copy data
+ result.iloc[0, 0] = 0
+ assert df.iloc[0, 0] == 0
+
+ # Now we do copy.
+ result = df.set_flags(
+ copy=True, allows_duplicate_labels=allows_duplicate_labels
+ )
+ result.iloc[0, 0] = 10
+ assert df.iloc[0, 0] == 0
+
def test_cache_on_copy(self):
# GH 31784 _item_cache not cleared on copy causes incorrect reads after updates
df = DataFrame({"a": [1]})
diff --git a/pandas/tests/frame/test_arithmetic.py b/pandas/tests/frame/test_arithmetic.py
index a6b0ece58b095..6dd8d890e8a4b 100644
--- a/pandas/tests/frame/test_arithmetic.py
+++ b/pandas/tests/frame/test_arithmetic.py
@@ -11,7 +11,7 @@
from pandas import DataFrame, MultiIndex, Series
import pandas._testing as tm
import pandas.core.common as com
-from pandas.core.computation.expressions import _MIN_ELEMENTS, _NUMEXPR_INSTALLED
+from pandas.core.computation.expressions import _MIN_ELEMENTS, NUMEXPR_INSTALLED
from pandas.tests.frame.common import _check_mixed_float, _check_mixed_int
# -------------------------------------------------------------------
@@ -375,7 +375,7 @@ def test_floordiv_axis0(self):
result2 = df.floordiv(ser.values, axis=0)
tm.assert_frame_equal(result2, expected)
- @pytest.mark.skipif(not _NUMEXPR_INSTALLED, reason="numexpr not installed")
+ @pytest.mark.skipif(not NUMEXPR_INSTALLED, reason="numexpr not installed")
@pytest.mark.parametrize("opname", ["floordiv", "pow"])
def test_floordiv_axis0_numexpr_path(self, opname):
# case that goes through numexpr and has to fall back to masked_arith_op
@@ -1417,7 +1417,7 @@ def test_alignment_non_pandas(self):
columns = ["X", "Y", "Z"]
df = pd.DataFrame(np.random.randn(3, 3), index=index, columns=columns)
- align = pd.core.ops._align_method_FRAME
+ align = pd.core.ops.align_method_FRAME
for val in [
[1, 2, 3],
(1, 2, 3),
@@ -1552,3 +1552,12 @@ def test_dataframe_operation_with_non_numeric_types(df, col_dtype):
expected = expected.astype({"b": col_dtype})
result = df + pd.Series([-1.0], index=list("a"))
tm.assert_frame_equal(result, expected)
+
+
+def test_arith_reindex_with_duplicates():
+ # https://github.com/pandas-dev/pandas/issues/35194
+ df1 = pd.DataFrame(data=[[0]], columns=["second"])
+ df2 = pd.DataFrame(data=[[0, 0, 0]], columns=["first", "second", "second"])
+ result = df1 + df2
+ expected = pd.DataFrame([[np.nan, 0, 0]], columns=["first", "second", "second"])
+ tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/frame/test_block_internals.py b/pandas/tests/frame/test_block_internals.py
index d5554860c034d..4a85da72bc8b1 100644
--- a/pandas/tests/frame/test_block_internals.py
+++ b/pandas/tests/frame/test_block_internals.py
@@ -64,7 +64,8 @@ def test_consolidate(self, float_frame):
float_frame["F"] = 8.0
assert len(float_frame._mgr.blocks) == 3
- float_frame._consolidate(inplace=True)
+ return_value = float_frame._consolidate(inplace=True)
+ assert return_value is None
assert len(float_frame._mgr.blocks) == 1
def test_consolidate_inplace(self, float_frame):
@@ -625,3 +626,58 @@ def test_add_column_with_pandas_array(self):
assert type(df["c"]._mgr.blocks[0]) == ObjectBlock
assert type(df2["c"]._mgr.blocks[0]) == ObjectBlock
tm.assert_frame_equal(df, df2)
+
+
+def test_to_dict_of_blocks_item_cache():
+ # Calling to_dict_of_blocks should not poison item_cache
+ df = pd.DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]})
+ df["c"] = pd.arrays.PandasArray(np.array([1, 2, None, 3], dtype=object))
+ mgr = df._mgr
+ assert len(mgr.blocks) == 3 # i.e. not consolidated
+
+ ser = df["b"] # populations item_cache["b"]
+
+ df._to_dict_of_blocks()
+
+ # Check that the to_dict_of_blocks didnt break link between ser and df
+ ser.values[0] = "foo"
+ assert df.loc[0, "b"] == "foo"
+
+ assert df["b"] is ser
+
+
+def test_update_inplace_sets_valid_block_values():
+ # https://github.com/pandas-dev/pandas/issues/33457
+ df = pd.DataFrame({"a": pd.Series([1, 2, None], dtype="category")})
+
+ # inplace update of a single column
+ df["a"].fillna(1, inplace=True)
+
+ # check we havent put a Series into any block.values
+ assert isinstance(df._mgr.blocks[0].values, pd.Categorical)
+
+ # smoketest for OP bug from GH#35731
+ assert df.isnull().sum().sum() == 0
+
+
+def test_nonconsolidated_item_cache_take():
+ # https://github.com/pandas-dev/pandas/issues/35521
+
+ # create non-consolidated dataframe with object dtype columns
+ df = pd.DataFrame()
+ df["col1"] = pd.Series(["a"], dtype=object)
+ df["col2"] = pd.Series([0], dtype=object)
+
+ # access column (item cache)
+ df["col1"] == "A"
+ # take operation
+ # (regression was that this consolidated but didn't reset item cache,
+ # resulting in an invalid cache and the .at operation not working properly)
+ df[df["col2"] == 0]
+
+ # now setting value should update actual dataframe
+ df.at[0, "col1"] = "A"
+
+ expected = pd.DataFrame({"col1": ["A"], "col2": [0]}, dtype=object)
+ tm.assert_frame_equal(df, expected)
+ assert df.at[0, "col1"] == "A"
diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py
index 02a871666c78d..b5e211895672a 100644
--- a/pandas/tests/frame/test_constructors.py
+++ b/pandas/tests/frame/test_constructors.py
@@ -10,17 +10,21 @@
import pytest
import pytz
-from pandas.compat import PY37, is_platform_little_endian
-from pandas.compat.numpy import _is_numpy_dev
+from pandas.compat import is_platform_little_endian
+from pandas.compat.numpy import _np_version_under1p19
from pandas.core.dtypes.common import is_integer_dtype
+from pandas.core.dtypes.dtypes import DatetimeTZDtype, IntervalDtype, PeriodDtype
import pandas as pd
from pandas import (
Categorical,
+ CategoricalIndex,
DataFrame,
Index,
+ Interval,
MultiIndex,
+ Period,
RangeIndex,
Series,
Timedelta,
@@ -67,7 +71,7 @@ def test_series_with_name_not_matching_column(self):
lambda: DataFrame({}),
lambda: DataFrame(()),
lambda: DataFrame([]),
- lambda: DataFrame((_ for _ in [])),
+ lambda: DataFrame(_ for _ in []),
lambda: DataFrame(range(0)),
lambda: DataFrame(data=None),
lambda: DataFrame(data={}),
@@ -147,14 +151,20 @@ def test_constructor_dtype_list_data(self):
assert df.loc[1, 0] is None
assert df.loc[0, 1] == "2"
- @pytest.mark.xfail(_is_numpy_dev, reason="Interprets list of frame as 3D")
- def test_constructor_list_frames(self):
- # see gh-3243
- result = DataFrame([DataFrame()])
- assert result.shape == (1, 0)
+ @pytest.mark.skipif(_np_version_under1p19, reason="NumPy change.")
+ def test_constructor_list_of_2d_raises(self):
+ # https://github.com/pandas-dev/pandas/issues/32289
+ a = pd.DataFrame()
+ b = np.empty((0, 0))
+ with pytest.raises(ValueError, match=r"shape=\(1, 0, 0\)"):
+ pd.DataFrame([a])
- result = DataFrame([DataFrame(dict(A=np.arange(5)))])
- assert isinstance(result.iloc[0, 0], DataFrame)
+ with pytest.raises(ValueError, match=r"shape=\(1, 0, 0\)"):
+ pd.DataFrame([b])
+
+ a = pd.DataFrame({"A": [1, 2]})
+ with pytest.raises(ValueError, match=r"shape=\(2, 2, 1\)"):
+ pd.DataFrame([a, a])
def test_constructor_mixed_dtypes(self):
def _make_mixed_dtypes_df(typ, ad=None):
@@ -507,22 +517,6 @@ def test_constructor_error_msgs(self):
with pytest.raises(ValueError, match=msg):
DataFrame({"a": False, "b": True})
- @pytest.mark.xfail(_is_numpy_dev, reason="Interprets embedded frame as 3D")
- def test_constructor_with_embedded_frames(self):
-
- # embedded data frames
- df1 = DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]})
- df2 = DataFrame([df1, df1 + 10])
-
- df2.dtypes
- str(df2)
-
- result = df2.loc[0, 0]
- tm.assert_frame_equal(result, df1)
-
- result = df2.loc[1, 0]
- tm.assert_frame_equal(result, df1 + 10)
-
def test_constructor_subclass_dict(self, float_frame, dict_subclass):
# Test for passing dict subclass to constructor
data = {
@@ -710,7 +704,7 @@ def create_data(constructor):
tm.assert_frame_equal(result_timedelta, expected)
tm.assert_frame_equal(result_Timedelta, expected)
- def test_constructor_period(self):
+ def test_constructor_period_dict(self):
# PeriodIndex
a = pd.PeriodIndex(["2012-01", "NaT", "2012-04"], freq="M")
b = pd.PeriodIndex(["2012-02-01", "2012-03-01", "NaT"], freq="D")
@@ -723,6 +717,47 @@ def test_constructor_period(self):
assert df["a"].dtype == a.dtype
assert df["b"].dtype == b.dtype
+ @pytest.mark.parametrize(
+ "data,dtype",
+ [
+ (pd.Period("2012-01", freq="M"), "period[M]"),
+ (pd.Period("2012-02-01", freq="D"), "period[D]"),
+ (Interval(left=0, right=5), IntervalDtype("int64")),
+ (Interval(left=0.1, right=0.5), IntervalDtype("float64")),
+ ],
+ )
+ def test_constructor_period_dict_scalar(self, data, dtype):
+ # scalar periods
+ df = DataFrame({"a": data}, index=[0])
+ assert df["a"].dtype == dtype
+
+ expected = DataFrame(index=[0], columns=["a"], data=data)
+
+ tm.assert_frame_equal(df, expected)
+
+ @pytest.mark.parametrize(
+ "data,dtype",
+ [
+ (Period("2020-01"), PeriodDtype("M")),
+ (Interval(left=0, right=5), IntervalDtype("int64")),
+ (
+ Timestamp("2011-01-01", tz="US/Eastern"),
+ DatetimeTZDtype(tz="US/Eastern"),
+ ),
+ ],
+ )
+ def test_constructor_extension_scalar_data(self, data, dtype):
+ # GH 34832
+ df = DataFrame(index=[0, 1], columns=["a", "b"], data=data)
+
+ assert df["a"].dtype == dtype
+ assert df["b"].dtype == dtype
+
+ arr = pd.array([data] * 2, dtype=dtype)
+ expected = DataFrame({"a": arr, "b": arr})
+
+ tm.assert_frame_equal(df, expected)
+
def test_nested_dict_frame_constructor(self):
rng = pd.period_range("1/1/2000", periods=5)
df = DataFrame(np.random.randn(10, 5), columns=rng)
@@ -915,7 +950,7 @@ def test_constructor_mrecarray(self):
# from GH3479
assert_fr_equal = functools.partial(
- tm.assert_frame_equal, check_index_type=True, check_column_type=True,
+ tm.assert_frame_equal, check_index_type=True, check_column_type=True
)
arrays = [
("float", np.array([1.5, 2.0])),
@@ -1188,6 +1223,15 @@ def test_constructor_list_of_odicts(self):
expected = DataFrame(index=[0])
tm.assert_frame_equal(result, expected)
+ def test_constructor_single_row(self):
+ data = [OrderedDict([["a", 1.5], ["b", 3], ["c", 4], ["d", 6]])]
+
+ result = DataFrame(data)
+ expected = DataFrame.from_dict(dict(zip([0], data)), orient="index").reindex(
+ result.index
+ )
+ tm.assert_frame_equal(result, expected)
+
def test_constructor_ordered_dict_preserve_order(self):
# see gh-13304
expected = DataFrame([[2, 1]], columns=["b", "a"])
@@ -1392,7 +1436,6 @@ def test_constructor_list_of_namedtuples(self):
result = DataFrame(tuples, columns=["y", "z"])
tm.assert_frame_equal(result, expected)
- @pytest.mark.skipif(not PY37, reason="Requires Python >= 3.7")
def test_constructor_list_of_dataclasses(self):
# GH21910
from dataclasses import make_dataclass
@@ -1404,7 +1447,6 @@ def test_constructor_list_of_dataclasses(self):
result = DataFrame(datas)
tm.assert_frame_equal(result, expected)
- @pytest.mark.skipif(not PY37, reason="Requires Python >= 3.7")
def test_constructor_list_of_dataclasses_with_varying_types(self):
# GH21910
from dataclasses import make_dataclass
@@ -1421,7 +1463,6 @@ def test_constructor_list_of_dataclasses_with_varying_types(self):
result = DataFrame(datas)
tm.assert_frame_equal(result, expected)
- @pytest.mark.skipif(not PY37, reason="Requires Python >= 3.7")
def test_constructor_list_of_dataclasses_error_thrown(self):
# GH21910
from dataclasses import make_dataclass
@@ -1503,16 +1544,17 @@ def test_from_dict_columns_parameter(self):
)
@pytest.mark.parametrize(
- "data_dict, keys",
+ "data_dict, keys, orient",
[
- ([{("a",): 1}, {("a",): 2}], [("a",)]),
- ([OrderedDict([(("a",), 1), (("b",), 2)])], [("a",), ("b",)]),
- ([{("a", "b"): 1}], [("a", "b")]),
+ ({}, [], "index"),
+ ([{("a",): 1}, {("a",): 2}], [("a",)], "columns"),
+ ([OrderedDict([(("a",), 1), (("b",), 2)])], [("a",), ("b",)], "columns"),
+ ([{("a", "b"): 1}], [("a", "b")], "columns"),
],
)
- def test_constructor_from_dict_tuples(self, data_dict, keys):
+ def test_constructor_from_dict_tuples(self, data_dict, keys, orient):
# GH 16769
- df = DataFrame.from_dict(data_dict)
+ df = DataFrame.from_dict(data_dict, orient)
result = df.columns
expected = Index(keys, dtype="object", tupleize_cols=False)
@@ -1591,6 +1633,42 @@ def test_constructor_Series_differently_indexed(self):
tm.assert_index_equal(df2.index, other_index)
tm.assert_frame_equal(df2, exp2)
+ @pytest.mark.parametrize(
+ "name_in1,name_in2,name_in3,name_out",
+ [
+ ("idx", "idx", "idx", "idx"),
+ ("idx", "idx", None, "idx"),
+ ("idx", None, None, "idx"),
+ ("idx1", "idx2", None, None),
+ ("idx1", "idx1", "idx2", None),
+ ("idx1", "idx2", "idx3", None),
+ (None, None, None, None),
+ ],
+ )
+ def test_constructor_index_names(self, name_in1, name_in2, name_in3, name_out):
+ # GH13475
+ indices = [
+ pd.Index(["a", "b", "c"], name=name_in1),
+ pd.Index(["b", "c", "d"], name=name_in2),
+ pd.Index(["c", "d", "e"], name=name_in3),
+ ]
+ series = {
+ c: pd.Series([0, 1, 2], index=i) for i, c in zip(indices, ["x", "y", "z"])
+ }
+ result = pd.DataFrame(series)
+
+ exp_ind = pd.Index(["a", "b", "c", "d", "e"], name=name_out)
+ expected = pd.DataFrame(
+ {
+ "x": [0, 1, 2, np.nan, np.nan],
+ "y": [np.nan, 0, 1, 2, np.nan],
+ "z": [np.nan, np.nan, 0, 1, 2],
+ },
+ index=exp_ind,
+ )
+
+ tm.assert_frame_equal(result, expected)
+
def test_constructor_manager_resize(self, float_frame):
index = list(float_frame.index[:5])
columns = list(float_frame.columns[:3])
@@ -2483,6 +2561,18 @@ def test_from_records_series_list_dict(self):
result = DataFrame.from_records(data)
tm.assert_frame_equal(result, expected)
+ def test_from_records_series_categorical_index(self):
+ # GH 32805
+ index = CategoricalIndex(
+ [pd.Interval(-20, -10), pd.Interval(-10, 0), pd.Interval(0, 10)]
+ )
+ series_of_dicts = pd.Series([{"a": 1}, {"a": 2}, {"b": 3}], index=index)
+ frame = pd.DataFrame.from_records(series_of_dicts, index=index)
+ expected = DataFrame(
+ {"a": [1, 2, np.NaN], "b": [np.NaN, np.NaN, 3]}, index=index
+ )
+ tm.assert_frame_equal(frame, expected)
+
def test_frame_from_records_utc(self):
rec = {"datum": 1.5, "begin_time": datetime(2006, 4, 27, tzinfo=pytz.utc)}
@@ -2498,6 +2588,7 @@ def test_to_frame_with_falsey_names(self):
result = DataFrame(Series(name=0, dtype=object)).dtypes
tm.assert_series_equal(result, expected)
+ @pytest.mark.arm_slow
@pytest.mark.parametrize("dtype", [None, "uint8", "category"])
def test_constructor_range_dtype(self, dtype):
expected = DataFrame({"A": [0, 1, 2, 3, 4]}, dtype=dtype or "int64")
@@ -2580,6 +2671,12 @@ class DatetimeSubclass(datetime):
data = pd.DataFrame({"datetime": [DatetimeSubclass(2020, 1, 1, 1, 1)]})
assert data.datetime.dtype == "datetime64[ns]"
+ def test_with_mismatched_index_length_raises(self):
+ # GH#33437
+ dti = pd.date_range("2016-01-01", periods=3, tz="US/Pacific")
+ with pytest.raises(ValueError, match="Shape of passed values"):
+ DataFrame(dti, index=range(4))
+
class TestDataFrameConstructorWithDatetimeTZ:
def test_from_dict(self):
diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py
index 7cb7115276f71..b4f91590e09d1 100644
--- a/pandas/tests/frame/test_missing.py
+++ b/pandas/tests/frame/test_missing.py
@@ -24,14 +24,16 @@ def test_dropEmptyRows(self, float_frame):
smaller_frame = frame.dropna(how="all")
# check that original was preserved
tm.assert_series_equal(frame["foo"], original)
- inplace_frame1.dropna(how="all", inplace=True)
+ return_value = inplace_frame1.dropna(how="all", inplace=True)
tm.assert_series_equal(smaller_frame["foo"], expected)
tm.assert_series_equal(inplace_frame1["foo"], expected)
+ assert return_value is None
smaller_frame = frame.dropna(how="all", subset=["foo"])
- inplace_frame2.dropna(how="all", subset=["foo"], inplace=True)
+ return_value = inplace_frame2.dropna(how="all", subset=["foo"], inplace=True)
tm.assert_series_equal(smaller_frame["foo"], expected)
tm.assert_series_equal(inplace_frame2["foo"], expected)
+ assert return_value is None
def test_dropIncompleteRows(self, float_frame):
N = len(float_frame.index)
@@ -45,18 +47,20 @@ def test_dropIncompleteRows(self, float_frame):
smaller_frame = frame.dropna()
tm.assert_series_equal(frame["foo"], original)
- inp_frame1.dropna(inplace=True)
+ return_value = inp_frame1.dropna(inplace=True)
exp = Series(mat[5:], index=float_frame.index[5:], name="foo")
tm.assert_series_equal(smaller_frame["foo"], exp)
tm.assert_series_equal(inp_frame1["foo"], exp)
+ assert return_value is None
samesize_frame = frame.dropna(subset=["bar"])
tm.assert_series_equal(frame["foo"], original)
assert (frame["bar"] == 5).all()
- inp_frame2.dropna(subset=["bar"], inplace=True)
+ return_value = inp_frame2.dropna(subset=["bar"], inplace=True)
tm.assert_index_equal(samesize_frame.index, float_frame.index)
tm.assert_index_equal(inp_frame2.index, float_frame.index)
+ assert return_value is None
def test_dropna(self):
df = DataFrame(np.random.randn(6, 4))
@@ -65,31 +69,35 @@ def test_dropna(self):
dropped = df.dropna(axis=1)
expected = df.loc[:, [0, 1, 3]]
inp = df.copy()
- inp.dropna(axis=1, inplace=True)
+ return_value = inp.dropna(axis=1, inplace=True)
tm.assert_frame_equal(dropped, expected)
tm.assert_frame_equal(inp, expected)
+ assert return_value is None
dropped = df.dropna(axis=0)
expected = df.loc[list(range(2, 6))]
inp = df.copy()
- inp.dropna(axis=0, inplace=True)
+ return_value = inp.dropna(axis=0, inplace=True)
tm.assert_frame_equal(dropped, expected)
tm.assert_frame_equal(inp, expected)
+ assert return_value is None
# threshold
dropped = df.dropna(axis=1, thresh=5)
expected = df.loc[:, [0, 1, 3]]
inp = df.copy()
- inp.dropna(axis=1, thresh=5, inplace=True)
+ return_value = inp.dropna(axis=1, thresh=5, inplace=True)
tm.assert_frame_equal(dropped, expected)
tm.assert_frame_equal(inp, expected)
+ assert return_value is None
dropped = df.dropna(axis=0, thresh=4)
expected = df.loc[range(2, 6)]
inp = df.copy()
- inp.dropna(axis=0, thresh=4, inplace=True)
+ return_value = inp.dropna(axis=0, thresh=4, inplace=True)
tm.assert_frame_equal(dropped, expected)
tm.assert_frame_equal(inp, expected)
+ assert return_value is None
dropped = df.dropna(axis=1, thresh=4)
tm.assert_frame_equal(dropped, df)
@@ -100,9 +108,10 @@ def test_dropna(self):
# subset
dropped = df.dropna(axis=0, subset=[0, 1, 3])
inp = df.copy()
- inp.dropna(axis=0, subset=[0, 1, 3], inplace=True)
+ return_value = inp.dropna(axis=0, subset=[0, 1, 3], inplace=True)
tm.assert_frame_equal(dropped, df)
tm.assert_frame_equal(inp, df)
+ assert return_value is None
# all
dropped = df.dropna(axis=1, how="all")
@@ -126,12 +135,21 @@ def test_drop_and_dropna_caching(self):
df2 = df.copy()
df["A"].dropna()
tm.assert_series_equal(df["A"], original)
- df["A"].dropna(inplace=True)
- tm.assert_series_equal(df["A"], expected)
+
+ ser = df["A"]
+ return_value = ser.dropna(inplace=True)
+ tm.assert_series_equal(ser, expected)
+ tm.assert_series_equal(df["A"], original)
+ assert return_value is None
+
df2["A"].drop([1])
tm.assert_series_equal(df2["A"], original)
- df2["A"].drop([1], inplace=True)
- tm.assert_series_equal(df2["A"], original.drop([1]))
+
+ ser = df2["A"]
+ return_value = ser.drop([1], inplace=True)
+ tm.assert_series_equal(ser, original.drop([1]))
+ tm.assert_series_equal(df2["A"], original)
+ assert return_value is None
def test_dropna_corner(self, float_frame):
# bad input
@@ -251,8 +269,9 @@ def test_fillna_different_dtype(self):
)
tm.assert_frame_equal(result, expected)
- df.fillna({2: "foo"}, inplace=True)
+ return_value = df.fillna({2: "foo"}, inplace=True)
tm.assert_frame_equal(df, expected)
+ assert return_value is None
def test_fillna_limit_and_value(self):
# limit and value
diff --git a/pandas/tests/frame/test_operators.py b/pandas/tests/frame/test_operators.py
index fede1ca23a8ce..8cf66e2737249 100644
--- a/pandas/tests/frame/test_operators.py
+++ b/pandas/tests/frame/test_operators.py
@@ -119,7 +119,7 @@ def test_pos_object(self, df):
"df", [pd.DataFrame({"a": pd.to_datetime(["2017-01-22", "1970-01-01"])})]
)
def test_pos_raises(self, df):
- msg = re.escape("Unary plus expects numeric dtype, not datetime64[ns]")
+ msg = "Unary plus expects .* dtype, not datetime64\\[ns\\]"
with pytest.raises(TypeError, match=msg):
(+df)
with pytest.raises(TypeError, match=msg):
diff --git a/pandas/tests/frame/test_query_eval.py b/pandas/tests/frame/test_query_eval.py
index 98a2a33822e3b..2994482fa5139 100644
--- a/pandas/tests/frame/test_query_eval.py
+++ b/pandas/tests/frame/test_query_eval.py
@@ -9,7 +9,7 @@
import pandas as pd
from pandas import DataFrame, Index, MultiIndex, Series, date_range
import pandas._testing as tm
-from pandas.core.computation.check import _NUMEXPR_INSTALLED
+from pandas.core.computation.check import NUMEXPR_INSTALLED
PARSERS = "python", "pandas"
ENGINES = "python", pytest.param("numexpr", marks=td.skip_if_no_ne)
@@ -39,7 +39,7 @@ def setup_method(self, method):
def test_query_default(self):
# GH 12749
- # this should always work, whether _NUMEXPR_INSTALLED or not
+ # this should always work, whether NUMEXPR_INSTALLED or not
df = self.df
result = df.query("A>0")
tm.assert_frame_equal(result, self.expected1)
@@ -65,7 +65,7 @@ def test_query_python(self):
def test_query_numexpr(self):
df = self.df
- if _NUMEXPR_INSTALLED:
+ if NUMEXPR_INSTALLED:
result = df.query("A>0", engine="numexpr")
tm.assert_frame_equal(result, self.expected1)
result = df.eval("A+1", engine="numexpr")
@@ -160,6 +160,13 @@ def test_eval_resolvers_as_list(self):
assert df.eval("a + b", resolvers=[dict1, dict2]) == dict1["a"] + dict2["b"]
assert pd.eval("a + b", resolvers=[dict1, dict2]) == dict1["a"] + dict2["b"]
+ def test_eval_object_dtype_binop(self):
+ # GH#24883
+ df = pd.DataFrame({"a1": ["Y", "N"]})
+ res = df.eval("c = ((a1 == 'Y') & True)")
+ expected = pd.DataFrame({"a1": ["Y", "N"], "c": [True, False]})
+ tm.assert_frame_equal(res, expected)
+
class TestDataFrameQueryWithMultiIndex:
def test_query_with_named_multiindex(self, parser, engine):
@@ -413,7 +420,8 @@ def test_date_index_query(self):
df = DataFrame(np.random.randn(n, 3))
df["dates1"] = date_range("1/1/2012", periods=n)
df["dates3"] = date_range("1/1/2014", periods=n)
- df.set_index("dates1", inplace=True, drop=True)
+ return_value = df.set_index("dates1", inplace=True, drop=True)
+ assert return_value is None
res = df.query("index < 20130101 < dates3", engine=engine, parser=parser)
expec = df[(df.index < "20130101") & ("20130101" < df.dates3)]
tm.assert_frame_equal(res, expec)
@@ -425,7 +433,8 @@ def test_date_index_query_with_NaT(self):
df["dates1"] = date_range("1/1/2012", periods=n)
df["dates3"] = date_range("1/1/2014", periods=n)
df.iloc[0, 0] = pd.NaT
- df.set_index("dates1", inplace=True, drop=True)
+ return_value = df.set_index("dates1", inplace=True, drop=True)
+ assert return_value is None
res = df.query("index < 20130101 < dates3", engine=engine, parser=parser)
expec = df[(df.index < "20130101") & ("20130101" < df.dates3)]
tm.assert_frame_equal(res, expec)
@@ -438,7 +447,8 @@ def test_date_index_query_with_NaT_duplicates(self):
d["dates3"] = date_range("1/1/2014", periods=n)
df = DataFrame(d)
df.loc[np.random.rand(n) > 0.5, "dates1"] = pd.NaT
- df.set_index("dates1", inplace=True, drop=True)
+ return_value = df.set_index("dates1", inplace=True, drop=True)
+ assert return_value is None
res = df.query("dates1 < 20130101 < dates3", engine=engine, parser=parser)
expec = df[(df.index.to_series() < "20130101") & ("20130101" < df.dates3)]
tm.assert_frame_equal(res, expec)
@@ -759,7 +769,8 @@ def test_date_index_query(self):
df = DataFrame(np.random.randn(n, 3))
df["dates1"] = date_range("1/1/2012", periods=n)
df["dates3"] = date_range("1/1/2014", periods=n)
- df.set_index("dates1", inplace=True, drop=True)
+ return_value = df.set_index("dates1", inplace=True, drop=True)
+ assert return_value is None
res = df.query(
"(index < 20130101) & (20130101 < dates3)", engine=engine, parser=parser
)
@@ -773,7 +784,8 @@ def test_date_index_query_with_NaT(self):
df["dates1"] = date_range("1/1/2012", periods=n)
df["dates3"] = date_range("1/1/2014", periods=n)
df.iloc[0, 0] = pd.NaT
- df.set_index("dates1", inplace=True, drop=True)
+ return_value = df.set_index("dates1", inplace=True, drop=True)
+ assert return_value is None
res = df.query(
"(index < 20130101) & (20130101 < dates3)", engine=engine, parser=parser
)
@@ -787,7 +799,8 @@ def test_date_index_query_with_NaT_duplicates(self):
df["dates1"] = date_range("1/1/2012", periods=n)
df["dates3"] = date_range("1/1/2014", periods=n)
df.loc[np.random.rand(n) > 0.5, "dates1"] = pd.NaT
- df.set_index("dates1", inplace=True, drop=True)
+ return_value = df.set_index("dates1", inplace=True, drop=True)
+ assert return_value is None
msg = r"'BoolOp' nodes are not implemented"
with pytest.raises(NotImplementedError, match=msg):
df.query("index < 20130101 < dates3", engine=engine, parser=parser)
diff --git a/pandas/tests/frame/test_reshape.py b/pandas/tests/frame/test_reshape.py
index 1634baacf6d6e..b10fdbb707404 100644
--- a/pandas/tests/frame/test_reshape.py
+++ b/pandas/tests/frame/test_reshape.py
@@ -417,7 +417,7 @@ def test_unstack_mixed_type_name_in_multiindex(
result = df.unstack(unstack_idx)
expected = pd.DataFrame(
- expected_values, columns=expected_columns, index=expected_index,
+ expected_values, columns=expected_columns, index=expected_index
)
tm.assert_frame_equal(result, expected)
@@ -473,7 +473,8 @@ def test_stack_ints(self):
)
df_named = df.copy()
- df_named.columns.set_names(range(3), inplace=True)
+ return_value = df_named.columns.set_names(range(3), inplace=True)
+ assert return_value is None
tm.assert_frame_equal(
df_named.stack(level=[1, 2]), df_named.stack(level=1).stack(level=1)
@@ -806,7 +807,7 @@ def test_unstack_multi_level_cols(self):
[["B", "C"], ["B", "D"]], names=["c1", "c2"]
),
index=pd.MultiIndex.from_tuples(
- [[10, 20, 30], [10, 20, 40]], names=["i1", "i2", "i3"],
+ [[10, 20, 30], [10, 20, 40]], names=["i1", "i2", "i3"]
),
)
assert df.unstack(["i2", "i1"]).columns.names[-2:] == ["i2", "i1"]
@@ -1301,3 +1302,16 @@ def test_unstacking_multi_index_df():
),
)
tm.assert_frame_equal(result, expected)
+
+
+def test_stack_positional_level_duplicate_column_names():
+ # https://github.com/pandas-dev/pandas/issues/36353
+ columns = pd.MultiIndex.from_product([("x", "y"), ("y", "z")], names=["a", "a"])
+ df = pd.DataFrame([[1, 1, 1, 1]], columns=columns)
+ result = df.stack(0)
+
+ new_columns = pd.Index(["y", "z"], name="a")
+ new_index = pd.MultiIndex.from_tuples([(0, "x"), (0, "y")], names=[None, "a"])
+ expected = pd.DataFrame([[1, 1], [1, 1]], index=new_index, columns=new_columns)
+
+ tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/frame/test_subclass.py b/pandas/tests/frame/test_subclass.py
index 08920cf7fceeb..2b462d5a10c51 100644
--- a/pandas/tests/frame/test_subclass.py
+++ b/pandas/tests/frame/test_subclass.py
@@ -696,3 +696,11 @@ def test_idxmax_preserves_subclass(self):
df = tm.SubclassedDataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]})
result = df.idxmax()
assert isinstance(result, tm.SubclassedSeries)
+
+ def test_equals_subclass(self):
+ # https://github.com/pandas-dev/pandas/pull/34402
+ # allow subclass in both directions
+ df1 = pd.DataFrame({"a": [1, 2, 3]})
+ df2 = tm.SubclassedDataFrame({"a": [1, 2, 3]})
+ assert df1.equals(df2)
+ assert df2.equals(df1)
diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py
index 2b7b3af8f4705..db7347bb863a5 100644
--- a/pandas/tests/frame/test_to_csv.py
+++ b/pandas/tests/frame/test_to_csv.py
@@ -570,7 +570,8 @@ def test_to_csv_headers(self):
from_df.to_csv(path, index=False, header=["X", "Y"])
recons = self.read_csv(path)
- recons.reset_index(inplace=True)
+ return_value = recons.reset_index(inplace=True)
+ assert return_value is None
tm.assert_frame_equal(to_df, recons)
def test_to_csv_multiindex(self, float_frame, datetime_frame):
diff --git a/pandas/tests/generic/test_duplicate_labels.py b/pandas/tests/generic/test_duplicate_labels.py
new file mode 100644
index 0000000000000..97468e1f10a8b
--- /dev/null
+++ b/pandas/tests/generic/test_duplicate_labels.py
@@ -0,0 +1,450 @@
+"""Tests dealing with the NDFrame.allows_duplicates."""
+import operator
+
+import numpy as np
+import pytest
+
+import pandas as pd
+import pandas._testing as tm
+
+not_implemented = pytest.mark.xfail(reason="Not implemented.")
+
+# ----------------------------------------------------------------------------
+# Preservation
+
+
+class TestPreserves:
+ @pytest.mark.parametrize(
+ "cls, data",
+ [
+ (pd.Series, np.array([])),
+ (pd.Series, [1, 2]),
+ (pd.DataFrame, {}),
+ (pd.DataFrame, {"A": [1, 2]}),
+ ],
+ )
+ def test_construction_ok(self, cls, data):
+ result = cls(data)
+ assert result.flags.allows_duplicate_labels is True
+
+ result = cls(data).set_flags(allows_duplicate_labels=False)
+ assert result.flags.allows_duplicate_labels is False
+
+ @pytest.mark.parametrize(
+ "func",
+ [
+ operator.itemgetter(["a"]),
+ operator.methodcaller("add", 1),
+ operator.methodcaller("rename", str.upper),
+ operator.methodcaller("rename", "name"),
+ pytest.param(operator.methodcaller("abs"), marks=not_implemented),
+ # TODO: test np.abs
+ ],
+ )
+ def test_preserved_series(self, func):
+ s = pd.Series([0, 1], index=["a", "b"]).set_flags(allows_duplicate_labels=False)
+ assert func(s).flags.allows_duplicate_labels is False
+
+ @pytest.mark.parametrize(
+ "other", [pd.Series(0, index=["a", "b", "c"]), pd.Series(0, index=["a", "b"])]
+ )
+ # TODO: frame
+ @not_implemented
+ def test_align(self, other):
+ s = pd.Series([0, 1], index=["a", "b"]).set_flags(allows_duplicate_labels=False)
+ a, b = s.align(other)
+ assert a.flags.allows_duplicate_labels is False
+ assert b.flags.allows_duplicate_labels is False
+
+ def test_preserved_frame(self):
+ df = pd.DataFrame({"A": [1, 2], "B": [3, 4]}, index=["a", "b"]).set_flags(
+ allows_duplicate_labels=False
+ )
+ assert df.loc[["a"]].flags.allows_duplicate_labels is False
+ assert df.loc[:, ["A", "B"]].flags.allows_duplicate_labels is False
+
+ @not_implemented
+ def test_to_frame(self):
+ s = pd.Series(dtype=float).set_flags(allows_duplicate_labels=False)
+ assert s.to_frame().flags.allows_duplicate_labels is False
+
+ @pytest.mark.parametrize("func", ["add", "sub"])
+ @pytest.mark.parametrize(
+ "frame", [False, pytest.param(True, marks=not_implemented)]
+ )
+ @pytest.mark.parametrize("other", [1, pd.Series([1, 2], name="A")])
+ def test_binops(self, func, other, frame):
+ df = pd.Series([1, 2], name="A", index=["a", "b"]).set_flags(
+ allows_duplicate_labels=False
+ )
+ if frame:
+ df = df.to_frame()
+ if isinstance(other, pd.Series) and frame:
+ other = other.to_frame()
+ func = operator.methodcaller(func, other)
+ assert df.flags.allows_duplicate_labels is False
+ assert func(df).flags.allows_duplicate_labels is False
+
+ @not_implemented
+ def test_preserve_getitem(self):
+ df = pd.DataFrame({"A": [1, 2]}).set_flags(allows_duplicate_labels=False)
+ assert df[["A"]].flags.allows_duplicate_labels is False
+ assert df["A"].flags.allows_duplicate_labels is False
+ assert df.loc[0].flags.allows_duplicate_labels is False
+ assert df.loc[[0]].flags.allows_duplicate_labels is False
+ assert df.loc[0, ["A"]].flags.allows_duplicate_labels is False
+
+ @pytest.mark.xfail(reason="Unclear behavior.")
+ def test_ndframe_getitem_caching_issue(self):
+ # NDFrame.__getitem__ will cache the first df['A']. May need to
+ # invalidate that cache? Update the cached entries?
+ df = pd.DataFrame({"A": [0]}).set_flags(allows_duplicate_labels=False)
+ assert df["A"].flags.allows_duplicate_labels is False
+ df.flags.allows_duplicate_labels = True
+ assert df["A"].flags.allows_duplicate_labels is True
+
+ @pytest.mark.parametrize(
+ "objs, kwargs",
+ [
+ # Series
+ (
+ [
+ pd.Series(1, index=["a", "b"]).set_flags(
+ allows_duplicate_labels=False
+ ),
+ pd.Series(2, index=["c", "d"]).set_flags(
+ allows_duplicate_labels=False
+ ),
+ ],
+ {},
+ ),
+ (
+ [
+ pd.Series(1, index=["a", "b"]).set_flags(
+ allows_duplicate_labels=False
+ ),
+ pd.Series(2, index=["a", "b"]).set_flags(
+ allows_duplicate_labels=False
+ ),
+ ],
+ {"ignore_index": True},
+ ),
+ (
+ [
+ pd.Series(1, index=["a", "b"]).set_flags(
+ allows_duplicate_labels=False
+ ),
+ pd.Series(2, index=["a", "b"]).set_flags(
+ allows_duplicate_labels=False
+ ),
+ ],
+ {"axis": 1},
+ ),
+ # Frame
+ (
+ [
+ pd.DataFrame({"A": [1, 2]}, index=["a", "b"]).set_flags(
+ allows_duplicate_labels=False
+ ),
+ pd.DataFrame({"A": [1, 2]}, index=["c", "d"]).set_flags(
+ allows_duplicate_labels=False
+ ),
+ ],
+ {},
+ ),
+ (
+ [
+ pd.DataFrame({"A": [1, 2]}, index=["a", "b"]).set_flags(
+ allows_duplicate_labels=False
+ ),
+ pd.DataFrame({"A": [1, 2]}, index=["a", "b"]).set_flags(
+ allows_duplicate_labels=False
+ ),
+ ],
+ {"ignore_index": True},
+ ),
+ (
+ [
+ pd.DataFrame({"A": [1, 2]}, index=["a", "b"]).set_flags(
+ allows_duplicate_labels=False
+ ),
+ pd.DataFrame({"B": [1, 2]}, index=["a", "b"]).set_flags(
+ allows_duplicate_labels=False
+ ),
+ ],
+ {"axis": 1},
+ ),
+ # Series / Frame
+ (
+ [
+ pd.DataFrame({"A": [1, 2]}, index=["a", "b"]).set_flags(
+ allows_duplicate_labels=False
+ ),
+ pd.Series([1, 2], index=["a", "b"], name="B",).set_flags(
+ allows_duplicate_labels=False,
+ ),
+ ],
+ {"axis": 1},
+ ),
+ ],
+ )
+ def test_concat(self, objs, kwargs):
+ result = pd.concat(objs, **kwargs)
+ assert result.flags.allows_duplicate_labels is False
+
+ @pytest.mark.parametrize(
+ "left, right, kwargs, expected",
+ [
+ # false false false
+ pytest.param(
+ pd.DataFrame({"A": [0, 1]}, index=["a", "b"]).set_flags(
+ allows_duplicate_labels=False
+ ),
+ pd.DataFrame({"B": [0, 1]}, index=["a", "d"]).set_flags(
+ allows_duplicate_labels=False
+ ),
+ dict(left_index=True, right_index=True),
+ False,
+ marks=not_implemented,
+ ),
+ # false true false
+ pytest.param(
+ pd.DataFrame({"A": [0, 1]}, index=["a", "b"]).set_flags(
+ allows_duplicate_labels=False
+ ),
+ pd.DataFrame({"B": [0, 1]}, index=["a", "d"]),
+ dict(left_index=True, right_index=True),
+ False,
+ marks=not_implemented,
+ ),
+ # true true true
+ (
+ pd.DataFrame({"A": [0, 1]}, index=["a", "b"]),
+ pd.DataFrame({"B": [0, 1]}, index=["a", "d"]),
+ dict(left_index=True, right_index=True),
+ True,
+ ),
+ ],
+ )
+ def test_merge(self, left, right, kwargs, expected):
+ result = pd.merge(left, right, **kwargs)
+ assert result.flags.allows_duplicate_labels is expected
+
+ @not_implemented
+ def test_groupby(self):
+ # XXX: This is under tested
+ # TODO:
+ # - apply
+ # - transform
+ # - Should passing a grouper that disallows duplicates propagate?
+ df = pd.DataFrame({"A": [1, 2, 3]}).set_flags(allows_duplicate_labels=False)
+ result = df.groupby([0, 0, 1]).agg("count")
+ assert result.flags.allows_duplicate_labels is False
+
+ @pytest.mark.parametrize("frame", [True, False])
+ @not_implemented
+ def test_window(self, frame):
+ df = pd.Series(
+ 1,
+ index=pd.date_range("2000", periods=12),
+ name="A",
+ allows_duplicate_labels=False,
+ )
+ if frame:
+ df = df.to_frame()
+ assert df.rolling(3).mean().flags.allows_duplicate_labels is False
+ assert df.ewm(3).mean().flags.allows_duplicate_labels is False
+ assert df.expanding(3).mean().flags.allows_duplicate_labels is False
+
+
+# ----------------------------------------------------------------------------
+# Raises
+
+
+class TestRaises:
+ @pytest.mark.parametrize(
+ "cls, axes",
+ [
+ (pd.Series, {"index": ["a", "a"], "dtype": float}),
+ (pd.DataFrame, {"index": ["a", "a"]}),
+ (pd.DataFrame, {"index": ["a", "a"], "columns": ["b", "b"]}),
+ (pd.DataFrame, {"columns": ["b", "b"]}),
+ ],
+ )
+ def test_set_flags_with_duplicates(self, cls, axes):
+ result = cls(**axes)
+ assert result.flags.allows_duplicate_labels is True
+
+ with pytest.raises(pd.errors.DuplicateLabelError):
+ cls(**axes).set_flags(allows_duplicate_labels=False)
+
+ @pytest.mark.parametrize(
+ "data",
+ [
+ pd.Series(index=[0, 0], dtype=float),
+ pd.DataFrame(index=[0, 0]),
+ pd.DataFrame(columns=[0, 0]),
+ ],
+ )
+ def test_setting_allows_duplicate_labels_raises(self, data):
+ with pytest.raises(pd.errors.DuplicateLabelError):
+ data.flags.allows_duplicate_labels = False
+
+ assert data.flags.allows_duplicate_labels is True
+
+ @pytest.mark.parametrize(
+ "func", [operator.methodcaller("append", pd.Series(0, index=["a", "b"]))]
+ )
+ def test_series_raises(self, func):
+ s = pd.Series([0, 1], index=["a", "b"]).set_flags(allows_duplicate_labels=False)
+ with pytest.raises(pd.errors.DuplicateLabelError):
+ func(s)
+
+ @pytest.mark.parametrize(
+ "getter, target",
+ [
+ (operator.itemgetter(["A", "A"]), None),
+ # loc
+ (operator.itemgetter(["a", "a"]), "loc"),
+ pytest.param(
+ operator.itemgetter(("a", ["A", "A"])), "loc", marks=not_implemented
+ ),
+ pytest.param(
+ operator.itemgetter((["a", "a"], "A")), "loc", marks=not_implemented
+ ),
+ # iloc
+ (operator.itemgetter([0, 0]), "iloc"),
+ pytest.param(
+ operator.itemgetter((0, [0, 0])), "iloc", marks=not_implemented
+ ),
+ pytest.param(
+ operator.itemgetter(([0, 0], 0)), "iloc", marks=not_implemented
+ ),
+ ],
+ )
+ def test_getitem_raises(self, getter, target):
+ df = pd.DataFrame({"A": [1, 2], "B": [3, 4]}, index=["a", "b"]).set_flags(
+ allows_duplicate_labels=False
+ )
+ if target:
+ # df, df.loc, or df.iloc
+ target = getattr(df, target)
+ else:
+ target = df
+
+ with pytest.raises(pd.errors.DuplicateLabelError):
+ getter(target)
+
+ @pytest.mark.parametrize(
+ "objs, kwargs",
+ [
+ (
+ [
+ pd.Series(1, index=[0, 1], name="a").set_flags(
+ allows_duplicate_labels=False
+ ),
+ pd.Series(2, index=[0, 1], name="a").set_flags(
+ allows_duplicate_labels=False
+ ),
+ ],
+ {"axis": 1},
+ )
+ ],
+ )
+ def test_concat_raises(self, objs, kwargs):
+ with pytest.raises(pd.errors.DuplicateLabelError):
+ pd.concat(objs, **kwargs)
+
+ @not_implemented
+ def test_merge_raises(self):
+ a = pd.DataFrame({"A": [0, 1, 2]}, index=["a", "b", "c"]).set_flags(
+ allows_duplicate_labels=False
+ )
+ b = pd.DataFrame({"B": [0, 1, 2]}, index=["a", "b", "b"])
+ with pytest.raises(pd.errors.DuplicateLabelError):
+ pd.merge(a, b, left_index=True, right_index=True)
+
+
+@pytest.mark.parametrize(
+ "idx",
+ [
+ pd.Index([1, 1]),
+ pd.Index(["a", "a"]),
+ pd.Index([1.1, 1.1]),
+ pd.PeriodIndex([pd.Period("2000", "D")] * 2),
+ pd.DatetimeIndex([pd.Timestamp("2000")] * 2),
+ pd.TimedeltaIndex([pd.Timedelta("1D")] * 2),
+ pd.CategoricalIndex(["a", "a"]),
+ pd.IntervalIndex([pd.Interval(0, 1)] * 2),
+ pd.MultiIndex.from_tuples([("a", 1), ("a", 1)]),
+ ],
+ ids=lambda x: type(x).__name__,
+)
+def test_raises_basic(idx):
+ with pytest.raises(pd.errors.DuplicateLabelError):
+ pd.Series(1, index=idx).set_flags(allows_duplicate_labels=False)
+
+ with pytest.raises(pd.errors.DuplicateLabelError):
+ pd.DataFrame({"A": [1, 1]}, index=idx).set_flags(allows_duplicate_labels=False)
+
+ with pytest.raises(pd.errors.DuplicateLabelError):
+ pd.DataFrame([[1, 2]], columns=idx).set_flags(allows_duplicate_labels=False)
+
+
+def test_format_duplicate_labels_message():
+ idx = pd.Index(["a", "b", "a", "b", "c"])
+ result = idx._format_duplicate_message()
+ expected = pd.DataFrame(
+ {"positions": [[0, 2], [1, 3]]}, index=pd.Index(["a", "b"], name="label")
+ )
+ tm.assert_frame_equal(result, expected)
+
+
+def test_format_duplicate_labels_message_multi():
+ idx = pd.MultiIndex.from_product([["A"], ["a", "b", "a", "b", "c"]])
+ result = idx._format_duplicate_message()
+ expected = pd.DataFrame(
+ {"positions": [[0, 2], [1, 3]]},
+ index=pd.MultiIndex.from_product([["A"], ["a", "b"]]),
+ )
+ tm.assert_frame_equal(result, expected)
+
+
+def test_dataframe_insert_raises():
+ df = pd.DataFrame({"A": [1, 2]}).set_flags(allows_duplicate_labels=False)
+ with pytest.raises(ValueError, match="Cannot specify"):
+ df.insert(0, "A", [3, 4], allow_duplicates=True)
+
+
+@pytest.mark.parametrize(
+ "method, frame_only",
+ [
+ (operator.methodcaller("set_index", "A", inplace=True), True),
+ (operator.methodcaller("set_axis", ["A", "B"], inplace=True), False),
+ (operator.methodcaller("reset_index", inplace=True), True),
+ (operator.methodcaller("rename", lambda x: x, inplace=True), False),
+ ],
+)
+def test_inplace_raises(method, frame_only):
+ df = pd.DataFrame({"A": [0, 0], "B": [1, 2]}).set_flags(
+ allows_duplicate_labels=False
+ )
+ s = df["A"]
+ s.flags.allows_duplicate_labels = False
+ msg = "Cannot specify"
+
+ with pytest.raises(ValueError, match=msg):
+ method(df)
+ if not frame_only:
+ with pytest.raises(ValueError, match=msg):
+ method(s)
+
+
+def test_pickle():
+ a = pd.Series([1, 2]).set_flags(allows_duplicate_labels=False)
+ b = tm.round_trip_pickle(a)
+ tm.assert_series_equal(a, b)
+
+ a = pd.DataFrame({"A": []}).set_flags(allows_duplicate_labels=False)
+ b = tm.round_trip_pickle(a)
+ tm.assert_frame_equal(a, b)
diff --git a/pandas/tests/generic/test_finalize.py b/pandas/tests/generic/test_finalize.py
index 4d0f1a326225d..8898619e374ab 100644
--- a/pandas/tests/generic/test_finalize.py
+++ b/pandas/tests/generic/test_finalize.py
@@ -123,7 +123,7 @@
(pd.DataFrame, frame_data, operator.methodcaller("sort_index")),
(pd.DataFrame, frame_data, operator.methodcaller("nlargest", 1, "A")),
(pd.DataFrame, frame_data, operator.methodcaller("nsmallest", 1, "A")),
- (pd.DataFrame, frame_mi_data, operator.methodcaller("swaplevel"),),
+ (pd.DataFrame, frame_mi_data, operator.methodcaller("swaplevel")),
pytest.param(
(
pd.DataFrame,
@@ -178,7 +178,7 @@
marks=not_implemented_mark,
),
pytest.param(
- (pd.DataFrame, frame_mi_data, operator.methodcaller("unstack"),),
+ (pd.DataFrame, frame_mi_data, operator.methodcaller("unstack")),
marks=not_implemented_mark,
),
pytest.param(
@@ -317,7 +317,7 @@
marks=not_implemented_mark,
),
pytest.param(
- (pd.Series, ([1, 2],), operator.methodcaller("squeeze")),
+ (pd.Series, ([1, 2],), operator.methodcaller("squeeze"))
# marks=not_implemented_mark,
),
(pd.Series, ([1, 2],), operator.methodcaller("rename_axis", index="a")),
@@ -733,9 +733,7 @@ def test_timedelta_property(attr):
assert result.attrs == {"a": 1}
-@pytest.mark.parametrize(
- "method", [operator.methodcaller("total_seconds")],
-)
+@pytest.mark.parametrize("method", [operator.methodcaller("total_seconds")])
@not_implemented_mark
def test_timedelta_methods(method):
s = pd.Series(pd.timedelta_range("2000", periods=4))
diff --git a/pandas/tests/generic/test_generic.py b/pandas/tests/generic/test_generic.py
index 94747a52136c4..2c2584e8dee01 100644
--- a/pandas/tests/generic/test_generic.py
+++ b/pandas/tests/generic/test_generic.py
@@ -3,12 +3,12 @@
import numpy as np
import pytest
-from pandas.compat.numpy import _np_version_under1p17
+from pandas.compat.numpy import np_version_under1p17
from pandas.core.dtypes.common import is_scalar
import pandas as pd
-from pandas import DataFrame, MultiIndex, Series, date_range
+from pandas import DataFrame, Series, date_range
import pandas._testing as tm
import pandas.core.common as com
@@ -652,12 +652,12 @@ def test_sample(sel):
pytest.param(
"np.random.MT19937",
3,
- marks=pytest.mark.skipif(_np_version_under1p17, reason="NumPy<1.17"),
+ marks=pytest.mark.skipif(np_version_under1p17, reason="NumPy<1.17"),
),
pytest.param(
"np.random.PCG64",
11,
- marks=pytest.mark.skipif(_np_version_under1p17, reason="NumPy<1.17"),
+ marks=pytest.mark.skipif(np_version_under1p17, reason="NumPy<1.17"),
),
],
)
@@ -785,26 +785,6 @@ def test_depr_take_kwarg_is_copy(self, is_copy):
s.take([0, 1], is_copy=is_copy)
def test_equals(self):
- s1 = pd.Series([1, 2, 3], index=[0, 2, 1])
- s2 = s1.copy()
- assert s1.equals(s2)
-
- s1[1] = 99
- assert not s1.equals(s2)
-
- # NaNs compare as equal
- s1 = pd.Series([1, np.nan, 3, np.nan], index=[0, 2, 1, 3])
- s2 = s1.copy()
- assert s1.equals(s2)
-
- s2[0] = 9.9
- assert not s1.equals(s2)
-
- idx = MultiIndex.from_tuples([(0, "a"), (1, "b"), (2, "c")])
- s1 = Series([1, 2, np.nan], index=idx)
- s2 = s1.copy()
- assert s1.equals(s2)
-
# Add object dtype column with nans
index = np.random.random(10)
df1 = DataFrame(np.random.random(10), index=index, columns=["floats"])
@@ -857,21 +837,6 @@ def test_equals(self):
df2 = df1.set_index(["floats"], append=True)
assert df3.equals(df2)
- # GH 8437
- a = pd.Series([False, np.nan])
- b = pd.Series([False, np.nan])
- c = pd.Series(index=range(2), dtype=object)
- d = c.copy()
- e = c.copy()
- f = c.copy()
- c[:-1] = d[:-1] = e[0] = f[0] = False
- assert a.equals(a)
- assert a.equals(b)
- assert a.equals(c)
- assert a.equals(d)
- assert a.equals(e)
- assert e.equals(f)
-
def test_pipe(self):
df = DataFrame({"A": [1, 2, 3]})
f = lambda x, y: x ** y
@@ -922,3 +887,13 @@ def test_axis_numbers_deprecated(self, box):
obj = box(dtype=object)
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
obj._AXIS_NUMBERS
+
+ @pytest.mark.parametrize("as_frame", [True, False])
+ def test_flags_identity(self, as_frame):
+ s = pd.Series([1, 2])
+ if as_frame:
+ s = s.to_frame()
+
+ assert s.flags is s.flags
+ s2 = s.copy()
+ assert s2.flags is not s.flags
diff --git a/pandas/tests/generic/test_to_xarray.py b/pandas/tests/generic/test_to_xarray.py
index ab56a752f7e90..a85d7ddc1ea53 100644
--- a/pandas/tests/generic/test_to_xarray.py
+++ b/pandas/tests/generic/test_to_xarray.py
@@ -47,9 +47,7 @@ def test_to_xarray_index_types(self, index):
expected = df.copy()
expected["f"] = expected["f"].astype(object)
expected.columns.name = None
- tm.assert_frame_equal(
- result.to_dataframe(), expected,
- )
+ tm.assert_frame_equal(result.to_dataframe(), expected)
@td.skip_if_no("xarray", min_version="0.7.0")
def test_to_xarray(self):
diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py
index dbd713a0af4cf..c96333bc48dd4 100644
--- a/pandas/tests/groupby/aggregate/test_aggregate.py
+++ b/pandas/tests/groupby/aggregate/test_aggregate.py
@@ -2,10 +2,13 @@
test .agg behavior / note that .apply is tested generally in test_groupby.py
"""
import functools
+from functools import partial
import numpy as np
import pytest
+from pandas.errors import PerformanceWarning
+
from pandas.core.dtypes.common import is_integer_dtype
import pandas as pd
@@ -252,6 +255,61 @@ def test_agg_multiple_functions_maintain_order(df):
tm.assert_index_equal(result.columns, exp_cols)
+def test_agg_multiple_functions_same_name():
+ # GH 30880
+ df = pd.DataFrame(
+ np.random.randn(1000, 3),
+ index=pd.date_range("1/1/2012", freq="S", periods=1000),
+ columns=["A", "B", "C"],
+ )
+ result = df.resample("3T").agg(
+ {"A": [partial(np.quantile, q=0.9999), partial(np.quantile, q=0.1111)]}
+ )
+ expected_index = pd.date_range("1/1/2012", freq="3T", periods=6)
+ expected_columns = MultiIndex.from_tuples([("A", "quantile"), ("A", "quantile")])
+ expected_values = np.array(
+ [df.resample("3T").A.quantile(q=q).values for q in [0.9999, 0.1111]]
+ ).T
+ expected = pd.DataFrame(
+ expected_values, columns=expected_columns, index=expected_index
+ )
+ tm.assert_frame_equal(result, expected)
+
+
+def test_agg_multiple_functions_same_name_with_ohlc_present():
+ # GH 30880
+ # ohlc expands dimensions, so different test to the above is required.
+ df = pd.DataFrame(
+ np.random.randn(1000, 3),
+ index=pd.date_range("1/1/2012", freq="S", periods=1000),
+ columns=["A", "B", "C"],
+ )
+ result = df.resample("3T").agg(
+ {"A": ["ohlc", partial(np.quantile, q=0.9999), partial(np.quantile, q=0.1111)]}
+ )
+ expected_index = pd.date_range("1/1/2012", freq="3T", periods=6)
+ expected_columns = pd.MultiIndex.from_tuples(
+ [
+ ("A", "ohlc", "open"),
+ ("A", "ohlc", "high"),
+ ("A", "ohlc", "low"),
+ ("A", "ohlc", "close"),
+ ("A", "quantile", "A"),
+ ("A", "quantile", "A"),
+ ]
+ )
+ non_ohlc_expected_values = np.array(
+ [df.resample("3T").A.quantile(q=q).values for q in [0.9999, 0.1111]]
+ ).T
+ expected_values = np.hstack([df.resample("3T").A.ohlc(), non_ohlc_expected_values])
+ expected = pd.DataFrame(
+ expected_values, columns=expected_columns, index=expected_index
+ )
+ # PerformanceWarning is thrown by `assert col in right` in assert_frame_equal
+ with tm.assert_produces_warning(PerformanceWarning):
+ tm.assert_frame_equal(result, expected)
+
+
def test_multiple_functions_tuples_and_non_tuples(df):
# #1359
funcs = [("foo", "mean"), "std"]
@@ -737,6 +795,41 @@ def test_groupby_aggregate_empty_key_empty_return():
tm.assert_frame_equal(result, expected)
+def test_grouby_agg_loses_results_with_as_index_false_relabel():
+ # GH 32240: When the aggregate function relabels column names and
+ # as_index=False is specified, the results are dropped.
+
+ df = pd.DataFrame(
+ {"key": ["x", "y", "z", "x", "y", "z"], "val": [1.0, 0.8, 2.0, 3.0, 3.6, 0.75]}
+ )
+
+ grouped = df.groupby("key", as_index=False)
+ result = grouped.agg(min_val=pd.NamedAgg(column="val", aggfunc="min"))
+ expected = pd.DataFrame({"key": ["x", "y", "z"], "min_val": [1.0, 0.8, 0.75]})
+ tm.assert_frame_equal(result, expected)
+
+
+def test_grouby_agg_loses_results_with_as_index_false_relabel_multiindex():
+ # GH 32240: When the aggregate function relabels column names and
+ # as_index=False is specified, the results are dropped. Check if
+ # multiindex is returned in the right order
+
+ df = pd.DataFrame(
+ {
+ "key": ["x", "y", "x", "y", "x", "x"],
+ "key1": ["a", "b", "c", "b", "a", "c"],
+ "val": [1.0, 0.8, 2.0, 3.0, 3.6, 0.75],
+ }
+ )
+
+ grouped = df.groupby(["key", "key1"], as_index=False)
+ result = grouped.agg(min_val=pd.NamedAgg(column="val", aggfunc="min"))
+ expected = pd.DataFrame(
+ {"key": ["x", "x", "y"], "key1": ["a", "c", "b"], "min_val": [1.0, 0.75, 0.8]}
+ )
+ tm.assert_frame_equal(result, expected)
+
+
@pytest.mark.parametrize(
"func", [lambda s: s.mean(), lambda s: np.mean(s), lambda s: np.nanmean(s)]
)
@@ -968,3 +1061,110 @@ def test_groupby_get_by_index():
res = df.groupby("A").agg({"B": lambda x: x.get(x.index[-1])})
expected = pd.DataFrame(dict(A=["S", "W"], B=[1.0, 2.0])).set_index("A")
pd.testing.assert_frame_equal(res, expected)
+
+
+@pytest.mark.parametrize(
+ "grp_col_dict, exp_data",
+ [
+ ({"nr": "min", "cat_ord": "min"}, {"nr": [1, 5], "cat_ord": ["a", "c"]}),
+ ({"cat_ord": "min"}, {"cat_ord": ["a", "c"]}),
+ ({"nr": "min"}, {"nr": [1, 5]}),
+ ],
+)
+def test_groupby_single_agg_cat_cols(grp_col_dict, exp_data):
+ # test single aggregations on ordered categorical cols GHGH27800
+
+ # create the result dataframe
+ input_df = pd.DataFrame(
+ {
+ "nr": [1, 2, 3, 4, 5, 6, 7, 8],
+ "cat_ord": list("aabbccdd"),
+ "cat": list("aaaabbbb"),
+ }
+ )
+
+ input_df = input_df.astype({"cat": "category", "cat_ord": "category"})
+ input_df["cat_ord"] = input_df["cat_ord"].cat.as_ordered()
+ result_df = input_df.groupby("cat").agg(grp_col_dict)
+
+ # create expected dataframe
+ cat_index = pd.CategoricalIndex(
+ ["a", "b"], categories=["a", "b"], ordered=False, name="cat", dtype="category"
+ )
+
+ expected_df = pd.DataFrame(data=exp_data, index=cat_index)
+
+ tm.assert_frame_equal(result_df, expected_df)
+
+
+@pytest.mark.parametrize(
+ "grp_col_dict, exp_data",
+ [
+ ({"nr": ["min", "max"], "cat_ord": "min"}, [(1, 4, "a"), (5, 8, "c")]),
+ ({"nr": "min", "cat_ord": ["min", "max"]}, [(1, "a", "b"), (5, "c", "d")]),
+ ({"cat_ord": ["min", "max"]}, [("a", "b"), ("c", "d")]),
+ ],
+)
+def test_groupby_combined_aggs_cat_cols(grp_col_dict, exp_data):
+ # test combined aggregations on ordered categorical cols GH27800
+
+ # create the result dataframe
+ input_df = pd.DataFrame(
+ {
+ "nr": [1, 2, 3, 4, 5, 6, 7, 8],
+ "cat_ord": list("aabbccdd"),
+ "cat": list("aaaabbbb"),
+ }
+ )
+
+ input_df = input_df.astype({"cat": "category", "cat_ord": "category"})
+ input_df["cat_ord"] = input_df["cat_ord"].cat.as_ordered()
+ result_df = input_df.groupby("cat").agg(grp_col_dict)
+
+ # create expected dataframe
+ cat_index = pd.CategoricalIndex(
+ ["a", "b"], categories=["a", "b"], ordered=False, name="cat", dtype="category"
+ )
+
+ # unpack the grp_col_dict to create the multi-index tuple
+ # this tuple will be used to create the expected dataframe index
+ multi_index_list = []
+ for k, v in grp_col_dict.items():
+ if isinstance(v, list):
+ for value in v:
+ multi_index_list.append([k, value])
+ else:
+ multi_index_list.append([k, v])
+ multi_index = pd.MultiIndex.from_tuples(tuple(multi_index_list))
+
+ expected_df = pd.DataFrame(data=exp_data, columns=multi_index, index=cat_index)
+
+ tm.assert_frame_equal(result_df, expected_df)
+
+
+def test_nonagg_agg():
+ # GH 35490 - Single/Multiple agg of non-agg function give same results
+ # TODO: agg should raise for functions that don't aggregate
+ df = pd.DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 2, 1]})
+ g = df.groupby("a")
+
+ result = g.agg(["cumsum"])
+ result.columns = result.columns.droplevel(-1)
+ expected = g.agg("cumsum")
+
+ tm.assert_frame_equal(result, expected)
+
+
+def test_agg_no_suffix_index():
+ # GH36189
+ df = pd.DataFrame([[4, 9]] * 3, columns=["A", "B"])
+ result = df.agg(["sum", lambda x: x.sum(), lambda x: x.sum()])
+ expected = pd.DataFrame(
+ {"A": [12, 12, 12], "B": [27, 27, 27]}, index=["sum", "", ""]
+ )
+ tm.assert_frame_equal(result, expected)
+
+ # test Series case
+ result = df["A"].agg(["sum", lambda x: x.sum(), lambda x: x.sum()])
+ expected = pd.Series([12, 12, 12], index=["sum", "", ""], name="A")
+ tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/groupby/aggregate/test_cython.py b/pandas/tests/groupby/aggregate/test_cython.py
index 5ddda264642de..87ebd8b5a27fb 100644
--- a/pandas/tests/groupby/aggregate/test_cython.py
+++ b/pandas/tests/groupby/aggregate/test_cython.py
@@ -236,3 +236,44 @@ def test_cython_with_timestamp_and_nat(op, data):
result = df.groupby("a").aggregate(op)
tm.assert_frame_equal(expected, result)
+
+
+@pytest.mark.parametrize(
+ "agg",
+ [
+ "min",
+ "max",
+ "count",
+ "sum",
+ "prod",
+ "var",
+ "mean",
+ "median",
+ "ohlc",
+ "cumprod",
+ "cumsum",
+ "shift",
+ "any",
+ "all",
+ "quantile",
+ "first",
+ "last",
+ "rank",
+ "cummin",
+ "cummax",
+ ],
+)
+def test_read_only_buffer_source_agg(agg):
+ # https://github.com/pandas-dev/pandas/issues/36014
+ df = DataFrame(
+ {
+ "sepal_length": [5.1, 4.9, 4.7, 4.6, 5.0],
+ "species": ["setosa", "setosa", "setosa", "setosa", "setosa"],
+ }
+ )
+ df._mgr.blocks[0].values.flags.writeable = False
+
+ result = df.groupby(["species"]).agg({"sepal_length": agg})
+ expected = df.copy().groupby(["species"]).agg({"sepal_length": agg})
+
+ tm.assert_equal(result, expected)
diff --git a/pandas/tests/groupby/aggregate/test_numba.py b/pandas/tests/groupby/aggregate/test_numba.py
index 726d79535184a..c4266996748c2 100644
--- a/pandas/tests/groupby/aggregate/test_numba.py
+++ b/pandas/tests/groupby/aggregate/test_numba.py
@@ -4,7 +4,7 @@
from pandas.errors import NumbaUtilError
import pandas.util._test_decorators as td
-from pandas import DataFrame
+from pandas import DataFrame, NamedAgg, option_context
import pandas._testing as tm
from pandas.core.util.numba_ import NUMBA_FUNC_CACHE
@@ -57,7 +57,7 @@ def func_numba(values, index):
func_numba = numba.jit(func_numba)
data = DataFrame(
- {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1],
+ {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
)
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
grouped = data.groupby(0)
@@ -90,7 +90,7 @@ def func_2(values, index):
func_2 = numba.jit(func_2)
data = DataFrame(
- {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1],
+ {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
)
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
grouped = data.groupby(0)
@@ -113,3 +113,40 @@ def func_2(values, index):
result = grouped.agg(func_1, engine="numba", engine_kwargs=engine_kwargs)
expected = grouped.agg(lambda x: np.mean(x) - 3.4, engine="cython")
tm.assert_equal(result, expected)
+
+
+@td.skip_if_no("numba", "0.46.0")
+def test_use_global_config():
+ def func_1(values, index):
+ return np.mean(values) - 3.4
+
+ data = DataFrame(
+ {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
+ )
+ grouped = data.groupby(0)
+ expected = grouped.agg(func_1, engine="numba")
+ with option_context("compute.use_numba", True):
+ result = grouped.agg(func_1, engine=None)
+ tm.assert_frame_equal(expected, result)
+
+
+@td.skip_if_no("numba", "0.46.0")
+@pytest.mark.parametrize(
+ "agg_func",
+ [
+ ["min", "max"],
+ "min",
+ {"B": ["min", "max"], "C": "sum"},
+ NamedAgg(column="B", aggfunc="min"),
+ ],
+)
+def test_multifunc_notimplimented(agg_func):
+ data = DataFrame(
+ {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
+ )
+ grouped = data.groupby(0)
+ with pytest.raises(NotImplementedError, match="Numba engine can"):
+ grouped.agg(agg_func, engine="numba")
+
+ with pytest.raises(NotImplementedError, match="Numba engine can"):
+ grouped[1].agg(agg_func, engine="numba")
diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py
index 264cf40dc6984..e8cd6017a117c 100644
--- a/pandas/tests/groupby/aggregate/test_other.py
+++ b/pandas/tests/groupby/aggregate/test_other.py
@@ -486,13 +486,13 @@ def test_agg_timezone_round_trip():
assert ts == grouped.first()["B"].iloc[0]
# GH#27110 applying iloc should return a DataFrame
- assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 0]
+ assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 1]
ts = df["B"].iloc[2]
assert ts == grouped.last()["B"].iloc[0]
# GH#27110 applying iloc should return a DataFrame
- assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 0]
+ assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 1]
def test_sum_uint64_overflow():
diff --git a/pandas/tests/groupby/test_allowlist.py b/pandas/tests/groupby/test_allowlist.py
index 0fd66cc047017..4a735fc7bb686 100644
--- a/pandas/tests/groupby/test_allowlist.py
+++ b/pandas/tests/groupby/test_allowlist.py
@@ -369,7 +369,6 @@ def test_groupby_selection_with_methods(df):
"ffill",
"bfill",
"pct_change",
- "tshift",
]
for m in methods:
@@ -379,6 +378,11 @@ def test_groupby_selection_with_methods(df):
# should always be frames!
tm.assert_frame_equal(res, exp)
+ # check that the index cache is cleared
+ with pytest.raises(ValueError, match="Freq was not set in the index"):
+ # GH#35937
+ g.tshift()
+
# methods which aren't just .foo()
tm.assert_frame_equal(g.fillna(0), g_exp.fillna(0))
tm.assert_frame_equal(g.dtypes, g_exp.dtypes)
diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py
index 1945647ced08f..db5c4af9c6f53 100644
--- a/pandas/tests/groupby/test_apply.py
+++ b/pandas/tests/groupby/test_apply.py
@@ -1,4 +1,4 @@
-from datetime import datetime
+from datetime import date, datetime
from io import StringIO
import numpy as np
@@ -63,15 +63,8 @@ def test_apply_trivial():
tm.assert_frame_equal(result, expected)
-@pytest.mark.xfail(
- reason="GH#20066; function passed into apply "
- "returns a DataFrame with the same index "
- "as the one to create GroupBy object."
-)
def test_apply_trivial_fail():
# GH 20066
- # trivial apply fails if the constant dataframe has the same index
- # with the one used to create GroupBy object.
df = pd.DataFrame(
{"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]},
columns=["key", "data"],
@@ -211,6 +204,7 @@ def test_group_apply_once_per_group2(capsys):
assert result == expected
+@pytest.mark.xfail(reason="GH-34998")
def test_apply_fast_slow_identical():
# GH 31613
@@ -234,9 +228,11 @@ def fast(group):
"func",
[
lambda x: x,
- lambda x: x[:],
+ pytest.param(lambda x: x[:], marks=pytest.mark.xfail(reason="GH-34998")),
lambda x: x.copy(deep=False),
- lambda x: x.copy(deep=True),
+ pytest.param(
+ lambda x: x.copy(deep=True), marks=pytest.mark.xfail(reason="GH-34998")
+ ),
],
)
def test_groupby_apply_identity_maybecopy_index_identical(func):
@@ -385,6 +381,16 @@ def test_apply_frame_to_series(df):
tm.assert_numpy_array_equal(result.values, expected.values)
+def test_apply_frame_not_as_index_column_name(df):
+ # GH 35964 - path within _wrap_applied_output not hit by a test
+ grouped = df.groupby(["A", "B"], as_index=False)
+ result = grouped.apply(len)
+ expected = grouped.count().rename(columns={"C": np.nan}).drop(columns="D")
+ # TODO: Use assert_frame_equal when column name is not np.nan (GH 36306)
+ tm.assert_index_equal(result.index, expected.index)
+ tm.assert_numpy_array_equal(result.values, expected.values)
+
+
def test_apply_frame_concat_series():
def trans(group):
return group.groupby("B")["C"].sum().sort_values()[:2]
@@ -865,13 +871,14 @@ def test_apply_multi_level_name(category):
b = [1, 2] * 5
if category:
b = pd.Categorical(b, categories=[1, 2, 3])
+ expected_index = pd.CategoricalIndex([1, 2], categories=[1, 2, 3], name="B")
+ else:
+ expected_index = pd.Index([1, 2], name="B")
df = pd.DataFrame(
{"A": np.arange(10), "B": b, "C": list(range(10)), "D": list(range(10))}
).set_index(["A", "B"])
result = df.groupby("B").apply(lambda x: x.sum())
- expected = pd.DataFrame(
- {"C": [20, 25], "D": [20, 25]}, index=pd.Index([1, 2], name="B")
- )
+ expected = pd.DataFrame({"C": [20, 25], "D": [20, 25]}, index=expected_index)
tm.assert_frame_equal(result, expected)
assert df.index.names == ["A", "B"]
@@ -949,9 +956,7 @@ def fct(group):
tm.assert_series_equal(result, expected)
-@pytest.mark.parametrize(
- "function", [lambda gr: gr.index, lambda gr: gr.index + 1 - 1],
-)
+@pytest.mark.parametrize("function", [lambda gr: gr.index, lambda gr: gr.index + 1 - 1])
def test_apply_function_index_return(function):
# GH: 22541
df = pd.DataFrame([1, 2, 2, 2, 1, 2, 3, 1, 3, 1], columns=["id"])
@@ -995,3 +1000,98 @@ def test_apply_function_with_indexing_return_column():
result = df.groupby("foo1", as_index=False).apply(lambda x: x.mean())
expected = DataFrame({"foo1": ["one", "three", "two"], "foo2": [3.0, 4.0, 4.0]})
tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.xfail(reason="GH-34998")
+def test_apply_with_timezones_aware():
+ # GH: 27212
+
+ dates = ["2001-01-01"] * 2 + ["2001-01-02"] * 2 + ["2001-01-03"] * 2
+ index_no_tz = pd.DatetimeIndex(dates)
+ index_tz = pd.DatetimeIndex(dates, tz="UTC")
+ df1 = pd.DataFrame({"x": list(range(2)) * 3, "y": range(6), "t": index_no_tz})
+ df2 = pd.DataFrame({"x": list(range(2)) * 3, "y": range(6), "t": index_tz})
+
+ result1 = df1.groupby("x", group_keys=False).apply(lambda df: df[["x", "y"]].copy())
+ result2 = df2.groupby("x", group_keys=False).apply(lambda df: df[["x", "y"]].copy())
+
+ tm.assert_frame_equal(result1, result2)
+
+
+def test_apply_is_unchanged_when_other_methods_are_called_first(reduction_func):
+ # GH #34656
+ # GH #34271
+ df = DataFrame(
+ {
+ "a": [99, 99, 99, 88, 88, 88],
+ "b": [1, 2, 3, 4, 5, 6],
+ "c": [10, 20, 30, 40, 50, 60],
+ }
+ )
+
+ expected = pd.DataFrame(
+ {"a": [264, 297], "b": [15, 6], "c": [150, 60]},
+ index=pd.Index([88, 99], name="a"),
+ )
+
+ # Check output when no other methods are called before .apply()
+ grp = df.groupby(by="a")
+ result = grp.apply(sum)
+ tm.assert_frame_equal(result, expected)
+
+ # Check output when another method is called before .apply()
+ grp = df.groupby(by="a")
+ args = {"nth": [0], "corrwith": [df]}.get(reduction_func, [])
+ _ = getattr(grp, reduction_func)(*args)
+ result = grp.apply(sum)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_apply_with_date_in_multiindex_does_not_convert_to_timestamp():
+ # GH 29617
+
+ df = pd.DataFrame(
+ {
+ "A": ["a", "a", "a", "b"],
+ "B": [
+ date(2020, 1, 10),
+ date(2020, 1, 10),
+ date(2020, 2, 10),
+ date(2020, 2, 10),
+ ],
+ "C": [1, 2, 3, 4],
+ },
+ index=pd.Index([100, 101, 102, 103], name="idx"),
+ )
+
+ grp = df.groupby(["A", "B"])
+ result = grp.apply(lambda x: x.head(1))
+
+ expected = df.iloc[[0, 2, 3]]
+ expected = expected.reset_index()
+ expected.index = pd.MultiIndex.from_frame(expected[["A", "B", "idx"]])
+ expected = expected.drop(columns="idx")
+
+ tm.assert_frame_equal(result, expected)
+ for val in result.index.levels[1]:
+ assert type(val) is date
+
+
+def test_apply_by_cols_equals_apply_by_rows_transposed():
+ # GH 16646
+ # Operating on the columns, or transposing and operating on the rows
+ # should give the same result. There was previously a bug where the
+ # by_rows operation would work fine, but by_cols would throw a ValueError
+
+ df = pd.DataFrame(
+ np.random.random([6, 4]),
+ columns=pd.MultiIndex.from_product([["A", "B"], [1, 2]]),
+ )
+
+ by_rows = df.T.groupby(axis=0, level=0).apply(
+ lambda x: x.droplevel(axis=0, level=0)
+ )
+ by_cols = df.groupby(axis=1, level=0).apply(lambda x: x.droplevel(axis=1, level=0))
+
+ tm.assert_frame_equal(by_cols, by_rows.T)
+ tm.assert_frame_equal(by_cols, df)
diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py
index 60c82bf1fb71c..711daf7fe415d 100644
--- a/pandas/tests/groupby/test_categorical.py
+++ b/pandas/tests/groupby/test_categorical.py
@@ -3,8 +3,6 @@
import numpy as np
import pytest
-from pandas.compat import PY37, is_platform_windows
-
import pandas as pd
from pandas import (
Categorical,
@@ -13,14 +11,13 @@
Index,
MultiIndex,
Series,
- _np_version_under1p17,
qcut,
)
import pandas._testing as tm
-def cartesian_product_for_groupers(result, args, names):
- """ Reindex to a cartesian production for the groupers,
+def cartesian_product_for_groupers(result, args, names, fill_value=np.NaN):
+ """Reindex to a cartesian production for the groupers,
preserving the nature (Categorical) of each grouper
"""
@@ -33,7 +30,42 @@ def f(a):
return a
index = MultiIndex.from_product(map(f, args), names=names)
- return result.reindex(index).sort_index()
+ return result.reindex(index, fill_value=fill_value).sort_index()
+
+
+_results_for_groupbys_with_missing_categories = dict(
+ # This maps the builtin groupby functions to their expected outputs for
+ # missing categories when they are called on a categorical grouper with
+ # observed=False. Some functions are expected to return NaN, some zero.
+ # These expected values can be used across several tests (i.e. they are
+ # the same for SeriesGroupBy and DataFrameGroupBy) but they should only be
+ # hardcoded in one place.
+ [
+ ("all", np.NaN),
+ ("any", np.NaN),
+ ("count", 0),
+ ("corrwith", np.NaN),
+ ("first", np.NaN),
+ ("idxmax", np.NaN),
+ ("idxmin", np.NaN),
+ ("last", np.NaN),
+ ("mad", np.NaN),
+ ("max", np.NaN),
+ ("mean", np.NaN),
+ ("median", np.NaN),
+ ("min", np.NaN),
+ ("nth", np.NaN),
+ ("nunique", 0),
+ ("prod", np.NaN),
+ ("quantile", np.NaN),
+ ("sem", np.NaN),
+ ("size", 0),
+ ("skew", np.NaN),
+ ("std", np.NaN),
+ ("sum", 0),
+ ("var", np.NaN),
+ ]
+)
def test_apply_use_categorical_name(df):
@@ -209,12 +241,6 @@ def test_level_get_group(observed):
tm.assert_frame_equal(result, expected)
-# GH#21636 flaky on py37; may be related to older numpy, see discussion
-# https://github.com/MacPython/pandas-wheels/pull/64
-@pytest.mark.xfail(
- PY37 and _np_version_under1p17 and not is_platform_windows(),
- reason="Flaky, GH-27902",
-)
@pytest.mark.parametrize("ordered", [True, False])
def test_apply(ordered):
# GH 10138
@@ -274,7 +300,7 @@ def test_observed(observed):
result = gb.sum()
if not observed:
expected = cartesian_product_for_groupers(
- expected, [cat1, cat2, ["foo", "bar"]], list("ABC")
+ expected, [cat1, cat2, ["foo", "bar"]], list("ABC"), fill_value=0
)
tm.assert_frame_equal(result, expected)
@@ -284,7 +310,9 @@ def test_observed(observed):
expected = DataFrame({"values": [1, 2, 3, 4]}, index=exp_index)
result = gb.sum()
if not observed:
- expected = cartesian_product_for_groupers(expected, [cat1, cat2], list("AB"))
+ expected = cartesian_product_for_groupers(
+ expected, [cat1, cat2], list("AB"), fill_value=0
+ )
tm.assert_frame_equal(result, expected)
@@ -1154,6 +1182,8 @@ def test_seriesgroupby_observed_false_or_none(df_cat, observed, operation):
).sortlevel()
expected = Series(data=[2, 4, np.nan, 1, np.nan, 3], index=index, name="C")
+ if operation == "agg":
+ expected = expected.fillna(0, downcast="infer")
grouped = df_cat.groupby(["A", "B"], observed=observed)["C"]
result = getattr(grouped, operation)(sum)
tm.assert_series_equal(result, expected)
@@ -1259,16 +1289,15 @@ def test_get_nonexistent_category():
)
-def test_series_groupby_on_2_categoricals_unobserved(
- reduction_func: str, observed: bool, request
-):
+def test_series_groupby_on_2_categoricals_unobserved(reduction_func, observed, request):
# GH 17605
-
if reduction_func == "ngroup":
pytest.skip("ngroup is not truly a reduction")
if reduction_func == "corrwith": # GH 32293
- mark = pytest.mark.xfail(reason="TODO: implemented SeriesGroupBy.corrwith")
+ mark = pytest.mark.xfail(
+ reason="TODO: implemented SeriesGroupBy.corrwith. See GH 32293"
+ )
request.node.add_marker(mark)
df = pd.DataFrame(
@@ -1289,36 +1318,21 @@ def test_series_groupby_on_2_categoricals_unobserved(
assert len(result) == expected_length
-@pytest.mark.parametrize(
- "func, zero_or_nan",
- [
- ("all", np.NaN),
- ("any", np.NaN),
- ("count", 0),
- ("first", np.NaN),
- ("idxmax", np.NaN),
- ("idxmin", np.NaN),
- ("last", np.NaN),
- ("mad", np.NaN),
- ("max", np.NaN),
- ("mean", np.NaN),
- ("median", np.NaN),
- ("min", np.NaN),
- ("nth", np.NaN),
- ("nunique", 0),
- ("prod", np.NaN),
- ("quantile", np.NaN),
- ("sem", np.NaN),
- ("size", 0),
- ("skew", np.NaN),
- ("std", np.NaN),
- ("sum", np.NaN),
- ("var", np.NaN),
- ],
-)
-def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(func, zero_or_nan):
+def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(
+ reduction_func, request
+):
# GH 17605
# Tests whether the unobserved categories in the result contain 0 or NaN
+
+ if reduction_func == "ngroup":
+ pytest.skip("ngroup is not truly a reduction")
+
+ if reduction_func == "corrwith": # GH 32293
+ mark = pytest.mark.xfail(
+ reason="TODO: implemented SeriesGroupBy.corrwith. See GH 32293"
+ )
+ request.node.add_marker(mark)
+
df = pd.DataFrame(
{
"cat_1": pd.Categorical(list("AABB"), categories=list("ABC")),
@@ -1327,21 +1341,86 @@ def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(func, zero_o
}
)
unobserved = [tuple("AC"), tuple("BC"), tuple("CA"), tuple("CB"), tuple("CC")]
- args = {"nth": [0]}.get(func, [])
+ args = {"nth": [0]}.get(reduction_func, [])
series_groupby = df.groupby(["cat_1", "cat_2"], observed=False)["value"]
- agg = getattr(series_groupby, func)
+ agg = getattr(series_groupby, reduction_func)
result = agg(*args)
+ zero_or_nan = _results_for_groupbys_with_missing_categories[reduction_func]
+
for idx in unobserved:
val = result.loc[idx]
assert (pd.isna(zero_or_nan) and pd.isna(val)) or (val == zero_or_nan)
- # If we expect unobserved values to be zero, we also expect the dtype to be int
- if zero_or_nan == 0:
+ # If we expect unobserved values to be zero, we also expect the dtype to be int.
+ # Except for .sum(). If the observed categories sum to dtype=float (i.e. their
+ # sums have decimals), then the zeros for the missing categories should also be
+ # floats.
+ if zero_or_nan == 0 and reduction_func != "sum":
assert np.issubdtype(result.dtype, np.integer)
+def test_dataframe_groupby_on_2_categoricals_when_observed_is_true(reduction_func):
+ # GH 23865
+ # GH 27075
+ # Ensure that df.groupby, when 'by' is two pd.Categorical variables,
+ # does not return the categories that are not in df when observed=True
+ if reduction_func == "ngroup":
+ pytest.skip("ngroup does not return the Categories on the index")
+
+ df = pd.DataFrame(
+ {
+ "cat_1": pd.Categorical(list("AABB"), categories=list("ABC")),
+ "cat_2": pd.Categorical(list("1111"), categories=list("12")),
+ "value": [0.1, 0.1, 0.1, 0.1],
+ }
+ )
+ unobserved_cats = [("A", "2"), ("B", "2"), ("C", "1"), ("C", "2")]
+
+ df_grp = df.groupby(["cat_1", "cat_2"], observed=True)
+
+ args = {"nth": [0], "corrwith": [df]}.get(reduction_func, [])
+ res = getattr(df_grp, reduction_func)(*args)
+
+ for cat in unobserved_cats:
+ assert cat not in res.index
+
+
+@pytest.mark.parametrize("observed", [False, None])
+def test_dataframe_groupby_on_2_categoricals_when_observed_is_false(
+ reduction_func, observed, request
+):
+ # GH 23865
+ # GH 27075
+ # Ensure that df.groupby, when 'by' is two pd.Categorical variables,
+ # returns the categories that are not in df when observed=False/None
+
+ if reduction_func == "ngroup":
+ pytest.skip("ngroup does not return the Categories on the index")
+
+ df = pd.DataFrame(
+ {
+ "cat_1": pd.Categorical(list("AABB"), categories=list("ABC")),
+ "cat_2": pd.Categorical(list("1111"), categories=list("12")),
+ "value": [0.1, 0.1, 0.1, 0.1],
+ }
+ )
+ unobserved_cats = [("A", "2"), ("B", "2"), ("C", "1"), ("C", "2")]
+
+ df_grp = df.groupby(["cat_1", "cat_2"], observed=observed)
+
+ args = {"nth": [0], "corrwith": [df]}.get(reduction_func, [])
+ res = getattr(df_grp, reduction_func)(*args)
+
+ expected = _results_for_groupbys_with_missing_categories[reduction_func]
+
+ if expected is np.nan:
+ assert res.loc[unobserved_cats].isnull().all().all()
+ else:
+ assert (res.loc[unobserved_cats] == expected).all().all()
+
+
def test_series_groupby_categorical_aggregation_getitem():
# GH 8870
d = {"foo": [10, 8, 4, 1], "bar": [10, 20, 30, 40], "baz": ["d", "c", "d", "c"]}
@@ -1370,7 +1449,7 @@ def test_groupby_agg_categorical_columns(func, expected_values):
result = df.groupby("groups").agg(func)
expected = pd.DataFrame(
- {"value": expected_values}, index=pd.Index([0, 1, 2], name="groups"),
+ {"value": expected_values}, index=pd.Index([0, 1, 2], name="groups")
)
tm.assert_frame_equal(result, expected)
@@ -1509,3 +1588,103 @@ def test_aggregate_categorical_with_isnan():
index=index,
)
tm.assert_frame_equal(result, expected)
+
+
+def test_categorical_transform():
+ # GH 29037
+ df = pd.DataFrame(
+ {
+ "package_id": [1, 1, 1, 2, 2, 3],
+ "status": [
+ "Waiting",
+ "OnTheWay",
+ "Delivered",
+ "Waiting",
+ "OnTheWay",
+ "Waiting",
+ ],
+ }
+ )
+
+ delivery_status_type = pd.CategoricalDtype(
+ categories=["Waiting", "OnTheWay", "Delivered"], ordered=True
+ )
+ df["status"] = df["status"].astype(delivery_status_type)
+ df["last_status"] = df.groupby("package_id")["status"].transform(max)
+ result = df.copy()
+
+ expected = pd.DataFrame(
+ {
+ "package_id": [1, 1, 1, 2, 2, 3],
+ "status": [
+ "Waiting",
+ "OnTheWay",
+ "Delivered",
+ "Waiting",
+ "OnTheWay",
+ "Waiting",
+ ],
+ "last_status": [
+ "Delivered",
+ "Delivered",
+ "Delivered",
+ "OnTheWay",
+ "OnTheWay",
+ "Waiting",
+ ],
+ }
+ )
+
+ expected["status"] = expected["status"].astype(delivery_status_type)
+
+ tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("func", ["first", "last"])
+def test_series_groupby_first_on_categorical_col_grouped_on_2_categoricals(
+ func: str, observed: bool
+):
+ # GH 34951
+ cat = pd.Categorical([0, 0, 1, 1])
+ val = [0, 1, 1, 0]
+ df = pd.DataFrame({"a": cat, "b": cat, "c": val})
+
+ idx = pd.Categorical([0, 1])
+ idx = pd.MultiIndex.from_product([idx, idx], names=["a", "b"])
+ expected_dict = {
+ "first": pd.Series([0, np.NaN, np.NaN, 1], idx, name="c"),
+ "last": pd.Series([1, np.NaN, np.NaN, 0], idx, name="c"),
+ }
+
+ expected = expected_dict[func]
+ if observed:
+ expected = expected.dropna().astype(np.int64)
+
+ srs_grp = df.groupby(["a", "b"], observed=observed)["c"]
+ result = getattr(srs_grp, func)()
+ tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("func", ["first", "last"])
+def test_df_groupby_first_on_categorical_col_grouped_on_2_categoricals(
+ func: str, observed: bool
+):
+ # GH 34951
+ cat = pd.Categorical([0, 0, 1, 1])
+ val = [0, 1, 1, 0]
+ df = pd.DataFrame({"a": cat, "b": cat, "c": val})
+
+ idx = pd.Categorical([0, 1])
+ idx = pd.MultiIndex.from_product([idx, idx], names=["a", "b"])
+ expected_dict = {
+ "first": pd.Series([0, np.NaN, np.NaN, 1], idx, name="c"),
+ "last": pd.Series([1, np.NaN, np.NaN, 0], idx, name="c"),
+ }
+
+ expected = expected_dict[func].to_frame()
+ if observed:
+ expected = expected.dropna().astype(np.int64)
+
+ df_grp = df.groupby(["a", "b"], observed=observed)
+ result = getattr(df_grp, func)()
+ tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py
index 6f19ec40c2520..ab736b55b5743 100644
--- a/pandas/tests/groupby/test_function.py
+++ b/pandas/tests/groupby/test_function.py
@@ -85,6 +85,24 @@ def test_max_min_non_numeric():
assert "ss" in result
+def test_min_date_with_nans():
+ # GH26321
+ dates = pd.to_datetime(
+ pd.Series(["2019-05-09", "2019-05-09", "2019-05-09"]), format="%Y-%m-%d"
+ ).dt.date
+ df = pd.DataFrame({"a": [np.nan, "1", np.nan], "b": [0, 1, 1], "c": dates})
+
+ result = df.groupby("b", as_index=False)["c"].min()["c"]
+ expected = pd.to_datetime(
+ pd.Series(["2019-05-09", "2019-05-09"], name="c"), format="%Y-%m-%d"
+ ).dt.date
+ tm.assert_series_equal(result, expected)
+
+ result = df.groupby("b")["c"].min()
+ expected.index.name = "b"
+ tm.assert_series_equal(result, expected)
+
+
def test_intercept_builtin_sum():
s = Series([1.0, 2.0, np.nan, 3.0])
grouped = s.groupby([0, 1, 2, 2])
@@ -477,51 +495,6 @@ def test_idxmin_idxmax_returns_int_types(func, values):
tm.assert_frame_equal(result, expected)
-def test_fill_consistency():
-
- # GH9221
- # pass thru keyword arguments to the generated wrapper
- # are set if the passed kw is None (only)
- df = DataFrame(
- index=pd.MultiIndex.from_product(
- [["value1", "value2"], date_range("2014-01-01", "2014-01-06")]
- ),
- columns=Index(["1", "2"], name="id"),
- )
- df["1"] = [
- np.nan,
- 1,
- np.nan,
- np.nan,
- 11,
- np.nan,
- np.nan,
- 2,
- np.nan,
- np.nan,
- 22,
- np.nan,
- ]
- df["2"] = [
- np.nan,
- 3,
- np.nan,
- np.nan,
- 33,
- np.nan,
- np.nan,
- 4,
- np.nan,
- np.nan,
- 44,
- np.nan,
- ]
-
- expected = df.groupby(level=0, axis=0).fillna(method="ffill")
- result = df.T.groupby(level=0, axis=1).fillna(method="ffill").T
- tm.assert_frame_equal(result, expected)
-
-
def test_groupby_cumprod():
# GH 4095
df = pd.DataFrame({"key": ["b"] * 10, "value": 2})
@@ -922,10 +895,6 @@ def test_frame_describe_multikey(tsframe):
groupedT = tsframe.groupby({"A": 0, "B": 0, "C": 1, "D": 1}, axis=1)
result = groupedT.describe()
expected = tsframe.describe().T
- expected.index = pd.MultiIndex(
- levels=[[0, 1], expected.index],
- codes=[[0, 0, 1, 1], range(len(expected.index))],
- )
tm.assert_frame_equal(result, expected)
@@ -974,6 +943,68 @@ def test_frame_describe_unstacked_format():
tm.assert_frame_equal(result, expected)
+@pytest.mark.filterwarnings(
+ "ignore:"
+ "indexing past lexsort depth may impact performance:"
+ "pandas.errors.PerformanceWarning"
+)
+@pytest.mark.parametrize("as_index", [True, False])
+def test_describe_with_duplicate_output_column_names(as_index):
+ # GH 35314
+ df = pd.DataFrame(
+ {
+ "a": [99, 99, 99, 88, 88, 88],
+ "b": [1, 2, 3, 4, 5, 6],
+ "c": [10, 20, 30, 40, 50, 60],
+ },
+ columns=["a", "b", "b"],
+ )
+
+ expected = (
+ pd.DataFrame.from_records(
+ [
+ ("a", "count", 3.0, 3.0),
+ ("a", "mean", 88.0, 99.0),
+ ("a", "std", 0.0, 0.0),
+ ("a", "min", 88.0, 99.0),
+ ("a", "25%", 88.0, 99.0),
+ ("a", "50%", 88.0, 99.0),
+ ("a", "75%", 88.0, 99.0),
+ ("a", "max", 88.0, 99.0),
+ ("b", "count", 3.0, 3.0),
+ ("b", "mean", 5.0, 2.0),
+ ("b", "std", 1.0, 1.0),
+ ("b", "min", 4.0, 1.0),
+ ("b", "25%", 4.5, 1.5),
+ ("b", "50%", 5.0, 2.0),
+ ("b", "75%", 5.5, 2.5),
+ ("b", "max", 6.0, 3.0),
+ ("b", "count", 3.0, 3.0),
+ ("b", "mean", 5.0, 2.0),
+ ("b", "std", 1.0, 1.0),
+ ("b", "min", 4.0, 1.0),
+ ("b", "25%", 4.5, 1.5),
+ ("b", "50%", 5.0, 2.0),
+ ("b", "75%", 5.5, 2.5),
+ ("b", "max", 6.0, 3.0),
+ ],
+ )
+ .set_index([0, 1])
+ .T
+ )
+ expected.columns.names = [None, None]
+ expected.index = pd.Index([88, 99], name="a")
+
+ if as_index:
+ expected = expected.drop(columns=["a"], level=0)
+ else:
+ expected = expected.reset_index(drop=True)
+
+ result = df.groupby("a", as_index=as_index).describe()
+
+ tm.assert_frame_equal(result, expected)
+
+
def test_groupby_mean_no_overflow():
# Regression test for (#22487)
df = pd.DataFrame(
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
index 0d040b8e6955a..6783fc5b66433 100644
--- a/pandas/tests/groupby/test_groupby.py
+++ b/pandas/tests/groupby/test_groupby.py
@@ -249,8 +249,8 @@ def test_len():
# issue 11016
df = pd.DataFrame(dict(a=[np.nan] * 3, b=[1, 2, 3]))
- assert len(df.groupby(("a"))) == 0
- assert len(df.groupby(("b"))) == 3
+ assert len(df.groupby("a")) == 0
+ assert len(df.groupby("b")) == 3
assert len(df.groupby(["a", "b"])) == 3
@@ -605,6 +605,14 @@ def test_as_index_select_column():
tm.assert_series_equal(result, expected)
+def test_groupby_as_index_select_column_sum_empty_df():
+ # GH 35246
+ df = DataFrame(columns=["A", "B", "C"])
+ left = df.groupby(by="A", as_index=False)["B"].sum()
+ assert type(left) is DataFrame
+ assert left.to_dict() == {"A": {}, "B": {}}
+
+
def test_groupby_as_index_agg(df):
grouped = df.groupby("A", as_index=False)
@@ -668,7 +676,7 @@ def test_ops_not_as_index(reduction_func):
if reduction_func in ("corrwith",):
pytest.skip("Test not applicable")
- if reduction_func in ("nth", "ngroup",):
+ if reduction_func in ("nth", "ngroup"):
pytest.skip("Skip until behavior is determined (GH #5755)")
df = DataFrame(np.random.randint(0, 5, size=(100, 2)), columns=["a", "b"])
@@ -1175,6 +1183,18 @@ def test_groupby_dtype_inference_empty():
tm.assert_frame_equal(result, expected, by_blocks=True)
+def test_groupby_unit64_float_conversion():
+ # GH: 30859 groupby converts unit64 to floats sometimes
+ df = pd.DataFrame({"first": [1], "second": [1], "value": [16148277970000000000]})
+ result = df.groupby(["first", "second"])["value"].max()
+ expected = pd.Series(
+ [16148277970000000000],
+ pd.MultiIndex.from_product([[1], [1]], names=["first", "second"]),
+ name="value",
+ )
+ tm.assert_series_equal(result, expected)
+
+
def test_groupby_list_infer_array_like(df):
result = df.groupby(list(df["A"])).mean()
expected = df.groupby(df["A"]).mean()
@@ -1941,13 +1961,6 @@ def test_shift_bfill_ffill_tz(tz_naive_fixture, op, expected):
tm.assert_frame_equal(result, expected)
-def test_ffill_missing_arguments():
- # GH 14955
- df = pd.DataFrame({"a": [1, 2], "b": [1, 1]})
- with pytest.raises(ValueError, match="Must specify a fill"):
- df.groupby("b").fillna()
-
-
def test_groupby_only_none_group():
# see GH21624
# this was crashing with "ValueError: Length of passed values is 1, index implies 0"
@@ -2047,3 +2060,80 @@ def test_groups_repr_truncates(max_seq_items, expected):
result = df.groupby(np.array(df.a)).groups.__repr__()
assert result == expected
+
+
+def test_group_on_two_row_multiindex_returns_one_tuple_key():
+ # GH 18451
+ df = pd.DataFrame([{"a": 1, "b": 2, "c": 99}, {"a": 1, "b": 2, "c": 88}])
+ df = df.set_index(["a", "b"])
+
+ grp = df.groupby(["a", "b"])
+ result = grp.indices
+ expected = {(1, 2): np.array([0, 1], dtype=np.int64)}
+
+ assert len(result) == 1
+ key = (1, 2)
+ assert (result[key] == expected[key]).all()
+
+
+@pytest.mark.parametrize(
+ "klass, attr, value",
+ [
+ (DataFrame, "axis", 1),
+ (DataFrame, "level", "a"),
+ (DataFrame, "as_index", False),
+ (DataFrame, "sort", False),
+ (DataFrame, "group_keys", False),
+ (DataFrame, "squeeze", True),
+ (DataFrame, "observed", True),
+ (DataFrame, "dropna", False),
+ pytest.param(
+ Series,
+ "axis",
+ 1,
+ marks=pytest.mark.xfail(
+ reason="GH 35443: Attribute currently not passed on to series"
+ ),
+ ),
+ (Series, "level", "a"),
+ (Series, "as_index", False),
+ (Series, "sort", False),
+ (Series, "group_keys", False),
+ (Series, "squeeze", True),
+ (Series, "observed", True),
+ (Series, "dropna", False),
+ ],
+)
+@pytest.mark.filterwarnings(
+ "ignore:The `squeeze` parameter is deprecated:FutureWarning"
+)
+def test_subsetting_columns_keeps_attrs(klass, attr, value):
+ # GH 9959 - When subsetting columns, don't drop attributes
+ df = pd.DataFrame({"a": [1], "b": [2], "c": [3]})
+ if attr != "axis":
+ df = df.set_index("a")
+
+ expected = df.groupby("a", **{attr: value})
+ result = expected[["b"]] if klass is DataFrame else expected["b"]
+ assert getattr(result, attr) == getattr(expected, attr)
+
+
+@pytest.mark.parametrize("func", ["sum", "any", "shift"])
+def test_groupby_column_index_name_lost(func):
+ # GH: 29764 groupby loses index sometimes
+ expected = pd.Index(["a"], name="idx")
+ df = pd.DataFrame([[1]], columns=expected)
+ df_grouped = df.groupby([1])
+ result = getattr(df_grouped, func)().columns
+ tm.assert_index_equal(result, expected)
+
+
+def test_groupby_duplicate_columns():
+ # GH: 31735
+ df = pd.DataFrame(
+ {"A": ["f", "e", "g", "h"], "B": ["a", "b", "c", "d"], "C": [1, 2, 3, 4]}
+ ).astype(object)
+ df.columns = ["A", "B", "B"]
+ result = df.groupby([0, 0, 0, 0]).min()
+ expected = pd.DataFrame([["e", "a", 1]], columns=["A", "B", "B"])
+ tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py
index 1a525d306e9f5..deb73acbb158a 100644
--- a/pandas/tests/groupby/test_groupby_dropna.py
+++ b/pandas/tests/groupby/test_groupby_dropna.py
@@ -162,6 +162,40 @@ def test_groupby_dropna_series_by(dropna, expected):
tm.assert_series_equal(result, expected)
+@pytest.mark.parametrize(
+ "dropna,df_expected,s_expected",
+ [
+ pytest.param(
+ True,
+ pd.DataFrame({"B": [2, 2, 1]}),
+ pd.Series(data=[2, 2, 1], name="B"),
+ marks=pytest.mark.xfail(raises=ValueError),
+ ),
+ (
+ False,
+ pd.DataFrame({"B": [2, 2, 1, 1]}),
+ pd.Series(data=[2, 2, 1, 1], name="B"),
+ ),
+ ],
+)
+def test_slice_groupby_then_transform(dropna, df_expected, s_expected):
+ # GH35014
+
+ df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]})
+ gb = df.groupby("A", dropna=dropna)
+
+ res = gb.transform(len)
+ tm.assert_frame_equal(res, df_expected)
+
+ gb_slice = gb[["B"]]
+ res = gb_slice.transform(len)
+ tm.assert_frame_equal(res, df_expected)
+
+ gb_slice = gb["B"]
+ res = gb["B"].transform(len)
+ tm.assert_series_equal(res, s_expected)
+
+
@pytest.mark.parametrize(
"dropna, tuples, outputs",
[
@@ -204,6 +238,7 @@ def test_groupby_dropna_multi_index_dataframe_agg(dropna, tuples, outputs):
tm.assert_frame_equal(grouped, expected)
+@pytest.mark.arm_slow
@pytest.mark.parametrize(
"datetime1, datetime2",
[
@@ -212,9 +247,7 @@ def test_groupby_dropna_multi_index_dataframe_agg(dropna, tuples, outputs):
(pd.Period("2020-01-01"), pd.Period("2020-02-01")),
],
)
-@pytest.mark.parametrize(
- "dropna, values", [(True, [12, 3]), (False, [12, 3, 6],)],
-)
+@pytest.mark.parametrize("dropna, values", [(True, [12, 3]), (False, [12, 3, 6])])
def test_groupby_dropna_datetime_like_data(
dropna, values, datetime1, datetime2, unique_nulls_fixture, unique_nulls_fixture2
):
@@ -242,3 +275,56 @@ def test_groupby_dropna_datetime_like_data(
expected = pd.DataFrame({"values": values}, index=pd.Index(indexes, name="dt"))
tm.assert_frame_equal(grouped, expected)
+
+
+@pytest.mark.parametrize(
+ "dropna, data, selected_data, levels",
+ [
+ pytest.param(
+ False,
+ {"groups": ["a", "a", "b", np.nan], "values": [10, 10, 20, 30]},
+ {"values": [0, 1, 0, 0]},
+ ["a", "b", np.nan],
+ id="dropna_false_has_nan",
+ ),
+ pytest.param(
+ True,
+ {"groups": ["a", "a", "b", np.nan], "values": [10, 10, 20, 30]},
+ {"values": [0, 1, 0]},
+ None,
+ id="dropna_true_has_nan",
+ ),
+ pytest.param(
+ # no nan in "groups"; dropna=True|False should be same.
+ False,
+ {"groups": ["a", "a", "b", "c"], "values": [10, 10, 20, 30]},
+ {"values": [0, 1, 0, 0]},
+ None,
+ id="dropna_false_no_nan",
+ ),
+ pytest.param(
+ # no nan in "groups"; dropna=True|False should be same.
+ True,
+ {"groups": ["a", "a", "b", "c"], "values": [10, 10, 20, 30]},
+ {"values": [0, 1, 0, 0]},
+ None,
+ id="dropna_true_no_nan",
+ ),
+ ],
+)
+def test_groupby_apply_with_dropna_for_multi_index(dropna, data, selected_data, levels):
+ # GH 35889
+
+ df = pd.DataFrame(data)
+ gb = df.groupby("groups", dropna=dropna)
+ result = gb.apply(lambda grp: pd.DataFrame({"values": range(len(grp))}))
+
+ mi_tuples = tuple(zip(data["groups"], selected_data["values"]))
+ mi = pd.MultiIndex.from_tuples(mi_tuples, names=["groups", None])
+ # Since right now, by default MI will drop NA from levels when we create MI
+ # via `from_*`, so we need to add NA for level manually afterwards.
+ if not dropna and levels:
+ mi = mi.set_levels(levels, level="groups")
+
+ expected = pd.DataFrame(selected_data, index=mi)
+ tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/groupby/test_groupby_subclass.py b/pandas/tests/groupby/test_groupby_subclass.py
index 7271911c5f80f..cc7a79e976513 100644
--- a/pandas/tests/groupby/test_groupby_subclass.py
+++ b/pandas/tests/groupby/test_groupby_subclass.py
@@ -51,9 +51,7 @@ def test_groupby_preserves_subclass(obj, groupby_func):
tm.assert_series_equal(result1, result2)
-@pytest.mark.parametrize(
- "obj", [DataFrame, tm.SubclassedDataFrame],
-)
+@pytest.mark.parametrize("obj", [DataFrame, tm.SubclassedDataFrame])
def test_groupby_resample_preserves_subclass(obj):
# GH28330 -- preserve subclass through groupby.resample()
diff --git a/pandas/tests/groupby/test_grouping.py b/pandas/tests/groupby/test_grouping.py
index efcd22f9c0c82..18ef95c05f291 100644
--- a/pandas/tests/groupby/test_grouping.py
+++ b/pandas/tests/groupby/test_grouping.py
@@ -191,13 +191,15 @@ def test_grouper_creation_bug(self):
result = g.sum()
tm.assert_frame_equal(result, expected)
- result = g.apply(lambda x: x.sum())
- tm.assert_frame_equal(result, expected)
-
g = df.groupby(pd.Grouper(key="A", axis=0))
result = g.sum()
tm.assert_frame_equal(result, expected)
+ result = g.apply(lambda x: x.sum())
+ expected["A"] = [0, 2, 4]
+ expected = expected.loc[:, ["A", "B"]]
+ tm.assert_frame_equal(result, expected)
+
# GH14334
# pd.Grouper(key=...) may be passed in a list
df = DataFrame(
@@ -737,7 +739,7 @@ def test_get_group(self):
with pytest.raises(ValueError, match=msg):
g.get_group("foo")
with pytest.raises(ValueError, match=msg):
- g.get_group(("foo"))
+ g.get_group("foo")
msg = "must supply a same-length tuple to get_group with multiple grouping keys"
with pytest.raises(ValueError, match=msg):
g.get_group(("foo", "bar", "baz"))
diff --git a/pandas/tests/groupby/test_missing.py b/pandas/tests/groupby/test_missing.py
new file mode 100644
index 0000000000000..116aed9935694
--- /dev/null
+++ b/pandas/tests/groupby/test_missing.py
@@ -0,0 +1,84 @@
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import DataFrame, Index, date_range
+import pandas._testing as tm
+
+
+@pytest.mark.parametrize("func", ["ffill", "bfill"])
+def test_groupby_column_index_name_lost_fill_funcs(func):
+ # GH: 29764 groupby loses index sometimes
+ df = pd.DataFrame(
+ [[1, 1.0, -1.0], [1, np.nan, np.nan], [1, 2.0, -2.0]],
+ columns=pd.Index(["type", "a", "b"], name="idx"),
+ )
+ df_grouped = df.groupby(["type"])[["a", "b"]]
+ result = getattr(df_grouped, func)().columns
+ expected = pd.Index(["a", "b"], name="idx")
+ tm.assert_index_equal(result, expected)
+
+
+@pytest.mark.parametrize("func", ["ffill", "bfill"])
+def test_groupby_fill_duplicate_column_names(func):
+ # GH: 25610 ValueError with duplicate column names
+ df1 = pd.DataFrame({"field1": [1, 3, 4], "field2": [1, 3, 4]})
+ df2 = pd.DataFrame({"field1": [1, np.nan, 4]})
+ df_grouped = pd.concat([df1, df2], axis=1).groupby(by=["field2"])
+ expected = pd.DataFrame(
+ [[1, 1.0], [3, np.nan], [4, 4.0]], columns=["field1", "field1"]
+ )
+ result = getattr(df_grouped, func)()
+ tm.assert_frame_equal(result, expected)
+
+
+def test_ffill_missing_arguments():
+ # GH 14955
+ df = pd.DataFrame({"a": [1, 2], "b": [1, 1]})
+ with pytest.raises(ValueError, match="Must specify a fill"):
+ df.groupby("b").fillna()
+
+
+def test_fill_consistency():
+
+ # GH9221
+ # pass thru keyword arguments to the generated wrapper
+ # are set if the passed kw is None (only)
+ df = DataFrame(
+ index=pd.MultiIndex.from_product(
+ [["value1", "value2"], date_range("2014-01-01", "2014-01-06")]
+ ),
+ columns=Index(["1", "2"], name="id"),
+ )
+ df["1"] = [
+ np.nan,
+ 1,
+ np.nan,
+ np.nan,
+ 11,
+ np.nan,
+ np.nan,
+ 2,
+ np.nan,
+ np.nan,
+ 22,
+ np.nan,
+ ]
+ df["2"] = [
+ np.nan,
+ 3,
+ np.nan,
+ np.nan,
+ 33,
+ np.nan,
+ np.nan,
+ 4,
+ np.nan,
+ np.nan,
+ 44,
+ np.nan,
+ ]
+
+ expected = df.groupby(level=0, axis=0).fillna(method="ffill")
+ result = df.T.groupby(level=0, axis=1).fillna(method="ffill").T
+ tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/groupby/test_nunique.py b/pandas/tests/groupby/test_nunique.py
index 1475b1ce2907c..c3347b7ae52f3 100644
--- a/pandas/tests/groupby/test_nunique.py
+++ b/pandas/tests/groupby/test_nunique.py
@@ -167,3 +167,11 @@ def test_nunique_preserves_column_level_names():
result = test.groupby([0, 0, 0]).nunique()
expected = pd.DataFrame([2], columns=test.columns)
tm.assert_frame_equal(result, expected)
+
+
+def test_nunique_transform_with_datetime():
+ # GH 35109 - transform with nunique on datetimes results in integers
+ df = pd.DataFrame(date_range("2008-12-31", "2009-01-02"), columns=["date"])
+ result = df.groupby([0, 0, 1])["date"].transform("nunique")
+ expected = pd.Series([2, 2, 1], name="date")
+ tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/groupby/test_quantile.py b/pandas/tests/groupby/test_quantile.py
index 8cfd8035502c3..9338742195bfe 100644
--- a/pandas/tests/groupby/test_quantile.py
+++ b/pandas/tests/groupby/test_quantile.py
@@ -232,3 +232,11 @@ def test_groupby_quantile_nullable_array(values, q):
expected = pd.Series(true_quantiles * 2, index=idx, name="b")
tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]])
+def test_groupby_quantile_skips_invalid_dtype(q):
+ df = pd.DataFrame({"a": [1], "b": [2.0], "c": ["x"]})
+ result = df.groupby("a").quantile(q)
+ expected = df.groupby("a")[["b"]].quantile(q)
+ tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/groupby/test_size.py b/pandas/tests/groupby/test_size.py
index 9cff8b966dad0..ba27e5a24ba00 100644
--- a/pandas/tests/groupby/test_size.py
+++ b/pandas/tests/groupby/test_size.py
@@ -53,7 +53,7 @@ def test_size_on_categorical(as_index):
result = df.groupby(["A", "B"], as_index=as_index).size()
expected = DataFrame(
- [[1, 1, 1], [1, 2, 0], [2, 1, 0], [2, 2, 1]], columns=["A", "B", "size"],
+ [[1, 1, 1], [1, 2, 0], [2, 1, 0], [2, 2, 1]], columns=["A", "B", "size"]
)
expected["A"] = expected["A"].astype("category")
if as_index:
diff --git a/pandas/tests/groupby/test_timegrouper.py b/pandas/tests/groupby/test_timegrouper.py
index 84fd7a1bdfb05..4ccbc6a65fd88 100644
--- a/pandas/tests/groupby/test_timegrouper.py
+++ b/pandas/tests/groupby/test_timegrouper.py
@@ -780,6 +780,6 @@ def test_grouper_period_index(self):
result = period_series.groupby(period_series.index.month).sum()
expected = pd.Series(
- range(0, periods), index=Index(range(1, periods + 1), name=index.name),
+ range(0, periods), index=Index(range(1, periods + 1), name=index.name)
)
tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/groupby/transform/test_numba.py b/pandas/tests/groupby/transform/test_numba.py
index 9a4015ac983c5..3a184bdd007c7 100644
--- a/pandas/tests/groupby/transform/test_numba.py
+++ b/pandas/tests/groupby/transform/test_numba.py
@@ -3,7 +3,7 @@
from pandas.errors import NumbaUtilError
import pandas.util._test_decorators as td
-from pandas import DataFrame
+from pandas import DataFrame, option_context
import pandas._testing as tm
from pandas.core.util.numba_ import NUMBA_FUNC_CACHE
@@ -56,7 +56,7 @@ def func(values, index):
func = numba.jit(func)
data = DataFrame(
- {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1],
+ {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
)
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
grouped = data.groupby(0)
@@ -89,7 +89,7 @@ def func_2(values, index):
func_2 = numba.jit(func_2)
data = DataFrame(
- {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1],
+ {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
)
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}
grouped = data.groupby(0)
@@ -112,3 +112,34 @@ def func_2(values, index):
result = grouped.transform(func_1, engine="numba", engine_kwargs=engine_kwargs)
expected = grouped.transform(lambda x: x + 1, engine="cython")
tm.assert_equal(result, expected)
+
+
+@td.skip_if_no("numba", "0.46.0")
+def test_use_global_config():
+ def func_1(values, index):
+ return values + 1
+
+ data = DataFrame(
+ {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
+ )
+ grouped = data.groupby(0)
+ expected = grouped.transform(func_1, engine="numba")
+ with option_context("compute.use_numba", True):
+ result = grouped.transform(func_1, engine=None)
+ tm.assert_frame_equal(expected, result)
+
+
+@td.skip_if_no("numba", "0.46.0")
+@pytest.mark.parametrize(
+ "agg_func", [["min", "max"], "min", {"B": ["min", "max"], "C": "sum"}]
+)
+def test_multifunc_notimplimented(agg_func):
+ data = DataFrame(
+ {0: ["a", "a", "b", "b", "a"], 1: [1.0, 2.0, 3.0, 4.0, 5.0]}, columns=[0, 1]
+ )
+ grouped = data.groupby(0)
+ with pytest.raises(NotImplementedError, match="Numba engine can"):
+ grouped.transform(agg_func, engine="numba")
+
+ with pytest.raises(NotImplementedError, match="Numba engine can"):
+ grouped[1].transform(agg_func, engine="numba")
diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py
index cdaf27e214d80..97be039e16ebb 100644
--- a/pandas/tests/groupby/transform/test_transform.py
+++ b/pandas/tests/groupby/transform/test_transform.py
@@ -4,7 +4,7 @@
import numpy as np
import pytest
-from pandas._libs import groupby
+from pandas._libs.groupby import group_cumprod_float64, group_cumsum
from pandas.core.dtypes.common import ensure_platform_int, is_timedelta64_dtype
@@ -545,14 +545,14 @@ def _check_cython_group_transform_cumulative(pd_op, np_op, dtype):
def test_cython_group_transform_cumsum(any_real_dtype):
# see gh-4095
dtype = np.dtype(any_real_dtype).type
- pd_op, np_op = groupby.group_cumsum, np.cumsum
+ pd_op, np_op = group_cumsum, np.cumsum
_check_cython_group_transform_cumulative(pd_op, np_op, dtype)
def test_cython_group_transform_cumprod():
# see gh-4095
dtype = np.float64
- pd_op, np_op = groupby.group_cumprod_float64, np.cumproduct
+ pd_op, np_op = group_cumprod_float64, np.cumproduct
_check_cython_group_transform_cumulative(pd_op, np_op, dtype)
@@ -567,13 +567,13 @@ def test_cython_group_transform_algos():
data = np.array([[1], [2], [3], [np.nan], [4]], dtype="float64")
actual = np.zeros_like(data)
actual.fill(np.nan)
- groupby.group_cumprod_float64(actual, data, labels, ngroups, is_datetimelike)
+ group_cumprod_float64(actual, data, labels, ngroups, is_datetimelike)
expected = np.array([1, 2, 6, np.nan, 24], dtype="float64")
tm.assert_numpy_array_equal(actual[:, 0], expected)
actual = np.zeros_like(data)
actual.fill(np.nan)
- groupby.group_cumsum(actual, data, labels, ngroups, is_datetimelike)
+ group_cumsum(actual, data, labels, ngroups, is_datetimelike)
expected = np.array([1, 3, 6, np.nan, 10], dtype="float64")
tm.assert_numpy_array_equal(actual[:, 0], expected)
@@ -581,7 +581,7 @@ def test_cython_group_transform_algos():
is_datetimelike = True
data = np.array([np.timedelta64(1, "ns")] * 5, dtype="m8[ns]")[:, None]
actual = np.zeros_like(data, dtype="int64")
- groupby.group_cumsum(actual, data.view("int64"), labels, ngroups, is_datetimelike)
+ group_cumsum(actual, data.view("int64"), labels, ngroups, is_datetimelike)
expected = np.array(
[
np.timedelta64(1, "ns"),
@@ -675,6 +675,7 @@ def test_groupby_cum_skipna(op, skipna, input, exp):
tm.assert_series_equal(expected, result)
+@pytest.mark.arm_slow
@pytest.mark.parametrize(
"op, args, targop",
[
diff --git a/pandas/tests/indexes/base_class/test_indexing.py b/pandas/tests/indexes/base_class/test_indexing.py
new file mode 100644
index 0000000000000..b2fa8f31ee5ec
--- /dev/null
+++ b/pandas/tests/indexes/base_class/test_indexing.py
@@ -0,0 +1,26 @@
+import pytest
+
+from pandas import Index
+
+
+class TestGetSliceBounds:
+ @pytest.mark.parametrize("kind", ["getitem", "loc", None])
+ @pytest.mark.parametrize("side, expected", [("left", 4), ("right", 5)])
+ def test_get_slice_bounds_within(self, kind, side, expected):
+ index = Index(list("abcdef"))
+ result = index.get_slice_bound("e", kind=kind, side=side)
+ assert result == expected
+
+ @pytest.mark.parametrize("kind", ["getitem", "loc", None])
+ @pytest.mark.parametrize("side", ["left", "right"])
+ @pytest.mark.parametrize(
+ "data, bound, expected", [(list("abcdef"), "x", 6), (list("bcdefg"), "a", 0)]
+ )
+ def test_get_slice_bounds_outside(self, kind, side, expected, data, bound):
+ index = Index(data)
+ result = index.get_slice_bound(bound, kind=kind, side=side)
+ assert result == expected
+
+ def test_get_slice_bounds_invalid_side(self):
+ with pytest.raises(ValueError, match="Invalid value for side kwarg"):
+ Index([]).get_slice_bound("a", kind=None, side="middle")
diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py
index 7f30a77872bc1..a3a06338a0277 100644
--- a/pandas/tests/indexes/categorical/test_category.py
+++ b/pandas/tests/indexes/categorical/test_category.py
@@ -43,7 +43,14 @@ def test_disallow_addsub_ops(self, func, op_name):
# GH 10039
# set ops (+/-) raise TypeError
idx = pd.Index(pd.Categorical(["a", "b"]))
- msg = f"cannot perform {op_name} with this index type: CategoricalIndex"
+ cat_or_list = "'(Categorical|list)' and '(Categorical|list)'"
+ msg = "|".join(
+ [
+ f"cannot perform {op_name} with this index type: CategoricalIndex",
+ "can only concatenate list",
+ rf"unsupported operand type\(s\) for [\+-]: {cat_or_list}",
+ ]
+ )
with pytest.raises(TypeError, match=msg):
func(idx)
@@ -395,15 +402,7 @@ def test_equals_categorical(self):
with pytest.raises(ValueError, match="Lengths must match"):
ci1 == Index(["a", "b", "c"])
- msg = (
- "categorical index comparisons must have the same categories "
- "and ordered attributes"
- "|"
- "Categoricals can only be compared if 'categories' are the same. "
- "Categories are different lengths"
- "|"
- "Categoricals can only be compared if 'ordered' is the same"
- )
+ msg = "Categoricals can only be compared if 'categories' are the same"
with pytest.raises(TypeError, match=msg):
ci1 == ci2
with pytest.raises(TypeError, match=msg):
@@ -478,3 +477,9 @@ def test_reindex_base(self):
def test_map_str(self):
# See test_map.py
pass
+
+ def test_format_different_scalar_lengths(self):
+ # GH35439
+ idx = CategoricalIndex(["aaaaaaaaa", "b"])
+ expected = ["aaaaaaaaa", "b"]
+ assert idx.format() == expected
diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py
index 30c58506f619d..c40f7b1bc2120 100644
--- a/pandas/tests/indexes/common.py
+++ b/pandas/tests/indexes/common.py
@@ -1,5 +1,5 @@
import gc
-from typing import Optional, Type
+from typing import Type
import numpy as np
import pytest
@@ -32,7 +32,7 @@
class Base:
""" base class for index sub-class tests """
- _holder: Optional[Type[Index]] = None
+ _holder: Type[Index]
_compat_props = ["shape", "ndim", "size", "nbytes"]
def create_index(self) -> Index:
@@ -145,22 +145,41 @@ def test_numeric_compat(self):
# Check that this doesn't cover MultiIndex case, if/when it does,
# we can remove multi.test_compat.test_numeric_compat
assert not isinstance(idx, MultiIndex)
+ if type(idx) is Index:
+ return
- with pytest.raises(TypeError, match="cannot perform __mul__"):
+ typ = type(idx._data).__name__
+ lmsg = "|".join(
+ [
+ rf"unsupported operand type\(s\) for \*: '{typ}' and 'int'",
+ "cannot perform (__mul__|__truediv__|__floordiv__) with "
+ f"this index type: {typ}",
+ ]
+ )
+ with pytest.raises(TypeError, match=lmsg):
idx * 1
- with pytest.raises(TypeError, match="cannot perform __rmul__"):
+ rmsg = "|".join(
+ [
+ rf"unsupported operand type\(s\) for \*: 'int' and '{typ}'",
+ "cannot perform (__rmul__|__rtruediv__|__rfloordiv__) with "
+ f"this index type: {typ}",
+ ]
+ )
+ with pytest.raises(TypeError, match=rmsg):
1 * idx
- div_err = "cannot perform __truediv__"
+ div_err = lmsg.replace("*", "/")
with pytest.raises(TypeError, match=div_err):
idx / 1
-
- div_err = div_err.replace(" __", " __r")
+ div_err = rmsg.replace("*", "/")
with pytest.raises(TypeError, match=div_err):
1 / idx
- with pytest.raises(TypeError, match="cannot perform __floordiv__"):
+
+ floordiv_err = lmsg.replace("*", "//")
+ with pytest.raises(TypeError, match=floordiv_err):
idx // 1
- with pytest.raises(TypeError, match="cannot perform __rfloordiv__"):
+ floordiv_err = rmsg.replace("*", "//")
+ with pytest.raises(TypeError, match=floordiv_err):
1 // idx
def test_logical_compat(self):
@@ -250,6 +269,25 @@ def test_copy_name(self, index):
s3 = s1 * s2
assert s3.index.name == "mario"
+ def test_copy_name2(self, index):
+ # gh-35592
+ if isinstance(index, MultiIndex):
+ return
+
+ assert index.copy(name="mario").name == "mario"
+
+ with pytest.raises(ValueError, match="Length of new names must be 1, got 2"):
+ index.copy(name=["mario", "luigi"])
+
+ msg = f"{type(index).__name__}.name must be a hashable type"
+ with pytest.raises(TypeError, match=msg):
+ index.copy(name=[["mario"]])
+
+ def test_copy_dtype_deprecated(self, index):
+ # GH35853
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ index.copy(dtype=object)
+
def test_ensure_copied_data(self, index):
# Check the "copy" argument of each Index.__new__ is honoured
# GH12309
@@ -459,7 +497,11 @@ def test_union_base(self, index):
for case in cases:
if not isinstance(index, CategoricalIndex):
result = first.union(case)
- assert tm.equalContents(result, everything)
+ assert tm.equalContents(result, everything), (
+ result,
+ everything,
+ type(case),
+ )
if isinstance(index, MultiIndex):
msg = "other must be a MultiIndex or a list of tuples"
@@ -632,6 +674,18 @@ def test_equals_op(self):
tm.assert_numpy_array_equal(index_a == item, expected3)
tm.assert_series_equal(series_a == item, Series(expected3))
+ def test_format(self):
+ # GH35439
+ idx = self.create_index()
+ expected = [str(x) for x in idx]
+ assert idx.format() == expected
+
+ def test_format_empty(self):
+ # GH35712
+ empty_idx = self._holder([])
+ assert empty_idx.format() == []
+ assert empty_idx.format(name=True) == [""]
+
def test_hasnans_isnans(self, index):
# GH 11343, added tests for hasnans / isnans
if isinstance(index, MultiIndex):
@@ -786,16 +840,17 @@ def test_map_str(self):
def test_putmask_with_wrong_mask(self):
# GH18368
index = self.create_index()
+ fill = index[0]
msg = "putmask: mask and data must be the same size"
with pytest.raises(ValueError, match=msg):
- index.putmask(np.ones(len(index) + 1, np.bool_), 1)
+ index.putmask(np.ones(len(index) + 1, np.bool_), fill)
with pytest.raises(ValueError, match=msg):
- index.putmask(np.ones(len(index) - 1, np.bool_), 1)
+ index.putmask(np.ones(len(index) - 1, np.bool_), fill)
with pytest.raises(ValueError, match=msg):
- index.putmask("foo", 1)
+ index.putmask("foo", fill)
@pytest.mark.parametrize("copy", [True, False])
@pytest.mark.parametrize("name", [None, "foo"])
@@ -845,6 +900,7 @@ def test_is_unique(self):
index_na_dup = index_na.insert(0, np.nan)
assert index_na_dup.is_unique is False
+ @pytest.mark.arm_slow
def test_engine_reference_cycle(self):
# GH27585
index = self.create_index()
@@ -855,7 +911,7 @@ def test_engine_reference_cycle(self):
def test_getitem_2d_deprecated(self):
# GH#30588
idx = self.create_index()
- with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False):
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
res = idx[:, None]
assert isinstance(res, np.ndarray), type(res)
diff --git a/pandas/tests/indexes/datetimes/test_constructors.py b/pandas/tests/indexes/datetimes/test_constructors.py
index c150e7901c86a..9a855a1624520 100644
--- a/pandas/tests/indexes/datetimes/test_constructors.py
+++ b/pandas/tests/indexes/datetimes/test_constructors.py
@@ -787,6 +787,65 @@ def test_construction_with_nat_and_tzlocal(self):
expected = DatetimeIndex([Timestamp("2018", tz=tz), pd.NaT])
tm.assert_index_equal(result, expected)
+ def test_constructor_with_ambiguous_keyword_arg(self):
+ # GH 35297
+
+ expected = DatetimeIndex(
+ ["2020-11-01 01:00:00", "2020-11-02 01:00:00"],
+ dtype="datetime64[ns, America/New_York]",
+ freq="D",
+ ambiguous=False,
+ )
+
+ # ambiguous keyword in start
+ timezone = "America/New_York"
+ start = pd.Timestamp(year=2020, month=11, day=1, hour=1).tz_localize(
+ timezone, ambiguous=False
+ )
+ result = pd.date_range(start=start, periods=2, ambiguous=False)
+ tm.assert_index_equal(result, expected)
+
+ # ambiguous keyword in end
+ timezone = "America/New_York"
+ end = pd.Timestamp(year=2020, month=11, day=2, hour=1).tz_localize(
+ timezone, ambiguous=False
+ )
+ result = pd.date_range(end=end, periods=2, ambiguous=False)
+ tm.assert_index_equal(result, expected)
+
+ def test_constructor_with_nonexistent_keyword_arg(self):
+ # GH 35297
+
+ timezone = "Europe/Warsaw"
+
+ # nonexistent keyword in start
+ start = pd.Timestamp("2015-03-29 02:30:00").tz_localize(
+ timezone, nonexistent="shift_forward"
+ )
+ result = pd.date_range(start=start, periods=2, freq="H")
+ expected = DatetimeIndex(
+ [
+ pd.Timestamp("2015-03-29 03:00:00+02:00", tz=timezone),
+ pd.Timestamp("2015-03-29 04:00:00+02:00", tz=timezone),
+ ]
+ )
+
+ tm.assert_index_equal(result, expected)
+
+ # nonexistent keyword in end
+ end = pd.Timestamp("2015-03-29 02:30:00").tz_localize(
+ timezone, nonexistent="shift_forward"
+ )
+ result = pd.date_range(end=end, periods=2, freq="H")
+ expected = DatetimeIndex(
+ [
+ pd.Timestamp("2015-03-29 01:00:00+01:00", tz=timezone),
+ pd.Timestamp("2015-03-29 03:00:00+02:00", tz=timezone),
+ ]
+ )
+
+ tm.assert_index_equal(result, expected)
+
def test_constructor_no_precision_raises(self):
# GH-24753, GH-24739
diff --git a/pandas/tests/indexes/datetimes/test_datetime.py b/pandas/tests/indexes/datetimes/test_datetime.py
index ec4162f87010f..8e2ac4feb7ded 100644
--- a/pandas/tests/indexes/datetimes/test_datetime.py
+++ b/pandas/tests/indexes/datetimes/test_datetime.py
@@ -51,7 +51,7 @@ def test_reindex_with_same_tz(self):
"2010-01-02 00:00:00",
]
expected1 = DatetimeIndex(
- expected_list1, dtype="datetime64[ns, UTC]", freq=None,
+ expected_list1, dtype="datetime64[ns, UTC]", freq=None
)
expected2 = np.array([0] + [-1] * 21 + [23], dtype=np.dtype("intp"))
tm.assert_index_equal(result1, expected1)
@@ -59,6 +59,7 @@ def test_reindex_with_same_tz(self):
def test_time_loc(self): # GH8667
from datetime import time
+
from pandas._libs.index import _SIZE_CUTOFF
ns = _SIZE_CUTOFF + np.array([-100, 100], dtype=np.int64)
diff --git a/pandas/tests/indexes/datetimes/test_datetimelike.py b/pandas/tests/indexes/datetimes/test_datetimelike.py
index 7345ae3032463..a5abf2946feda 100644
--- a/pandas/tests/indexes/datetimes/test_datetimelike.py
+++ b/pandas/tests/indexes/datetimes/test_datetimelike.py
@@ -20,6 +20,12 @@ def index(self, request):
def create_index(self) -> DatetimeIndex:
return date_range("20130101", periods=5)
+ def test_format(self):
+ # GH35439
+ idx = self.create_index()
+ expected = [f"{x:%Y-%m-%d}" for x in idx]
+ assert idx.format() == expected
+
def test_shift(self):
pass # handled in test_ops
diff --git a/pandas/tests/indexes/datetimes/test_formats.py b/pandas/tests/indexes/datetimes/test_formats.py
index f34019e06fd5f..a98a96b436107 100644
--- a/pandas/tests/indexes/datetimes/test_formats.py
+++ b/pandas/tests/indexes/datetimes/test_formats.py
@@ -10,41 +10,53 @@
import pandas._testing as tm
-def test_to_native_types():
+def test_to_native_types_method_deprecated():
index = pd.date_range(freq="1D", periods=3, start="2017-01-01")
-
- # First, with no arguments.
expected = np.array(["2017-01-01", "2017-01-02", "2017-01-03"], dtype=object)
- result = index.to_native_types()
- tm.assert_numpy_array_equal(result, expected)
+ with tm.assert_produces_warning(FutureWarning):
+ result = index.to_native_types()
- # No NaN values, so na_rep has no effect
- result = index.to_native_types(na_rep="pandas")
tm.assert_numpy_array_equal(result, expected)
# Make sure slicing works
expected = np.array(["2017-01-01", "2017-01-03"], dtype=object)
- result = index.to_native_types([0, 2])
+ with tm.assert_produces_warning(FutureWarning):
+ result = index.to_native_types([0, 2])
+
+ tm.assert_numpy_array_equal(result, expected)
+
+
+def test_to_native_types():
+ index = pd.date_range(freq="1D", periods=3, start="2017-01-01")
+
+ # First, with no arguments.
+ expected = np.array(["2017-01-01", "2017-01-02", "2017-01-03"], dtype=object)
+
+ result = index._format_native_types()
+ tm.assert_numpy_array_equal(result, expected)
+
+ # No NaN values, so na_rep has no effect
+ result = index._format_native_types(na_rep="pandas")
tm.assert_numpy_array_equal(result, expected)
# Make sure date formatting works
expected = np.array(["01-2017-01", "01-2017-02", "01-2017-03"], dtype=object)
- result = index.to_native_types(date_format="%m-%Y-%d")
+ result = index._format_native_types(date_format="%m-%Y-%d")
tm.assert_numpy_array_equal(result, expected)
# NULL object handling should work
index = DatetimeIndex(["2017-01-01", pd.NaT, "2017-01-03"])
expected = np.array(["2017-01-01", "NaT", "2017-01-03"], dtype=object)
- result = index.to_native_types()
+ result = index._format_native_types()
tm.assert_numpy_array_equal(result, expected)
expected = np.array(["2017-01-01", "pandas", "2017-01-03"], dtype=object)
- result = index.to_native_types(na_rep="pandas")
+ result = index._format_native_types(na_rep="pandas")
tm.assert_numpy_array_equal(result, expected)
diff --git a/pandas/tests/indexes/datetimes/test_indexing.py b/pandas/tests/indexes/datetimes/test_indexing.py
index b1faaa2115f55..539d9cb8f06a7 100644
--- a/pandas/tests/indexes/datetimes/test_indexing.py
+++ b/pandas/tests/indexes/datetimes/test_indexing.py
@@ -6,7 +6,7 @@
from pandas.errors import InvalidIndexError
import pandas as pd
-from pandas import DatetimeIndex, Index, Timestamp, date_range, notna
+from pandas import DatetimeIndex, Index, Timestamp, bdate_range, date_range, notna
import pandas._testing as tm
from pandas.tseries.offsets import BDay, CDay
@@ -95,7 +95,7 @@ def test_dti_business_getitem(self):
def test_dti_business_getitem_matplotlib_hackaround(self):
rng = pd.bdate_range(START, END)
- with tm.assert_produces_warning(DeprecationWarning):
+ with tm.assert_produces_warning(FutureWarning):
# GH#30588 multi-dimensional indexing deprecated
values = rng[:, None]
expected = rng.values[:, None]
@@ -122,7 +122,7 @@ def test_dti_custom_getitem(self):
def test_dti_custom_getitem_matplotlib_hackaround(self):
rng = pd.bdate_range(START, END, freq="C")
- with tm.assert_produces_warning(DeprecationWarning):
+ with tm.assert_produces_warning(FutureWarning):
# GH#30588 multi-dimensional indexing deprecated
values = rng[:, None]
expected = rng.values[:, None]
@@ -471,6 +471,16 @@ def test_get_loc(self):
with pytest.raises(NotImplementedError, match=msg):
idx.get_loc(time(12, 30), method="pad")
+ def test_get_loc_time_nat(self):
+ # GH#35114
+ # Case where key's total microseconds happens to match iNaT % 1e6 // 1000
+ tic = time(minute=12, second=43, microsecond=145224)
+ dti = pd.DatetimeIndex([pd.NaT])
+
+ loc = dti.get_loc(tic)
+ expected = np.array([], dtype=np.intp)
+ tm.assert_numpy_array_equal(loc, expected)
+
def test_get_loc_tz_aware(self):
# https://github.com/pandas-dev/pandas/issues/32140
dti = pd.date_range(
@@ -655,3 +665,43 @@ def test_get_value(self):
with tm.assert_produces_warning(FutureWarning):
result = dti.get_value(ser, key.to_datetime64())
assert result == 7
+
+
+class TestGetSliceBounds:
+ @pytest.mark.parametrize("box", [date, datetime, Timestamp])
+ @pytest.mark.parametrize("kind", ["getitem", "loc", None])
+ @pytest.mark.parametrize("side, expected", [("left", 4), ("right", 5)])
+ def test_get_slice_bounds_datetime_within(
+ self, box, kind, side, expected, tz_aware_fixture
+ ):
+ # GH 35690
+ index = bdate_range("2000-01-03", "2000-02-11").tz_localize(tz_aware_fixture)
+ result = index.get_slice_bound(
+ box(year=2000, month=1, day=7), kind=kind, side=side
+ )
+ assert result == expected
+
+ @pytest.mark.parametrize("box", [date, datetime, Timestamp])
+ @pytest.mark.parametrize("kind", ["getitem", "loc", None])
+ @pytest.mark.parametrize("side", ["left", "right"])
+ @pytest.mark.parametrize("year, expected", [(1999, 0), (2020, 30)])
+ def test_get_slice_bounds_datetime_outside(
+ self, box, kind, side, year, expected, tz_aware_fixture
+ ):
+ # GH 35690
+ index = bdate_range("2000-01-03", "2000-02-11").tz_localize(tz_aware_fixture)
+ result = index.get_slice_bound(
+ box(year=year, month=1, day=7), kind=kind, side=side
+ )
+ assert result == expected
+
+ @pytest.mark.parametrize("box", [date, datetime, Timestamp])
+ @pytest.mark.parametrize("kind", ["getitem", "loc", None])
+ def test_slice_datetime_locs(self, box, kind, tz_aware_fixture):
+ # GH 34077
+ index = DatetimeIndex(["2010-01-01", "2010-01-03"]).tz_localize(
+ tz_aware_fixture
+ )
+ result = index.slice_locs(box(2010, 1, 1), box(2010, 1, 2))
+ expected = (0, 1)
+ assert result == expected
diff --git a/pandas/tests/indexes/datetimes/test_setops.py b/pandas/tests/indexes/datetimes/test_setops.py
index 6670b079ddd29..102c8f97a8a6b 100644
--- a/pandas/tests/indexes/datetimes/test_setops.py
+++ b/pandas/tests/indexes/datetimes/test_setops.py
@@ -46,10 +46,8 @@ def test_union3(self, sort, box):
first = everything[:5]
second = everything[5:]
- # GH 10149
- expected = (
- first.astype("O").union(pd.Index(second.values, dtype="O")).astype("O")
- )
+ # GH 10149 support listlike inputs other than Index objects
+ expected = first.union(second, sort=sort)
case = box(second.values)
result = first.union(case, sort=sort)
tm.assert_index_equal(result, expected)
@@ -470,6 +468,13 @@ def test_intersection_bug(self):
tm.assert_index_equal(result, b)
assert result.freq == b.freq
+ def test_intersection_list(self):
+ # GH#35876
+ values = [pd.Timestamp("2020-01-01"), pd.Timestamp("2020-02-01")]
+ idx = pd.DatetimeIndex(values, name="a")
+ res = idx.intersection(values)
+ tm.assert_index_equal(res, idx)
+
def test_month_range_union_tz_pytz(self, sort):
from pytz import timezone
diff --git a/pandas/tests/indexes/datetimes/test_timezones.py b/pandas/tests/indexes/datetimes/test_timezones.py
index ea68e8759c123..233835bb4b5f7 100644
--- a/pandas/tests/indexes/datetimes/test_timezones.py
+++ b/pandas/tests/indexes/datetimes/test_timezones.py
@@ -799,7 +799,7 @@ def test_dti_from_tzaware_datetime(self, tz):
@pytest.mark.parametrize("tzstr", ["US/Eastern", "dateutil/US/Eastern"])
def test_dti_tz_constructors(self, tzstr):
- """ Test different DatetimeIndex constructions with timezone
+ """Test different DatetimeIndex constructions with timezone
Follow-up of GH#4229
"""
arr = ["11/10/2005 08:00:00", "11/10/2005 09:00:00"]
diff --git a/pandas/tests/indexes/interval/test_base.py b/pandas/tests/indexes/interval/test_base.py
index 891640234d26e..c316655fbda8a 100644
--- a/pandas/tests/indexes/interval/test_base.py
+++ b/pandas/tests/indexes/interval/test_base.py
@@ -84,5 +84,5 @@ def test_getitem_2d_deprecated(self):
# GH#30588 multi-dim indexing is deprecated, but raising is also acceptable
idx = self.create_index()
with pytest.raises(ValueError, match="multi-dimensional indexing not allowed"):
- with tm.assert_produces_warning(DeprecationWarning, check_stacklevel=False):
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
idx[:, None]
diff --git a/pandas/tests/indexes/interval/test_formats.py b/pandas/tests/indexes/interval/test_formats.py
index 7acf5c1e0906c..0e8d7d1ba5aba 100644
--- a/pandas/tests/indexes/interval/test_formats.py
+++ b/pandas/tests/indexes/interval/test_formats.py
@@ -73,6 +73,6 @@ def test_repr_missing(self, constructor, expected):
def test_to_native_types(self, tuples, closed, expected_data):
# GH 28210
index = IntervalIndex.from_tuples(tuples, closed=closed)
- result = index.to_native_types()
+ result = index._format_native_types()
expected = np.array(expected_data)
tm.assert_numpy_array_equal(result, expected)
diff --git a/pandas/tests/indexes/interval/test_interval.py b/pandas/tests/indexes/interval/test_interval.py
index 2755b186f3eae..b81f0f27e60ad 100644
--- a/pandas/tests/indexes/interval/test_interval.py
+++ b/pandas/tests/indexes/interval/test_interval.py
@@ -191,24 +191,34 @@ def test_insert(self, data):
tm.assert_index_equal(result, expected)
# invalid type
- msg = "can only insert Interval objects and NA into an IntervalIndex"
+ msg = "can only insert Interval objects and NA into an IntervalArray"
with pytest.raises(ValueError, match=msg):
data.insert(1, "foo")
# invalid closed
- msg = "inserted item must be closed on the same side as the index"
+ msg = "'value.closed' is 'left', expected 'right'."
for closed in {"left", "right", "both", "neither"} - {item.closed}:
+ msg = f"'value.closed' is '{closed}', expected '{item.closed}'."
with pytest.raises(ValueError, match=msg):
bad_item = Interval(item.left, item.right, closed=closed)
data.insert(1, bad_item)
# GH 18295 (test missing)
na_idx = IntervalIndex([np.nan], closed=data.closed)
- for na in (np.nan, pd.NaT, None):
+ for na in [np.nan, None, pd.NA]:
expected = data[:1].append(na_idx).append(data[1:])
result = data.insert(1, na)
tm.assert_index_equal(result, expected)
+ if data.left.dtype.kind not in ["m", "M"]:
+ # trying to insert pd.NaT into a numeric-dtyped Index should cast/raise
+ msg = "can only insert Interval objects and NA into an IntervalArray"
+ with pytest.raises(ValueError, match=msg):
+ result = data.insert(1, pd.NaT)
+ else:
+ result = data.insert(1, pd.NaT)
+ tm.assert_index_equal(result, expected)
+
def test_is_unique_interval(self, closed):
"""
Interval specific tests for is_unique in addition to base class tests
@@ -618,7 +628,7 @@ def test_sort_values(self, closed):
expected = IntervalIndex([Interval(0, 1), Interval(1, 2), np.nan])
tm.assert_index_equal(result, expected)
- result = index.sort_values(ascending=False)
+ result = index.sort_values(ascending=False, na_position="first")
expected = IntervalIndex([np.nan, Interval(1, 2), Interval(0, 1)])
tm.assert_index_equal(result, expected)
@@ -874,6 +884,13 @@ def test_get_value_non_scalar_errors(self, key):
with tm.assert_produces_warning(FutureWarning):
idx.get_value(s, key)
+ @pytest.mark.parametrize("closed", ["left", "right", "both"])
+ def test_pickle_round_trip_closed(self, closed):
+ # https://github.com/pandas-dev/pandas/issues/35658
+ idx = IntervalIndex.from_tuples([(1, 2), (2, 3)], closed=closed)
+ result = tm.round_trip_pickle(idx)
+ tm.assert_index_equal(result, idx)
+
def test_dir():
# GH#27571 dir(interval_index) should not raise
diff --git a/pandas/tests/indexes/interval/test_interval_tree.py b/pandas/tests/indexes/interval/test_interval_tree.py
index 476ec1dd10b4b..ab6eac482211d 100644
--- a/pandas/tests/indexes/interval/test_interval_tree.py
+++ b/pandas/tests/indexes/interval/test_interval_tree.py
@@ -4,8 +4,8 @@
import pytest
from pandas._libs.interval import IntervalTree
+from pandas.compat import IS64
-from pandas import compat
import pandas._testing as tm
@@ -14,9 +14,7 @@ def skipif_32bit(param):
Skip parameters in a parametrize on 32bit systems. Specifically used
here to skip leaf_size parameters related to GH 23440.
"""
- marks = pytest.mark.skipif(
- compat.is_platform_32bit(), reason="GH 23440: int type mismatch on 32bit"
- )
+ marks = pytest.mark.skipif(not IS64, reason="GH 23440: int type mismatch on 32bit")
return pytest.param(param, marks=marks)
@@ -181,7 +179,7 @@ def test_is_overlapping_trivial(self, closed, left, right):
tree = IntervalTree(left, right, closed=closed)
assert tree.is_overlapping is False
- @pytest.mark.skipif(compat.is_platform_32bit(), reason="GH 23440")
+ @pytest.mark.skipif(not IS64, reason="GH 23440")
def test_construction_overflow(self):
# GH 25485
left, right = np.arange(101, dtype="int64"), [np.iinfo(np.int64).max] * 101
diff --git a/pandas/tests/indexes/multi/test_analytics.py b/pandas/tests/indexes/multi/test_analytics.py
index 9e4e73e793bac..d661a56311e6c 100644
--- a/pandas/tests/indexes/multi/test_analytics.py
+++ b/pandas/tests/indexes/multi/test_analytics.py
@@ -1,7 +1,7 @@
import numpy as np
import pytest
-from pandas.compat.numpy import _np_version_under1p17
+from pandas.compat.numpy import np_version_under1p17
import pandas as pd
from pandas import Index, MultiIndex, date_range, period_range
@@ -240,7 +240,7 @@ def test_numpy_ufuncs(idx, func):
# test ufuncs of numpy. see:
# https://numpy.org/doc/stable/reference/ufuncs.html
- if _np_version_under1p17:
+ if np_version_under1p17:
expected_exception = AttributeError
msg = f"'tuple' object has no attribute '{func.__name__}'"
else:
diff --git a/pandas/tests/indexes/multi/test_compat.py b/pandas/tests/indexes/multi/test_compat.py
index d1f66af4a8e83..72b5ed0edaa78 100644
--- a/pandas/tests/indexes/multi/test_compat.py
+++ b/pandas/tests/indexes/multi/test_compat.py
@@ -68,23 +68,33 @@ def test_inplace_mutation_resets_values():
mi1 = MultiIndex(levels=levels, codes=codes)
mi2 = MultiIndex(levels=levels2, codes=codes)
+
+ # instantiating MultiIndex should not access/cache _.values
+ assert "_values" not in mi1._cache
+ assert "_values" not in mi2._cache
+
vals = mi1.values.copy()
vals2 = mi2.values.copy()
- assert mi1._tuples is not None
+ # accessing .values should cache ._values
+ assert mi1._values is mi1._cache["_values"]
+ assert mi1.values is mi1._cache["_values"]
+ assert isinstance(mi1._cache["_values"], np.ndarray)
# Make sure level setting works
new_vals = mi1.set_levels(levels2).values
tm.assert_almost_equal(vals2, new_vals)
- # Non-inplace doesn't kill _tuples [implementation detail]
- tm.assert_almost_equal(mi1._tuples, vals)
+ # Non-inplace doesn't drop _values from _cache [implementation detail]
+ tm.assert_almost_equal(mi1._cache["_values"], vals)
# ...and values is still same too
tm.assert_almost_equal(mi1.values, vals)
- # Inplace should kill _tuples
- mi1.set_levels(levels2, inplace=True)
+ # Inplace should drop _values from _cache
+ with tm.assert_produces_warning(FutureWarning):
+ mi1.set_levels(levels2, inplace=True)
+ assert "_values" not in mi1._cache
tm.assert_almost_equal(mi1.values, vals2)
# Make sure label setting works too
@@ -94,17 +104,24 @@ def test_inplace_mutation_resets_values():
# Must be 1d array of tuples
assert exp_values.shape == (6,)
- new_values = mi2.set_codes(codes2).values
+
+ new_mi = mi2.set_codes(codes2)
+ assert "_values" not in new_mi._cache
+ new_values = new_mi.values
+ assert "_values" in new_mi._cache
# Not inplace shouldn't change
- tm.assert_almost_equal(mi2._tuples, vals2)
+ tm.assert_almost_equal(mi2._cache["_values"], vals2)
# Should have correct values
tm.assert_almost_equal(exp_values, new_values)
- # ...and again setting inplace should kill _tuples, etc
- mi2.set_codes(codes2, inplace=True)
+ # ...and again setting inplace should drop _values from _cache, etc
+ with tm.assert_produces_warning(FutureWarning):
+ mi2.set_codes(codes2, inplace=True)
+ assert "_values" not in mi2._cache
tm.assert_almost_equal(mi2.values, new_values)
+ assert "_values" in mi2._cache
def test_ndarray_compat_properties(idx, compat_props):
diff --git a/pandas/tests/indexes/multi/test_constructors.py b/pandas/tests/indexes/multi/test_constructors.py
index 1157c7f8bb962..16af884c89e9e 100644
--- a/pandas/tests/indexes/multi/test_constructors.py
+++ b/pandas/tests/indexes/multi/test_constructors.py
@@ -741,18 +741,18 @@ def test_raise_invalid_sortorder():
with pytest.raises(ValueError, match=r".* sortorder 2 with lexsort_depth 1.*"):
MultiIndex(
- levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]], sortorder=2,
+ levels=levels, codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 2, 1]], sortorder=2
)
with pytest.raises(ValueError, match=r".* sortorder 1 with lexsort_depth 0.*"):
MultiIndex(
- levels=levels, codes=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]], sortorder=1,
+ levels=levels, codes=[[0, 0, 1, 0, 1, 1], [0, 1, 0, 2, 2, 1]], sortorder=1
)
def test_datetimeindex():
idx1 = pd.DatetimeIndex(
- ["2013-04-01 9:00", "2013-04-02 9:00", "2013-04-03 9:00"] * 2, tz="Asia/Tokyo",
+ ["2013-04-01 9:00", "2013-04-02 9:00", "2013-04-03 9:00"] * 2, tz="Asia/Tokyo"
)
idx2 = pd.date_range("2010/01/01", periods=6, freq="M", tz="US/Eastern")
idx = MultiIndex.from_arrays([idx1, idx2])
diff --git a/pandas/tests/indexes/multi/test_duplicates.py b/pandas/tests/indexes/multi/test_duplicates.py
index e48731b9c8099..aa2f37dad152c 100644
--- a/pandas/tests/indexes/multi/test_duplicates.py
+++ b/pandas/tests/indexes/multi/test_duplicates.py
@@ -91,7 +91,8 @@ def test_duplicate_multiindex_codes():
mi = MultiIndex.from_arrays([["A", "A", "B", "B", "B"], [1, 2, 1, 2, 3]])
msg = r"Level values must be unique: \[[AB', ]+\] on level 0"
with pytest.raises(ValueError, match=msg):
- mi.set_levels([["A", "B", "A", "A", "B"], [2, 1, 3, -2, 5]], inplace=True)
+ with tm.assert_produces_warning(FutureWarning):
+ mi.set_levels([["A", "B", "A", "A", "B"], [2, 1, 3, -2, 5]], inplace=True)
@pytest.mark.parametrize("names", [["a", "b", "a"], [1, 1, 2], [1, "a", 1]])
@@ -240,6 +241,7 @@ def test_duplicated(idx_dup, keep, expected):
tm.assert_numpy_array_equal(result, expected)
+@pytest.mark.arm_slow
def test_duplicated_large(keep):
# GH 9125
n, k = 200, 5000
diff --git a/pandas/tests/indexes/multi/test_equivalence.py b/pandas/tests/indexes/multi/test_equivalence.py
index 063ede028add7..b48f09457b96c 100644
--- a/pandas/tests/indexes/multi/test_equivalence.py
+++ b/pandas/tests/indexes/multi/test_equivalence.py
@@ -192,10 +192,12 @@ def test_is_():
mi4 = mi3.view()
# GH 17464 - Remove duplicate MultiIndex levels
- mi4.set_levels([list(range(10)), list(range(10))], inplace=True)
+ with tm.assert_produces_warning(FutureWarning):
+ mi4.set_levels([list(range(10)), list(range(10))], inplace=True)
assert not mi4.is_(mi3)
mi5 = mi.view()
- mi5.set_levels(mi5.levels, inplace=True)
+ with tm.assert_produces_warning(FutureWarning):
+ mi5.set_levels(mi5.levels, inplace=True)
assert not mi5.is_(mi)
diff --git a/pandas/tests/indexes/multi/test_get_set.py b/pandas/tests/indexes/multi/test_get_set.py
index 8a3deca0236e4..b9132f429905d 100644
--- a/pandas/tests/indexes/multi/test_get_set.py
+++ b/pandas/tests/indexes/multi/test_get_set.py
@@ -93,7 +93,8 @@ def test_set_levels(idx):
# level changing [w/ mutation]
ind2 = idx.copy()
- inplace_return = ind2.set_levels(new_levels, inplace=True)
+ with tm.assert_produces_warning(FutureWarning):
+ inplace_return = ind2.set_levels(new_levels, inplace=True)
assert inplace_return is None
assert_matching(ind2.levels, new_levels)
@@ -113,20 +114,23 @@ def test_set_levels(idx):
# level changing specific level [w/ mutation]
ind2 = idx.copy()
- inplace_return = ind2.set_levels(new_levels[0], level=0, inplace=True)
+ with tm.assert_produces_warning(FutureWarning):
+ inplace_return = ind2.set_levels(new_levels[0], level=0, inplace=True)
assert inplace_return is None
assert_matching(ind2.levels, [new_levels[0], levels[1]])
assert_matching(idx.levels, levels)
ind2 = idx.copy()
- inplace_return = ind2.set_levels(new_levels[1], level=1, inplace=True)
+ with tm.assert_produces_warning(FutureWarning):
+ inplace_return = ind2.set_levels(new_levels[1], level=1, inplace=True)
assert inplace_return is None
assert_matching(ind2.levels, [levels[0], new_levels[1]])
assert_matching(idx.levels, levels)
# level changing multiple levels [w/ mutation]
ind2 = idx.copy()
- inplace_return = ind2.set_levels(new_levels, level=[0, 1], inplace=True)
+ with tm.assert_produces_warning(FutureWarning):
+ inplace_return = ind2.set_levels(new_levels, level=[0, 1], inplace=True)
assert inplace_return is None
assert_matching(ind2.levels, new_levels)
assert_matching(idx.levels, levels)
@@ -136,19 +140,23 @@ def test_set_levels(idx):
original_index = idx.copy()
for inplace in [True, False]:
with pytest.raises(ValueError, match="^On"):
- idx.set_levels(["c"], level=0, inplace=inplace)
+ with tm.assert_produces_warning(FutureWarning):
+ idx.set_levels(["c"], level=0, inplace=inplace)
assert_matching(idx.levels, original_index.levels, check_dtype=True)
with pytest.raises(ValueError, match="^On"):
- idx.set_codes([0, 1, 2, 3, 4, 5], level=0, inplace=inplace)
+ with tm.assert_produces_warning(FutureWarning):
+ idx.set_codes([0, 1, 2, 3, 4, 5], level=0, inplace=inplace)
assert_matching(idx.codes, original_index.codes, check_dtype=True)
with pytest.raises(TypeError, match="^Levels"):
- idx.set_levels("c", level=0, inplace=inplace)
+ with tm.assert_produces_warning(FutureWarning):
+ idx.set_levels("c", level=0, inplace=inplace)
assert_matching(idx.levels, original_index.levels, check_dtype=True)
with pytest.raises(TypeError, match="^Codes"):
- idx.set_codes(1, level=0, inplace=inplace)
+ with tm.assert_produces_warning(FutureWarning):
+ idx.set_codes(1, level=0, inplace=inplace)
assert_matching(idx.codes, original_index.codes, check_dtype=True)
@@ -168,7 +176,8 @@ def test_set_codes(idx):
# changing label w/ mutation
ind2 = idx.copy()
- inplace_return = ind2.set_codes(new_codes, inplace=True)
+ with tm.assert_produces_warning(FutureWarning):
+ inplace_return = ind2.set_codes(new_codes, inplace=True)
assert inplace_return is None
assert_matching(ind2.codes, new_codes)
@@ -188,20 +197,23 @@ def test_set_codes(idx):
# label changing specific level w/ mutation
ind2 = idx.copy()
- inplace_return = ind2.set_codes(new_codes[0], level=0, inplace=True)
+ with tm.assert_produces_warning(FutureWarning):
+ inplace_return = ind2.set_codes(new_codes[0], level=0, inplace=True)
assert inplace_return is None
assert_matching(ind2.codes, [new_codes[0], codes[1]])
assert_matching(idx.codes, codes)
ind2 = idx.copy()
- inplace_return = ind2.set_codes(new_codes[1], level=1, inplace=True)
+ with tm.assert_produces_warning(FutureWarning):
+ inplace_return = ind2.set_codes(new_codes[1], level=1, inplace=True)
assert inplace_return is None
assert_matching(ind2.codes, [codes[0], new_codes[1]])
assert_matching(idx.codes, codes)
# codes changing multiple levels [w/ mutation]
ind2 = idx.copy()
- inplace_return = ind2.set_codes(new_codes, level=[0, 1], inplace=True)
+ with tm.assert_produces_warning(FutureWarning):
+ inplace_return = ind2.set_codes(new_codes, level=[0, 1], inplace=True)
assert inplace_return is None
assert_matching(ind2.codes, new_codes)
assert_matching(idx.codes, codes)
@@ -217,7 +229,8 @@ def test_set_codes(idx):
# [w/ mutation]
result = ind.copy()
- result.set_codes(codes=new_codes, level=1, inplace=True)
+ with tm.assert_produces_warning(FutureWarning):
+ result.set_codes(codes=new_codes, level=1, inplace=True)
assert result.equals(expected)
@@ -329,3 +342,19 @@ def test_set_levels_with_iterable():
[expected_sizes, colors], names=["size", "color"]
)
tm.assert_index_equal(result, expected)
+
+
+@pytest.mark.parametrize("inplace", [True, False])
+def test_set_codes_inplace_deprecated(idx, inplace):
+ new_codes = idx.codes[1][::-1]
+
+ with tm.assert_produces_warning(FutureWarning):
+ idx.set_codes(codes=new_codes, level=1, inplace=inplace)
+
+
+@pytest.mark.parametrize("inplace", [True, False])
+def test_set_levels_inplace_deprecated(idx, inplace):
+ new_level = idx.levels[1].copy()
+
+ with tm.assert_produces_warning(FutureWarning):
+ idx.set_levels(levels=new_level, level=1, inplace=inplace)
diff --git a/pandas/tests/indexes/multi/test_integrity.py b/pandas/tests/indexes/multi/test_integrity.py
index fd150bb4d57a2..6a353fe1ad6e7 100644
--- a/pandas/tests/indexes/multi/test_integrity.py
+++ b/pandas/tests/indexes/multi/test_integrity.py
@@ -118,6 +118,7 @@ def test_consistency():
assert index.is_unique is False
+@pytest.mark.arm_slow
def test_hash_collisions():
# non-smoke test that we don't get hash collisions
@@ -220,7 +221,8 @@ def test_metadata_immutable(idx):
def test_level_setting_resets_attributes():
ind = pd.MultiIndex.from_arrays([["A", "A", "B", "B", "B"], [1, 2, 1, 2, 3]])
assert ind.is_monotonic
- ind.set_levels([["A", "B"], [1, 3, 2]], inplace=True)
+ with tm.assert_produces_warning(FutureWarning):
+ ind.set_levels([["A", "B"], [1, 3, 2]], inplace=True)
# if this fails, probably didn't reset the cache correctly.
assert not ind.is_monotonic
diff --git a/pandas/tests/indexes/multi/test_isin.py b/pandas/tests/indexes/multi/test_isin.py
index 122263e6ec198..b369b9a50954e 100644
--- a/pandas/tests/indexes/multi/test_isin.py
+++ b/pandas/tests/indexes/multi/test_isin.py
@@ -78,7 +78,7 @@ def test_isin_level_kwarg():
@pytest.mark.parametrize(
"labels,expected,level",
[
- ([("b", np.nan)], np.array([False, False, True]), None,),
+ ([("b", np.nan)], np.array([False, False, True]), None),
([np.nan, "a"], np.array([True, True, False]), 0),
(["d", np.nan], np.array([False, True, True]), 1),
],
diff --git a/pandas/tests/indexes/multi/test_names.py b/pandas/tests/indexes/multi/test_names.py
index 479b5ef0211a0..f38da7ad2ae1c 100644
--- a/pandas/tests/indexes/multi/test_names.py
+++ b/pandas/tests/indexes/multi/test_names.py
@@ -75,6 +75,13 @@ def test_copy_names():
assert multi_idx.names == ["MyName1", "MyName2"]
assert multi_idx3.names == ["NewName1", "NewName2"]
+ # gh-35592
+ with pytest.raises(ValueError, match="Length of new names must be 2, got 1"):
+ multi_idx.copy(names=["mario"])
+
+ with pytest.raises(TypeError, match="MultiIndex.name must be a hashable type"):
+ multi_idx.copy(names=[["mario"], ["luigi"]])
+
def test_names(idx, index_names):
diff --git a/pandas/tests/indexes/multi/test_setops.py b/pandas/tests/indexes/multi/test_setops.py
index d7427ee622977..6d4928547cad1 100644
--- a/pandas/tests/indexes/multi/test_setops.py
+++ b/pandas/tests/indexes/multi/test_setops.py
@@ -37,6 +37,7 @@ def test_intersection_base(idx, sort, klass):
first.intersection([1, 2, 3], sort=sort)
+@pytest.mark.arm_slow
@pytest.mark.parametrize("klass", [MultiIndex, np.array, Series, list])
def test_union_base(idx, sort, klass):
first = idx[::-1]
diff --git a/pandas/tests/indexes/numeric/test_indexing.py b/pandas/tests/indexes/numeric/test_indexing.py
index 473e370c76f8b..508bd2f566507 100644
--- a/pandas/tests/indexes/numeric/test_indexing.py
+++ b/pandas/tests/indexes/numeric/test_indexing.py
@@ -228,6 +228,12 @@ def test_take_fill_value_ints(self, klass):
class TestContains:
+ @pytest.mark.parametrize("klass", [Float64Index, Int64Index, UInt64Index])
+ def test_contains_none(self, klass):
+ # GH#35788 should return False, not raise TypeError
+ index = klass([0, 1, 2, 3, 4])
+ assert None not in index
+
def test_contains_float64_nans(self):
index = Float64Index([1.0, 2.0, np.nan])
assert np.nan in index
diff --git a/pandas/tests/indexes/period/test_formats.py b/pandas/tests/indexes/period/test_formats.py
index 5db373a9f07ae..150a797169c14 100644
--- a/pandas/tests/indexes/period/test_formats.py
+++ b/pandas/tests/indexes/period/test_formats.py
@@ -12,35 +12,29 @@ def test_to_native_types():
# First, with no arguments.
expected = np.array(["2017-01-01", "2017-01-02", "2017-01-03"], dtype="=U10")
- result = index.to_native_types()
+ result = index._format_native_types()
tm.assert_numpy_array_equal(result, expected)
# No NaN values, so na_rep has no effect
- result = index.to_native_types(na_rep="pandas")
- tm.assert_numpy_array_equal(result, expected)
-
- # Make sure slicing works
- expected = np.array(["2017-01-01", "2017-01-03"], dtype="=U10")
-
- result = index.to_native_types([0, 2])
+ result = index._format_native_types(na_rep="pandas")
tm.assert_numpy_array_equal(result, expected)
# Make sure date formatting works
expected = np.array(["01-2017-01", "01-2017-02", "01-2017-03"], dtype="=U10")
- result = index.to_native_types(date_format="%m-%Y-%d")
+ result = index._format_native_types(date_format="%m-%Y-%d")
tm.assert_numpy_array_equal(result, expected)
# NULL object handling should work
index = PeriodIndex(["2017-01-01", pd.NaT, "2017-01-03"], freq="D")
expected = np.array(["2017-01-01", "NaT", "2017-01-03"], dtype=object)
- result = index.to_native_types()
+ result = index._format_native_types()
tm.assert_numpy_array_equal(result, expected)
expected = np.array(["2017-01-01", "pandas", "2017-01-03"], dtype=object)
- result = index.to_native_types(na_rep="pandas")
+ result = index._format_native_types(na_rep="pandas")
tm.assert_numpy_array_equal(result, expected)
diff --git a/pandas/tests/indexes/period/test_indexing.py b/pandas/tests/indexes/period/test_indexing.py
index b61d1d903f89a..f42499147cdbb 100644
--- a/pandas/tests/indexes/period/test_indexing.py
+++ b/pandas/tests/indexes/period/test_indexing.py
@@ -157,6 +157,7 @@ def test_getitem_list_periods(self):
exp = ts.iloc[[1]]
tm.assert_series_equal(ts[[Period("2012-01-02", freq="D")]], exp)
+ @pytest.mark.arm_slow
def test_getitem_seconds(self):
# GH#6716
didx = date_range(start="2013/01/01 09:00:00", freq="S", periods=4000)
@@ -359,6 +360,22 @@ def test_get_loc2(self):
],
)
+ def test_get_loc_invalid_string_raises_keyerror(self):
+ # GH#34240
+ pi = pd.period_range("2000", periods=3, name="A")
+ with pytest.raises(KeyError, match="A"):
+ pi.get_loc("A")
+
+ ser = pd.Series([1, 2, 3], index=pi)
+ with pytest.raises(KeyError, match="A"):
+ ser.loc["A"]
+
+ with pytest.raises(KeyError, match="A"):
+ ser["A"]
+
+ assert "A" not in ser
+ assert "A" not in pi
+
class TestGetIndexer:
def test_get_indexer(self):
diff --git a/pandas/tests/indexes/period/test_ops.py b/pandas/tests/indexes/period/test_ops.py
index e7dd76584d780..d1b34c315b682 100644
--- a/pandas/tests/indexes/period/test_ops.py
+++ b/pandas/tests/indexes/period/test_ops.py
@@ -174,9 +174,6 @@ def _check_freq(index, expected_index):
ordered, indexer = idx.sort_values(return_indexer=True, ascending=False)
tm.assert_index_equal(ordered, expected[::-1])
-
- exp = np.array([2, 1, 3, 4, 0])
- tm.assert_numpy_array_equal(indexer, exp, check_dtype=False)
_check_freq(ordered, idx)
pidx = PeriodIndex(["2011", "2013", "NaT", "2011"], name="pidx", freq="D")
@@ -333,3 +330,16 @@ def test_freq_setter_deprecated(self):
# warning for setter
with pytest.raises(AttributeError, match="can't set attribute"):
idx.freq = pd.offsets.Day()
+
+
+@pytest.mark.xfail(reason="Datetime-like sort_values currently unstable (GH 35922)")
+def test_order_stability_compat():
+ # GH 35584. The new implementation of sort_values for Index.sort_values
+ # is stable when sorting in descending order. Datetime-like sort_values
+ # currently aren't stable. xfail should be removed after
+ # the implementations' behavior is synchronized (xref GH 35922)
+ pidx = PeriodIndex(["2011", "2013", "2015", "2012", "2011"], name="pidx", freq="A")
+ iidx = Index([2011, 2013, 2015, 2012, 2011], name="idx")
+ ordered1, indexer1 = pidx.sort_values(return_indexer=True, ascending=False)
+ ordered2, indexer2 = iidx.sort_values(return_indexer=True, ascending=False)
+ tm.assert_numpy_array_equal(indexer1, indexer2)
diff --git a/pandas/tests/indexes/period/test_period.py b/pandas/tests/indexes/period/test_period.py
index 15a88ab3819ce..085d41aaa5b76 100644
--- a/pandas/tests/indexes/period/test_period.py
+++ b/pandas/tests/indexes/period/test_period.py
@@ -536,6 +536,12 @@ def test_contains_raise_error_if_period_index_is_in_multi_index(self, msg, key):
with pytest.raises(KeyError, match=msg):
df.loc[key]
+ def test_format_empty(self):
+ # GH35712
+ empty_idx = self._holder([], freq="A")
+ assert empty_idx.format() == []
+ assert empty_idx.format(name=True) == [""]
+
def test_maybe_convert_timedelta():
pi = PeriodIndex(["2000", "2001"], freq="D")
diff --git a/pandas/tests/indexes/period/test_searchsorted.py b/pandas/tests/indexes/period/test_searchsorted.py
index f5a2583bf2e10..f2950b9f6065c 100644
--- a/pandas/tests/indexes/period/test_searchsorted.py
+++ b/pandas/tests/indexes/period/test_searchsorted.py
@@ -2,6 +2,7 @@
import pytest
from pandas._libs.tslibs import IncompatibleFrequency
+from pandas.compat.numpy import np_version_under1p18
from pandas import NaT, Period, PeriodIndex, Series, array
import pandas._testing as tm
@@ -21,7 +22,13 @@ def test_searchsorted(self, freq):
p2 = Period("2014-01-04", freq=freq)
assert pidx.searchsorted(p2) == 3
- assert pidx.searchsorted(NaT) == 0
+ if np_version_under1p18:
+ # GH#36254
+ # Following numpy convention, NaT goes at the beginning
+ # (unlike NaN which goes at the end)
+ assert pidx.searchsorted(NaT) == 0
+ else:
+ assert pidx.searchsorted(NaT) == 5
msg = "Input has different freq=H from PeriodArray"
with pytest.raises(IncompatibleFrequency, match=msg):
diff --git a/pandas/tests/indexes/ranges/test_range.py b/pandas/tests/indexes/ranges/test_range.py
index 5b6f9cb358b7d..899c8cbc0425d 100644
--- a/pandas/tests/indexes/ranges/test_range.py
+++ b/pandas/tests/indexes/ranges/test_range.py
@@ -100,10 +100,14 @@ def test_insert(self):
# GH 18295 (test missing)
expected = Float64Index([0, np.nan, 1, 2, 3, 4])
- for na in (np.nan, pd.NaT, None):
+ for na in [np.nan, None, pd.NA]:
result = RangeIndex(5).insert(1, na)
tm.assert_index_equal(result, expected)
+ result = RangeIndex(5).insert(1, pd.NaT)
+ expected = pd.Index([0, pd.NaT, 1, 2, 3, 4], dtype=object)
+ tm.assert_index_equal(result, expected)
+
def test_delete(self):
idx = RangeIndex(5, name="Foo")
@@ -137,53 +141,68 @@ def test_dtype(self):
index = self.create_index()
assert index.dtype == np.int64
- def test_cached_data(self):
- # GH 26565, GH26617
- # Calling RangeIndex._data caches an int64 array of the same length at
- # self._cached_data. This test checks whether _cached_data has been set
+ def test_cache(self):
+ # GH 26565, GH26617, GH35432
+ # This test checks whether _cache has been set.
+ # Calling RangeIndex._cache["_data"] creates an int64 array of the same length
+ # as the RangeIndex and stores it in _cache.
idx = RangeIndex(0, 100, 10)
- assert idx._cached_data is None
+ assert idx._cache == {}
repr(idx)
- assert idx._cached_data is None
+ assert idx._cache == {}
str(idx)
- assert idx._cached_data is None
+ assert idx._cache == {}
idx.get_loc(20)
- assert idx._cached_data is None
+ assert idx._cache == {}
- 90 in idx
- assert idx._cached_data is None
+ 90 in idx # True
+ assert idx._cache == {}
- 91 in idx
- assert idx._cached_data is None
+ 91 in idx # False
+ assert idx._cache == {}
idx.all()
- assert idx._cached_data is None
+ assert idx._cache == {}
idx.any()
- assert idx._cached_data is None
+ assert idx._cache == {}
+
+ for _ in idx:
+ pass
+ assert idx._cache == {}
+
+ idx.format()
+ assert idx._cache == {}
df = pd.DataFrame({"a": range(10)}, index=idx)
+ str(df)
+ assert idx._cache == {}
+
df.loc[50]
- assert idx._cached_data is None
+ assert idx._cache == {}
with pytest.raises(KeyError, match="51"):
df.loc[51]
- assert idx._cached_data is None
+ assert idx._cache == {}
df.loc[10:50]
- assert idx._cached_data is None
+ assert idx._cache == {}
df.iloc[5:10]
- assert idx._cached_data is None
+ assert idx._cache == {}
- # actually calling idx._data
+ # idx._cache should contain a _data entry after call to idx._data
+ idx._data
assert isinstance(idx._data, np.ndarray)
- assert isinstance(idx._cached_data, np.ndarray)
+ assert idx._data is idx._data # check cached value is reused
+ assert len(idx._cache) == 4
+ expected = np.arange(0, 100, 10, dtype="int64")
+ tm.assert_numpy_array_equal(idx._cache["_data"], expected)
def test_is_monotonic(self):
index = RangeIndex(0, 20, 2)
@@ -506,3 +525,9 @@ def test_engineless_lookup(self):
idx.get_loc("a")
assert "_engine" not in idx._cache
+
+ def test_format_empty(self):
+ # GH35712
+ empty_idx = self._holder(0)
+ assert empty_idx.format() == []
+ assert empty_idx.format(name=True) == [""]
diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py
index 099c7ced5e2ce..f811bd579aaaa 100644
--- a/pandas/tests/indexes/test_base.py
+++ b/pandas/tests/indexes/test_base.py
@@ -56,17 +56,12 @@ def test_can_hold_identifiers(self):
@pytest.mark.parametrize("index", ["datetime"], indirect=True)
def test_new_axis(self, index):
- with tm.assert_produces_warning(DeprecationWarning):
+ with tm.assert_produces_warning(FutureWarning):
# GH#30588 multi-dimensional indexing deprecated
new_index = index[None, :]
assert new_index.ndim == 2
assert isinstance(new_index, np.ndarray)
- @pytest.mark.parametrize("index", ["int", "uint", "float"], indirect=True)
- def test_copy_and_deepcopy(self, index):
- new_copy2 = index.copy(dtype=int)
- assert new_copy2.dtype.kind == "i"
-
def test_constructor_regular(self, index):
tm.assert_contains_all(index, index)
@@ -1171,8 +1166,11 @@ def test_summary_bug(self):
assert "~:{range}:0" in result
assert "{other}%s" in result
- def test_format(self, index):
- self._check_method_works(Index.format, index)
+ def test_format_different_scalar_lengths(self):
+ # GH35439
+ idx = Index(["aaaaaaaaa", "b"])
+ expected = ["aaaaaaaaa", "b"]
+ assert idx.format() == expected
def test_format_bug(self):
# GH 14626
@@ -1362,7 +1360,7 @@ def test_get_indexer_strings_raises(self):
def test_get_indexer_numeric_index_boolean_target(self, idx_class):
# GH 16877
- numeric_index = idx_class(RangeIndex((4)))
+ numeric_index = idx_class(RangeIndex(4))
result = numeric_index.get_indexer([True, False, True])
expected = np.array([-1, -1, -1], dtype=np.intp)
tm.assert_numpy_array_equal(result, expected)
@@ -1511,23 +1509,24 @@ def test_slice_locs_na_raises(self):
@pytest.mark.parametrize(
"in_slice,expected",
[
+ # error: Slice index must be an integer or None
(pd.IndexSlice[::-1], "yxdcb"),
- (pd.IndexSlice["b":"y":-1], ""), # type: ignore
- (pd.IndexSlice["b"::-1], "b"), # type: ignore
- (pd.IndexSlice[:"b":-1], "yxdcb"), # type: ignore
- (pd.IndexSlice[:"y":-1], "y"), # type: ignore
- (pd.IndexSlice["y"::-1], "yxdcb"), # type: ignore
- (pd.IndexSlice["y"::-4], "yb"), # type: ignore
+ (pd.IndexSlice["b":"y":-1], ""), # type: ignore[misc]
+ (pd.IndexSlice["b"::-1], "b"), # type: ignore[misc]
+ (pd.IndexSlice[:"b":-1], "yxdcb"), # type: ignore[misc]
+ (pd.IndexSlice[:"y":-1], "y"), # type: ignore[misc]
+ (pd.IndexSlice["y"::-1], "yxdcb"), # type: ignore[misc]
+ (pd.IndexSlice["y"::-4], "yb"), # type: ignore[misc]
# absent labels
- (pd.IndexSlice[:"a":-1], "yxdcb"), # type: ignore
- (pd.IndexSlice[:"a":-2], "ydb"), # type: ignore
- (pd.IndexSlice["z"::-1], "yxdcb"), # type: ignore
- (pd.IndexSlice["z"::-3], "yc"), # type: ignore
- (pd.IndexSlice["m"::-1], "dcb"), # type: ignore
- (pd.IndexSlice[:"m":-1], "yx"), # type: ignore
- (pd.IndexSlice["a":"a":-1], ""), # type: ignore
- (pd.IndexSlice["z":"z":-1], ""), # type: ignore
- (pd.IndexSlice["m":"m":-1], ""), # type: ignore
+ (pd.IndexSlice[:"a":-1], "yxdcb"), # type: ignore[misc]
+ (pd.IndexSlice[:"a":-2], "ydb"), # type: ignore[misc]
+ (pd.IndexSlice["z"::-1], "yxdcb"), # type: ignore[misc]
+ (pd.IndexSlice["z"::-3], "yc"), # type: ignore[misc]
+ (pd.IndexSlice["m"::-1], "dcb"), # type: ignore[misc]
+ (pd.IndexSlice[:"m":-1], "yx"), # type: ignore[misc]
+ (pd.IndexSlice["a":"a":-1], ""), # type: ignore[misc]
+ (pd.IndexSlice["z":"z":-1], ""), # type: ignore[misc]
+ (pd.IndexSlice["m":"m":-1], ""), # type: ignore[misc]
],
)
def test_slice_locs_negative_step(self, in_slice, expected):
@@ -2427,7 +2426,7 @@ def test_index_with_tuple_bool(self):
# TODO: remove tupleize_cols=False once correct behaviour is restored
# TODO: also this op right now produces FutureWarning from numpy
idx = Index([("a", "b"), ("b", "c"), ("c", "a")], tupleize_cols=False)
- result = idx == ("c", "a",)
+ result = idx == ("c", "a")
expected = np.array([False, False, True])
tm.assert_numpy_array_equal(result, expected)
@@ -2531,7 +2530,7 @@ def test_shape_of_invalid_index():
# that the returned shape is consistent with this underlying array for
# compat with matplotlib (see https://github.com/pandas-dev/pandas/issues/27775)
idx = pd.Index([0, 1, 2, 3])
- with tm.assert_produces_warning(DeprecationWarning):
+ with tm.assert_produces_warning(FutureWarning):
# GH#30588 multi-dimensional indexing deprecated
assert idx[:, None].shape == (4, 1)
diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py
index 02a173eb4958d..675ae388a28a4 100644
--- a/pandas/tests/indexes/test_common.py
+++ b/pandas/tests/indexes/test_common.py
@@ -13,7 +13,14 @@
from pandas.core.dtypes.common import is_period_dtype, needs_i8_conversion
import pandas as pd
-from pandas import CategoricalIndex, MultiIndex, RangeIndex
+from pandas import (
+ CategoricalIndex,
+ DatetimeIndex,
+ MultiIndex,
+ PeriodIndex,
+ RangeIndex,
+ TimedeltaIndex,
+)
import pandas._testing as tm
@@ -374,8 +381,7 @@ def test_has_duplicates(self, index):
"dtype",
["int64", "uint64", "float64", "category", "datetime64[ns]", "timedelta64[ns]"],
)
- @pytest.mark.parametrize("copy", [True, False])
- def test_astype_preserves_name(self, index, dtype, copy):
+ def test_astype_preserves_name(self, index, dtype):
# https://github.com/pandas-dev/pandas/issues/32013
if isinstance(index, MultiIndex):
index.names = ["idx" + str(i) for i in range(index.nlevels)]
@@ -384,10 +390,7 @@ def test_astype_preserves_name(self, index, dtype, copy):
try:
# Some of these conversions cannot succeed so we use a try / except
- if copy:
- result = index.copy(dtype=dtype)
- else:
- result = index.astype(dtype)
+ result = index.astype(dtype)
except (ValueError, TypeError, NotImplementedError, SystemError):
return
@@ -395,3 +398,44 @@ def test_astype_preserves_name(self, index, dtype, copy):
assert result.names == index.names
else:
assert result.name == index.name
+
+
+@pytest.mark.parametrize("na_position", [None, "middle"])
+def test_sort_values_invalid_na_position(index_with_missing, na_position):
+ if isinstance(index_with_missing, (DatetimeIndex, PeriodIndex, TimedeltaIndex)):
+ # datetime-like indices will get na_position kwarg as part of
+ # synchronizing duplicate-sorting behavior, because we currently expect
+ # them, other indices, and Series to sort differently (xref 35922)
+ pytest.xfail("sort_values does not support na_position kwarg")
+ elif isinstance(index_with_missing, (CategoricalIndex, MultiIndex)):
+ pytest.xfail("missing value sorting order not defined for index type")
+
+ if na_position not in ["first", "last"]:
+ with pytest.raises(ValueError, match=f"invalid na_position: {na_position}"):
+ index_with_missing.sort_values(na_position=na_position)
+
+
+@pytest.mark.parametrize("na_position", ["first", "last"])
+def test_sort_values_with_missing(index_with_missing, na_position):
+ # GH 35584. Test that sort_values works with missing values,
+ # sort non-missing and place missing according to na_position
+
+ if isinstance(index_with_missing, (DatetimeIndex, PeriodIndex, TimedeltaIndex)):
+ # datetime-like indices will get na_position kwarg as part of
+ # synchronizing duplicate-sorting behavior, because we currently expect
+ # them, other indices, and Series to sort differently (xref 35922)
+ pytest.xfail("sort_values does not support na_position kwarg")
+ elif isinstance(index_with_missing, (CategoricalIndex, MultiIndex)):
+ pytest.xfail("missing value sorting order not defined for index type")
+
+ missing_count = np.sum(index_with_missing.isna())
+ not_na_vals = index_with_missing[index_with_missing.notna()].values
+ sorted_values = np.sort(not_na_vals)
+ if na_position == "first":
+ sorted_values = np.concatenate([[None] * missing_count, sorted_values])
+ else:
+ sorted_values = np.concatenate([sorted_values, [None] * missing_count])
+ expected = type(index_with_missing)(sorted_values)
+
+ result = index_with_missing.sort_values(na_position=na_position)
+ tm.assert_index_equal(result, expected)
diff --git a/pandas/tests/indexes/test_numeric.py b/pandas/tests/indexes/test_numeric.py
index 33de0800658f2..7fa7a571d2571 100644
--- a/pandas/tests/indexes/test_numeric.py
+++ b/pandas/tests/indexes/test_numeric.py
@@ -21,6 +21,13 @@ def test_can_hold_identifiers(self):
key = idx[0]
assert idx._can_hold_identifiers_and_holds_name(key) is False
+ def test_format(self):
+ # GH35439
+ idx = self.create_index()
+ max_width = max(len(str(x)) for x in idx)
+ expected = [str(x).ljust(max_width) for x in idx]
+ assert idx.format() == expected
+
def test_numeric_compat(self):
pass # override Base method
@@ -77,10 +84,14 @@ def test_index_groupby(self):
expected = {ex_keys[0]: idx[[0, 5]], ex_keys[1]: idx[[1, 4]]}
tm.assert_dict_equal(idx.groupby(to_groupby), expected)
- def test_insert(self, nulls_fixture):
+ def test_insert_na(self, nulls_fixture):
# GH 18295 (test missing)
index = self.create_index()
- expected = Float64Index([index[0], np.nan] + list(index[1:]))
+
+ if nulls_fixture is pd.NaT:
+ expected = Index([index[0], pd.NaT] + list(index[1:]), dtype=object)
+ else:
+ expected = Float64Index([index[0], np.nan] + list(index[1:]))
result = index.insert(1, nulls_fixture)
tm.assert_index_equal(result, expected)
@@ -239,6 +250,19 @@ def test_equals_numeric(self):
i2 = Float64Index([1.0, np.nan])
assert i.equals(i2)
+ @pytest.mark.parametrize(
+ "other",
+ (
+ Int64Index([1, 2]),
+ Index([1.0, 2.0], dtype=object),
+ Index([1, 2], dtype=object),
+ ),
+ )
+ def test_equals_numeric_other_index_type(self, other):
+ i = Float64Index([1.0, 2.0])
+ assert i.equals(other)
+ assert other.equals(i)
+
@pytest.mark.parametrize(
"vals",
[
@@ -374,7 +398,7 @@ def test_identical(self):
same_values_different_type = Index(i, dtype=object)
assert not i.identical(same_values_different_type)
- i = index.copy(dtype=object)
+ i = index.astype(dtype=object)
i = i.rename("foo")
same_values = Index(i, dtype=object)
assert same_values.identical(i)
@@ -382,7 +406,7 @@ def test_identical(self):
assert not i.identical(index)
assert Index(same_values, name="foo", dtype=object).identical(i)
- assert not index.copy(dtype=object).identical(index.copy(dtype=self._dtype))
+ assert not index.astype(dtype=object).identical(index.astype(dtype=self._dtype))
def test_union_noncomparable(self):
# corner case, non-Int64Index
@@ -611,7 +635,11 @@ def test_range_float_union_dtype():
tm.assert_index_equal(result, expected)
-def test_uint_index_does_not_convert_to_float64():
+@pytest.mark.parametrize(
+ "box",
+ [list, lambda x: np.array(x, dtype=object), lambda x: pd.Index(x, dtype=object)],
+)
+def test_uint_index_does_not_convert_to_float64(box):
# https://github.com/pandas-dev/pandas/issues/28279
# https://github.com/pandas-dev/pandas/issues/28023
series = pd.Series(
@@ -626,7 +654,7 @@ def test_uint_index_does_not_convert_to_float64():
],
)
- result = series.loc[[7606741985629028552, 17876870360202815256]]
+ result = series.loc[box([7606741985629028552, 17876870360202815256])]
expected = UInt64Index(
[7606741985629028552, 17876870360202815256, 17876870360202815256],
@@ -635,3 +663,44 @@ def test_uint_index_does_not_convert_to_float64():
tm.assert_index_equal(result.index, expected)
tm.assert_equal(result, series[:3])
+
+
+def test_float64_index_equals():
+ # https://github.com/pandas-dev/pandas/issues/35217
+ float_index = pd.Index([1.0, 2, 3])
+ string_index = pd.Index(["1", "2", "3"])
+
+ result = float_index.equals(string_index)
+ assert result is False
+
+ result = string_index.equals(float_index)
+ assert result is False
+
+
+def test_float64_index_difference():
+ # https://github.com/pandas-dev/pandas/issues/35217
+ float_index = pd.Index([1.0, 2, 3])
+ string_index = pd.Index(["1", "2", "3"])
+
+ result = float_index.difference(string_index)
+ tm.assert_index_equal(result, float_index)
+
+ result = string_index.difference(float_index)
+ tm.assert_index_equal(result, string_index)
+
+
+class TestGetSliceBounds:
+ @pytest.mark.parametrize("kind", ["getitem", "loc", None])
+ @pytest.mark.parametrize("side, expected", [("left", 4), ("right", 5)])
+ def test_get_slice_bounds_within(self, kind, side, expected):
+ index = Index(range(6))
+ result = index.get_slice_bound(4, kind=kind, side=side)
+ assert result == expected
+
+ @pytest.mark.parametrize("kind", ["getitem", "loc", None])
+ @pytest.mark.parametrize("side", ["left", "right"])
+ @pytest.mark.parametrize("bound, expected", [(-1, 0), (10, 6)])
+ def test_get_slice_bounds_outside(self, kind, side, expected, bound):
+ index = Index(range(6))
+ result = index.get_slice_bound(bound, kind=kind, side=side)
+ assert result == expected
diff --git a/pandas/tests/indexes/test_numpy_compat.py b/pandas/tests/indexes/test_numpy_compat.py
index 043539c173427..b71417b2a625d 100644
--- a/pandas/tests/indexes/test_numpy_compat.py
+++ b/pandas/tests/indexes/test_numpy_compat.py
@@ -1,6 +1,8 @@
import numpy as np
import pytest
+from pandas.compat.numpy import np_version_under1p17, np_version_under1p18
+
from pandas import (
DatetimeIndex,
Float64Index,
@@ -9,8 +11,6 @@
PeriodIndex,
TimedeltaIndex,
UInt64Index,
- _np_version_under1p17,
- _np_version_under1p18,
)
import pandas._testing as tm
from pandas.core.indexes.datetimelike import DatetimeIndexOpsMixin
@@ -83,12 +83,12 @@ def test_numpy_ufuncs_other(index, func):
if func in [np.isfinite, np.isnan, np.isinf]:
pytest.xfail(reason="__array_ufunc__ is not defined")
- if not _np_version_under1p18 and func in [np.isfinite, np.isinf, np.isnan]:
+ if not np_version_under1p18 and func in [np.isfinite, np.isinf, np.isnan]:
# numpy 1.18(dev) changed isinf and isnan to not raise on dt64/tfd64
result = func(index)
assert isinstance(result, np.ndarray)
- elif not _np_version_under1p17 and func in [np.isfinite]:
+ elif not np_version_under1p17 and func in [np.isfinite]:
# ok under numpy >= 1.17
# Results in bool array
result = func(index)
@@ -114,18 +114,3 @@ def test_numpy_ufuncs_other(index, func):
else:
with pytest.raises(Exception):
func(index)
-
-
-def test_elementwise_comparison_warning():
- # https://github.com/pandas-dev/pandas/issues/22698#issuecomment-458968300
- # np.array([1, 2]) == 'a' returns False, and produces a
- # FutureWarning that it'll be [False, False] in the future.
- # We just want to ensure that comes through.
- # When NumPy dev actually enforces this change, we'll need to skip
- # this test.
- idx = Index([1, 2])
- with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
- result = idx == "a"
-
- expected = np.array([False, False])
- tm.assert_numpy_array_equal(result, expected)
diff --git a/pandas/tests/indexes/timedeltas/test_scalar_compat.py b/pandas/tests/indexes/timedeltas/test_scalar_compat.py
index 16c19b8d00380..6a2238d90b590 100644
--- a/pandas/tests/indexes/timedeltas/test_scalar_compat.py
+++ b/pandas/tests/indexes/timedeltas/test_scalar_compat.py
@@ -104,18 +104,18 @@ def test_round(self):
"L",
t1a,
TimedeltaIndex(
- ["-1 days +00:00:00", "-2 days +23:58:58", "-2 days +23:57:56"],
+ ["-1 days +00:00:00", "-2 days +23:58:58", "-2 days +23:57:56"]
),
),
(
"S",
t1a,
TimedeltaIndex(
- ["-1 days +00:00:00", "-2 days +23:58:58", "-2 days +23:57:56"],
+ ["-1 days +00:00:00", "-2 days +23:58:58", "-2 days +23:57:56"]
),
),
- ("12T", t1c, TimedeltaIndex(["-1 days", "-1 days", "-1 days"],),),
- ("H", t1c, TimedeltaIndex(["-1 days", "-1 days", "-1 days"],),),
+ ("12T", t1c, TimedeltaIndex(["-1 days", "-1 days", "-1 days"])),
+ ("H", t1c, TimedeltaIndex(["-1 days", "-1 days", "-1 days"])),
("d", t1c, TimedeltaIndex([-1, -1, -1], unit="D")),
]:
diff --git a/pandas/tests/indexes/timedeltas/test_searchsorted.py b/pandas/tests/indexes/timedeltas/test_searchsorted.py
index 4806a9acff96f..3cf45931cf6b7 100644
--- a/pandas/tests/indexes/timedeltas/test_searchsorted.py
+++ b/pandas/tests/indexes/timedeltas/test_searchsorted.py
@@ -17,7 +17,7 @@ def test_searchsorted_different_argument_classes(self, klass):
tm.assert_numpy_array_equal(result, expected)
@pytest.mark.parametrize(
- "arg", [[1, 2], ["a", "b"], [Timestamp("2020-01-01", tz="Europe/London")] * 2],
+ "arg", [[1, 2], ["a", "b"], [Timestamp("2020-01-01", tz="Europe/London")] * 2]
)
def test_searchsorted_invalid_argument_dtype(self, arg):
idx = TimedeltaIndex(["1 day", "2 days", "3 days"])
diff --git a/pandas/tests/indexing/common.py b/pandas/tests/indexing/common.py
index 9cc031001f81c..656d25bec2a6b 100644
--- a/pandas/tests/indexing/common.py
+++ b/pandas/tests/indexing/common.py
@@ -144,9 +144,7 @@ def check_values(self, f, func, values=False):
tm.assert_almost_equal(result, expected)
- def check_result(
- self, method, key, typs=None, axes=None, fails=None,
- ):
+ def check_result(self, method, key, typs=None, axes=None, fails=None):
def _eq(axis, obj, key):
""" compare equal for these 2 keys """
axified = _axify(obj, key, axis)
diff --git a/pandas/tests/indexing/interval/test_interval.py b/pandas/tests/indexing/interval/test_interval.py
index 634020982b1c2..8976e87a1b75a 100644
--- a/pandas/tests/indexing/interval/test_interval.py
+++ b/pandas/tests/indexing/interval/test_interval.py
@@ -71,6 +71,7 @@ def test_non_matching(self):
with pytest.raises(KeyError, match="^$"):
s.loc[[-1, 3]]
+ @pytest.mark.arm_slow
def test_large_series(self):
s = Series(
np.arange(1000000), index=IntervalIndex.from_breaks(np.arange(1000001))
diff --git a/pandas/tests/indexing/multiindex/test_chaining_and_caching.py b/pandas/tests/indexing/multiindex/test_chaining_and_caching.py
index d3b13336e2a44..62c0171fe641f 100644
--- a/pandas/tests/indexing/multiindex/test_chaining_and_caching.py
+++ b/pandas/tests/indexing/multiindex/test_chaining_and_caching.py
@@ -49,6 +49,7 @@ def test_cache_updating():
assert result == 2
+@pytest.mark.arm_slow
def test_indexer_caching():
# GH5727
# make sure that indexers are in the _internal_names_set
diff --git a/pandas/tests/indexing/multiindex/test_datetime.py b/pandas/tests/indexing/multiindex/test_datetime.py
index 907d20cd5bd53..a49cb0bc2c43e 100644
--- a/pandas/tests/indexing/multiindex/test_datetime.py
+++ b/pandas/tests/indexing/multiindex/test_datetime.py
@@ -2,7 +2,16 @@
import numpy as np
-from pandas import Index, Period, Series, period_range
+from pandas import (
+ DataFrame,
+ Index,
+ MultiIndex,
+ Period,
+ Series,
+ period_range,
+ to_datetime,
+)
+import pandas._testing as tm
def test_multiindex_period_datetime():
@@ -20,3 +29,22 @@ def test_multiindex_period_datetime():
# try datetime as index
result = s.loc["a", datetime(2012, 1, 1)]
assert result == expected
+
+
+def test_multiindex_datetime_columns():
+ # GH35015, using datetime as column indices raises exception
+
+ mi = MultiIndex.from_tuples(
+ [(to_datetime("02/29/2020"), to_datetime("03/01/2020"))], names=["a", "b"]
+ )
+
+ df = DataFrame([], columns=mi)
+
+ expected_df = DataFrame(
+ [],
+ columns=MultiIndex.from_arrays(
+ [[to_datetime("02/29/2020")], [to_datetime("03/01/2020")]], names=["a", "b"]
+ ),
+ )
+
+ tm.assert_frame_equal(df, expected_df)
diff --git a/pandas/tests/indexing/multiindex/test_indexing_slow.py b/pandas/tests/indexing/multiindex/test_indexing_slow.py
index ea4453b8dd6eb..d8e56661b7d61 100644
--- a/pandas/tests/indexing/multiindex/test_indexing_slow.py
+++ b/pandas/tests/indexing/multiindex/test_indexing_slow.py
@@ -15,7 +15,7 @@ def test_multiindex_get_loc(): # GH7724, GH2646
with warnings.catch_warnings(record=True):
# test indexing into a multi-index before & past the lexsort depth
- from numpy.random import randint, choice, randn
+ from numpy.random import choice, randint, randn
cols = ["jim", "joe", "jolie", "joline", "jolia"]
@@ -34,12 +34,15 @@ def validate(mi, df, key):
right = df[mask].copy()
if i + 1 != len(key): # partial key
- right.drop(cols[: i + 1], axis=1, inplace=True)
- right.set_index(cols[i + 1 : -1], inplace=True)
+ return_value = right.drop(cols[: i + 1], axis=1, inplace=True)
+ assert return_value is None
+ return_value = right.set_index(cols[i + 1 : -1], inplace=True)
+ assert return_value is None
tm.assert_frame_equal(mi.loc[key[: i + 1]], right)
else: # full key
- right.set_index(cols[:-1], inplace=True)
+ return_value = right.set_index(cols[:-1], inplace=True)
+ assert return_value is None
if len(right) == 1: # single hit
right = Series(
right["jolia"].values, name=right.index[0], index=["jolia"]
diff --git a/pandas/tests/indexing/multiindex/test_ix.py b/pandas/tests/indexing/multiindex/test_ix.py
index 01b0b392d52a3..abf989324e4a5 100644
--- a/pandas/tests/indexing/multiindex/test_ix.py
+++ b/pandas/tests/indexing/multiindex/test_ix.py
@@ -35,7 +35,8 @@ def test_loc_general(self):
tm.assert_frame_equal(df.loc[key], df.iloc[2:])
# this is ok
- df.sort_index(inplace=True)
+ return_value = df.sort_index(inplace=True)
+ assert return_value is None
res = df.loc[key]
# col has float dtype, result should be Float64Index
diff --git a/pandas/tests/indexing/multiindex/test_loc.py b/pandas/tests/indexing/multiindex/test_loc.py
index f0cbdbe8d0564..63983f45d7832 100644
--- a/pandas/tests/indexing/multiindex/test_loc.py
+++ b/pandas/tests/indexing/multiindex/test_loc.py
@@ -491,3 +491,22 @@ def test_loc_datetime_mask_slicing():
),
)
tm.assert_series_equal(result, expected)
+
+
+def test_loc_with_mi_indexer():
+ # https://github.com/pandas-dev/pandas/issues/35351
+ df = DataFrame(
+ data=[["a", 1], ["a", 0], ["b", 1], ["c", 2]],
+ index=MultiIndex.from_tuples(
+ [(0, 1), (1, 0), (1, 1), (1, 1)], names=["index", "date"]
+ ),
+ columns=["author", "price"],
+ )
+ idx = MultiIndex.from_tuples([(0, 1), (1, 1)], names=["index", "date"])
+ result = df.loc[idx, :]
+ expected = DataFrame(
+ [["a", 1], ["b", 1], ["c", 2]],
+ index=MultiIndex.from_tuples([(0, 1), (1, 1), (1, 1)], names=["index", "date"]),
+ columns=["author", "price"],
+ )
+ tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/indexing/multiindex/test_multiindex.py b/pandas/tests/indexing/multiindex/test_multiindex.py
index 5e5fcd3db88d8..4565d79c632de 100644
--- a/pandas/tests/indexing/multiindex/test_multiindex.py
+++ b/pandas/tests/indexing/multiindex/test_multiindex.py
@@ -1,4 +1,5 @@
import numpy as np
+import pytest
import pandas._libs.index as _index
from pandas.errors import PerformanceWarning
@@ -83,3 +84,10 @@ def test_nested_tuples_duplicates(self):
df3 = df.copy(deep=True)
df3.loc[[(dti[0], "a")], "c2"] = 1.0
tm.assert_frame_equal(df3, expected)
+
+ def test_multiindex_get_loc_list_raises(self):
+ # https://github.com/pandas-dev/pandas/issues/35878
+ idx = pd.MultiIndex.from_tuples([("a", 1), ("b", 2)])
+ msg = "unhashable type"
+ with pytest.raises(TypeError, match=msg):
+ idx.get_loc([])
diff --git a/pandas/tests/indexing/multiindex/test_slice.py b/pandas/tests/indexing/multiindex/test_slice.py
index 532bb4f2e6dac..ec0391a2ccc26 100644
--- a/pandas/tests/indexing/multiindex/test_slice.py
+++ b/pandas/tests/indexing/multiindex/test_slice.py
@@ -6,7 +6,7 @@
import pandas as pd
from pandas import DataFrame, Index, MultiIndex, Series, Timestamp
import pandas._testing as tm
-from pandas.core.indexing import _non_reducing_slice
+from pandas.core.indexing import non_reducing_slice
from pandas.tests.indexing.common import _mklbl
@@ -739,7 +739,7 @@ def test_non_reducing_slice_on_multiindex(self):
df = pd.DataFrame(dic, index=[0, 1])
idx = pd.IndexSlice
slice_ = idx[:, idx["b", "d"]]
- tslice_ = _non_reducing_slice(slice_)
+ tslice_ = non_reducing_slice(slice_)
result = df.loc[tslice_]
expected = pd.DataFrame({("b", "d"): [4, 1]})
diff --git a/pandas/tests/indexing/multiindex/test_sorted.py b/pandas/tests/indexing/multiindex/test_sorted.py
index fdeb3ce95b0bb..bafe5068e1418 100644
--- a/pandas/tests/indexing/multiindex/test_sorted.py
+++ b/pandas/tests/indexing/multiindex/test_sorted.py
@@ -43,8 +43,14 @@ def test_frame_getitem_not_sorted2(self, key):
df2 = df.set_index(["col1", "col2"])
df2_original = df2.copy()
- df2.index.set_levels(["b", "d", "a"], level="col1", inplace=True)
- df2.index.set_codes([0, 1, 0, 2], level="col1", inplace=True)
+ with tm.assert_produces_warning(FutureWarning):
+ return_value = df2.index.set_levels(
+ ["b", "d", "a"], level="col1", inplace=True
+ )
+ assert return_value is None
+ with tm.assert_produces_warning(FutureWarning):
+ return_value = df2.index.set_codes([0, 1, 0, 2], level="col1", inplace=True)
+ assert return_value is None
assert not df2.index.is_lexsorted()
assert not df2.index.is_monotonic
diff --git a/pandas/tests/indexing/multiindex/test_xs.py b/pandas/tests/indexing/multiindex/test_xs.py
index ff748d755c063..91be1d913001b 100644
--- a/pandas/tests/indexing/multiindex/test_xs.py
+++ b/pandas/tests/indexing/multiindex/test_xs.py
@@ -1,7 +1,7 @@
import numpy as np
import pytest
-from pandas import DataFrame, Index, MultiIndex, Series, concat, date_range
+from pandas import DataFrame, Index, IndexSlice, MultiIndex, Series, concat, date_range
import pandas._testing as tm
import pandas.core.common as com
@@ -220,6 +220,27 @@ def test_xs_level_series_slice_not_implemented(
s[2000, 3:4]
+def test_xs_IndexSlice_argument_not_implemented():
+ # GH 35301
+
+ index = MultiIndex(
+ levels=[[("foo", "bar", 0), ("foo", "baz", 0), ("foo", "qux", 0)], [0, 1]],
+ codes=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]],
+ )
+
+ series = Series(np.random.randn(6), index=index)
+ frame = DataFrame(np.random.randn(6, 4), index=index)
+
+ msg = (
+ "Expected label or tuple of labels, got "
+ r"\(\('foo', 'qux', 0\), slice\(None, None, None\)\)"
+ )
+ with pytest.raises(TypeError, match=msg):
+ frame.xs(IndexSlice[("foo", "qux", 0), :])
+ with pytest.raises(TypeError, match=msg):
+ series.xs(IndexSlice[("foo", "qux", 0), :])
+
+
def test_series_getitem_multiindex_xs():
# GH6258
dt = list(date_range("20130903", periods=3))
@@ -237,9 +258,11 @@ def test_series_getitem_multiindex_xs_by_label():
[("a", "one"), ("a", "two"), ("b", "one"), ("b", "two")]
)
s = Series([1, 2, 3, 4], index=idx)
- s.index.set_names(["L1", "L2"], inplace=True)
+ return_value = s.index.set_names(["L1", "L2"], inplace=True)
+ assert return_value is None
expected = Series([1, 3], index=["a", "b"])
- expected.index.set_names(["L1"], inplace=True)
+ return_value = expected.index.set_names(["L1"], inplace=True)
+ assert return_value is None
result = s.xs("one", level="L2")
tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/indexing/test_callable.py b/pandas/tests/indexing/test_callable.py
index 621417eb38d94..bf51c3e5d1695 100644
--- a/pandas/tests/indexing/test_callable.py
+++ b/pandas/tests/indexing/test_callable.py
@@ -17,15 +17,11 @@ def test_frame_loc_callable(self):
res = df.loc[lambda x: x.A > 2]
tm.assert_frame_equal(res, df.loc[df.A > 2])
- res = df.loc[
- lambda x: x.A > 2,
- ] # noqa: E231
- tm.assert_frame_equal(res, df.loc[df.A > 2,]) # noqa: E231
+ res = df.loc[lambda x: x.A > 2] # noqa: E231
+ tm.assert_frame_equal(res, df.loc[df.A > 2]) # noqa: E231
- res = df.loc[
- lambda x: x.A > 2,
- ] # noqa: E231
- tm.assert_frame_equal(res, df.loc[df.A > 2,]) # noqa: E231
+ res = df.loc[lambda x: x.A > 2] # noqa: E231
+ tm.assert_frame_equal(res, df.loc[df.A > 2]) # noqa: E231
res = df.loc[lambda x: x.B == "b", :]
tm.assert_frame_equal(res, df.loc[df.B == "b", :])
@@ -94,10 +90,8 @@ def test_frame_loc_callable_labels(self):
res = df.loc[lambda x: ["A", "C"]]
tm.assert_frame_equal(res, df.loc[["A", "C"]])
- res = df.loc[
- lambda x: ["A", "C"],
- ] # noqa: E231
- tm.assert_frame_equal(res, df.loc[["A", "C"],]) # noqa: E231
+ res = df.loc[lambda x: ["A", "C"]] # noqa: E231
+ tm.assert_frame_equal(res, df.loc[["A", "C"]]) # noqa: E231
res = df.loc[lambda x: ["A", "C"], :]
tm.assert_frame_equal(res, df.loc[["A", "C"], :])
diff --git a/pandas/tests/indexing/test_chaining_and_caching.py b/pandas/tests/indexing/test_chaining_and_caching.py
index fa5fe5ba5c384..66835c586e6c7 100644
--- a/pandas/tests/indexing/test_chaining_and_caching.py
+++ b/pandas/tests/indexing/test_chaining_and_caching.py
@@ -81,6 +81,21 @@ def test_setitem_cache_updating(self):
tm.assert_frame_equal(out, expected)
tm.assert_series_equal(out["A"], expected["A"])
+ def test_altering_series_clears_parent_cache(self):
+ # GH #33675
+ df = pd.DataFrame([[1, 2], [3, 4]], index=["a", "b"], columns=["A", "B"])
+ ser = df["A"]
+
+ assert "A" in df._item_cache
+
+ # Adding a new entry to ser swaps in a new array, so "A" needs to
+ # be removed from df._item_cache
+ ser["c"] = 5
+ assert len(ser) == 3
+ assert "A" not in df._item_cache
+ assert df["A"] is not ser
+ assert len(df["A"]) == 2
+
class TestChaining:
def test_setitem_chained_setfault(self):
@@ -117,6 +132,7 @@ def test_setitem_chained_setfault(self):
result = df.head()
tm.assert_frame_equal(result, expected)
+ @pytest.mark.arm_slow
def test_detect_chained_assignment(self):
pd.set_option("chained_assignment", "raise")
diff --git a/pandas/tests/indexing/test_check_indexer.py b/pandas/tests/indexing/test_check_indexer.py
index 69d4065234d93..865ecb129cdfa 100644
--- a/pandas/tests/indexing/test_check_indexer.py
+++ b/pandas/tests/indexing/test_check_indexer.py
@@ -32,7 +32,7 @@ def test_valid_input(indexer, expected):
@pytest.mark.parametrize(
- "indexer", [[True, False, None], pd.array([True, False, None], dtype="boolean")],
+ "indexer", [[True, False, None], pd.array([True, False, None], dtype="boolean")]
)
def test_boolean_na_returns_indexer(indexer):
# https://github.com/pandas-dev/pandas/issues/31503
@@ -61,7 +61,7 @@ def test_bool_raise_length(indexer):
@pytest.mark.parametrize(
- "indexer", [[0, 1, None], pd.array([0, 1, pd.NA], dtype="Int64")],
+ "indexer", [[0, 1, None], pd.array([0, 1, pd.NA], dtype="Int64")]
)
def test_int_raise_missing_values(indexer):
array = np.array([1, 2, 3])
@@ -89,9 +89,7 @@ def test_raise_invalid_array_dtypes(indexer):
check_array_indexer(array, indexer)
-@pytest.mark.parametrize(
- "indexer", [None, Ellipsis, slice(0, 3), (None,)],
-)
+@pytest.mark.parametrize("indexer", [None, Ellipsis, slice(0, 3), (None,)])
def test_pass_through_non_array_likes(indexer):
array = np.array([1, 2, 3])
diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py
index 1512c88a68778..752ecd47fe089 100644
--- a/pandas/tests/indexing/test_coercion.py
+++ b/pandas/tests/indexing/test_coercion.py
@@ -5,7 +5,7 @@
import numpy as np
import pytest
-import pandas.compat as compat
+from pandas.compat import IS64, is_platform_windows
import pandas as pd
import pandas._testing as tm
@@ -87,7 +87,7 @@ def _assert_setitem_series_conversion(
# tm.assert_series_equal(temp, expected_series)
@pytest.mark.parametrize(
- "val,exp_dtype", [(1, object), (1.1, object), (1 + 1j, object), (True, object)],
+ "val,exp_dtype", [(1, object), (1.1, object), (1 + 1j, object), (True, object)]
)
def test_setitem_series_object(self, val, exp_dtype):
obj = pd.Series(list("abcd"))
@@ -1041,7 +1041,7 @@ def test_replace_series(self, how, to_key, from_key):
from_key == "complex128" and to_key in ("int64", "float64")
):
- if compat.is_platform_32bit() or compat.is_platform_windows():
+ if not IS64 or is_platform_windows():
pytest.skip(f"32-bit platform buggy: {from_key} -> {to_key}")
# Expected: do not downcast by replacement
diff --git a/pandas/tests/indexing/test_floats.py b/pandas/tests/indexing/test_floats.py
index 18b9898e7d800..c48e0a129e161 100644
--- a/pandas/tests/indexing/test_floats.py
+++ b/pandas/tests/indexing/test_floats.py
@@ -181,9 +181,7 @@ def test_scalar_with_mixed(self):
expected = 3
assert result == expected
- @pytest.mark.parametrize(
- "index_func", [tm.makeIntIndex, tm.makeRangeIndex],
- )
+ @pytest.mark.parametrize("index_func", [tm.makeIntIndex, tm.makeRangeIndex])
@pytest.mark.parametrize("klass", [Series, DataFrame])
def test_scalar_integer(self, index_func, klass):
@@ -405,7 +403,7 @@ def test_slice_integer(self):
@pytest.mark.parametrize("l", [slice(2, 4.0), slice(2.0, 4), slice(2.0, 4.0)])
def test_integer_positional_indexing(self, l):
- """ make sure that we are raising on positional indexing
+ """make sure that we are raising on positional indexing
w.r.t. an integer index
"""
s = Series(range(2, 6), index=range(2, 6))
@@ -425,9 +423,7 @@ def test_integer_positional_indexing(self, l):
with pytest.raises(TypeError, match=msg):
s.iloc[l]
- @pytest.mark.parametrize(
- "index_func", [tm.makeIntIndex, tm.makeRangeIndex],
- )
+ @pytest.mark.parametrize("index_func", [tm.makeIntIndex, tm.makeRangeIndex])
def test_slice_integer_frame_getitem(self, index_func):
# similar to above, but on the getitem dim (of a DataFrame)
@@ -486,9 +482,7 @@ def test_slice_integer_frame_getitem(self, index_func):
s[l]
@pytest.mark.parametrize("l", [slice(3.0, 4), slice(3, 4.0), slice(3.0, 4.0)])
- @pytest.mark.parametrize(
- "index_func", [tm.makeIntIndex, tm.makeRangeIndex],
- )
+ @pytest.mark.parametrize("index_func", [tm.makeIntIndex, tm.makeRangeIndex])
def test_float_slice_getitem_with_integer_index_raises(self, l, index_func):
# similar to above, but on the getitem dim (of a DataFrame)
diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py
index c5f40102874dd..d3d455f83c41a 100644
--- a/pandas/tests/indexing/test_iloc.py
+++ b/pandas/tests/indexing/test_iloc.py
@@ -56,7 +56,7 @@ def test_is_scalar_access(self):
assert ser.iloc._is_scalar_access((1,))
df = ser.to_frame()
- assert df.iloc._is_scalar_access((1, 0,))
+ assert df.iloc._is_scalar_access((1, 0))
def test_iloc_exceeds_bounds(self):
@@ -369,6 +369,20 @@ def test_iloc_setitem_dups(self):
df.iloc[[1, 0], [0, 1]] = df.iloc[[1, 0], [0, 1]].reset_index(drop=True)
tm.assert_frame_equal(df, expected)
+ def test_iloc_setitem_frame_duplicate_columns_multiple_blocks(self):
+ # Same as the "assign back to self" check in test_iloc_setitem_dups
+ # but on a DataFrame with multiple blocks
+ df = pd.DataFrame([[0, 1], [2, 3]], columns=["B", "B"])
+
+ df.iloc[:, 0] = df.iloc[:, 0].astype("f8")
+ assert len(df._mgr.blocks) == 2
+ expected = df.copy()
+
+ # assign back to self
+ df.iloc[[0, 1], [0, 1]] = df.iloc[[0, 1], [0, 1]]
+
+ tm.assert_frame_equal(df, expected)
+
# TODO: GH#27620 this test used to compare iloc against ix; check if this
# is redundant with another test comparing iloc against loc
def test_iloc_getitem_frame(self):
@@ -694,6 +708,7 @@ def test_series_indexing_zerodim_np_array(self):
result = s.iloc[np.array(0)]
assert result == 1
+ @pytest.mark.xfail(reason="https://github.com/pandas-dev/pandas/issues/33457")
def test_iloc_setitem_categorical_updates_inplace(self):
# Mixed dtype ensures we go through take_split_path in setitem_with_indexer
cat = pd.Categorical(["A", "B", "C"])
diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py
index ced70069dd955..0cc61cd7df389 100644
--- a/pandas/tests/indexing/test_indexing.py
+++ b/pandas/tests/indexing/test_indexing.py
@@ -12,7 +12,7 @@
import pandas as pd
from pandas import DataFrame, Index, NaT, Series
import pandas._testing as tm
-from pandas.core.indexing import _maybe_numeric_slice, _non_reducing_slice
+from pandas.core.indexing import maybe_numeric_slice, non_reducing_slice
from pandas.tests.indexing.common import _mklbl
# ------------------------------------------------------------------------
@@ -745,7 +745,7 @@ def run_tests(df, rhs, right):
# make frames multi-type & re-run tests
for frame in [df, rhs, right]:
frame["joe"] = frame["joe"].astype("float64")
- frame["jolie"] = frame["jolie"].map("@{0}".format)
+ frame["jolie"] = frame["jolie"].map("@{}".format)
run_tests(df, rhs, right)
@@ -822,7 +822,7 @@ def test_range_in_series_indexing(self, size):
def test_non_reducing_slice(self, slc):
df = DataFrame([[0, 1], [2, 3]])
- tslice_ = _non_reducing_slice(slc)
+ tslice_ = non_reducing_slice(slc)
assert isinstance(df.loc[tslice_], DataFrame)
def test_list_slice(self):
@@ -831,18 +831,18 @@ def test_list_slice(self):
df = DataFrame({"A": [1, 2], "B": [3, 4]}, index=["A", "B"])
expected = pd.IndexSlice[:, ["A"]]
for subset in slices:
- result = _non_reducing_slice(subset)
+ result = non_reducing_slice(subset)
tm.assert_frame_equal(df.loc[result], df.loc[expected])
def test_maybe_numeric_slice(self):
df = DataFrame({"A": [1, 2], "B": ["c", "d"], "C": [True, False]})
- result = _maybe_numeric_slice(df, slice_=None)
+ result = maybe_numeric_slice(df, slice_=None)
expected = pd.IndexSlice[:, ["A"]]
assert result == expected
- result = _maybe_numeric_slice(df, None, include_bool=True)
+ result = maybe_numeric_slice(df, None, include_bool=True)
expected = pd.IndexSlice[:, ["A", "C"]]
- result = _maybe_numeric_slice(df, [1])
+ result = maybe_numeric_slice(df, [1])
expected = [1]
assert result == expected
@@ -1004,7 +1004,7 @@ def test_extension_array_cross_section():
def test_extension_array_cross_section_converts():
# all numeric columns -> numeric series
df = pd.DataFrame(
- {"A": pd.array([1, 2], dtype="Int64"), "B": np.array([1, 2])}, index=["a", "b"],
+ {"A": pd.array([1, 2], dtype="Int64"), "B": np.array([1, 2])}, index=["a", "b"]
)
result = df.loc["a"]
expected = pd.Series([1, 1], dtype="Int64", index=["A", "B"], name="a")
@@ -1100,3 +1100,13 @@ def test_long_text_missing_labels_inside_loc_error_message_limited():
error_message_regex = "long_missing_label_text_0.*\\\\n.*long_missing_label_text_1"
with pytest.raises(KeyError, match=error_message_regex):
s.loc[["a", "c"] + missing_labels]
+
+
+def test_setitem_categorical():
+ # https://github.com/pandas-dev/pandas/issues/35369
+ df = pd.DataFrame({"h": pd.Series(list("mn")).astype("category")})
+ df.h = df.h.cat.reorder_categories(["n", "m"])
+ expected = pd.DataFrame(
+ {"h": pd.Categorical(["m", "n"]).reorder_categories(["n", "m"])}
+ )
+ tm.assert_frame_equal(df, expected)
diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py
index 47980e88f76d4..9b9bca77e17ec 100644
--- a/pandas/tests/indexing/test_loc.py
+++ b/pandas/tests/indexing/test_loc.py
@@ -5,6 +5,8 @@
import numpy as np
import pytest
+from pandas.compat.numpy import is_numpy_dev
+
import pandas as pd
from pandas import DataFrame, Series, Timestamp, date_range
import pandas._testing as tm
@@ -27,13 +29,11 @@ def test_loc_getitem_label_out_of_range(self):
# out of range label
self.check_result(
- "loc", "f", typs=["ints", "uints", "labels", "mixed", "ts"], fails=KeyError,
+ "loc", "f", typs=["ints", "uints", "labels", "mixed", "ts"], fails=KeyError
)
self.check_result("loc", "f", typs=["floats"], fails=KeyError)
self.check_result("loc", "f", typs=["floats"], fails=KeyError)
- self.check_result(
- "loc", 20, typs=["ints", "uints", "mixed"], fails=KeyError,
- )
+ self.check_result("loc", 20, typs=["ints", "uints", "mixed"], fails=KeyError)
self.check_result("loc", 20, typs=["labels"], fails=KeyError)
self.check_result("loc", 20, typs=["ts"], axes=0, fails=KeyError)
self.check_result("loc", 20, typs=["floats"], axes=0, fails=KeyError)
@@ -44,26 +44,24 @@ def test_loc_getitem_label_list(self):
pass
def test_loc_getitem_label_list_with_missing(self):
+ self.check_result("loc", [0, 1, 2], typs=["empty"], fails=KeyError)
self.check_result(
- "loc", [0, 1, 2], typs=["empty"], fails=KeyError,
- )
- self.check_result(
- "loc", [0, 2, 10], typs=["ints", "uints", "floats"], axes=0, fails=KeyError,
+ "loc", [0, 2, 10], typs=["ints", "uints", "floats"], axes=0, fails=KeyError
)
self.check_result(
- "loc", [3, 6, 7], typs=["ints", "uints", "floats"], axes=1, fails=KeyError,
+ "loc", [3, 6, 7], typs=["ints", "uints", "floats"], axes=1, fails=KeyError
)
# GH 17758 - MultiIndex and missing keys
self.check_result(
- "loc", [(1, 3), (1, 4), (2, 5)], typs=["multi"], axes=0, fails=KeyError,
+ "loc", [(1, 3), (1, 4), (2, 5)], typs=["multi"], axes=0, fails=KeyError
)
def test_loc_getitem_label_list_fails(self):
# fails
self.check_result(
- "loc", [20, 30, 40], typs=["ints", "uints"], axes=1, fails=KeyError,
+ "loc", [20, 30, 40], typs=["ints", "uints"], axes=1, fails=KeyError
)
def test_loc_getitem_label_array_like(self):
@@ -93,18 +91,14 @@ def test_loc_getitem_label_slice(self):
)
self.check_result(
- "loc", slice("20130102", "20130104"), typs=["ts"], axes=1, fails=TypeError,
+ "loc", slice("20130102", "20130104"), typs=["ts"], axes=1, fails=TypeError
)
- self.check_result(
- "loc", slice(2, 8), typs=["mixed"], axes=0, fails=TypeError,
- )
- self.check_result(
- "loc", slice(2, 8), typs=["mixed"], axes=1, fails=KeyError,
- )
+ self.check_result("loc", slice(2, 8), typs=["mixed"], axes=0, fails=TypeError)
+ self.check_result("loc", slice(2, 8), typs=["mixed"], axes=1, fails=KeyError)
self.check_result(
- "loc", slice(2, 4, 2), typs=["mixed"], axes=0, fails=TypeError,
+ "loc", slice(2, 4, 2), typs=["mixed"], axes=0, fails=TypeError
)
def test_setitem_from_duplicate_axis(self):
@@ -667,8 +661,7 @@ def test_loc_setitem_with_scalar_index(self, indexer, value):
(1, ["A", "B", "C"]),
np.array([7, 8, 9], dtype=np.int64),
pd.DataFrame(
- [[1, 2, np.nan], [7, 8, 9], [5, 6, np.nan]],
- columns=["A", "B", "C"],
+ [[1, 2, np.nan], [7, 8, 9], [5, 6, np.nan]], columns=["A", "B", "C"]
),
),
(
@@ -792,6 +785,7 @@ def test_loc_non_unique(self):
expected = DataFrame({"A": [2, 4, 5], "B": [4, 6, 7]}, index=[1, 1, 2])
tm.assert_frame_equal(result, expected)
+ @pytest.mark.arm_slow
def test_loc_non_unique_memory_error(self):
# GH 4280
@@ -894,6 +888,22 @@ def test_identity_slice_returns_new_object(self):
original_series[:3] = [7, 8, 9]
assert all(sliced_series[:3] == [7, 8, 9])
+ def test_loc_copy_vs_view(self):
+ # GH 15631
+ x = DataFrame(zip(range(3), range(3)), columns=["a", "b"])
+
+ y = x.copy()
+ q = y.loc[:, "a"]
+ q += 2
+
+ tm.assert_frame_equal(x, y)
+
+ z = x.copy()
+ q = z.loc[x.index, "a"]
+ q += 2
+
+ tm.assert_frame_equal(x, z)
+
def test_loc_uint64(self):
# GH20722
# Test whether loc accept uint64 max value as index.
@@ -929,6 +939,7 @@ def test_loc_setitem_empty_append(self):
df.loc[0, "x"] = expected.loc[0, "x"]
tm.assert_frame_equal(df, expected)
+ @pytest.mark.xfail(is_numpy_dev, reason="gh-35481")
def test_loc_setitem_empty_append_raises(self):
# GH6173, various appends to an empty dataframe
diff --git a/pandas/tests/indexing/test_partial.py b/pandas/tests/indexing/test_partial.py
index 350f86b4e9fd0..7afbbc2b9ab2b 100644
--- a/pandas/tests/indexing/test_partial.py
+++ b/pandas/tests/indexing/test_partial.py
@@ -660,3 +660,15 @@ def test_indexing_timeseries_regression(self):
expected = Series(rng, index=rng)
tm.assert_series_equal(result, expected)
+
+ def test_index_name_empty(self):
+ # GH 31368
+ df = pd.DataFrame({}, index=pd.RangeIndex(0, name="df_index"))
+ series = pd.Series(1.23, index=pd.RangeIndex(4, name="series_index"))
+
+ df["series"] = series
+ expected = pd.DataFrame(
+ {"series": [1.23] * 4}, index=pd.RangeIndex(4, name="df_index")
+ )
+
+ tm.assert_frame_equal(df, expected)
diff --git a/pandas/tests/internals/test_internals.py b/pandas/tests/internals/test_internals.py
index 5fd44d7cd74a9..1d73d1e35728b 100644
--- a/pandas/tests/internals/test_internals.py
+++ b/pandas/tests/internals/test_internals.py
@@ -377,7 +377,7 @@ def test_copy(self, mgr):
for blk, cp_blk in zip(mgr.blocks, cp.blocks):
# view assertion
- assert cp_blk.equals(blk)
+ tm.assert_equal(cp_blk.values, blk.values)
if isinstance(blk.values, np.ndarray):
assert cp_blk.values.base is blk.values.base
else:
@@ -389,7 +389,7 @@ def test_copy(self, mgr):
# copy assertion we either have a None for a base or in case of
# some blocks it is an array (e.g. datetimetz), but was copied
- assert cp_blk.equals(blk)
+ tm.assert_equal(cp_blk.values, blk.values)
if not isinstance(cp_blk.values, np.ndarray):
assert cp_blk.values._data.base is not blk.values._data.base
else:
@@ -892,16 +892,16 @@ def assert_reindex_indexer_is_ok(mgr, axis, new_labels, indexer, fill_value):
fill_value,
)
assert_reindex_indexer_is_ok(
- mgr, ax, mgr.axes[ax][::-1], np.arange(mgr.shape[ax]), fill_value,
+ mgr, ax, mgr.axes[ax][::-1], np.arange(mgr.shape[ax]), fill_value
)
assert_reindex_indexer_is_ok(
- mgr, ax, mgr.axes[ax], np.arange(mgr.shape[ax])[::-1], fill_value,
+ mgr, ax, mgr.axes[ax], np.arange(mgr.shape[ax])[::-1], fill_value
)
assert_reindex_indexer_is_ok(
mgr, ax, pd.Index(["foo", "bar", "baz"]), [0, 0, 0], fill_value
)
assert_reindex_indexer_is_ok(
- mgr, ax, pd.Index(["foo", "bar", "baz"]), [-1, 0, -1], fill_value,
+ mgr, ax, pd.Index(["foo", "bar", "baz"]), [-1, 0, -1], fill_value
)
assert_reindex_indexer_is_ok(
mgr,
@@ -913,7 +913,7 @@ def assert_reindex_indexer_is_ok(mgr, axis, new_labels, indexer, fill_value):
if mgr.shape[ax] >= 3:
assert_reindex_indexer_is_ok(
- mgr, ax, pd.Index(["foo", "bar", "baz"]), [0, 1, 2], fill_value,
+ mgr, ax, pd.Index(["foo", "bar", "baz"]), [0, 1, 2], fill_value
)
diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py
index fcee25c258efa..193baa8c3ed74 100644
--- a/pandas/tests/io/conftest.py
+++ b/pandas/tests/io/conftest.py
@@ -1,4 +1,7 @@
import os
+import shlex
+import subprocess
+import time
import pytest
@@ -31,10 +34,65 @@ def feather_file(datapath):
@pytest.fixture
-def s3_resource(tips_file, jsonl_file, feather_file):
+def s3so(worker_id):
+ worker_id = "5" if worker_id == "master" else worker_id.lstrip("gw")
+ return dict(client_kwargs={"endpoint_url": f"http://127.0.0.1:555{worker_id}/"})
+
+
+@pytest.fixture(scope="session")
+def s3_base(worker_id):
"""
Fixture for mocking S3 interaction.
+ Sets up moto server in separate process
+ """
+ pytest.importorskip("s3fs")
+ pytest.importorskip("boto3")
+ requests = pytest.importorskip("requests")
+
+ with tm.ensure_safe_environment_variables():
+ # temporary workaround as moto fails for botocore >= 1.11 otherwise,
+ # see https://github.com/spulec/moto/issues/1924 & 1952
+ os.environ.setdefault("AWS_ACCESS_KEY_ID", "foobar_key")
+ os.environ.setdefault("AWS_SECRET_ACCESS_KEY", "foobar_secret")
+
+ pytest.importorskip("moto", minversion="1.3.14")
+ pytest.importorskip("flask") # server mode needs flask too
+
+ # Launching moto in server mode, i.e., as a separate process
+ # with an S3 endpoint on localhost
+
+ worker_id = "5" if worker_id == "master" else worker_id.lstrip("gw")
+ endpoint_port = f"555{worker_id}"
+ endpoint_uri = f"http://127.0.0.1:{endpoint_port}/"
+
+ # pipe to null to avoid logging in terminal
+ proc = subprocess.Popen(
+ shlex.split(f"moto_server s3 -p {endpoint_port}"), stdout=subprocess.DEVNULL
+ )
+
+ timeout = 5
+ while timeout > 0:
+ try:
+ # OK to go once server is accepting connections
+ r = requests.get(endpoint_uri)
+ if r.ok:
+ break
+ except Exception:
+ pass
+ timeout -= 0.1
+ time.sleep(0.1)
+ yield endpoint_uri
+
+ proc.terminate()
+ proc.wait()
+
+
+@pytest.fixture()
+def s3_resource(s3_base, tips_file, jsonl_file, feather_file):
+ """
+ Sets up S3 bucket with contents
+
The primary bucket name is "pandas-test". The following datasets
are loaded.
@@ -46,45 +104,58 @@ def s3_resource(tips_file, jsonl_file, feather_file):
A private bucket "cant_get_it" is also created. The boto3 s3 resource
is yielded by the fixture.
"""
- s3fs = pytest.importorskip("s3fs")
- boto3 = pytest.importorskip("boto3")
-
- with tm.ensure_safe_environment_variables():
- # temporary workaround as moto fails for botocore >= 1.11 otherwise,
- # see https://github.com/spulec/moto/issues/1924 & 1952
- os.environ.setdefault("AWS_ACCESS_KEY_ID", "foobar_key")
- os.environ.setdefault("AWS_SECRET_ACCESS_KEY", "foobar_secret")
-
- moto = pytest.importorskip("moto")
-
- test_s3_files = [
- ("tips#1.csv", tips_file),
- ("tips.csv", tips_file),
- ("tips.csv.gz", tips_file + ".gz"),
- ("tips.csv.bz2", tips_file + ".bz2"),
- ("items.jsonl", jsonl_file),
- ("simple_dataset.feather", feather_file),
- ]
-
- def add_tips_files(bucket_name):
- for s3_key, file_name in test_s3_files:
- with open(file_name, "rb") as f:
- conn.Bucket(bucket_name).put_object(Key=s3_key, Body=f)
-
- try:
- s3 = moto.mock_s3()
- s3.start()
-
- # see gh-16135
- bucket = "pandas-test"
- conn = boto3.resource("s3", region_name="us-east-1")
-
- conn.create_bucket(Bucket=bucket)
- add_tips_files(bucket)
-
- conn.create_bucket(Bucket="cant_get_it", ACL="private")
- add_tips_files("cant_get_it")
- s3fs.S3FileSystem.clear_instance_cache()
- yield conn
- finally:
- s3.stop()
+ import boto3
+ import s3fs
+
+ test_s3_files = [
+ ("tips#1.csv", tips_file),
+ ("tips.csv", tips_file),
+ ("tips.csv.gz", tips_file + ".gz"),
+ ("tips.csv.bz2", tips_file + ".bz2"),
+ ("items.jsonl", jsonl_file),
+ ("simple_dataset.feather", feather_file),
+ ]
+
+ def add_tips_files(bucket_name):
+ for s3_key, file_name in test_s3_files:
+ with open(file_name, "rb") as f:
+ cli.put_object(Bucket=bucket_name, Key=s3_key, Body=f)
+
+ bucket = "pandas-test"
+ conn = boto3.resource("s3", endpoint_url=s3_base)
+ cli = boto3.client("s3", endpoint_url=s3_base)
+
+ try:
+ cli.create_bucket(Bucket=bucket)
+ except: # noqa
+ # OK is bucket already exists
+ pass
+ try:
+ cli.create_bucket(Bucket="cant_get_it", ACL="private")
+ except: # noqa
+ # OK is bucket already exists
+ pass
+ timeout = 2
+ while not cli.list_buckets()["Buckets"] and timeout > 0:
+ time.sleep(0.1)
+ timeout -= 0.1
+
+ add_tips_files(bucket)
+ add_tips_files("cant_get_it")
+ s3fs.S3FileSystem.clear_instance_cache()
+ yield conn
+
+ s3 = s3fs.S3FileSystem(client_kwargs={"endpoint_url": s3_base})
+
+ try:
+ s3.rm(bucket, recursive=True)
+ except: # noqa
+ pass
+ try:
+ s3.rm("cant_get_it", recursive=True)
+ except: # noqa
+ pass
+ timeout = 2
+ while cli.list_buckets()["Buckets"] and timeout > 0:
+ time.sleep(0.1)
+ timeout -= 0.1
diff --git a/pandas/tests/io/data/excel/gh-35802.ods b/pandas/tests/io/data/excel/gh-35802.ods
new file mode 100755
index 0000000000000..f3ad061f1d995
Binary files /dev/null and b/pandas/tests/io/data/excel/gh-35802.ods differ
diff --git a/pandas/tests/io/data/excel/gh-36122.ods b/pandas/tests/io/data/excel/gh-36122.ods
new file mode 100755
index 0000000000000..3dfdaf976da45
Binary files /dev/null and b/pandas/tests/io/data/excel/gh-36122.ods differ
diff --git a/pandas/tests/io/data/excel/test_datetime_mi.ods b/pandas/tests/io/data/excel/test_datetime_mi.ods
new file mode 100644
index 0000000000000..c37c35060c650
Binary files /dev/null and b/pandas/tests/io/data/excel/test_datetime_mi.ods differ
diff --git a/pandas/tests/io/data/excel/test_datetime_mi.xls b/pandas/tests/io/data/excel/test_datetime_mi.xls
new file mode 100644
index 0000000000000..aeade05855919
Binary files /dev/null and b/pandas/tests/io/data/excel/test_datetime_mi.xls differ
diff --git a/pandas/tests/io/data/excel/test_datetime_mi.xlsb b/pandas/tests/io/data/excel/test_datetime_mi.xlsb
new file mode 100644
index 0000000000000..0984c020a4c54
Binary files /dev/null and b/pandas/tests/io/data/excel/test_datetime_mi.xlsb differ
diff --git a/pandas/tests/io/data/excel/test_datetime_mi.xlsm b/pandas/tests/io/data/excel/test_datetime_mi.xlsm
new file mode 100644
index 0000000000000..55fb88912afb9
Binary files /dev/null and b/pandas/tests/io/data/excel/test_datetime_mi.xlsm differ
diff --git a/pandas/tests/io/data/excel/test_datetime_mi.xlsx b/pandas/tests/io/data/excel/test_datetime_mi.xlsx
new file mode 100644
index 0000000000000..0ffee0a8b79a3
Binary files /dev/null and b/pandas/tests/io/data/excel/test_datetime_mi.xlsx differ
diff --git a/pandas/tests/io/data/legacy_pickle/1.1.0/1.1.0_x86_64_darwin_3.8.5.pickle b/pandas/tests/io/data/legacy_pickle/1.1.0/1.1.0_x86_64_darwin_3.8.5.pickle
new file mode 100644
index 0000000000000..f8df9afff6565
Binary files /dev/null and b/pandas/tests/io/data/legacy_pickle/1.1.0/1.1.0_x86_64_darwin_3.8.5.pickle differ
diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py
index 955db982f8300..4bdcc5b327fa7 100644
--- a/pandas/tests/io/excel/test_readers.py
+++ b/pandas/tests/io/excel/test_readers.py
@@ -1,9 +1,7 @@
-import contextlib
from datetime import datetime, time
from functools import partial
import os
from urllib.error import URLError
-import warnings
import numpy as np
import pytest
@@ -14,22 +12,6 @@
from pandas import DataFrame, Index, MultiIndex, Series
import pandas._testing as tm
-
-@contextlib.contextmanager
-def ignore_xlrd_time_clock_warning():
- """
- Context manager to ignore warnings raised by the xlrd library,
- regarding the deprecation of `time.clock` in Python 3.7.
- """
- with warnings.catch_warnings():
- warnings.filterwarnings(
- action="ignore",
- message="time.clock has been deprecated",
- category=DeprecationWarning,
- )
- yield
-
-
read_ext_params = [".xls", ".xlsx", ".xlsm", ".xlsb", ".ods"]
engine_params = [
# Add any engines to test here
@@ -134,21 +116,19 @@ def test_usecols_int(self, read_ext, df_ref):
# usecols as int
msg = "Passing an integer for `usecols`"
with pytest.raises(ValueError, match=msg):
- with ignore_xlrd_time_clock_warning():
- pd.read_excel(
- "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols=3
- )
+ pd.read_excel(
+ "test1" + read_ext, sheet_name="Sheet1", index_col=0, usecols=3
+ )
# usecols as int
with pytest.raises(ValueError, match=msg):
- with ignore_xlrd_time_clock_warning():
- pd.read_excel(
- "test1" + read_ext,
- sheet_name="Sheet2",
- skiprows=[1],
- index_col=0,
- usecols=3,
- )
+ pd.read_excel(
+ "test1" + read_ext,
+ sheet_name="Sheet2",
+ skiprows=[1],
+ index_col=0,
+ usecols=3,
+ )
def test_usecols_list(self, read_ext, df_ref):
if pd.read_excel.keywords["engine"] == "pyxlsb":
@@ -519,6 +499,23 @@ def test_reader_spaces(self, read_ext):
)
tm.assert_frame_equal(actual, expected)
+ # gh-36122, gh-35802
+ @pytest.mark.parametrize(
+ "basename,expected",
+ [
+ ("gh-35802", DataFrame({"COLUMN": ["Test (1)"]})),
+ ("gh-36122", DataFrame(columns=["got 2nd sa"])),
+ ],
+ )
+ def test_read_excel_ods_nested_xml(self, read_ext, basename, expected):
+ # see gh-35802
+ engine = pd.read_excel.keywords["engine"]
+ if engine != "odf":
+ pytest.skip(f"Skipped for engine: {engine}")
+
+ actual = pd.read_excel(basename + read_ext)
+ tm.assert_frame_equal(actual, expected)
+
def test_reading_all_sheets(self, read_ext):
# Test reading all sheet names by setting sheet_name to None,
# Ensure a dict is returned.
@@ -597,8 +594,7 @@ def test_sheet_name(self, read_ext, df_ref):
df1 = pd.read_excel(
filename + read_ext, sheet_name=sheet_name, index_col=0
) # doc
- with ignore_xlrd_time_clock_warning():
- df2 = pd.read_excel(filename + read_ext, index_col=0, sheet_name=sheet_name)
+ df2 = pd.read_excel(filename + read_ext, index_col=0, sheet_name=sheet_name)
tm.assert_frame_equal(df1, df_ref, check_names=False)
tm.assert_frame_equal(df2, df_ref, check_names=False)
@@ -627,13 +623,14 @@ def test_read_from_http_url(self, read_ext):
tm.assert_frame_equal(url_table, local_table)
@td.skip_if_not_us_locale
- def test_read_from_s3_url(self, read_ext, s3_resource):
+ def test_read_from_s3_url(self, read_ext, s3_resource, s3so):
# Bucket "pandas-test" created in tests/io/conftest.py
with open("test1" + read_ext, "rb") as f:
s3_resource.Bucket("pandas-test").put_object(Key="test1" + read_ext, Body=f)
url = "s3://pandas-test/test1" + read_ext
- url_table = pd.read_excel(url)
+
+ url_table = pd.read_excel(url, storage_options=s3so)
local_table = pd.read_excel("test1" + read_ext)
tm.assert_frame_equal(url_table, local_table)
@@ -897,7 +894,7 @@ def test_read_excel_bool_header_arg(self, read_ext):
with pytest.raises(TypeError, match=msg):
pd.read_excel("test1" + read_ext, header=arg)
- def test_read_excel_skiprows_list(self, read_ext):
+ def test_read_excel_skiprows(self, read_ext):
# GH 4903
if pd.read_excel.keywords["engine"] == "pyxlsb":
pytest.xfail("Sheets containing datetimes not supported by pyxlsb")
@@ -923,6 +920,31 @@ def test_read_excel_skiprows_list(self, read_ext):
)
tm.assert_frame_equal(actual, expected)
+ # GH36435
+ actual = pd.read_excel(
+ "testskiprows" + read_ext,
+ sheet_name="skiprows_list",
+ skiprows=lambda x: x in [0, 2],
+ )
+ tm.assert_frame_equal(actual, expected)
+
+ actual = pd.read_excel(
+ "testskiprows" + read_ext,
+ sheet_name="skiprows_list",
+ skiprows=3,
+ names=["a", "b", "c", "d"],
+ )
+ expected = DataFrame(
+ [
+ # [1, 2.5, pd.Timestamp("2015-01-01"), True],
+ [2, 3.5, pd.Timestamp("2015-01-02"), False],
+ [3, 4.5, pd.Timestamp("2015-01-03"), False],
+ [4, 5.5, pd.Timestamp("2015-01-04"), True],
+ ],
+ columns=["a", "b", "c", "d"],
+ )
+ tm.assert_frame_equal(actual, expected)
+
def test_read_excel_nrows(self, read_ext):
# GH 16645
num_rows_to_pull = 5
@@ -968,6 +990,19 @@ def test_deprecated_kwargs(self, read_ext):
pd.read_excel("test1" + read_ext)
+ def test_no_header_with_list_index_col(self, read_ext):
+ # GH 31783
+ file_name = "testmultiindex" + read_ext
+ data = [("B", "B"), ("key", "val"), (3, 4), (3, 4)]
+ idx = pd.MultiIndex.from_tuples(
+ [("A", "A"), ("key", "val"), (1, 2), (1, 2)], names=(0, 1)
+ )
+ expected = pd.DataFrame(data, index=idx, columns=(2, 3))
+ result = pd.read_excel(
+ file_name, sheet_name="index_col_none", index_col=[0, 1], header=None
+ )
+ tm.assert_frame_equal(expected, result)
+
class TestExcelFileRead:
@pytest.fixture(autouse=True)
@@ -1143,3 +1178,22 @@ def test_header_with_index_col(self, engine, filename):
filename, sheet_name="Sheet1", index_col=0, header=[0, 1]
)
tm.assert_frame_equal(expected, result)
+
+ def test_read_datetime_multiindex(self, engine, read_ext):
+ # GH 34748
+ if engine == "pyxlsb":
+ pytest.xfail("Sheets containing datetimes not supported by pyxlsb")
+
+ f = "test_datetime_mi" + read_ext
+ with pd.ExcelFile(f) as excel:
+ actual = pd.read_excel(excel, header=[0, 1], index_col=0, engine=engine)
+ expected_column_index = pd.MultiIndex.from_tuples(
+ [(pd.to_datetime("02/29/2020"), pd.to_datetime("03/01/2020"))],
+ names=[
+ pd.to_datetime("02/29/2020").to_pydatetime(),
+ pd.to_datetime("03/01/2020").to_pydatetime(),
+ ],
+ )
+ expected = pd.DataFrame([], columns=expected_column_index)
+
+ tm.assert_frame_equal(expected, actual)
diff --git a/pandas/tests/io/formats/test_css.py b/pandas/tests/io/formats/test_css.py
index 9383f86e335fa..785904fafd31a 100644
--- a/pandas/tests/io/formats/test_css.py
+++ b/pandas/tests/io/formats/test_css.py
@@ -99,11 +99,11 @@ def test_css_side_shorthands(shorthand, expansions):
top, right, bottom, left = expansions
assert_resolves(
- f"{shorthand}: 1pt", {top: "1pt", right: "1pt", bottom: "1pt", left: "1pt"},
+ f"{shorthand}: 1pt", {top: "1pt", right: "1pt", bottom: "1pt", left: "1pt"}
)
assert_resolves(
- f"{shorthand}: 1pt 4pt", {top: "1pt", right: "4pt", bottom: "1pt", left: "4pt"},
+ f"{shorthand}: 1pt 4pt", {top: "1pt", right: "4pt", bottom: "1pt", left: "4pt"}
)
assert_resolves(
@@ -189,9 +189,7 @@ def test_css_absolute_font_size(size, relative_to, resolved):
inherited = None
else:
inherited = {"font-size": relative_to}
- assert_resolves(
- f"font-size: {size}", {"font-size": resolved}, inherited=inherited,
- )
+ assert_resolves(f"font-size: {size}", {"font-size": resolved}, inherited=inherited)
@pytest.mark.parametrize(
@@ -225,6 +223,4 @@ def test_css_relative_font_size(size, relative_to, resolved):
inherited = None
else:
inherited = {"font-size": relative_to}
- assert_resolves(
- f"font-size: {size}", {"font-size": resolved}, inherited=inherited,
- )
+ assert_resolves(f"font-size: {size}", {"font-size": resolved}, inherited=inherited)
diff --git a/pandas/tests/io/formats/test_format.py b/pandas/tests/io/formats/test_format.py
index 3c40a2ae8d6b8..cce0783a3c867 100644
--- a/pandas/tests/io/formats/test_format.py
+++ b/pandas/tests/io/formats/test_format.py
@@ -18,7 +18,8 @@
import pytest
import pytz
-from pandas.compat import is_platform_32bit, is_platform_windows
+from pandas.compat import IS64, is_platform_windows
+import pandas.util._test_decorators as td
import pandas as pd
from pandas import (
@@ -40,7 +41,7 @@
import pandas.io.formats.format as fmt
import pandas.io.formats.printing as printing
-use_32bit_repr = is_platform_windows() or is_platform_32bit()
+use_32bit_repr = is_platform_windows() or not IS64
@pytest.fixture(params=["string", "pathlike", "buffer"])
@@ -225,7 +226,7 @@ def test_repr_truncation(self):
r = repr(df)
r = r[r.find("\n") + 1 :]
- adj = fmt._get_adjustment()
+ adj = fmt.get_adjustment()
for line, value in zip(r.split("\n"), df["B"]):
if adj.len(value) + 1 > max_len:
@@ -647,7 +648,7 @@ def test_to_string_unicode_columns(self, float_frame):
assert isinstance(result, str)
def test_to_string_utf8_columns(self):
- n = "\u05d0".encode("utf-8")
+ n = "\u05d0".encode()
with option_context("display.max_rows", 1):
df = DataFrame([1, 2], columns=[n])
@@ -1545,11 +1546,11 @@ def test_to_string_no_index(self):
df_s = df.to_string(index=False)
# Leading space is expected for positive numbers.
- expected = " x y z\n 11 33 AAA\n 22 -44 "
+ expected = " x y z\n11 33 AAA\n22 -44 "
assert df_s == expected
df_s = df[["y", "x", "z"]].to_string(index=False)
- expected = " y x z\n 33 11 AAA\n-44 22 "
+ expected = " y x z\n 33 11 AAA\n-44 22 "
assert df_s == expected
def test_to_string_line_width_no_index(self):
@@ -1564,7 +1565,7 @@ def test_to_string_line_width_no_index(self):
df = DataFrame({"x": [11, 22, 33], "y": [4, 5, 6]})
df_s = df.to_string(line_width=1, index=False)
- expected = " x \\\n 11 \n 22 \n 33 \n\n y \n 4 \n 5 \n 6 "
+ expected = " x \\\n11 \n22 \n33 \n\n y \n 4 \n 5 \n 6 "
assert df_s == expected
@@ -2141,6 +2142,15 @@ def test_dict_entries(self):
assert "'a': 1" in val
assert "'b': 2" in val
+ def test_categorical_columns(self):
+ # GH35439
+ data = [[4, 2], [3, 2], [4, 3]]
+ cols = ["aaaaaaaaa", "b"]
+ df = pd.DataFrame(data, columns=cols)
+ df_cat_cols = pd.DataFrame(data, columns=pd.CategoricalIndex(cols))
+
+ assert df.to_string() == df_cat_cols.to_string()
+
def test_period(self):
# GH 12615
df = pd.DataFrame(
@@ -2259,7 +2269,7 @@ def test_to_string_without_index(self):
# GH 11729 Test index=False option
s = Series([1, 2, 3, 4])
result = s.to_string(index=False)
- expected = " 1\n" + " 2\n" + " 3\n" + " 4"
+ expected = "1\n" + "2\n" + "3\n" + "4"
assert result == expected
def test_unicode_name_in_footer(self):
@@ -2910,6 +2920,15 @@ def test_format(self):
assert result[0] == " 12.0"
assert result[1] == " 0.0"
+ def test_output_display_precision_trailing_zeroes(self):
+ # Issue #20359: trimming zeros while there is no decimal point
+
+ # Happens when display precision is set to zero
+ with pd.option_context("display.precision", 0):
+ s = pd.Series([840.0, 4200.0])
+ expected_output = "0 840\n1 4200\ndtype: float64"
+ assert str(s) == expected_output
+
def test_output_significant_digits(self):
# Issue #9764
@@ -3320,6 +3339,7 @@ def test_format_percentiles_integer_idx():
assert result == expected
+@td.check_file_leaks
def test_repr_html_ipython_config(ip):
code = textwrap.dedent(
"""\
@@ -3371,3 +3391,37 @@ def test_filepath_or_buffer_bad_arg_raises(float_frame, method):
msg = "buf is not a file name and it has no write method"
with pytest.raises(TypeError, match=msg):
getattr(float_frame, method)(buf=object())
+
+
+@pytest.mark.parametrize(
+ "input_array, expected",
+ [
+ ("a", "a"),
+ (["a", "b"], "a\nb"),
+ ([1, "a"], "1\na"),
+ (1, "1"),
+ ([0, -1], " 0\n-1"),
+ (1.0, "1.0"),
+ ([" a", " b"], " a\n b"),
+ ([".1", "1"], ".1\n 1"),
+ (["10", "-10"], " 10\n-10"),
+ ],
+)
+def test_format_remove_leading_space_series(input_array, expected):
+ # GH: 24980
+ s = pd.Series(input_array).to_string(index=False)
+ assert s == expected
+
+
+@pytest.mark.parametrize(
+ "input_array, expected",
+ [
+ ({"A": ["a"]}, "A\na"),
+ ({"A": ["a", "b"], "B": ["c", "dd"]}, "A B\na c\nb dd"),
+ ({"A": ["a", 1], "B": ["aa", 1]}, "A B\na aa\n1 1"),
+ ],
+)
+def test_format_remove_leading_space_dataframe(input_array, expected):
+ # GH: 24980
+ df = pd.DataFrame(input_array).to_string(index=False)
+ assert df == expected
diff --git a/pandas/tests/io/formats/test_info.py b/pandas/tests/io/formats/test_info.py
index 877bd1650ae60..7000daeb9b575 100644
--- a/pandas/tests/io/formats/test_info.py
+++ b/pandas/tests/io/formats/test_info.py
@@ -299,7 +299,7 @@ def test_info_memory_usage():
DataFrame(1, index=["a"], columns=["A"]).memory_usage(index=True)
DataFrame(1, index=["a"], columns=["A"]).index.nbytes
df = DataFrame(
- data=1, index=MultiIndex.from_product([["a"], range(1000)]), columns=["A"],
+ data=1, index=MultiIndex.from_product([["a"], range(1000)]), columns=["A"]
)
df.index.nbytes
df.memory_usage(index=True)
@@ -336,7 +336,7 @@ def test_info_memory_usage_deep_pypy():
@pytest.mark.skipif(PYPY, reason="PyPy getsizeof() fails by design")
def test_usage_via_getsizeof():
df = DataFrame(
- data=1, index=MultiIndex.from_product([["a"], range(1000)]), columns=["A"],
+ data=1, index=MultiIndex.from_product([["a"], range(1000)]), columns=["A"]
)
mem = df.memory_usage(deep=True).sum()
# sys.getsizeof will call the .memory_usage with
@@ -359,16 +359,14 @@ def test_info_memory_usage_qualified():
buf = StringIO()
df = DataFrame(
- 1, columns=list("ab"), index=MultiIndex.from_product([range(3), range(3)]),
+ 1, columns=list("ab"), index=MultiIndex.from_product([range(3), range(3)])
)
df.info(buf=buf)
assert "+" not in buf.getvalue()
buf = StringIO()
df = DataFrame(
- 1,
- columns=list("ab"),
- index=MultiIndex.from_product([range(3), ["foo", "bar"]]),
+ 1, columns=list("ab"), index=MultiIndex.from_product([range(3), ["foo", "bar"]])
)
df.info(buf=buf)
assert "+" in buf.getvalue()
@@ -384,7 +382,7 @@ def memory_usage(f):
N = 100
M = len(uppercase)
index = MultiIndex.from_product(
- [list(uppercase), date_range("20160101", periods=N)], names=["id", "date"],
+ [list(uppercase), date_range("20160101", periods=N)], names=["id", "date"]
)
df = DataFrame({"value": np.random.randn(N * M)}, index=index)
diff --git a/pandas/tests/io/formats/test_style.py b/pandas/tests/io/formats/test_style.py
index ec4614538004c..8d66a16fc2b7a 100644
--- a/pandas/tests/io/formats/test_style.py
+++ b/pandas/tests/io/formats/test_style.py
@@ -405,9 +405,10 @@ def f(x):
result = self.df.style.where(f, style1)._compute().ctx
expected = {
- (r, c): [style1 if f(self.df.loc[row, col]) else ""]
+ (r, c): [style1]
for r, row in enumerate(self.df.index)
for c, col in enumerate(self.df.columns)
+ if f(self.df.loc[row, col])
}
assert result == expected
@@ -966,7 +967,6 @@ def test_bar_align_mid_nans(self):
"transparent 25.0%, #d65f5f 25.0%, "
"#d65f5f 50.0%, transparent 50.0%)",
],
- (1, 0): [""],
(0, 1): [
"width: 10em",
" height: 80%",
@@ -994,7 +994,6 @@ def test_bar_align_zero_nans(self):
"transparent 50.0%, #d65f5f 50.0%, "
"#d65f5f 75.0%, transparent 75.0%)",
],
- (1, 0): [""],
(0, 1): [
"width: 10em",
" height: 80%",
@@ -1091,7 +1090,7 @@ def test_format_with_bad_na_rep(self):
def test_highlight_null(self, null_color="red"):
df = pd.DataFrame({"A": [0, np.nan]})
result = df.style.highlight_null()._compute().ctx
- expected = {(0, 0): [""], (1, 0): ["background-color: red"]}
+ expected = {(1, 0): ["background-color: red"]}
assert result == expected
def test_highlight_null_subset(self):
@@ -1104,9 +1103,7 @@ def test_highlight_null_subset(self):
.ctx
)
expected = {
- (0, 0): [""],
(1, 0): ["background-color: red"],
- (0, 1): [""],
(1, 1): ["background-color: green"],
}
assert result == expected
@@ -1219,8 +1216,6 @@ def test_highlight_max(self):
expected = {
(1, 0): ["background-color: yellow"],
(1, 1): ["background-color: yellow"],
- (0, 1): [""],
- (0, 0): [""],
}
assert result == expected
@@ -1228,8 +1223,6 @@ def test_highlight_max(self):
expected = {
(0, 1): ["background-color: yellow"],
(1, 1): ["background-color: yellow"],
- (0, 0): [""],
- (1, 0): [""],
}
assert result == expected
@@ -1689,6 +1682,62 @@ def f(a, b, styler):
result = styler.pipe((f, "styler"), a=1, b=2)
assert result == (1, 2, styler)
+ def test_no_cell_ids(self):
+ # GH 35588
+ # GH 35663
+ df = pd.DataFrame(data=[[0]])
+ styler = Styler(df, uuid="_", cell_ids=False)
+ styler.render()
+ s = styler.render() # render twice to ensure ctx is not updated
+ assert s.find('') != -1
+
+ @pytest.mark.parametrize(
+ "classes",
+ [
+ DataFrame(
+ data=[["", "test-class"], [np.nan, None]],
+ columns=["A", "B"],
+ index=["a", "b"],
+ ),
+ DataFrame(data=[["test-class"]], columns=["B"], index=["a"]),
+ DataFrame(data=[["test-class", "unused"]], columns=["B", "C"], index=["a"]),
+ ],
+ )
+ def test_set_data_classes(self, classes):
+ # GH 36159
+ df = DataFrame(data=[[0, 1], [2, 3]], columns=["A", "B"], index=["a", "b"])
+ s = Styler(df, uuid="_", cell_ids=False).set_td_classes(classes).render()
+ assert ' | 0 | ' in s
+ assert '1 | ' in s
+ assert '2 | ' in s
+ assert '3 | ' in s
+
+ def test_colspan_w3(self):
+ # GH 36223
+ df = pd.DataFrame(data=[[1, 2]], columns=[["l0", "l0"], ["l1a", "l1b"]])
+ s = Styler(df, uuid="_", cell_ids=False)
+ assert 'l0 | ' in s.render()
+
+ @pytest.mark.parametrize("len_", [1, 5, 32, 33, 100])
+ def test_uuid_len(self, len_):
+ # GH 36345
+ df = pd.DataFrame(data=[["A"]])
+ s = Styler(df, uuid_len=len_, cell_ids=False).render()
+ strt = s.find('id="T_')
+ end = s[strt + 6 :].find('"')
+ if len_ > 32:
+ assert end == 32 + 1
+ else:
+ assert end == len_ + 1
+
+ @pytest.mark.parametrize("len_", [-2, "bad", None])
+ def test_uuid_len_raises(self, len_):
+ # GH 36345
+ df = pd.DataFrame(data=[["A"]])
+ msg = "``uuid_len`` must be an integer in range \\[0, 32\\]."
+ with pytest.raises(TypeError, match=msg):
+ Styler(df, uuid_len=len_, cell_ids=False).render()
+
@td.skip_if_no_mpl
class TestStylerMatplotlibDep:
diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py
index 4c86e3a16b135..e2ceb95d77053 100644
--- a/pandas/tests/io/formats/test_to_csv.py
+++ b/pandas/tests/io/formats/test_to_csv.py
@@ -11,10 +11,6 @@
class TestToCSV:
- @pytest.mark.xfail(
- (3, 6, 5) > sys.version_info,
- reason=("Python csv library bug (see https://bugs.python.org/issue32255)"),
- )
def test_to_csv_with_single_column(self):
# see gh-18676, https://bugs.python.org/issue32255
#
@@ -30,7 +26,7 @@ def test_to_csv_with_single_column(self):
"""
with tm.ensure_clean("test.csv") as path:
df1.to_csv(path, header=None, index=None)
- with open(path, "r") as f:
+ with open(path) as f:
assert f.read() == expected1
df2 = DataFrame([1, None])
@@ -40,7 +36,7 @@ def test_to_csv_with_single_column(self):
"""
with tm.ensure_clean("test.csv") as path:
df2.to_csv(path, header=None, index=None)
- with open(path, "r") as f:
+ with open(path) as f:
assert f.read() == expected2
def test_to_csv_defualt_encoding(self):
@@ -62,7 +58,7 @@ def test_to_csv_quotechar(self):
with tm.ensure_clean("test.csv") as path:
df.to_csv(path, quoting=1) # 1=QUOTE_ALL
- with open(path, "r") as f:
+ with open(path) as f:
assert f.read() == expected
expected = """\
@@ -73,7 +69,7 @@ def test_to_csv_quotechar(self):
with tm.ensure_clean("test.csv") as path:
df.to_csv(path, quoting=1, quotechar="$")
- with open(path, "r") as f:
+ with open(path) as f:
assert f.read() == expected
with tm.ensure_clean("test.csv") as path:
@@ -90,7 +86,7 @@ def test_to_csv_doublequote(self):
with tm.ensure_clean("test.csv") as path:
df.to_csv(path, quoting=1, doublequote=True) # QUOTE_ALL
- with open(path, "r") as f:
+ with open(path) as f:
assert f.read() == expected
from _csv import Error
@@ -109,7 +105,7 @@ def test_to_csv_escapechar(self):
with tm.ensure_clean("test.csv") as path: # QUOTE_ALL
df.to_csv(path, quoting=1, doublequote=False, escapechar="\\")
- with open(path, "r") as f:
+ with open(path) as f:
assert f.read() == expected
df = DataFrame({"col": ["a,a", ",bb,"]})
@@ -121,7 +117,7 @@ def test_to_csv_escapechar(self):
with tm.ensure_clean("test.csv") as path:
df.to_csv(path, quoting=3, escapechar="\\") # QUOTE_NONE
- with open(path, "r") as f:
+ with open(path) as f:
assert f.read() == expected
def test_csv_to_string(self):
@@ -346,7 +342,7 @@ def test_to_csv_string_array_ascii(self):
"""
with tm.ensure_clean("str_test.csv") as path:
df.to_csv(path, encoding="ascii")
- with open(path, "r") as f:
+ with open(path) as f:
assert f.read() == expected_ascii
def test_to_csv_string_array_utf8(self):
@@ -360,7 +356,7 @@ def test_to_csv_string_array_utf8(self):
"""
with tm.ensure_clean("unicode_test.csv") as path:
df.to_csv(path, encoding="utf-8")
- with open(path, "r") as f:
+ with open(path) as f:
assert f.read() == expected_utf8
def test_to_csv_string_with_lf(self):
@@ -471,7 +467,7 @@ def test_to_csv_write_to_open_file(self):
with open(path, "w") as f:
f.write("manual header\n")
df.to_csv(f, header=None, index=None)
- with open(path, "r") as f:
+ with open(path) as f:
assert f.read() == expected
def test_to_csv_write_to_open_file_with_newline_py3(self):
@@ -607,3 +603,39 @@ def test_to_csv_errors(self, errors):
ser.to_csv(path, errors=errors)
# No use in reading back the data as it is not the same anymore
# due to the error handling
+
+ def test_to_csv_binary_handle(self):
+ """
+ Binary file objects should work if 'mode' contains a 'b'.
+
+ GH 35058 and GH 19827
+ """
+ df = tm.makeDataFrame()
+ with tm.ensure_clean() as path:
+ with open(path, mode="w+b") as handle:
+ df.to_csv(handle, mode="w+b")
+ tm.assert_frame_equal(df, pd.read_csv(path, index_col=0))
+
+ def test_to_csv_encoding_binary_handle(self):
+ """
+ Binary file objects should honor a specified encoding.
+
+ GH 23854 and GH 13068 with binary handles
+ """
+ # example from GH 23854
+ content = "a, b, 🐟".encode("utf-8-sig")
+ buffer = io.BytesIO(content)
+ df = pd.read_csv(buffer, encoding="utf-8-sig")
+
+ buffer = io.BytesIO()
+ df.to_csv(buffer, mode="w+b", encoding="utf-8-sig", index=False)
+ buffer.seek(0) # tests whether file handle wasn't closed
+ assert buffer.getvalue().startswith(content)
+
+ # example from GH 13068
+ with tm.ensure_clean() as path:
+ with open(path, "w+b") as handle:
+ pd.DataFrame().to_csv(handle, mode="w+b", encoding="utf-8-sig")
+
+ handle.seek(0)
+ assert handle.read().startswith(b'\xef\xbb\xbf""')
diff --git a/pandas/tests/io/formats/test_to_html.py b/pandas/tests/io/formats/test_to_html.py
index e85fd398964d0..7acdbfd462874 100644
--- a/pandas/tests/io/formats/test_to_html.py
+++ b/pandas/tests/io/formats/test_to_html.py
@@ -137,7 +137,7 @@ def test_to_html_encoding(float_frame, tmp_path):
# GH 28663
path = tmp_path / "test.html"
float_frame.to_html(path, encoding="gbk")
- with open(str(path), "r", encoding="gbk") as f:
+ with open(str(path), encoding="gbk") as f:
assert float_frame.to_html() == f.read()
diff --git a/pandas/tests/io/formats/test_to_latex.py b/pandas/tests/io/formats/test_to_latex.py
index 509e5bcb33304..8df8796d236a5 100644
--- a/pandas/tests/io/formats/test_to_latex.py
+++ b/pandas/tests/io/formats/test_to_latex.py
@@ -7,13 +7,21 @@
from pandas import DataFrame, Series
import pandas._testing as tm
+from pandas.io.formats.format import DataFrameFormatter
+from pandas.io.formats.latex import (
+ RegularTableBuilder,
+ RowBodyIterator,
+ RowHeaderIterator,
+ RowStringConverter,
+)
+
class TestToLatex:
def test_to_latex_filename(self, float_frame):
with tm.ensure_clean("test.tex") as path:
float_frame.to_latex(path)
- with open(path, "r") as f:
+ with open(path) as f:
assert float_frame.to_latex() == f.read()
# test with utf-8 and encoding option (GH 7061)
@@ -50,16 +58,26 @@ def test_to_latex(self, float_frame):
withoutindex_result = df.to_latex(index=False)
withoutindex_expected = r"""\begin{tabular}{rl}
\toprule
- a & b \\
+ a & b \\
\midrule
- 1 & b1 \\
- 2 & b2 \\
+ 1 & b1 \\
+ 2 & b2 \\
\bottomrule
\end{tabular}
"""
assert withoutindex_result == withoutindex_expected
+ @pytest.mark.parametrize(
+ "bad_column_format",
+ [5, 1.2, ["l", "r"], ("r", "c"), {"r", "c", "l"}, dict(a="r", b="l")],
+ )
+ def test_to_latex_bad_column_format(self, bad_column_format):
+ df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]})
+ msg = r"column_format must be str or unicode"
+ with pytest.raises(ValueError, match=msg):
+ df.to_latex(column_format=bad_column_format)
+
def test_to_latex_format(self, float_frame):
# GH Bug #9402
float_frame.to_latex(column_format="ccc")
@@ -393,6 +411,11 @@ def test_to_latex_longtable(self):
df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]})
withindex_result = df.to_latex(longtable=True)
withindex_expected = r"""\begin{longtable}{lrl}
+\toprule
+{} & a & b \\
+\midrule
+\endfirsthead
+
\toprule
{} & a & b \\
\midrule
@@ -413,7 +436,12 @@ def test_to_latex_longtable(self):
withoutindex_result = df.to_latex(index=False, longtable=True)
withoutindex_expected = r"""\begin{longtable}{rl}
\toprule
- a & b \\
+ a & b \\
+\midrule
+\endfirsthead
+
+\toprule
+ a & b \\
\midrule
\endhead
\midrule
@@ -423,8 +451,8 @@ def test_to_latex_longtable(self):
\bottomrule
\endlastfoot
- 1 & b1 \\
- 2 & b2 \\
+ 1 & b1 \\
+ 2 & b2 \\
\end{longtable}
"""
@@ -507,6 +535,9 @@ def test_to_latex_longtable_caption_label(self):
df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]})
+ # test when no caption and no label is provided
+ # is performed by test_to_latex_longtable()
+
# test when only the caption is provided
result_c = df.to_latex(longtable=True, caption=the_caption)
@@ -515,6 +546,11 @@ def test_to_latex_longtable_caption_label(self):
\toprule
{} & a & b \\
\midrule
+\endfirsthead
+\caption[]{a table in a \texttt{longtable} environment} \\
+\toprule
+{} & a & b \\
+\midrule
\endhead
\midrule
\multicolumn{3}{r}{{Continued on next page}} \\
@@ -534,6 +570,11 @@ def test_to_latex_longtable_caption_label(self):
expected_l = r"""\begin{longtable}{lrl}
\label{tab:longtable}\\
+\toprule
+{} & a & b \\
+\midrule
+\endfirsthead
+
\toprule
{} & a & b \\
\midrule
@@ -555,7 +596,13 @@ def test_to_latex_longtable_caption_label(self):
result_cl = df.to_latex(longtable=True, caption=the_caption, label=the_label)
expected_cl = r"""\begin{longtable}{lrl}
-\caption{a table in a \texttt{longtable} environment}\label{tab:longtable}\\
+\caption{a table in a \texttt{longtable} environment}
+\label{tab:longtable}\\
+\toprule
+{} & a & b \\
+\midrule
+\endfirsthead
+\caption[]{a table in a \texttt{longtable} environment} \\
\toprule
{} & a & b \\
\midrule
@@ -573,6 +620,59 @@ def test_to_latex_longtable_caption_label(self):
"""
assert result_cl == expected_cl
+ def test_to_latex_position(self):
+ the_position = "h"
+
+ df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]})
+
+ # test when only the position is provided
+ result_p = df.to_latex(position=the_position)
+
+ expected_p = r"""\begin{table}[h]
+\centering
+\begin{tabular}{lrl}
+\toprule
+{} & a & b \\
+\midrule
+0 & 1 & b1 \\
+1 & 2 & b2 \\
+\bottomrule
+\end{tabular}
+\end{table}
+"""
+ assert result_p == expected_p
+
+ def test_to_latex_longtable_position(self):
+ the_position = "t"
+
+ df = DataFrame({"a": [1, 2], "b": ["b1", "b2"]})
+
+ # test when only the position is provided
+ result_p = df.to_latex(longtable=True, position=the_position)
+
+ expected_p = r"""\begin{longtable}[t]{lrl}
+\toprule
+{} & a & b \\
+\midrule
+\endfirsthead
+
+\toprule
+{} & a & b \\
+\midrule
+\endhead
+\midrule
+\multicolumn{3}{r}{{Continued on next page}} \\
+\midrule
+\endfoot
+
+\bottomrule
+\endlastfoot
+0 & 1 & b1 \\
+1 & 2 & b2 \\
+\end{longtable}
+"""
+ assert result_p == expected_p
+
def test_to_latex_escape_special_chars(self):
special_characters = ["&", "%", "$", "#", "_", "{", "}", "~", "^", "\\"]
df = DataFrame(data=special_characters)
@@ -614,8 +714,8 @@ def test_to_latex_no_header(self):
withoutindex_result = df.to_latex(index=False, header=False)
withoutindex_expected = r"""\begin{tabular}{rl}
\toprule
- 1 & b1 \\
- 2 & b2 \\
+1 & b1 \\
+2 & b2 \\
\bottomrule
\end{tabular}
"""
@@ -641,10 +741,10 @@ def test_to_latex_specified_header(self):
withoutindex_result = df.to_latex(header=["AA", "BB"], index=False)
withoutindex_expected = r"""\begin{tabular}{rl}
\toprule
-AA & BB \\
+AA & BB \\
\midrule
- 1 & b1 \\
- 2 & b2 \\
+ 1 & b1 \\
+ 2 & b2 \\
\bottomrule
\end{tabular}
"""
@@ -881,3 +981,87 @@ def test_to_latex_multindex_header(self):
\end{tabular}
"""
assert observed == expected
+
+
+class TestTableBuilder:
+ @pytest.fixture
+ def dataframe(self):
+ return DataFrame({"a": [1, 2], "b": ["b1", "b2"]})
+
+ @pytest.fixture
+ def table_builder(self, dataframe):
+ return RegularTableBuilder(formatter=DataFrameFormatter(dataframe))
+
+ def test_create_row_iterator(self, table_builder):
+ iterator = table_builder._create_row_iterator(over="header")
+ assert isinstance(iterator, RowHeaderIterator)
+
+ def test_create_body_iterator(self, table_builder):
+ iterator = table_builder._create_row_iterator(over="body")
+ assert isinstance(iterator, RowBodyIterator)
+
+ def test_create_body_wrong_kwarg_raises(self, table_builder):
+ with pytest.raises(ValueError, match="must be either 'header' or 'body'"):
+ table_builder._create_row_iterator(over="SOMETHING BAD")
+
+
+class TestRowStringConverter:
+ @pytest.mark.parametrize(
+ "row_num, expected",
+ [
+ (0, r"{} & Design & ratio & xy \\"),
+ (1, r"0 & 1 & 4 & 10 \\"),
+ (2, r"1 & 2 & 5 & 11 \\"),
+ ],
+ )
+ def test_get_strrow_normal_without_escape(self, row_num, expected):
+ df = DataFrame({r"Design": [1, 2, 3], r"ratio": [4, 5, 6], r"xy": [10, 11, 12]})
+ row_string_converter = RowStringConverter(
+ formatter=DataFrameFormatter(df, escape=True),
+ )
+ assert row_string_converter.get_strrow(row_num=row_num) == expected
+
+ @pytest.mark.parametrize(
+ "row_num, expected",
+ [
+ (0, r"{} & Design \# & ratio, \% & x\&y \\"),
+ (1, r"0 & 1 & 4 & 10 \\"),
+ (2, r"1 & 2 & 5 & 11 \\"),
+ ],
+ )
+ def test_get_strrow_normal_with_escape(self, row_num, expected):
+ df = DataFrame(
+ {r"Design #": [1, 2, 3], r"ratio, %": [4, 5, 6], r"x&y": [10, 11, 12]}
+ )
+ row_string_converter = RowStringConverter(
+ formatter=DataFrameFormatter(df, escape=True),
+ )
+ assert row_string_converter.get_strrow(row_num=row_num) == expected
+
+ @pytest.mark.parametrize(
+ "row_num, expected",
+ [
+ (0, r"{} & \multicolumn{2}{r}{c1} & \multicolumn{2}{r}{c2} & c3 \\"),
+ (1, r"{} & 0 & 1 & 0 & 1 & 0 \\"),
+ (2, r"0 & 0 & 5 & 0 & 5 & 0 \\"),
+ ],
+ )
+ def test_get_strrow_multindex_multicolumn(self, row_num, expected):
+ df = DataFrame(
+ {
+ ("c1", 0): {x: x for x in range(5)},
+ ("c1", 1): {x: x + 5 for x in range(5)},
+ ("c2", 0): {x: x for x in range(5)},
+ ("c2", 1): {x: x + 5 for x in range(5)},
+ ("c3", 0): {x: x for x in range(5)},
+ }
+ )
+
+ row_string_converter = RowStringConverter(
+ formatter=DataFrameFormatter(df),
+ multicolumn=True,
+ multicolumn_format="r",
+ multirow=True,
+ )
+
+ assert row_string_converter.get_strrow(row_num=row_num) == expected
diff --git a/pandas/tests/io/formats/test_to_markdown.py b/pandas/tests/io/formats/test_to_markdown.py
index 8893e4294353f..5223b313fef4f 100644
--- a/pandas/tests/io/formats/test_to_markdown.py
+++ b/pandas/tests/io/formats/test_to_markdown.py
@@ -3,6 +3,7 @@
import pytest
import pandas as pd
+import pandas._testing as tm
pytest.importorskip("tabulate")
@@ -53,3 +54,37 @@ def test_no_buf(capsys):
assert (
result == "| | 0 |\n|---:|----:|\n| 0 | 1 |\n| 1 | 2 |\n| 2 | 3 |"
)
+
+
+@pytest.mark.parametrize("index", [True, False, None])
+@pytest.mark.parametrize("showindex", [True, False, None])
+def test_index(index, showindex):
+ # GH 32667
+ kwargs = {}
+ if index is not None:
+ kwargs["index"] = index
+ if showindex is not None:
+ kwargs["showindex"] = showindex
+
+ df = pd.DataFrame([1, 2, 3])
+ yes_index_result = (
+ "| | 0 |\n|---:|----:|\n| 0 | 1 |\n| 1 | 2 |\n| 2 | 3 |"
+ )
+ no_index_result = "| 0 |\n|----:|\n| 1 |\n| 2 |\n| 3 |"
+
+ warning = FutureWarning if "showindex" in kwargs else None
+ with tm.assert_produces_warning(warning):
+ result = df.to_markdown(**kwargs)
+
+ if "showindex" in kwargs:
+ # give showindex higher priority if specified
+ if showindex:
+ expected = yes_index_result
+ else:
+ expected = no_index_result
+ else:
+ if index in [True, None]:
+ expected = yes_index_result
+ else:
+ expected = no_index_result
+ assert result == expected
diff --git a/pandas/tests/io/generate_legacy_storage_files.py b/pandas/tests/io/generate_legacy_storage_files.py
index e64103bd2cde8..61e1fc019faac 100644
--- a/pandas/tests/io/generate_legacy_storage_files.py
+++ b/pandas/tests/io/generate_legacy_storage_files.py
@@ -6,10 +6,10 @@
in ~/pandas
. activate pandas_0.20.3
-cd ~/
+cd ~/pandas/pandas
-$ python pandas/pandas/tests/io/generate_legacy_storage_files.py \
- pandas/pandas/tests/io/data/legacy_pickle/0.20.3/ pickle
+$ python -m tests.io.generate_legacy_storage_files \
+ tests/io/data/legacy_pickle/0.20.3/ pickle
This script generates a storage file for the current arch, system,
and python version
@@ -328,7 +328,7 @@ def write_legacy_pickles(output_dir):
pth = f"{platform_name()}.pickle"
fh = open(os.path.join(output_dir, pth), "wb")
- pickle.dump(create_pickle_data(), fh, pickle.HIGHEST_PROTOCOL)
+ pickle.dump(create_pickle_data(), fh, pickle.DEFAULT_PROTOCOL)
fh.close()
print(f"created pickle file: {pth}")
diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py
index 182c21ed1d416..a41af9886c617 100644
--- a/pandas/tests/io/json/test_compression.py
+++ b/pandas/tests/io/json/test_compression.py
@@ -34,7 +34,7 @@ def test_read_zipped_json(datapath):
@td.skip_if_not_us_locale
-def test_with_s3_url(compression, s3_resource):
+def test_with_s3_url(compression, s3_resource, s3so):
# Bucket "pandas-test" created in tests/io/conftest.py
df = pd.read_json('{"a": [1, 2, 3], "b": [4, 5, 6]}')
@@ -44,7 +44,9 @@ def test_with_s3_url(compression, s3_resource):
with open(path, "rb") as f:
s3_resource.Bucket("pandas-test").put_object(Key="test-1", Body=f)
- roundtripped_df = pd.read_json("s3://pandas-test/test-1", compression=compression)
+ roundtripped_df = pd.read_json(
+ "s3://pandas-test/test-1", compression=compression, storage_options=s3so
+ )
tm.assert_frame_equal(df, roundtripped_df)
diff --git a/pandas/tests/io/json/test_json_table_schema.py b/pandas/tests/io/json/test_json_table_schema.py
index df64af6ac2265..8f1ed193b100f 100644
--- a/pandas/tests/io/json/test_json_table_schema.py
+++ b/pandas/tests/io/json/test_json_table_schema.py
@@ -1,6 +1,7 @@
"""Tests for Table Schema integration."""
from collections import OrderedDict
import json
+import sys
import numpy as np
import pytest
@@ -255,6 +256,9 @@ def test_read_json_from_to_json_results(self):
tm.assert_frame_equal(result1, df)
tm.assert_frame_equal(result2, df)
+ @pytest.mark.filterwarnings(
+ "ignore:an integer is required (got type float)*:DeprecationWarning"
+ )
def test_to_json(self):
df = self.df.copy()
df.index.name = "idx"
@@ -431,6 +435,9 @@ def test_to_json_categorical_index(self):
assert result == expected
+ @pytest.mark.filterwarnings(
+ "ignore:an integer is required (got type float)*:DeprecationWarning"
+ )
def test_date_format_raises(self):
with pytest.raises(ValueError):
self.df.to_json(orient="table", date_format="epoch")
@@ -671,6 +678,7 @@ class TestTableOrientReader:
{"bools": [True, False, False, True]},
],
)
+ @pytest.mark.skipif(sys.version_info[:3] == (3, 7, 0), reason="GH-35309")
def test_read_json_table_orient(self, index_nm, vals, recwarn):
df = DataFrame(vals, index=pd.Index(range(4), name=index_nm))
out = df.to_json(orient="table")
diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py
index 10f49b9b81528..13152f01abb04 100644
--- a/pandas/tests/io/json/test_pandas.py
+++ b/pandas/tests/io/json/test_pandas.py
@@ -9,11 +9,11 @@
import numpy as np
import pytest
-from pandas.compat import is_platform_32bit, is_platform_windows
+from pandas.compat import IS64, is_platform_windows
import pandas.util._test_decorators as td
import pandas as pd
-from pandas import DataFrame, DatetimeIndex, Series, Timestamp, read_json
+from pandas import DataFrame, DatetimeIndex, Series, Timestamp, compat, read_json
import pandas._testing as tm
_seriesd = tm.getSeriesData()
@@ -35,6 +35,9 @@ def assert_json_roundtrip_equal(result, expected, orient):
tm.assert_frame_equal(result, expected)
+@pytest.mark.filterwarnings(
+ "ignore:an integer is required (got type float)*:DeprecationWarning"
+)
@pytest.mark.filterwarnings("ignore:the 'numpy' keyword is deprecated:FutureWarning")
class TestPandasContainer:
@pytest.fixture(autouse=True)
@@ -151,7 +154,7 @@ def test_roundtrip_intframe(self, orient, convert_axes, numpy, dtype, int_frame)
expected = int_frame
if (
numpy
- and (is_platform_32bit() or is_platform_windows())
+ and (not IS64 or is_platform_windows())
and not dtype
and orient != "split"
):
@@ -358,9 +361,7 @@ def test_frame_infinity(self, orient, inf, dtype):
result = read_json(df.to_json(), dtype=dtype)
assert np.isnan(result.iloc[0, 2])
- @pytest.mark.skipif(
- is_platform_32bit(), reason="not compliant on 32-bit, xref #15865"
- )
+ @pytest.mark.skipif(not IS64, reason="not compliant on 32-bit, xref #15865")
@pytest.mark.parametrize(
"value,precision,expected_val",
[
@@ -744,11 +745,7 @@ def test_reconstruction_index(self):
def test_path(self, float_frame, int_frame, datetime_frame):
with tm.ensure_clean("test.json") as path:
- for df in [
- float_frame,
- int_frame,
- datetime_frame,
- ]:
+ for df in [float_frame, int_frame, datetime_frame]:
df.to_json(path)
read_json(path)
@@ -1210,10 +1207,12 @@ def test_read_inline_jsonl(self):
tm.assert_frame_equal(result, expected)
@td.skip_if_not_us_locale
- def test_read_s3_jsonl(self, s3_resource):
+ def test_read_s3_jsonl(self, s3_resource, s3so):
# GH17200
- result = read_json("s3n://pandas-test/items.jsonl", lines=True)
+ result = read_json(
+ "s3n://pandas-test/items.jsonl", lines=True, storage_options=s3so
+ )
expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])
tm.assert_frame_equal(result, expected)
@@ -1250,23 +1249,32 @@ def test_to_json_large_numbers(self, bigNum):
json = series.to_json()
expected = '{"articleId":' + str(bigNum) + "}"
assert json == expected
- # GH 20599
+
+ df = DataFrame(bigNum, dtype=object, index=["articleId"], columns=[0])
+ json = df.to_json()
+ expected = '{"0":{"articleId":' + str(bigNum) + "}}"
+ assert json == expected
+
+ @pytest.mark.parametrize("bigNum", [sys.maxsize + 1, -(sys.maxsize + 2)])
+ @pytest.mark.skipif(not compat.IS64, reason="GH-35279")
+ def test_read_json_large_numbers(self, bigNum):
+ # GH20599
+
+ series = Series(bigNum, dtype=object, index=["articleId"])
+ json = '{"articleId":' + str(bigNum) + "}"
with pytest.raises(ValueError):
json = StringIO(json)
result = read_json(json)
tm.assert_series_equal(series, result)
df = DataFrame(bigNum, dtype=object, index=["articleId"], columns=[0])
- json = df.to_json()
- expected = '{"0":{"articleId":' + str(bigNum) + "}}"
- assert json == expected
- # GH 20599
+ json = '{"0":{"articleId":' + str(bigNum) + "}}"
with pytest.raises(ValueError):
json = StringIO(json)
result = read_json(json)
tm.assert_frame_equal(df, result)
- def test_read_json_large_numbers(self):
+ def test_read_json_large_numbers2(self):
# GH18842
json = '{"articleId": "1404366058080022500245"}'
json = StringIO(json)
@@ -1688,13 +1696,13 @@ def test_json_multiindex(self, dataframe, expected):
result = series.to_json(orient="index")
assert result == expected
- def test_to_s3(self, s3_resource):
+ def test_to_s3(self, s3_resource, s3so):
import time
# GH 28375
mock_bucket_name, target_file = "pandas-test", "test.json"
df = DataFrame({"x": [1, 2, 3], "y": [2, 4, 6]})
- df.to_json(f"s3://{mock_bucket_name}/{target_file}")
+ df.to_json(f"s3://{mock_bucket_name}/{target_file}", storage_options=s3so)
timeout = 5
while True:
if target_file in (
diff --git a/pandas/tests/io/json/test_ujson.py b/pandas/tests/io/json/test_ujson.py
index 952c583040360..086c0b7ba08b2 100644
--- a/pandas/tests/io/json/test_ujson.py
+++ b/pandas/tests/io/json/test_ujson.py
@@ -15,7 +15,7 @@
import pandas._libs.json as ujson
from pandas._libs.tslib import Timestamp
-import pandas.compat as compat
+from pandas.compat import IS64, is_platform_windows
from pandas import DataFrame, DatetimeIndex, Index, NaT, Series, Timedelta, date_range
import pandas._testing as tm
@@ -53,7 +53,7 @@ def get_int32_compat_dtype(numpy, orient):
# See GH#32527
dtype = np.int64
if not ((numpy is None or orient == "index") or (numpy is True and orient is None)):
- if compat.is_platform_windows():
+ if is_platform_windows():
dtype = np.int32
else:
dtype = np.intp
@@ -62,9 +62,7 @@ def get_int32_compat_dtype(numpy, orient):
class TestUltraJSONTests:
- @pytest.mark.skipif(
- compat.is_platform_32bit(), reason="not compliant on 32-bit, xref #15865"
- )
+ @pytest.mark.skipif(not IS64, reason="not compliant on 32-bit, xref #15865")
def test_encode_decimal(self):
sut = decimal.Decimal("1337.1337")
encoded = ujson.encode(sut, double_precision=15)
@@ -561,6 +559,7 @@ def test_encode_long_conversion(self):
assert long_input == ujson.decode(output)
@pytest.mark.parametrize("bigNum", [sys.maxsize + 1, -(sys.maxsize + 2)])
+ @pytest.mark.xfail(not IS64, reason="GH-35288")
def test_dumps_ints_larger_than_maxsize(self, bigNum):
# GH34395
bigNum = sys.maxsize + 1
@@ -592,14 +591,14 @@ def test_decode_number_with_32bit_sign_bit(self, val):
def test_encode_big_escape(self):
# Make sure no Exception is raised.
for _ in range(10):
- base = "\u00e5".encode("utf-8")
+ base = "\u00e5".encode()
escape_input = base * 1024 * 1024 * 2
ujson.encode(escape_input)
def test_decode_big_escape(self):
# Make sure no Exception is raised.
for _ in range(10):
- base = "\u00e5".encode("utf-8")
+ base = "\u00e5".encode()
quote = b'"'
escape_input = quote + (base * 1024 * 1024 * 2) + quote
@@ -702,7 +701,7 @@ def test_int_array(self, any_int_dtype):
tm.assert_numpy_array_equal(arr_input, arr_output)
def test_int_max(self, any_int_dtype):
- if any_int_dtype in ("int64", "uint64") and compat.is_platform_32bit():
+ if any_int_dtype in ("int64", "uint64") and not IS64:
pytest.skip("Cannot test 64-bit integer on 32-bit platform")
klass = np.dtype(any_int_dtype).type
diff --git a/pandas/tests/io/parser/test_c_parser_only.py b/pandas/tests/io/parser/test_c_parser_only.py
index d76d01904731a..ae63b6af3a8b6 100644
--- a/pandas/tests/io/parser/test_c_parser_only.py
+++ b/pandas/tests/io/parser/test_c_parser_only.py
@@ -160,7 +160,9 @@ def test_precise_conversion(c_parser_only):
# 25 decimal digits of precision
text = f"a\n{num:.25}"
- normal_val = float(parser.read_csv(StringIO(text))["a"][0])
+ normal_val = float(
+ parser.read_csv(StringIO(text), float_precision="legacy")["a"][0]
+ )
precise_val = float(
parser.read_csv(StringIO(text), float_precision="high")["a"][0]
)
@@ -575,7 +577,7 @@ def test_file_handles_mmap(c_parser_only, csv1):
# Don't close user provided file handles.
parser = c_parser_only
- with open(csv1, "r") as f:
+ with open(csv1) as f:
m = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)
parser.read_csv(m)
@@ -606,3 +608,118 @@ def test_unix_style_breaks(c_parser_only):
result = parser.read_csv(path, skiprows=2, encoding="utf-8", engine="c")
expected = DataFrame(columns=["col_1", "col_2", "col_3"])
tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"])
+@pytest.mark.parametrize(
+ "data,thousands,decimal",
+ [
+ (
+ """A|B|C
+1|2,334.01|5
+10|13|10.
+""",
+ ",",
+ ".",
+ ),
+ (
+ """A|B|C
+1|2.334,01|5
+10|13|10,
+""",
+ ".",
+ ",",
+ ),
+ ],
+)
+def test_1000_sep_with_decimal(
+ c_parser_only, data, thousands, decimal, float_precision
+):
+ parser = c_parser_only
+ expected = DataFrame({"A": [1, 10], "B": [2334.01, 13], "C": [5, 10.0]})
+
+ result = parser.read_csv(
+ StringIO(data),
+ sep="|",
+ thousands=thousands,
+ decimal=decimal,
+ float_precision=float_precision,
+ )
+ tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"])
+@pytest.mark.parametrize(
+ "value,expected",
+ [
+ ("-1,0", -1.0),
+ ("-1,2e0", -1.2),
+ ("-1e0", -1.0),
+ ("+1e0", 1.0),
+ ("+1e+0", 1.0),
+ ("+1e-1", 0.1),
+ ("+,1e1", 1.0),
+ ("+1,e0", 1.0),
+ ("-,1e1", -1.0),
+ ("-1,e0", -1.0),
+ ("0,1", 0.1),
+ ("1,", 1.0),
+ (",1", 0.1),
+ ("-,1", -0.1),
+ ("1_,", 1.0),
+ ("1_234,56", 1234.56),
+ ("1_234,56e0", 1234.56),
+ # negative cases; must not parse as float
+ ("_", "_"),
+ ("-_", "-_"),
+ ("-_1", "-_1"),
+ ("-_1e0", "-_1e0"),
+ ("_1", "_1"),
+ ("_1,", "_1,"),
+ ("_1,_", "_1,_"),
+ ("_1e0", "_1e0"),
+ ("1,2e_1", "1,2e_1"),
+ ("1,2e1_0", "1,2e1_0"),
+ ("1,_2", "1,_2"),
+ (",1__2", ",1__2"),
+ (",1e", ",1e"),
+ ("-,1e", "-,1e"),
+ ("1_000,000_000", "1_000,000_000"),
+ ("1,e1_2", "1,e1_2"),
+ ],
+)
+def test_1000_sep_decimal_float_precision(
+ c_parser_only, value, expected, float_precision
+):
+ # test decimal and thousand sep handling in across 'float_precision'
+ # parsers
+ parser = c_parser_only
+ df = parser.read_csv(
+ StringIO(value),
+ sep="|",
+ thousands="_",
+ decimal=",",
+ header=None,
+ float_precision=float_precision,
+ )
+ val = df.iloc[0, 0]
+ assert val == expected
+
+
+def test_float_precision_options(c_parser_only):
+ # GH 17154, 36228
+ parser = c_parser_only
+ s = "foo\n243.164\n"
+ df = parser.read_csv(StringIO(s))
+ df2 = parser.read_csv(StringIO(s), float_precision="high")
+
+ tm.assert_frame_equal(df, df2)
+
+ df3 = parser.read_csv(StringIO(s), float_precision="legacy")
+
+ assert not df.iloc[0, 0] == df3.iloc[0, 0]
+
+ msg = "Unrecognized float_precision option: junk"
+
+ with pytest.raises(ValueError, match=msg):
+ parser.read_csv(StringIO(s), float_precision="junk")
diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py
index 12e73bae40eac..6bbc9bc9e1788 100644
--- a/pandas/tests/io/parser/test_common.py
+++ b/pandas/tests/io/parser/test_common.py
@@ -18,7 +18,7 @@
from pandas.errors import DtypeWarning, EmptyDataError, ParserError
import pandas.util._test_decorators as td
-from pandas import DataFrame, Index, MultiIndex, Series, compat, concat
+from pandas import DataFrame, Index, MultiIndex, Series, compat, concat, option_context
import pandas._testing as tm
from pandas.io.parsers import CParserWrapper, TextFileReader, TextParser
@@ -1138,6 +1138,7 @@ def test_parse_integers_above_fp_precision(all_parsers):
tm.assert_frame_equal(result, expected)
+@pytest.mark.xfail(reason="ResourceWarning #35660", strict=False)
def test_chunks_have_consistent_numerical_type(all_parsers):
parser = all_parsers
integers = [str(i) for i in range(499999)]
@@ -1151,6 +1152,7 @@ def test_chunks_have_consistent_numerical_type(all_parsers):
assert result.a.dtype == float
+@pytest.mark.xfail(reason="ResourceWarning #35660", strict=False)
def test_warn_if_chunks_have_mismatched_type(all_parsers):
warning_type = None
parser = all_parsers
@@ -1724,7 +1726,7 @@ def test_iteration_open_handle(all_parsers):
with open(path, "w") as f:
f.write("AAA\nBBB\nCCC\nDDD\nEEE\nFFF\nGGG")
- with open(path, "r") as f:
+ with open(path) as f:
for line in f:
if "CCC" in line:
break
@@ -1834,6 +1836,7 @@ def test_raise_on_no_columns(all_parsers, nrows):
parser.read_csv(StringIO(data))
+@td.check_file_leaks
def test_memory_map(all_parsers, csv_dir_path):
mmap_file = os.path.join(csv_dir_path, "test_mmap.csv")
parser = all_parsers
@@ -2125,6 +2128,16 @@ def test_first_row_bom(all_parsers):
tm.assert_frame_equal(result, expected)
+def test_first_row_bom_unquoted(all_parsers):
+ # see gh-36343
+ parser = all_parsers
+ data = """\ufeffHead1 Head2 Head3"""
+
+ result = parser.read_csv(StringIO(data), delimiter="\t")
+ expected = DataFrame(columns=["Head1", "Head2", "Head3"])
+ tm.assert_frame_equal(result, expected)
+
+
def test_integer_precision(all_parsers):
# Gh 7072
s = """1,1;0;0;0;1;1;3844;3844;3844;1;1;1;1;1;1;0;0;1;1;0;0,,,4321583677327450765
@@ -2179,3 +2192,13 @@ def test_read_csv_names_not_accepting_sets(all_parsers):
parser = all_parsers
with pytest.raises(ValueError, match="Names should be an ordered collection."):
parser.read_csv(StringIO(data), names=set("QAZ"))
+
+
+def test_read_csv_with_use_inf_as_na(all_parsers):
+ # https://github.com/pandas-dev/pandas/issues/35493
+ parser = all_parsers
+ data = "1.0\nNaN\n3.0"
+ with option_context("use_inf_as_na", True):
+ result = parser.read_csv(StringIO(data), header=None)
+ expected = DataFrame([1.0, np.nan, 3.0])
+ tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/io/parser/test_dtypes.py b/pandas/tests/io/parser/test_dtypes.py
index 6298d1e5498f3..6ac310e3b2227 100644
--- a/pandas/tests/io/parser/test_dtypes.py
+++ b/pandas/tests/io/parser/test_dtypes.py
@@ -561,9 +561,13 @@ def test_boolean_dtype(all_parsers):
"True",
"TRUE",
"true",
+ "1",
+ "1.0",
"False",
"FALSE",
"false",
+ "0",
+ "0.0",
"NaN",
"nan",
"NA",
@@ -576,7 +580,23 @@ def test_boolean_dtype(all_parsers):
expected = pd.DataFrame(
{
"a": pd.array(
- [True, True, True, False, False, False, None, None, None, None, None],
+ [
+ True,
+ True,
+ True,
+ True,
+ True,
+ False,
+ False,
+ False,
+ False,
+ False,
+ None,
+ None,
+ None,
+ None,
+ None,
+ ],
dtype="boolean",
)
}
diff --git a/pandas/tests/io/parser/test_encoding.py b/pandas/tests/io/parser/test_encoding.py
index de7b3bed034c7..f23b498c7388a 100644
--- a/pandas/tests/io/parser/test_encoding.py
+++ b/pandas/tests/io/parser/test_encoding.py
@@ -27,7 +27,7 @@ def test_bytes_io_input(all_parsers):
def test_read_csv_unicode(all_parsers):
parser = all_parsers
- data = BytesIO("\u0141aski, Jan;1".encode("utf-8"))
+ data = BytesIO("\u0141aski, Jan;1".encode())
result = parser.read_csv(data, sep=";", encoding="utf-8", header=None)
expected = DataFrame([["\u0141aski, Jan", 1]])
diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py
index 509ae89909699..b8b03cbd14a1d 100644
--- a/pandas/tests/io/parser/test_network.py
+++ b/pandas/tests/io/parser/test_network.py
@@ -46,6 +46,21 @@ def check_compressed_urls(salaries_table, compression, extension, mode, engine):
tm.assert_frame_equal(url_table, salaries_table)
+@tm.network("https://raw.githubusercontent.com/", check_before_test=True)
+def test_url_encoding_csv():
+ """
+ read_csv should honor the requested encoding for URLs.
+
+ GH 10424
+ """
+ path = (
+ "https://raw.githubusercontent.com/pandas-dev/pandas/master/"
+ + "pandas/tests/io/parser/data/unicode_series.csv"
+ )
+ df = read_csv(path, encoding="latin-1", header=None)
+ assert df.loc[15, 1] == "Á köldum klaka (Cold Fever) (1994)"
+
+
@pytest.fixture
def tips_df(datapath):
"""DataFrame with the tips dataset."""
@@ -56,50 +71,62 @@ def tips_df(datapath):
@td.skip_if_not_us_locale()
class TestS3:
@td.skip_if_no("s3fs")
- def test_parse_public_s3_bucket(self, tips_df):
+ def test_parse_public_s3_bucket(self, tips_df, s3so):
# more of an integration test due to the not-public contents portion
# can probably mock this though.
for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
- df = read_csv("s3://pandas-test/tips.csv" + ext, compression=comp)
+ df = read_csv(
+ "s3://pandas-test/tips.csv" + ext,
+ compression=comp,
+ storage_options=s3so,
+ )
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(df, tips_df)
# Read public file from bucket with not-public contents
- df = read_csv("s3://cant_get_it/tips.csv")
+ df = read_csv("s3://cant_get_it/tips.csv", storage_options=s3so)
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(df, tips_df)
- def test_parse_public_s3n_bucket(self, tips_df):
+ def test_parse_public_s3n_bucket(self, tips_df, s3so):
# Read from AWS s3 as "s3n" URL
- df = read_csv("s3n://pandas-test/tips.csv", nrows=10)
+ df = read_csv("s3n://pandas-test/tips.csv", nrows=10, storage_options=s3so)
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(tips_df.iloc[:10], df)
- def test_parse_public_s3a_bucket(self, tips_df):
+ def test_parse_public_s3a_bucket(self, tips_df, s3so):
# Read from AWS s3 as "s3a" URL
- df = read_csv("s3a://pandas-test/tips.csv", nrows=10)
+ df = read_csv("s3a://pandas-test/tips.csv", nrows=10, storage_options=s3so)
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(tips_df.iloc[:10], df)
- def test_parse_public_s3_bucket_nrows(self, tips_df):
+ def test_parse_public_s3_bucket_nrows(self, tips_df, s3so):
for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
- df = read_csv("s3://pandas-test/tips.csv" + ext, nrows=10, compression=comp)
+ df = read_csv(
+ "s3://pandas-test/tips.csv" + ext,
+ nrows=10,
+ compression=comp,
+ storage_options=s3so,
+ )
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(tips_df.iloc[:10], df)
- def test_parse_public_s3_bucket_chunked(self, tips_df):
+ def test_parse_public_s3_bucket_chunked(self, tips_df, s3so):
# Read with a chunksize
chunksize = 5
for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
df_reader = read_csv(
- "s3://pandas-test/tips.csv" + ext, chunksize=chunksize, compression=comp
+ "s3://pandas-test/tips.csv" + ext,
+ chunksize=chunksize,
+ compression=comp,
+ storage_options=s3so,
)
assert df_reader.chunksize == chunksize
for i_chunk in [0, 1, 2]:
@@ -111,7 +138,7 @@ def test_parse_public_s3_bucket_chunked(self, tips_df):
true_df = tips_df.iloc[chunksize * i_chunk : chunksize * (i_chunk + 1)]
tm.assert_frame_equal(true_df, df)
- def test_parse_public_s3_bucket_chunked_python(self, tips_df):
+ def test_parse_public_s3_bucket_chunked_python(self, tips_df, s3so):
# Read with a chunksize using the Python parser
chunksize = 5
for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
@@ -120,6 +147,7 @@ def test_parse_public_s3_bucket_chunked_python(self, tips_df):
chunksize=chunksize,
compression=comp,
engine="python",
+ storage_options=s3so,
)
assert df_reader.chunksize == chunksize
for i_chunk in [0, 1, 2]:
@@ -130,46 +158,53 @@ def test_parse_public_s3_bucket_chunked_python(self, tips_df):
true_df = tips_df.iloc[chunksize * i_chunk : chunksize * (i_chunk + 1)]
tm.assert_frame_equal(true_df, df)
- def test_parse_public_s3_bucket_python(self, tips_df):
+ def test_parse_public_s3_bucket_python(self, tips_df, s3so):
for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
df = read_csv(
- "s3://pandas-test/tips.csv" + ext, engine="python", compression=comp
+ "s3://pandas-test/tips.csv" + ext,
+ engine="python",
+ compression=comp,
+ storage_options=s3so,
)
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(df, tips_df)
- def test_infer_s3_compression(self, tips_df):
+ def test_infer_s3_compression(self, tips_df, s3so):
for ext in ["", ".gz", ".bz2"]:
df = read_csv(
- "s3://pandas-test/tips.csv" + ext, engine="python", compression="infer"
+ "s3://pandas-test/tips.csv" + ext,
+ engine="python",
+ compression="infer",
+ storage_options=s3so,
)
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(df, tips_df)
- def test_parse_public_s3_bucket_nrows_python(self, tips_df):
+ def test_parse_public_s3_bucket_nrows_python(self, tips_df, s3so):
for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]:
df = read_csv(
"s3://pandas-test/tips.csv" + ext,
engine="python",
nrows=10,
compression=comp,
+ storage_options=s3so,
)
assert isinstance(df, DataFrame)
assert not df.empty
tm.assert_frame_equal(tips_df.iloc[:10], df)
- def test_read_s3_fails(self):
+ def test_read_s3_fails(self, s3so):
with pytest.raises(IOError):
- read_csv("s3://nyqpug/asdf.csv")
+ read_csv("s3://nyqpug/asdf.csv", storage_options=s3so)
# Receive a permission error when trying to read a private bucket.
# It's irrelevant here that this isn't actually a table.
with pytest.raises(IOError):
read_csv("s3://cant_get_it/file.csv")
- def test_write_s3_csv_fails(self, tips_df):
+ def test_write_s3_csv_fails(self, tips_df, s3so):
# GH 32486
# Attempting to write to an invalid S3 path should raise
import botocore
@@ -180,10 +215,12 @@ def test_write_s3_csv_fails(self, tips_df):
error = (FileNotFoundError, botocore.exceptions.ClientError)
with pytest.raises(error, match="The specified bucket does not exist"):
- tips_df.to_csv("s3://an_s3_bucket_data_doesnt_exit/not_real.csv")
+ tips_df.to_csv(
+ "s3://an_s3_bucket_data_doesnt_exit/not_real.csv", storage_options=s3so
+ )
@td.skip_if_no("pyarrow")
- def test_write_s3_parquet_fails(self, tips_df):
+ def test_write_s3_parquet_fails(self, tips_df, s3so):
# GH 27679
# Attempting to write to an invalid S3 path should raise
import botocore
@@ -194,7 +231,10 @@ def test_write_s3_parquet_fails(self, tips_df):
error = (FileNotFoundError, botocore.exceptions.ClientError)
with pytest.raises(error, match="The specified bucket does not exist"):
- tips_df.to_parquet("s3://an_s3_bucket_data_doesnt_exit/not_real.parquet")
+ tips_df.to_parquet(
+ "s3://an_s3_bucket_data_doesnt_exit/not_real.parquet",
+ storage_options=s3so,
+ )
def test_read_csv_handles_boto_s3_object(self, s3_resource, tips_file):
# see gh-16135
@@ -210,7 +250,7 @@ def test_read_csv_handles_boto_s3_object(self, s3_resource, tips_file):
expected = read_csv(tips_file)
tm.assert_frame_equal(result, expected)
- def test_read_csv_chunked_download(self, s3_resource, caplog):
+ def test_read_csv_chunked_download(self, s3_resource, caplog, s3so):
# 8 MB, S3FS usees 5MB chunks
import s3fs
@@ -230,18 +270,20 @@ def test_read_csv_chunked_download(self, s3_resource, caplog):
s3fs.S3FileSystem.clear_instance_cache()
with caplog.at_level(logging.DEBUG, logger="s3fs"):
- read_csv("s3://pandas-test/large-file.csv", nrows=5)
+ read_csv("s3://pandas-test/large-file.csv", nrows=5, storage_options=s3so)
# log of fetch_range (start, stop)
assert (0, 5505024) in (x.args[-2:] for x in caplog.records)
- def test_read_s3_with_hash_in_key(self, tips_df):
+ def test_read_s3_with_hash_in_key(self, tips_df, s3so):
# GH 25945
- result = read_csv("s3://pandas-test/tips#1.csv")
+ result = read_csv("s3://pandas-test/tips#1.csv", storage_options=s3so)
tm.assert_frame_equal(tips_df, result)
@td.skip_if_no("pyarrow")
- def test_read_feather_s3_file_path(self, feather_file):
+ def test_read_feather_s3_file_path(self, feather_file, s3so):
# GH 29055
expected = read_feather(feather_file)
- res = read_feather("s3://pandas-test/simple_dataset.feather")
+ res = read_feather(
+ "s3://pandas-test/simple_dataset.feather", storage_options=s3so
+ )
tm.assert_frame_equal(expected, res)
diff --git a/pandas/tests/io/parser/test_parse_dates.py b/pandas/tests/io/parser/test_parse_dates.py
index ed947755e3419..662659982c0b3 100644
--- a/pandas/tests/io/parser/test_parse_dates.py
+++ b/pandas/tests/io/parser/test_parse_dates.py
@@ -370,7 +370,11 @@ def test_date_col_as_index_col(all_parsers):
tm.assert_frame_equal(result, expected)
-def test_multiple_date_cols_int_cast(all_parsers):
+@pytest.mark.parametrize(
+ "date_parser, warning",
+ ([conv.parse_date_time, FutureWarning], [pd.to_datetime, None]),
+)
+def test_multiple_date_cols_int_cast(all_parsers, date_parser, warning):
data = (
"KORD,19990127, 19:00:00, 18:56:00, 0.8100\n"
"KORD,19990127, 20:00:00, 19:56:00, 0.0100\n"
@@ -382,13 +386,15 @@ def test_multiple_date_cols_int_cast(all_parsers):
parse_dates = {"actual": [1, 2], "nominal": [1, 3]}
parser = all_parsers
- result = parser.read_csv(
- StringIO(data),
- header=None,
- date_parser=conv.parse_date_time,
- parse_dates=parse_dates,
- prefix="X",
- )
+ with tm.assert_produces_warning(warning, check_stacklevel=False):
+ result = parser.read_csv(
+ StringIO(data),
+ header=None,
+ date_parser=date_parser,
+ parse_dates=parse_dates,
+ prefix="X",
+ )
+
expected = DataFrame(
[
[datetime(1999, 1, 27, 19, 0), datetime(1999, 1, 27, 18, 56), "KORD", 0.81],
@@ -808,7 +814,9 @@ def test_parse_dates_custom_euro_format(all_parsers, kwargs):
tm.assert_frame_equal(df, expected)
else:
msg = "got an unexpected keyword argument 'day_first'"
- with pytest.raises(TypeError, match=msg):
+ with pytest.raises(TypeError, match=msg), tm.assert_produces_warning(
+ FutureWarning
+ ):
parser.read_csv(
StringIO(data),
names=["time", "Q", "NTU"],
@@ -1166,7 +1174,11 @@ def test_parse_dates_no_convert_thousands(all_parsers, data, kwargs, expected):
tm.assert_frame_equal(result, expected)
-def test_parse_date_time_multi_level_column_name(all_parsers):
+@pytest.mark.parametrize(
+ "date_parser, warning",
+ ([conv.parse_date_time, FutureWarning], [pd.to_datetime, None]),
+)
+def test_parse_date_time_multi_level_column_name(all_parsers, date_parser, warning):
data = """\
D,T,A,B
date, time,a,b
@@ -1174,12 +1186,13 @@ def test_parse_date_time_multi_level_column_name(all_parsers):
2001-01-06, 00:00:00, 1.0, 11.
"""
parser = all_parsers
- result = parser.read_csv(
- StringIO(data),
- header=[0, 1],
- parse_dates={"date_time": [0, 1]},
- date_parser=conv.parse_date_time,
- )
+ with tm.assert_produces_warning(warning, check_stacklevel=False):
+ result = parser.read_csv(
+ StringIO(data),
+ header=[0, 1],
+ parse_dates={"date_time": [0, 1]},
+ date_parser=date_parser,
+ )
expected_data = [
[datetime(2001, 1, 5, 9, 0, 0), 0.0, 10.0],
@@ -1189,6 +1202,10 @@ def test_parse_date_time_multi_level_column_name(all_parsers):
tm.assert_frame_equal(result, expected)
+@pytest.mark.parametrize(
+ "date_parser, warning",
+ ([conv.parse_date_time, FutureWarning], [pd.to_datetime, None]),
+)
@pytest.mark.parametrize(
"data,kwargs,expected",
[
@@ -1261,9 +1278,10 @@ def test_parse_date_time_multi_level_column_name(all_parsers):
),
],
)
-def test_parse_date_time(all_parsers, data, kwargs, expected):
+def test_parse_date_time(all_parsers, data, kwargs, expected, date_parser, warning):
parser = all_parsers
- result = parser.read_csv(StringIO(data), date_parser=conv.parse_date_time, **kwargs)
+ with tm.assert_produces_warning(warning, check_stacklevel=False):
+ result = parser.read_csv(StringIO(data), date_parser=date_parser, **kwargs)
# Python can sometimes be flaky about how
# the aggregated columns are entered, so
@@ -1272,15 +1290,20 @@ def test_parse_date_time(all_parsers, data, kwargs, expected):
tm.assert_frame_equal(result, expected)
-def test_parse_date_fields(all_parsers):
+@pytest.mark.parametrize(
+ "date_parser, warning",
+ ([conv.parse_date_fields, FutureWarning], [pd.to_datetime, None]),
+)
+def test_parse_date_fields(all_parsers, date_parser, warning):
parser = all_parsers
data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11."
- result = parser.read_csv(
- StringIO(data),
- header=0,
- parse_dates={"ymd": [0, 1, 2]},
- date_parser=conv.parse_date_fields,
- )
+ with tm.assert_produces_warning(warning, check_stacklevel=False):
+ result = parser.read_csv(
+ StringIO(data),
+ header=0,
+ parse_dates={"ymd": [0, 1, 2]},
+ date_parser=date_parser,
+ )
expected = DataFrame(
[[datetime(2001, 1, 10), 10.0], [datetime(2001, 2, 1), 11.0]],
@@ -1289,19 +1312,27 @@ def test_parse_date_fields(all_parsers):
tm.assert_frame_equal(result, expected)
-def test_parse_date_all_fields(all_parsers):
+@pytest.mark.parametrize(
+ "date_parser, warning",
+ (
+ [conv.parse_all_fields, FutureWarning],
+ [lambda x: pd.to_datetime(x, format="%Y %m %d %H %M %S"), None],
+ ),
+)
+def test_parse_date_all_fields(all_parsers, date_parser, warning):
parser = all_parsers
data = """\
year,month,day,hour,minute,second,a,b
2001,01,05,10,00,0,0.0,10.
2001,01,5,10,0,00,1.,11.
"""
- result = parser.read_csv(
- StringIO(data),
- header=0,
- date_parser=conv.parse_all_fields,
- parse_dates={"ymdHMS": [0, 1, 2, 3, 4, 5]},
- )
+ with tm.assert_produces_warning(warning, check_stacklevel=False):
+ result = parser.read_csv(
+ StringIO(data),
+ header=0,
+ date_parser=date_parser,
+ parse_dates={"ymdHMS": [0, 1, 2, 3, 4, 5]},
+ )
expected = DataFrame(
[
[datetime(2001, 1, 5, 10, 0, 0), 0.0, 10.0],
@@ -1312,19 +1343,27 @@ def test_parse_date_all_fields(all_parsers):
tm.assert_frame_equal(result, expected)
-def test_datetime_fractional_seconds(all_parsers):
+@pytest.mark.parametrize(
+ "date_parser, warning",
+ (
+ [conv.parse_all_fields, FutureWarning],
+ [lambda x: pd.to_datetime(x, format="%Y %m %d %H %M %S.%f"), None],
+ ),
+)
+def test_datetime_fractional_seconds(all_parsers, date_parser, warning):
parser = all_parsers
data = """\
year,month,day,hour,minute,second,a,b
2001,01,05,10,00,0.123456,0.0,10.
2001,01,5,10,0,0.500000,1.,11.
"""
- result = parser.read_csv(
- StringIO(data),
- header=0,
- date_parser=conv.parse_all_fields,
- parse_dates={"ymdHMS": [0, 1, 2, 3, 4, 5]},
- )
+ with tm.assert_produces_warning(warning, check_stacklevel=False):
+ result = parser.read_csv(
+ StringIO(data),
+ header=0,
+ date_parser=date_parser,
+ parse_dates={"ymdHMS": [0, 1, 2, 3, 4, 5]},
+ )
expected = DataFrame(
[
[datetime(2001, 1, 5, 10, 0, 0, microsecond=123456), 0.0, 10.0],
@@ -1339,12 +1378,13 @@ def test_generic(all_parsers):
parser = all_parsers
data = "year,month,day,a\n2001,01,10,10.\n2001,02,1,11."
- result = parser.read_csv(
- StringIO(data),
- header=0,
- parse_dates={"ym": [0, 1]},
- date_parser=lambda y, m: date(year=int(y), month=int(m), day=1),
- )
+ with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
+ result = parser.read_csv(
+ StringIO(data),
+ header=0,
+ parse_dates={"ym": [0, 1]},
+ date_parser=lambda y, m: date(year=int(y), month=int(m), day=1),
+ )
expected = DataFrame(
[[date(2001, 1, 1), 10, 10.0], [date(2001, 2, 1), 1, 11.0]],
columns=["ym", "day", "a"],
@@ -1439,7 +1479,7 @@ def test_parse_timezone(all_parsers):
end="2018-01-04 09:05:00",
freq="1min",
tz=pytz.FixedOffset(540),
- ),
+ )
),
freq=None,
)
@@ -1553,5 +1593,5 @@ def test_missing_parse_dates_column_raises(
msg = f"Missing column provided to 'parse_dates': '{missing_cols}'"
with pytest.raises(ValueError, match=msg):
parser.read_csv(
- content, sep=",", names=names, usecols=usecols, parse_dates=parse_dates,
+ content, sep=",", names=names, usecols=usecols, parse_dates=parse_dates
)
diff --git a/pandas/tests/io/parser/test_read_fwf.py b/pandas/tests/io/parser/test_read_fwf.py
index e982667f06f31..5087d0e50c9ea 100644
--- a/pandas/tests/io/parser/test_read_fwf.py
+++ b/pandas/tests/io/parser/test_read_fwf.py
@@ -40,7 +40,7 @@ def test_basic():
tm.assert_frame_equal(result, expected)
-def test_colspecs():
+def test_col_specs():
data = """\
A B C D E
201158 360.242940 149.910199 11950.7
@@ -49,8 +49,8 @@ def test_colspecs():
201161 413.836124 184.375703 11916.8
201162 502.953953 173.237159 12468.3
"""
- colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)]
- result = read_fwf(StringIO(data), colspecs=colspecs)
+ col_specs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)]
+ result = read_fwf(StringIO(data), colspecs=col_specs)
expected = DataFrame(
[
@@ -74,7 +74,7 @@ def test_widths():
2011 61 413.836124 184.375703 11916.8
2011 62 502.953953 173.237159 12468.3
"""
- result = read_fwf(StringIO(data), widths=[5, 5, 13, 13, 7])
+ result = read_fwf(StringIO(data), col_widths=[5, 5, 13, 13, 7])
expected = DataFrame(
[
@@ -104,8 +104,8 @@ def test_non_space_filler():
201161~~~~413.836124~~~184.375703~~~11916.8
201162~~~~502.953953~~~173.237159~~~12468.3
"""
- colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)]
- result = read_fwf(StringIO(data), colspecs=colspecs, delimiter="~")
+ col_specs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)]
+ result = read_fwf(StringIO(data), col_specs=col_specs, delimiter="~")
expected = DataFrame(
[
@@ -129,10 +129,10 @@ def test_over_specified():
201161 413.836124 184.375703 11916.8
201162 502.953953 173.237159 12468.3
"""
- colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)]
+ col_specs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)]
with pytest.raises(ValueError, match="must specify only one of"):
- read_fwf(StringIO(data), colspecs=colspecs, widths=[6, 10, 10, 7])
+ read_fwf(StringIO(data), col_specs=col_specs, col_widths=[6, 10, 10, 7])
def test_under_specified():
@@ -145,7 +145,7 @@ def test_under_specified():
201162 502.953953 173.237159 12468.3
"""
with pytest.raises(ValueError, match="Must specify either"):
- read_fwf(StringIO(data), colspecs=None, widths=None)
+ read_fwf(StringIO(data), col_specs=None, col_widths=None)
def test_read_csv_compat():
@@ -167,20 +167,18 @@ def test_read_csv_compat():
201161 413.836124 184.375703 11916.8
201162 502.953953 173.237159 12468.3
"""
- colspecs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)]
- result = read_fwf(StringIO(fwf_data), colspecs=colspecs)
+ col_specs = [(0, 4), (4, 8), (8, 20), (21, 33), (34, 43)]
+ result = read_fwf(StringIO(fwf_data), col_specs=col_specs)
tm.assert_frame_equal(result, expected)
def test_bytes_io_input():
- result = read_fwf(
- BytesIO("שלום\nשלום".encode("utf8")), widths=[2, 2], encoding="utf8"
- )
+ result = read_fwf(BytesIO("שלום\nשלום".encode()), widths=[2, 2], encoding="utf8")
expected = DataFrame([["של", "ום"]], columns=["של", "ום"])
tm.assert_frame_equal(result, expected)
-def test_fwf_colspecs_is_list_or_tuple():
+def test_fwf_col_specs_is_list_or_tuple():
data = """index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
@@ -193,10 +191,10 @@ def test_fwf_colspecs_is_list_or_tuple():
msg = "column specifications must be a list or tuple.+"
with pytest.raises(TypeError, match=msg):
- read_fwf(StringIO(data), colspecs={"a": 1}, delimiter=",")
+ read_fwf(StringIO(data), col_specs={"a": 1}, delimiter=",")
-def test_fwf_colspecs_is_list_or_tuple_of_two_element_tuples():
+def test_fwf_col_specs_is_list_or_tuple_of_two_element_tuples():
data = """index,A,B,C,D
foo,2,3,4,5
bar,7,8,9,10
@@ -213,7 +211,7 @@ def test_fwf_colspecs_is_list_or_tuple_of_two_element_tuples():
@pytest.mark.parametrize(
- "colspecs,exp_data",
+ "col_specs,exp_data",
[
([(0, 3), (3, None)], [[123, 456], [456, 789]]),
([(None, 3), (3, 6)], [[123, 456], [456, 789]]),
@@ -221,7 +219,7 @@ def test_fwf_colspecs_is_list_or_tuple_of_two_element_tuples():
([(None, None), (3, 6)], [[123456, 456], [456789, 789]]),
],
)
-def test_fwf_colspecs_none(colspecs, exp_data):
+def test_fwf_col_specs_none(col_specs, exp_data):
# see gh-7079
data = """\
123456
@@ -229,7 +227,7 @@ def test_fwf_colspecs_none(colspecs, exp_data):
"""
expected = DataFrame(exp_data)
- result = read_fwf(StringIO(data), colspecs=colspecs, header=None)
+ result = read_fwf(StringIO(data), col_specs=col_specs, header=None)
tm.assert_frame_equal(result, expected)
@@ -242,7 +240,7 @@ def test_fwf_colspecs_none(colspecs, exp_data):
(10, [[1, 2], [123, 98]]),
],
)
-def test_fwf_colspecs_infer_nrows(infer_nrows, exp_data):
+def test_fwf_col_specs_infer_nrows(infer_nrows, exp_data):
# see gh-15138
data = """\
1 2
@@ -259,7 +257,7 @@ def test_fwf_regression():
#
# Turns out "T060" is parsable as a datetime slice!
tz_list = [1, 10, 20, 30, 60, 80, 100]
- widths = [16] + [8] * len(tz_list)
+ col_widths = [16] + [8] * len(tz_list)
names = ["SST"] + [f"T{z:03d}" for z in tz_list[1:]]
data = """ 2009164202000 9.5403 9.4105 8.6571 7.8372 6.0612 5.8843 5.5192
@@ -274,7 +272,7 @@ def test_fwf_regression():
index_col=0,
header=None,
names=names,
- widths=widths,
+ col_widths=col_widths,
parse_dates=True,
date_parser=lambda s: datetime.strptime(s, "%Y%j%H%M%S"),
)
@@ -305,7 +303,7 @@ def test_fwf_for_uint8():
1421302964.226776 PRI=6 PGN=0xf002 SRC=0x47 243 00 00 255 247 00 00 71""" # noqa
df = read_fwf(
StringIO(data),
- colspecs=[(0, 17), (25, 26), (33, 37), (49, 51), (58, 62), (63, 1000)],
+ col_specs=[(0, 17), (25, 26), (33, 37), (49, 51), (58, 62), (63, 1000)],
names=["time", "pri", "pgn", "dst", "src", "data"],
converters={
"pgn": lambda x: int(x, 16),
@@ -334,10 +332,10 @@ def test_fwf_comment(comment):
"""
data = data.replace("#", comment)
- colspecs = [(0, 3), (4, 9), (9, 25)]
+ col_specs = [(0, 3), (4, 9), (9, 25)]
expected = DataFrame([[1, 2.0, 4], [5, np.nan, 10.0]])
- result = read_fwf(StringIO(data), colspecs=colspecs, header=None, comment=comment)
+ result = read_fwf(StringIO(data), col_specs=col_specs, header=None, comment=comment)
tm.assert_almost_equal(result, expected)
@@ -349,11 +347,11 @@ def test_fwf_thousands(thousands):
"""
data = data.replace(",", thousands)
- colspecs = [(0, 3), (3, 11), (12, 16)]
+ col_specs = [(0, 3), (3, 11), (12, 16)]
expected = DataFrame([[1, 2334.0, 5], [10, 13, 10.0]])
result = read_fwf(
- StringIO(data), header=None, colspecs=colspecs, thousands=thousands
+ StringIO(data), header=None, col_specs=col_specs, thousands=thousands
)
tm.assert_almost_equal(result, expected)
@@ -383,8 +381,8 @@ def test_full_file():
2000-01-07T00:00:00 0.487094399463 0 bar
2000-01-10T00:00:00 0.836648671666 2 baz
2000-01-11T00:00:00 0.157160753327 34 foo"""
- colspecs = ((0, 19), (21, 35), (38, 40), (42, 45))
- expected = read_fwf(StringIO(test), colspecs=colspecs)
+ col_specs = ((0, 19), (21, 35), (38, 40), (42, 45))
+ expected = read_fwf(StringIO(test), col_specs=col_specs)
result = read_fwf(StringIO(test))
tm.assert_frame_equal(result, expected)
@@ -400,8 +398,8 @@ def test_full_file_with_missing():
2000-01-07T00:00:00 0 bar
2000-01-10T00:00:00 0.836648671666 2 baz
34"""
- colspecs = ((0, 19), (21, 35), (38, 40), (42, 45))
- expected = read_fwf(StringIO(test), colspecs=colspecs)
+ col_specs = ((0, 19), (21, 35), (38, 40), (42, 45))
+ expected = read_fwf(StringIO(test), col_specs=col_specs)
result = read_fwf(StringIO(test))
tm.assert_frame_equal(result, expected)
@@ -419,8 +417,8 @@ def test_full_file_with_spaces():
""".strip(
"\r\n"
)
- colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70))
- expected = read_fwf(StringIO(test), colspecs=colspecs)
+ col_specs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70))
+ expected = read_fwf(StringIO(test), col_specs=col_specs)
result = read_fwf(StringIO(test))
tm.assert_frame_equal(result, expected)
@@ -438,8 +436,8 @@ def test_full_file_with_spaces_and_missing():
""".strip(
"\r\n"
)
- colspecs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70))
- expected = read_fwf(StringIO(test), colspecs=colspecs)
+ col_specs = ((0, 7), (8, 28), (30, 38), (42, 53), (56, 70))
+ expected = read_fwf(StringIO(test), col_specs=col_specs)
result = read_fwf(StringIO(test))
tm.assert_frame_equal(result, expected)
@@ -457,8 +455,8 @@ def test_messed_up_data():
""".strip(
"\r\n"
)
- colspecs = ((2, 10), (15, 33), (37, 45), (49, 61), (64, 79))
- expected = read_fwf(StringIO(test), colspecs=colspecs)
+ col_specs = ((2, 10), (15, 33), (37, 45), (49, 61), (64, 79))
+ expected = read_fwf(StringIO(test), col_specs=col_specs)
result = read_fwf(StringIO(test))
tm.assert_frame_equal(result, expected)
@@ -476,8 +474,8 @@ def test_multiple_delimiters():
"\r\n"
)
delimiter = " +~.\\"
- colspecs = ((0, 4), (7, 13), (15, 19), (21, 41))
- expected = read_fwf(StringIO(test), colspecs=colspecs, delimiter=delimiter)
+ col_specs = ((0, 4), (7, 13), (15, 19), (21, 41))
+ expected = read_fwf(StringIO(test), col_specs=col_specs, delimiter=delimiter)
result = read_fwf(StringIO(test), delimiter=delimiter)
tm.assert_frame_equal(result, expected)
@@ -495,7 +493,7 @@ def test_variable_width_unicode():
kwargs = dict(header=None, encoding=encoding)
expected = read_fwf(
- BytesIO(data.encode(encoding)), colspecs=[(0, 4), (5, 9)], **kwargs
+ BytesIO(data.encode(encoding)), col_specs=[(0, 4), (5, 9)], **kwargs
)
result = read_fwf(BytesIO(data.encode(encoding)), **kwargs)
tm.assert_frame_equal(result, expected)
@@ -507,8 +505,8 @@ def test_dtype(dtype):
1 2 3.2
3 4 5.2
"""
- colspecs = [(0, 5), (5, 10), (10, None)]
- result = read_fwf(StringIO(data), colspecs=colspecs, dtype=dtype)
+ col_specs = [(0, 5), (5, 10), (10, None)]
+ result = read_fwf(StringIO(data), col_specs=col_specs, dtype=dtype)
expected = pd.DataFrame(
{"a": [1, 3], "b": [2, 4], "c": [3.2, 5.2]}, columns=["a", "b", "c"]
@@ -574,7 +572,11 @@ def test_whitespace_preservation():
a bbb
ccdd """
result = read_fwf(
- StringIO(fwf_data), widths=[3, 3], header=header, skiprows=[0], delimiter="\n\t"
+ StringIO(fwf_data),
+ col_widths=[3, 3],
+ header=header,
+ skiprows=[0],
+ delimiter="\n\t",
)
expected = read_csv(StringIO(csv_data), header=header)
tm.assert_frame_equal(result, expected)
@@ -589,7 +591,9 @@ def test_default_delimiter():
fwf_data = """
a \tbbb
cc\tdd """
- result = read_fwf(StringIO(fwf_data), widths=[3, 3], header=header, skiprows=[0])
+ result = read_fwf(
+ StringIO(fwf_data), col_widths=[3, 3], header=header, skiprows=[0]
+ )
expected = read_csv(StringIO(csv_data), header=header)
tm.assert_frame_equal(result, expected)
@@ -603,7 +607,7 @@ def test_fwf_compression(compression_only, infer):
compression = compression_only
extension = "gz" if compression == "gzip" else compression
- kwargs = dict(widths=[5, 5], names=["one", "two"])
+ kwargs = dict(col_widths=[5, 5], names=["one", "two"])
expected = read_fwf(StringIO(data), **kwargs)
data = bytes(data, encoding="utf-8")
diff --git a/pandas/tests/io/parser/test_usecols.py b/pandas/tests/io/parser/test_usecols.py
index d4e049cc3fcc2..7e9c9866a666d 100644
--- a/pandas/tests/io/parser/test_usecols.py
+++ b/pandas/tests/io/parser/test_usecols.py
@@ -199,7 +199,7 @@ def test_usecols_with_whitespace(all_parsers):
# Column selection by index.
([0, 1], DataFrame(data=[[1000, 2000], [4000, 5000]], columns=["2", "0"])),
# Column selection by name.
- (["0", "1"], DataFrame(data=[[2000, 3000], [5000, 6000]], columns=["0", "1"]),),
+ (["0", "1"], DataFrame(data=[[2000, 3000], [5000, 6000]], columns=["0", "1"])),
],
)
def test_usecols_with_integer_like_header(all_parsers, usecols, expected):
diff --git a/pandas/tests/io/pytables/common.py b/pandas/tests/io/pytables/common.py
index aad18890de3ad..7e7a76e287d32 100644
--- a/pandas/tests/io/pytables/common.py
+++ b/pandas/tests/io/pytables/common.py
@@ -25,7 +25,7 @@ def safe_close(store):
try:
if store is not None:
store.close()
- except IOError:
+ except OSError:
pass
diff --git a/pandas/tests/io/pytables/test_store.py b/pandas/tests/io/pytables/test_store.py
index c69992471fc9b..0942c79837e7c 100644
--- a/pandas/tests/io/pytables/test_store.py
+++ b/pandas/tests/io/pytables/test_store.py
@@ -1727,6 +1727,37 @@ def col(t, column):
with pytest.raises(TypeError):
store.create_table_index("f2")
+ def test_create_table_index_data_columns_argument(self, setup_path):
+ # GH 28156
+
+ with ensure_clean_store(setup_path) as store:
+
+ with catch_warnings(record=True):
+
+ def col(t, column):
+ return getattr(store.get_storer(t).table.cols, column)
+
+ # data columns
+ df = tm.makeTimeDataFrame()
+ df["string"] = "foo"
+ df["string2"] = "bar"
+ store.append("f", df, data_columns=["string"])
+ assert col("f", "index").is_indexed is True
+ assert col("f", "string").is_indexed is True
+
+ msg = "'Cols' object has no attribute 'string2'"
+ with pytest.raises(AttributeError, match=msg):
+ col("f", "string2").is_indexed
+
+ # try to index a col which isn't a data_column
+ msg = (
+ "column string2 is not a data_column.\n"
+ "In order to read column string2 you must reload the dataframe \n"
+ "into HDFStore and include string2 with the data_columns argument."
+ )
+ with pytest.raises(AttributeError, match=msg):
+ store.create_table_index("f", columns=["string2"])
+
def test_append_hierarchical(self, setup_path):
index = MultiIndex(
levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]],
diff --git a/pandas/tests/io/pytables/test_timezones.py b/pandas/tests/io/pytables/test_timezones.py
index 38d32b0bdc8a3..1c29928991cde 100644
--- a/pandas/tests/io/pytables/test_timezones.py
+++ b/pandas/tests/io/pytables/test_timezones.py
@@ -110,7 +110,7 @@ def test_append_with_timezones_dateutil(setup_path):
dti = dti._with_freq(None) # freq doesnt round-trip
# GH 4098 example
- df = DataFrame(dict(A=Series(range(3), index=dti,)))
+ df = DataFrame(dict(A=Series(range(3), index=dti)))
_maybe_remove(store, "df")
store.put("df", df)
@@ -197,7 +197,7 @@ def test_append_with_timezones_pytz(setup_path):
dti = dti._with_freq(None) # freq doesnt round-trip
# GH 4098 example
- df = DataFrame(dict(A=Series(range(3), index=dti,)))
+ df = DataFrame(dict(A=Series(range(3), index=dti)))
_maybe_remove(store, "df")
store.put("df", df)
diff --git a/pandas/tests/io/sas/data/corrupt.sas7bdat b/pandas/tests/io/sas/data/corrupt.sas7bdat
new file mode 100644
index 0000000000000..2941ffe3ecdf5
Binary files /dev/null and b/pandas/tests/io/sas/data/corrupt.sas7bdat differ
diff --git a/pandas/tests/io/sas/data/datetime.csv b/pandas/tests/io/sas/data/datetime.csv
index 6126f6d04eaf0..f0d82f7fc494e 100644
--- a/pandas/tests/io/sas/data/datetime.csv
+++ b/pandas/tests/io/sas/data/datetime.csv
@@ -1,5 +1,5 @@
Date1,Date2,DateTime,DateTimeHi,Taiw
-1677-09-22,1677-09-22,1677-09-21 00:12:44,1677-09-21 00:12:43.145226,1912-01-01
+1677-09-22,1677-09-22,1677-09-21 00:12:44,1677-09-21 00:12:43.145225,1912-01-01
1960-01-01,1960-01-01,1960-01-01 00:00:00,1960-01-01 00:00:00.000000,1960-01-01
2016-02-29,2016-02-29,2016-02-29 23:59:59,2016-02-29 23:59:59.123456,2016-02-29
-2262-04-11,2262-04-11,2262-04-11 23:47:16,2262-04-11 23:47:16.854774,2262-04-11
+2262-04-11,2262-04-11,2262-04-11 23:47:16,2262-04-11 23:47:16.854775,2262-04-11
diff --git a/pandas/tests/io/sas/test_sas7bdat.py b/pandas/tests/io/sas/test_sas7bdat.py
index 8c14f9de9f61c..9de6ca75fd4d9 100644
--- a/pandas/tests/io/sas/test_sas7bdat.py
+++ b/pandas/tests/io/sas/test_sas7bdat.py
@@ -217,6 +217,14 @@ def test_zero_variables(datapath):
pd.read_sas(fname)
+def test_corrupt_read(datapath):
+ # We don't really care about the exact failure, the important thing is
+ # that the resource should be cleaned up afterwards (BUG #35566)
+ fname = datapath("io", "sas", "data", "corrupt.sas7bdat")
+ with pytest.raises(AttributeError):
+ pd.read_sas(fname)
+
+
def round_datetime_to_ms(ts):
if isinstance(ts, datetime):
return ts.replace(microsecond=int(round(ts.microsecond, -3) / 1000) * 1000)
diff --git a/pandas/tests/io/sas/test_xport.py b/pandas/tests/io/sas/test_xport.py
index 2682bafedb8f1..939edb3d8e0b4 100644
--- a/pandas/tests/io/sas/test_xport.py
+++ b/pandas/tests/io/sas/test_xport.py
@@ -3,6 +3,8 @@
import numpy as np
import pytest
+import pandas.util._test_decorators as td
+
import pandas as pd
import pandas._testing as tm
@@ -26,10 +28,12 @@ def setup_method(self, datapath):
self.dirpath = datapath("io", "sas", "data")
self.file01 = os.path.join(self.dirpath, "DEMO_G.xpt")
self.file02 = os.path.join(self.dirpath, "SSHSV1_A.xpt")
- self.file02b = open(os.path.join(self.dirpath, "SSHSV1_A.xpt"), "rb")
self.file03 = os.path.join(self.dirpath, "DRXFCD_G.xpt")
self.file04 = os.path.join(self.dirpath, "paxraw_d_short.xpt")
+ with td.file_leak_context():
+ yield
+
def test1_basic(self):
# Tests with DEMO_G.xpt (all numeric file)
@@ -127,7 +131,12 @@ def test2_binary(self):
data_csv = pd.read_csv(self.file02.replace(".xpt", ".csv"))
numeric_as_float(data_csv)
- data = read_sas(self.file02b, format="xport")
+ with open(self.file02, "rb") as fd:
+ with td.file_leak_context():
+ # GH#35693 ensure that if we pass an open file, we
+ # dont incorrectly close it in read_sas
+ data = read_sas(fd, format="xport")
+
tm.assert_frame_equal(data, data_csv)
def test_multiple_types(self):
diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py
index e2f4ae04c1f9f..ede8d61490778 100644
--- a/pandas/tests/io/test_common.py
+++ b/pandas/tests/io/test_common.py
@@ -87,7 +87,17 @@ def test_stringify_path_fspath(self):
@pytest.mark.parametrize(
"extension,expected",
- [("", None), (".gz", "gzip"), (".bz2", "bz2"), (".zip", "zip"), (".xz", "xz")],
+ [
+ ("", None),
+ (".gz", "gzip"),
+ (".bz2", "bz2"),
+ (".zip", "zip"),
+ (".xz", "xz"),
+ (".GZ", "gzip"),
+ (".BZ2", "bz2"),
+ (".ZIP", "zip"),
+ (".XZ", "xz"),
+ ],
)
@pytest.mark.parametrize("path_type", path_types)
def test_infer_compression_from_path(self, extension, expected, path_type):
@@ -95,21 +105,21 @@ def test_infer_compression_from_path(self, extension, expected, path_type):
compression = icom.infer_compression(path, compression="infer")
assert compression == expected
- def test_get_filepath_or_buffer_with_path(self):
- filename = "~/sometest"
- filepath_or_buffer, _, _, should_close = icom.get_filepath_or_buffer(filename)
- assert filepath_or_buffer != filename
- assert os.path.isabs(filepath_or_buffer)
- assert os.path.expanduser(filename) == filepath_or_buffer
- assert not should_close
+ @pytest.mark.parametrize("path_type", [str, CustomFSPath, Path])
+ def test_get_filepath_or_buffer_with_path(self, path_type):
+ # ignore LocalPath: it creates strange paths: /absolute/~/sometest
+ filename = path_type("~/sometest")
+ ioargs = icom.get_filepath_or_buffer(filename)
+ assert ioargs.filepath_or_buffer != filename
+ assert os.path.isabs(ioargs.filepath_or_buffer)
+ assert os.path.expanduser(filename) == ioargs.filepath_or_buffer
+ assert not ioargs.should_close
def test_get_filepath_or_buffer_with_buffer(self):
input_buffer = StringIO()
- filepath_or_buffer, _, _, should_close = icom.get_filepath_or_buffer(
- input_buffer
- )
- assert filepath_or_buffer == input_buffer
- assert not should_close
+ ioargs = icom.get_filepath_or_buffer(input_buffer)
+ assert ioargs.filepath_or_buffer == input_buffer
+ assert not ioargs.should_close
def test_iterator(self):
reader = pd.read_csv(StringIO(self.data1), chunksize=1)
@@ -329,7 +339,7 @@ def test_constructor_bad_file(self, mmap_file):
with pytest.raises(err, match=msg):
icom._MMapWrapper(non_file)
- target = open(mmap_file, "r")
+ target = open(mmap_file)
target.close()
msg = "I/O operation on closed file"
@@ -337,7 +347,7 @@ def test_constructor_bad_file(self, mmap_file):
icom._MMapWrapper(target)
def test_get_attr(self, mmap_file):
- with open(mmap_file, "r") as target:
+ with open(mmap_file) as target:
wrapper = icom._MMapWrapper(target)
attrs = dir(wrapper.mmap)
@@ -350,7 +360,7 @@ def test_get_attr(self, mmap_file):
assert not hasattr(wrapper, "foo")
def test_next(self, mmap_file):
- with open(mmap_file, "r") as target:
+ with open(mmap_file) as target:
wrapper = icom._MMapWrapper(target)
lines = target.readlines()
@@ -368,6 +378,36 @@ def test_unknown_engine(self):
with pytest.raises(ValueError, match="Unknown engine"):
pd.read_csv(path, engine="pyt")
+ def test_binary_mode(self):
+ """
+ 'encoding' shouldn't be passed to 'open' in binary mode.
+
+ GH 35058
+ """
+ with tm.ensure_clean() as path:
+ df = tm.makeDataFrame()
+ df.to_csv(path, mode="w+b")
+ tm.assert_frame_equal(df, pd.read_csv(path, index_col=0))
+
+ @pytest.mark.parametrize("encoding", ["utf-16", "utf-32"])
+ @pytest.mark.parametrize("compression_", ["bz2", "xz"])
+ def test_warning_missing_utf_bom(self, encoding, compression_):
+ """
+ bz2 and xz do not write the byte order mark (BOM) for utf-16/32.
+
+ https://stackoverflow.com/questions/55171439
+
+ GH 35681
+ """
+ df = tm.makeDataFrame()
+ with tm.ensure_clean() as path:
+ with tm.assert_produces_warning(UnicodeWarning):
+ df.to_csv(path, compression=compression_, encoding=encoding)
+
+ # reading should fail (otherwise we wouldn't need the warning)
+ with pytest.raises(Exception):
+ pd.read_csv(path, compression=compression_, encoding=encoding)
+
def test_is_fsspec_url():
assert icom.is_fsspec_url("gcs://pandas/somethingelse.com")
diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py
index 59c9bd0a36d3d..31e9ad4cf4416 100644
--- a/pandas/tests/io/test_compression.py
+++ b/pandas/tests/io/test_compression.py
@@ -1,7 +1,10 @@
+import io
import os
+from pathlib import Path
import subprocess
import sys
import textwrap
+import time
import pytest
@@ -114,6 +117,72 @@ def test_compression_warning(compression_only):
df.to_csv(f, compression=compression_only)
+def test_compression_binary(compression_only):
+ """
+ Binary file handles support compression.
+
+ GH22555
+ """
+ df = tm.makeDataFrame()
+
+ # with a file
+ with tm.ensure_clean() as path:
+ with open(path, mode="wb") as file:
+ df.to_csv(file, mode="wb", compression=compression_only)
+ file.seek(0) # file shouldn't be closed
+ tm.assert_frame_equal(
+ df, pd.read_csv(path, index_col=0, compression=compression_only)
+ )
+
+ # with BytesIO
+ file = io.BytesIO()
+ df.to_csv(file, mode="wb", compression=compression_only)
+ file.seek(0) # file shouldn't be closed
+ tm.assert_frame_equal(
+ df, pd.read_csv(file, index_col=0, compression=compression_only)
+ )
+
+
+def test_gzip_reproducibility_file_name():
+ """
+ Gzip should create reproducible archives with mtime.
+
+ Note: Archives created with different filenames will still be different!
+
+ GH 28103
+ """
+ df = tm.makeDataFrame()
+ compression_options = {"method": "gzip", "mtime": 1}
+
+ # test for filename
+ with tm.ensure_clean() as path:
+ path = Path(path)
+ df.to_csv(path, compression=compression_options)
+ time.sleep(2)
+ output = path.read_bytes()
+ df.to_csv(path, compression=compression_options)
+ assert output == path.read_bytes()
+
+
+def test_gzip_reproducibility_file_object():
+ """
+ Gzip should create reproducible archives with mtime.
+
+ GH 28103
+ """
+ df = tm.makeDataFrame()
+ compression_options = {"method": "gzip", "mtime": 1}
+
+ # test for file object
+ buffer = io.BytesIO()
+ df.to_csv(buffer, compression=compression_options, mode="wb")
+ output = buffer.getvalue()
+ time.sleep(2)
+ buffer = io.BytesIO()
+ df.to_csv(buffer, compression=compression_options, mode="wb")
+ assert output == buffer.getvalue()
+
+
def test_with_missing_lzma():
"""Tests if import pandas works when lzma is not present."""
# https://github.com/pandas-dev/pandas/issues/27575
diff --git a/pandas/tests/io/test_date_converters.py b/pandas/tests/io/test_date_converters.py
index cdb8eca02a3e5..a9fa27e091714 100644
--- a/pandas/tests/io/test_date_converters.py
+++ b/pandas/tests/io/test_date_converters.py
@@ -8,11 +8,12 @@
def test_parse_date_time():
+
dates = np.array(["2007/1/3", "2008/2/4"], dtype=object)
times = np.array(["05:07:09", "06:08:00"], dtype=object)
expected = np.array([datetime(2007, 1, 3, 5, 7, 9), datetime(2008, 2, 4, 6, 8, 0)])
-
- result = conv.parse_date_time(dates, times)
+ with tm.assert_produces_warning(FutureWarning):
+ result = conv.parse_date_time(dates, times)
tm.assert_numpy_array_equal(result, expected)
@@ -20,9 +21,10 @@ def test_parse_date_fields():
days = np.array([3, 4])
months = np.array([1, 2])
years = np.array([2007, 2008])
- result = conv.parse_date_fields(years, months, days)
-
expected = np.array([datetime(2007, 1, 3), datetime(2008, 2, 4)])
+
+ with tm.assert_produces_warning(FutureWarning):
+ result = conv.parse_date_fields(years, months, days)
tm.assert_numpy_array_equal(result, expected)
@@ -34,7 +36,8 @@ def test_parse_all_fields():
days = np.array([3, 4])
years = np.array([2007, 2008])
months = np.array([1, 2])
-
- result = conv.parse_all_fields(years, months, days, hours, minutes, seconds)
expected = np.array([datetime(2007, 1, 3, 5, 7, 9), datetime(2008, 2, 4, 6, 8, 0)])
+
+ with tm.assert_produces_warning(FutureWarning):
+ result = conv.parse_all_fields(years, months, days, hours, minutes, seconds)
tm.assert_numpy_array_equal(result, expected)
diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py
index a8a5c8f00e6bf..c1e63f512b53e 100644
--- a/pandas/tests/io/test_feather.py
+++ b/pandas/tests/io/test_feather.py
@@ -76,7 +76,7 @@ def test_basic(self):
pd.Timestamp("20130103"),
],
"dtns": pd.DatetimeIndex(
- list(pd.date_range("20130101", periods=3, freq="ns")), freq=None,
+ list(pd.date_range("20130101", periods=3, freq="ns")), freq=None
),
}
)
diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py
index c397a61616c1c..666da677d702e 100644
--- a/pandas/tests/io/test_fsspec.py
+++ b/pandas/tests/io/test_fsspec.py
@@ -1,7 +1,18 @@
+import io
+
import numpy as np
import pytest
-from pandas import DataFrame, date_range, read_csv, read_parquet
+from pandas import (
+ DataFrame,
+ date_range,
+ read_csv,
+ read_feather,
+ read_json,
+ read_parquet,
+ read_pickle,
+ read_stata,
+)
import pandas._testing as tm
from pandas.util import _test_decorators as td
@@ -15,7 +26,8 @@
)
# the ignore on the following line accounts for to_csv returning Optional(str)
# in general, but always str in the case we give no filename
-text = df1.to_csv(index=False).encode() # type: ignore
+# error: Item "None" of "Optional[str]" has no attribute "encode"
+text = df1.to_csv(index=False).encode() # type: ignore[union-attr]
@pytest.fixture
@@ -37,8 +49,8 @@ def test_read_csv(cleared_fs):
def test_reasonable_error(monkeypatch, cleared_fs):
- from fsspec.registry import known_implementations
from fsspec import registry
+ from fsspec.registry import known_implementations
registry.target.clear()
with pytest.raises(ValueError) as e:
@@ -62,6 +74,16 @@ def test_to_csv(cleared_fs):
tm.assert_frame_equal(df1, df2)
+def test_csv_options(fsspectest):
+ df = DataFrame({"a": [0]})
+ df.to_csv(
+ "testmem://test/test.csv", storage_options={"test": "csv_write"}, index=False
+ )
+ assert fsspectest.test[0] == "csv_write"
+ read_csv("testmem://test/test.csv", storage_options={"test": "csv_read"})
+ assert fsspectest.test[0] == "csv_read"
+
+
@td.skip_if_no("fastparquet")
def test_to_parquet_new_file(monkeypatch, cleared_fs):
"""Regression test for writing to a not-yet-existent GCS Parquet file."""
@@ -70,28 +92,77 @@ def test_to_parquet_new_file(monkeypatch, cleared_fs):
)
+@td.skip_if_no("pyarrow")
+def test_arrowparquet_options(fsspectest):
+ """Regression test for writing to a not-yet-existent GCS Parquet file."""
+ df = DataFrame({"a": [0]})
+ df.to_parquet(
+ "testmem://test/test.csv",
+ engine="pyarrow",
+ compression=None,
+ storage_options={"test": "parquet_write"},
+ )
+ assert fsspectest.test[0] == "parquet_write"
+ read_parquet(
+ "testmem://test/test.csv",
+ engine="pyarrow",
+ storage_options={"test": "parquet_read"},
+ )
+ assert fsspectest.test[0] == "parquet_read"
+
+
+@td.skip_if_no("fastparquet")
+def test_fastparquet_options(fsspectest):
+ """Regression test for writing to a not-yet-existent GCS Parquet file."""
+ df = DataFrame({"a": [0]})
+ df.to_parquet(
+ "testmem://test/test.csv",
+ engine="fastparquet",
+ compression=None,
+ storage_options={"test": "parquet_write"},
+ )
+ assert fsspectest.test[0] == "parquet_write"
+ read_parquet(
+ "testmem://test/test.csv",
+ engine="fastparquet",
+ storage_options={"test": "parquet_read"},
+ )
+ assert fsspectest.test[0] == "parquet_read"
+
+
@td.skip_if_no("s3fs")
-def test_from_s3_csv(s3_resource, tips_file):
- tm.assert_equal(read_csv("s3://pandas-test/tips.csv"), read_csv(tips_file))
+def test_from_s3_csv(s3_resource, tips_file, s3so):
+ tm.assert_equal(
+ read_csv("s3://pandas-test/tips.csv", storage_options=s3so), read_csv(tips_file)
+ )
# the following are decompressed by pandas, not fsspec
- tm.assert_equal(read_csv("s3://pandas-test/tips.csv.gz"), read_csv(tips_file))
- tm.assert_equal(read_csv("s3://pandas-test/tips.csv.bz2"), read_csv(tips_file))
+ tm.assert_equal(
+ read_csv("s3://pandas-test/tips.csv.gz", storage_options=s3so),
+ read_csv(tips_file),
+ )
+ tm.assert_equal(
+ read_csv("s3://pandas-test/tips.csv.bz2", storage_options=s3so),
+ read_csv(tips_file),
+ )
@pytest.mark.parametrize("protocol", ["s3", "s3a", "s3n"])
@td.skip_if_no("s3fs")
-def test_s3_protocols(s3_resource, tips_file, protocol):
+def test_s3_protocols(s3_resource, tips_file, protocol, s3so):
tm.assert_equal(
- read_csv("%s://pandas-test/tips.csv" % protocol), read_csv(tips_file)
+ read_csv("%s://pandas-test/tips.csv" % protocol, storage_options=s3so),
+ read_csv(tips_file),
)
@td.skip_if_no("s3fs")
@td.skip_if_no("fastparquet")
-def test_s3_parquet(s3_resource):
+def test_s3_parquet(s3_resource, s3so):
fn = "s3://pandas-test/test.parquet"
- df1.to_parquet(fn, index=False, engine="fastparquet", compression=None)
- df2 = read_parquet(fn, engine="fastparquet")
+ df1.to_parquet(
+ fn, index=False, engine="fastparquet", compression=None, storage_options=s3so
+ )
+ df2 = read_parquet(fn, engine="fastparquet", storage_options=s3so)
tm.assert_equal(df1, df2)
@@ -100,3 +171,67 @@ def test_not_present_exception():
with pytest.raises(ImportError) as e:
read_csv("memory://test/test.csv")
assert "fsspec library is required" in str(e.value)
+
+
+@td.skip_if_no("pyarrow")
+def test_feather_options(fsspectest):
+ df = DataFrame({"a": [0]})
+ df.to_feather("testmem://afile", storage_options={"test": "feather_write"})
+ assert fsspectest.test[0] == "feather_write"
+ out = read_feather("testmem://afile", storage_options={"test": "feather_read"})
+ assert fsspectest.test[0] == "feather_read"
+ tm.assert_frame_equal(df, out)
+
+
+def test_pickle_options(fsspectest):
+ df = DataFrame({"a": [0]})
+ df.to_pickle("testmem://afile", storage_options={"test": "pickle_write"})
+ assert fsspectest.test[0] == "pickle_write"
+ out = read_pickle("testmem://afile", storage_options={"test": "pickle_read"})
+ assert fsspectest.test[0] == "pickle_read"
+ tm.assert_frame_equal(df, out)
+
+
+def test_json_options(fsspectest):
+ df = DataFrame({"a": [0]})
+ df.to_json("testmem://afile", storage_options={"test": "json_write"})
+ assert fsspectest.test[0] == "json_write"
+ out = read_json("testmem://afile", storage_options={"test": "json_read"})
+ assert fsspectest.test[0] == "json_read"
+ tm.assert_frame_equal(df, out)
+
+
+def test_stata_options(fsspectest):
+ df = DataFrame({"a": [0]})
+ df.to_stata(
+ "testmem://afile", storage_options={"test": "stata_write"}, write_index=False
+ )
+ assert fsspectest.test[0] == "stata_write"
+ out = read_stata("testmem://afile", storage_options={"test": "stata_read"})
+ assert fsspectest.test[0] == "stata_read"
+ tm.assert_frame_equal(df, out.astype("int64"))
+
+
+@td.skip_if_no("tabulate")
+def test_markdown_options(fsspectest):
+ df = DataFrame({"a": [0]})
+ df.to_markdown("testmem://afile", storage_options={"test": "md_write"})
+ assert fsspectest.test[0] == "md_write"
+ assert fsspectest.cat("afile")
+
+
+@td.skip_if_no("pyarrow")
+def test_non_fsspec_options():
+ with pytest.raises(ValueError, match="storage_options"):
+ read_csv("localfile", storage_options={"a": True})
+ with pytest.raises(ValueError, match="storage_options"):
+ # separate test for parquet, which has a different code path
+ read_parquet("localfile", storage_options={"a": True})
+ by = io.BytesIO()
+
+ with pytest.raises(ValueError, match="storage_options"):
+ read_csv(by, storage_options={"a": True})
+
+ df = DataFrame({"a": [0]})
+ with pytest.raises(ValueError, match="storage_options"):
+ df.to_parquet("nonfsspecpath", storage_options={"a": True})
diff --git a/pandas/tests/io/test_gbq.py b/pandas/tests/io/test_gbq.py
index 870d78ef1c533..df107259d38cd 100644
--- a/pandas/tests/io/test_gbq.py
+++ b/pandas/tests/io/test_gbq.py
@@ -148,7 +148,6 @@ def mock_read_gbq(sql, **kwargs):
@pytest.mark.single
-@pytest.mark.xfail(reason="skipping gbq integration for now, xref #34779")
class TestToGBQIntegrationWithServiceAccountKeyPath:
@pytest.fixture()
def gbq_dataset(self):
diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py
index 4d93119ffa3f5..18b5743a3375a 100644
--- a/pandas/tests/io/test_gcs.py
+++ b/pandas/tests/io/test_gcs.py
@@ -9,9 +9,28 @@
from pandas.util import _test_decorators as td
+@pytest.fixture
+def gcs_buffer(monkeypatch):
+ """Emulate GCS using a binary buffer."""
+ from fsspec import AbstractFileSystem, registry
+
+ registry.target.clear() # noqa # remove state
+
+ gcs_buffer = BytesIO()
+ gcs_buffer.close = lambda: True
+
+ class MockGCSFileSystem(AbstractFileSystem):
+ def open(*args, **kwargs):
+ gcs_buffer.seek(0)
+ return gcs_buffer
+
+ monkeypatch.setattr("gcsfs.GCSFileSystem", MockGCSFileSystem)
+
+ return gcs_buffer
+
+
@td.skip_if_no("gcsfs")
-def test_read_csv_gcs(monkeypatch):
- from fsspec import AbstractFileSystem
+def test_read_csv_gcs(gcs_buffer):
from fsspec import registry
registry.target.clear() # noqa # remove state
@@ -25,22 +44,19 @@ def test_read_csv_gcs(monkeypatch):
}
)
- class MockGCSFileSystem(AbstractFileSystem):
- def open(*args, **kwargs):
- return BytesIO(df1.to_csv(index=False).encode())
+ gcs_buffer.write(df1.to_csv(index=False).encode())
- monkeypatch.setattr("gcsfs.GCSFileSystem", MockGCSFileSystem)
df2 = read_csv("gs://test/test.csv", parse_dates=["dt"])
tm.assert_frame_equal(df1, df2)
@td.skip_if_no("gcsfs")
-def test_to_csv_gcs(monkeypatch):
- from fsspec import AbstractFileSystem
+def test_to_csv_gcs(gcs_buffer):
from fsspec import registry
registry.target.clear() # noqa # remove state
+
df1 = DataFrame(
{
"int": [1, 3],
@@ -49,35 +65,62 @@ def test_to_csv_gcs(monkeypatch):
"dt": date_range("2018-06-18", periods=2),
}
)
- s = BytesIO()
- s.close = lambda: True
- class MockGCSFileSystem(AbstractFileSystem):
- def open(*args, **kwargs):
- s.seek(0)
- return s
-
- monkeypatch.setattr("gcsfs.GCSFileSystem", MockGCSFileSystem)
df1.to_csv("gs://test/test.csv", index=True)
- def mock_get_filepath_or_buffer(*args, **kwargs):
- return BytesIO(df1.to_csv(index=True).encode()), None, None, False
-
- monkeypatch.setattr(
- "pandas.io.common.get_filepath_or_buffer", mock_get_filepath_or_buffer
- )
-
df2 = read_csv("gs://test/test.csv", parse_dates=["dt"], index_col=0)
tm.assert_frame_equal(df1, df2)
+@td.skip_if_no("gcsfs")
+@pytest.mark.parametrize("encoding", ["utf-8", "cp1251"])
+def test_to_csv_compression_encoding_gcs(gcs_buffer, compression_only, encoding):
+ """
+ Compression and encoding should with GCS.
+
+ GH 35677 (to_csv, compression), GH 26124 (to_csv, encoding), and
+ GH 32392 (read_csv, encoding)
+ """
+ from fsspec import registry
+
+ registry.target.clear() # noqa # remove state
+ df = tm.makeDataFrame()
+
+ # reference of compressed and encoded file
+ compression = {"method": compression_only}
+ if compression_only == "gzip":
+ compression["mtime"] = 1 # be reproducible
+ buffer = BytesIO()
+ df.to_csv(buffer, compression=compression, encoding=encoding, mode="wb")
+
+ # write compressed file with explicit compression
+ path_gcs = "gs://test/test.csv"
+ df.to_csv(path_gcs, compression=compression, encoding=encoding)
+ assert gcs_buffer.getvalue() == buffer.getvalue()
+ read_df = read_csv(
+ path_gcs, index_col=0, compression=compression_only, encoding=encoding
+ )
+ tm.assert_frame_equal(df, read_df)
+
+ # write compressed file with implicit compression
+ if compression_only == "gzip":
+ compression_only = "gz"
+ compression["method"] = "infer"
+ path_gcs += f".{compression_only}"
+ df.to_csv(
+ path_gcs, compression=compression, encoding=encoding,
+ )
+ assert gcs_buffer.getvalue() == buffer.getvalue()
+ read_df = read_csv(path_gcs, index_col=0, compression="infer", encoding=encoding)
+ tm.assert_frame_equal(df, read_df)
+
+
@td.skip_if_no("fastparquet")
@td.skip_if_no("gcsfs")
def test_to_parquet_gcs_new_file(monkeypatch, tmpdir):
"""Regression test for writing to a not-yet-existent GCS Parquet file."""
- from fsspec import AbstractFileSystem
- from fsspec import registry
+ from fsspec import AbstractFileSystem, registry
registry.target.clear() # noqa # remove state
df1 = DataFrame(
diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py
index 2c93dbb5b6b83..59034e9f3d807 100644
--- a/pandas/tests/io/test_html.py
+++ b/pandas/tests/io/test_html.py
@@ -114,7 +114,7 @@ def test_to_html_compat(self):
c_idx_names=False,
r_idx_names=False,
)
- .applymap("{0:.3f}".format)
+ .applymap("{:.3f}".format)
.astype(float)
)
out = df.to_html()
@@ -616,7 +616,7 @@ def try_remove_ws(x):
@pytest.mark.slow
def test_gold_canyon(self):
gc = "Gold Canyon"
- with open(self.banklist_data, "r") as f:
+ with open(self.banklist_data) as f:
raw_text = f.read()
assert gc in raw_text
diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
index 82157f3d722a9..35a400cba8671 100644
--- a/pandas/tests/io/test_parquet.py
+++ b/pandas/tests/io/test_parquet.py
@@ -8,6 +8,7 @@
import numpy as np
import pytest
+from pandas.compat import PY38
import pandas.util._test_decorators as td
import pandas as pd
@@ -537,9 +538,11 @@ def test_categorical(self, pa):
expected = df.astype(object)
check_round_trip(df, pa, expected=expected)
- def test_s3_roundtrip_explicit_fs(self, df_compat, s3_resource, pa):
+ def test_s3_roundtrip_explicit_fs(self, df_compat, s3_resource, pa, s3so):
s3fs = pytest.importorskip("s3fs")
- s3 = s3fs.S3FileSystem()
+ if LooseVersion(pyarrow.__version__) <= LooseVersion("0.17.0"):
+ pytest.skip()
+ s3 = s3fs.S3FileSystem(**s3so)
kw = dict(filesystem=s3)
check_round_trip(
df_compat,
@@ -549,27 +552,62 @@ def test_s3_roundtrip_explicit_fs(self, df_compat, s3_resource, pa):
write_kwargs=kw,
)
- def test_s3_roundtrip(self, df_compat, s3_resource, pa):
+ def test_s3_roundtrip(self, df_compat, s3_resource, pa, s3so):
+ if LooseVersion(pyarrow.__version__) <= LooseVersion("0.17.0"):
+ pytest.skip()
# GH #19134
- check_round_trip(df_compat, pa, path="s3://pandas-test/pyarrow.parquet")
+ s3so = dict(storage_options=s3so)
+ check_round_trip(
+ df_compat,
+ pa,
+ path="s3://pandas-test/pyarrow.parquet",
+ read_kwargs=s3so,
+ write_kwargs=s3so,
+ )
- @td.skip_if_no("s3fs")
- @pytest.mark.parametrize("partition_col", [["A"], []])
- def test_s3_roundtrip_for_dir(self, df_compat, s3_resource, pa, partition_col):
+ @td.skip_if_no("s3fs") # also requires flask
+ @pytest.mark.parametrize(
+ "partition_col",
+ [
+ pytest.param(
+ ["A"],
+ marks=pytest.mark.xfail(
+ PY38, reason="Getting back empty DataFrame", raises=AssertionError,
+ ),
+ ),
+ [],
+ ],
+ )
+ def test_s3_roundtrip_for_dir(
+ self, df_compat, s3_resource, pa, partition_col, s3so
+ ):
# GH #26388
- # https://github.com/apache/arrow/blob/master/python/pyarrow/tests/test_parquet.py#L2716
- # As per pyarrow partitioned columns become 'categorical' dtypes
- # and are added to back of dataframe on read
-
expected_df = df_compat.copy()
- if partition_col:
- expected_df[partition_col] = expected_df[partition_col].astype("category")
+
+ # GH #35791
+ # read_table uses the new Arrow Datasets API since pyarrow 1.0.0
+ # Previous behaviour was pyarrow partitioned columns become 'category' dtypes
+ # These are added to back of dataframe on read. In new API category dtype is
+ # only used if partition field is string.
+ legacy_read_table = LooseVersion(pyarrow.__version__) < LooseVersion("1.0.0")
+ if partition_col and legacy_read_table:
+ partition_col_type = "category"
+ else:
+ partition_col_type = "int32"
+
+ expected_df[partition_col] = expected_df[partition_col].astype(
+ partition_col_type
+ )
+
check_round_trip(
df_compat,
pa,
expected=expected_df,
path="s3://pandas-test/parquet_dir",
- write_kwargs={"partition_cols": partition_col, "compression": None},
+ read_kwargs=dict(storage_options=s3so),
+ write_kwargs=dict(
+ partition_cols=partition_col, compression=None, storage_options=s3so
+ ),
check_like=True,
repeat=1,
)
@@ -743,9 +781,15 @@ def test_filter_row_groups(self, fp):
result = read_parquet(path, fp, filters=[("a", "==", 0)])
assert len(result) == 1
- def test_s3_roundtrip(self, df_compat, s3_resource, fp):
+ def test_s3_roundtrip(self, df_compat, s3_resource, fp, s3so):
# GH #19134
- check_round_trip(df_compat, fp, path="s3://pandas-test/fastparquet.parquet")
+ check_round_trip(
+ df_compat,
+ fp,
+ path="s3://pandas-test/fastparquet.parquet",
+ read_kwargs=dict(storage_options=s3so),
+ write_kwargs=dict(compression=None, storage_options=s3so),
+ )
def test_partition_cols_supported(self, fp, df_full):
# GH #23283
diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py
index e4d43db7834e3..2241fe7013568 100644
--- a/pandas/tests/io/test_pickle.py
+++ b/pandas/tests/io/test_pickle.py
@@ -14,7 +14,9 @@
import datetime
import glob
import gzip
+import io
import os
+from pathlib import Path
import pickle
import shutil
from warnings import catch_warnings, simplefilter
@@ -22,7 +24,7 @@
import pytest
-from pandas.compat import _get_lzma_file, _import_lzma, is_platform_little_endian
+from pandas.compat import get_lzma_file, import_lzma, is_platform_little_endian
import pandas.util._test_decorators as td
import pandas as pd
@@ -31,7 +33,7 @@
from pandas.tseries.offsets import Day, MonthEnd
-lzma = _import_lzma()
+lzma = import_lzma()
@pytest.fixture(scope="module")
@@ -183,6 +185,15 @@ def python_unpickler(path):
result = python_unpickler(path)
compare_element(result, expected, typ)
+ # and the same for file objects (GH 35679)
+ with open(path, mode="wb") as handle:
+ writer(expected, path)
+ handle.seek(0) # shouldn't close file handle
+ with open(path, mode="rb") as handle:
+ result = pd.read_pickle(handle)
+ handle.seek(0) # shouldn't close file handle
+ compare_element(result, expected, typ)
+
def test_pickle_path_pathlib():
df = tm.makeDataFrame()
@@ -257,7 +268,7 @@ def compress_file(self, src_path, dest_path, compression):
with zipfile.ZipFile(dest_path, "w", compression=zipfile.ZIP_DEFLATED) as f:
f.write(src_path, os.path.basename(src_path))
elif compression == "xz":
- f = _get_lzma_file(lzma)(dest_path, "w")
+ f = get_lzma_file(lzma)(dest_path, "w")
else:
msg = f"Unrecognized compression type: {compression}"
raise ValueError(msg)
@@ -477,3 +488,30 @@ def test_read_pickle_with_subclass():
tm.assert_series_equal(result[0], expected[0])
assert isinstance(result[1], MyTz)
+
+
+def test_pickle_binary_object_compression(compression):
+ """
+ Read/write from binary file-objects w/wo compression.
+
+ GH 26237, GH 29054, and GH 29570
+ """
+ df = tm.makeDataFrame()
+
+ # reference for compression
+ with tm.ensure_clean() as path:
+ df.to_pickle(path, compression=compression)
+ reference = Path(path).read_bytes()
+
+ # write
+ buffer = io.BytesIO()
+ df.to_pickle(buffer, compression=compression)
+ buffer.seek(0)
+
+ # gzip and zip safe the filename: cannot compare the compressed content
+ assert buffer.getvalue() == reference or compression in ("gzip", "zip")
+
+ # read
+ read_df = pd.read_pickle(buffer, compression=compression)
+ buffer.seek(0)
+ tm.assert_frame_equal(df, read_df)
diff --git a/pandas/tests/io/test_s3.py b/pandas/tests/io/test_s3.py
index a76be9465f62a..0ee6cb0796644 100644
--- a/pandas/tests/io/test_s3.py
+++ b/pandas/tests/io/test_s3.py
@@ -1,8 +1,12 @@
from io import BytesIO
+import os
import pytest
+import pandas.util._test_decorators as td
+
from pandas import read_csv
+import pandas._testing as tm
def test_streaming_s3_objects():
@@ -15,3 +19,30 @@ def test_streaming_s3_objects():
for el in data:
body = StreamingBody(BytesIO(el), content_length=len(el))
read_csv(body)
+
+
+@tm.network
+@td.skip_if_no("s3fs")
+def test_read_without_creds_from_pub_bucket():
+ # GH 34626
+ # Use Amazon Open Data Registry - https://registry.opendata.aws/gdelt
+ result = read_csv("s3://gdelt-open-data/events/1981.csv", nrows=3)
+ assert len(result) == 3
+
+
+@tm.network
+@td.skip_if_no("s3fs")
+def test_read_with_creds_from_pub_bucket():
+ # Ensure we can read from a public bucket with credentials
+ # GH 34626
+ # Use Amazon Open Data Registry - https://registry.opendata.aws/gdelt
+
+ with tm.ensure_safe_environment_variables():
+ # temporary workaround as moto fails for botocore >= 1.11 otherwise,
+ # see https://github.com/spulec/moto/issues/1924 & 1952
+ os.environ.setdefault("AWS_ACCESS_KEY_ID", "foobar_key")
+ os.environ.setdefault("AWS_SECRET_ACCESS_KEY", "foobar_secret")
+ df = read_csv(
+ "s3://gdelt-open-data/events/1981.csv", nrows=5, sep="\t", header=None
+ )
+ assert len(df) == 5
diff --git a/pandas/tests/io/test_spss.py b/pandas/tests/io/test_spss.py
index 013f56f83c5ec..a4894ff66ab9f 100644
--- a/pandas/tests/io/test_spss.py
+++ b/pandas/tests/io/test_spss.py
@@ -1,3 +1,5 @@
+from pathlib import Path
+
import numpy as np
import pytest
@@ -7,9 +9,10 @@
pyreadstat = pytest.importorskip("pyreadstat")
-def test_spss_labelled_num(datapath):
+@pytest.mark.parametrize("path_klass", [lambda p: p, Path])
+def test_spss_labelled_num(path_klass, datapath):
# test file from the Haven project (https://haven.tidyverse.org/)
- fname = datapath("io", "data", "spss", "labelled-num.sav")
+ fname = path_klass(datapath("io", "data", "spss", "labelled-num.sav"))
df = pd.read_spss(fname, convert_categoricals=True)
expected = pd.DataFrame({"VAR00002": "This is one"}, index=[0])
diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py
index a07e7a74b7573..32a15e6201037 100644
--- a/pandas/tests/io/test_sql.py
+++ b/pandas/tests/io/test_sql.py
@@ -48,10 +48,10 @@
try:
import sqlalchemy
- import sqlalchemy.schema
- import sqlalchemy.sql.sqltypes as sqltypes
from sqlalchemy.ext import declarative
from sqlalchemy.orm import session as sa_session
+ import sqlalchemy.schema
+ import sqlalchemy.sql.sqltypes as sqltypes
SQLALCHEMY_INSTALLED = True
except ImportError:
@@ -263,7 +263,8 @@ def _get_all_tables(self):
return table_list
def _close_conn(self):
- pass
+ # https://docs.sqlalchemy.org/en/13/core/connections.html#engine-disposal
+ self.conn.dispose()
class PandasSQLTest:
@@ -280,7 +281,6 @@ def _get_exec(self):
@pytest.fixture(params=[("io", "data", "csv", "iris.csv")])
def load_iris_data(self, datapath, request):
- import io
iris_csv_file = datapath(*request.param)
@@ -290,7 +290,7 @@ def load_iris_data(self, datapath, request):
self.drop_table("iris")
self._get_exec().execute(SQL_STRINGS["create_iris"][self.flavor])
- with io.open(iris_csv_file, mode="r", newline=None) as iris_csv:
+ with open(iris_csv_file, mode="r", newline=None) as iris_csv:
r = csv.reader(iris_csv)
next(r) # skip header row
ins = SQL_STRINGS["insert_iris"][self.flavor]
@@ -1242,7 +1242,7 @@ class _TestSQLAlchemy(SQLAlchemyMixIn, PandasSQLTest):
def setup_class(cls):
cls.setup_import()
cls.setup_driver()
- conn = cls.connect()
+ conn = cls.conn = cls.connect()
conn.connect()
def load_test_data_and_sql(self):
@@ -1813,6 +1813,24 @@ def main(connectable):
DataFrame({"test_foo_data": [0, 1, 2]}).to_sql("test_foo_data", self.conn)
main(self.conn)
+ @pytest.mark.parametrize(
+ "input",
+ [{"foo": [np.inf]}, {"foo": [-np.inf]}, {"foo": [-np.inf], "infe0": ["bar"]}],
+ )
+ def test_to_sql_with_negative_npinf(self, input):
+ # GH 34431
+
+ df = pd.DataFrame(input)
+
+ if self.flavor == "mysql":
+ msg = "inf cannot be used with MySQL"
+ with pytest.raises(ValueError, match=msg):
+ df.to_sql("foobar", self.conn, index=False)
+ else:
+ df.to_sql("foobar", self.conn, index=False)
+ res = sql.read_sql_table("foobar", self.conn)
+ tm.assert_equal(df, res)
+
def test_temporary_table(self):
test_data = "Hello, World!"
expected = DataFrame({"spam": [test_data]})
@@ -2330,9 +2348,6 @@ def date_format(dt):
def format_query(sql, *args):
- """
-
- """
processed_args = []
for arg in args:
if isinstance(arg, float) and isna(arg):
diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py
index 6d7fec803a8e0..88f61390957a6 100644
--- a/pandas/tests/io/test_stata.py
+++ b/pandas/tests/io/test_stata.py
@@ -1153,7 +1153,7 @@ def test_read_chunks_117(
from_frame = parsed.iloc[pos : pos + chunksize, :].copy()
from_frame = self._convert_categorical(from_frame)
tm.assert_frame_equal(
- from_frame, chunk, check_dtype=False, check_datetimelike_compat=True,
+ from_frame, chunk, check_dtype=False, check_datetimelike_compat=True
)
pos += chunksize
@@ -1251,7 +1251,7 @@ def test_read_chunks_115(
from_frame = parsed.iloc[pos : pos + chunksize, :].copy()
from_frame = self._convert_categorical(from_frame)
tm.assert_frame_equal(
- from_frame, chunk, check_dtype=False, check_datetimelike_compat=True,
+ from_frame, chunk, check_dtype=False, check_datetimelike_compat=True
)
pos += chunksize
diff --git a/pandas/tests/plotting/common.py b/pandas/tests/plotting/common.py
index 896d3278cdde1..9301a29933d45 100644
--- a/pandas/tests/plotting/common.py
+++ b/pandas/tests/plotting/common.py
@@ -14,24 +14,24 @@
import pandas._testing as tm
-"""
-This is a common base class used for various plotting tests
-"""
-
-
@td.skip_if_no_mpl
class TestPlotBase:
+ """
+ This is a common base class used for various plotting tests
+ """
+
def setup_method(self, method):
import matplotlib as mpl
+
from pandas.plotting._matplotlib import compat
mpl.rcdefaults()
- self.mpl_ge_2_2_3 = compat._mpl_ge_2_2_3()
- self.mpl_ge_3_0_0 = compat._mpl_ge_3_0_0()
- self.mpl_ge_3_1_0 = compat._mpl_ge_3_1_0()
- self.mpl_ge_3_2_0 = compat._mpl_ge_3_2_0()
+ self.mpl_ge_2_2_3 = compat.mpl_ge_2_2_3()
+ self.mpl_ge_3_0_0 = compat.mpl_ge_3_0_0()
+ self.mpl_ge_3_1_0 = compat.mpl_ge_3_1_0()
+ self.mpl_ge_3_2_0 = compat.mpl_ge_3_2_0()
self.bp_n_objects = 7
self.polycollection_factor = 2
@@ -187,8 +187,8 @@ def _check_colors(
Series used for color grouping key
used for andrew_curves, parallel_coordinates, radviz test
"""
+ from matplotlib.collections import Collection, LineCollection, PolyCollection
from matplotlib.lines import Line2D
- from matplotlib.collections import Collection, PolyCollection, LineCollection
conv = self.colorconverter
if linecolors is not None:
@@ -330,7 +330,7 @@ def _check_axes_shape(self, axes, axes_num=None, layout=None, figsize=None):
figsize : tuple
expected figsize. default is matplotlib default
"""
- from pandas.plotting._matplotlib.tools import _flatten
+ from pandas.plotting._matplotlib.tools import flatten_axes
if figsize is None:
figsize = self.default_figsize
@@ -343,7 +343,7 @@ def _check_axes_shape(self, axes, axes_num=None, layout=None, figsize=None):
assert len(ax.get_children()) > 0
if layout is not None:
- result = self._get_axes_layout(_flatten(axes))
+ result = self._get_axes_layout(flatten_axes(axes))
assert result == layout
tm.assert_numpy_array_equal(
@@ -370,9 +370,9 @@ def _flatten_visible(self, axes):
axes : matplotlib Axes object, or its list-like
"""
- from pandas.plotting._matplotlib.tools import _flatten
+ from pandas.plotting._matplotlib.tools import flatten_axes
- axes = _flatten(axes)
+ axes = flatten_axes(axes)
axes = [ax for ax in axes if ax.get_visible()]
return axes
diff --git a/pandas/tests/plotting/test_converter.py b/pandas/tests/plotting/test_converter.py
index df2c9ecbd7a0a..b2eeb649276d5 100644
--- a/pandas/tests/plotting/test_converter.py
+++ b/pandas/tests/plotting/test_converter.py
@@ -27,6 +27,7 @@
pass
pytest.importorskip("matplotlib.pyplot")
+dates = pytest.importorskip("matplotlib.dates")
def test_registry_mpl_resets():
@@ -146,7 +147,7 @@ def test_convert_accepts_unicode(self):
def test_conversion(self):
rs = self.dtc.convert(["2012-1-1"], None, None)[0]
- xp = datetime(2012, 1, 1).toordinal()
+ xp = dates.date2num(datetime(2012, 1, 1))
assert rs == xp
rs = self.dtc.convert("2012-1-1", None, None)
@@ -155,9 +156,6 @@ def test_conversion(self):
rs = self.dtc.convert(date(2012, 1, 1), None, None)
assert rs == xp
- rs = self.dtc.convert(datetime(2012, 1, 1).toordinal(), None, None)
- assert rs == xp
-
rs = self.dtc.convert("2012-1-1", None, None)
assert rs == xp
diff --git a/pandas/tests/plotting/test_datetimelike.py b/pandas/tests/plotting/test_datetimelike.py
index 201856669103a..78aa1887f5611 100644
--- a/pandas/tests/plotting/test_datetimelike.py
+++ b/pandas/tests/plotting/test_datetimelike.py
@@ -9,7 +9,7 @@
from pandas._libs.tslibs import BaseOffset, to_offset
import pandas.util._test_decorators as td
-from pandas import DataFrame, Index, NaT, Series, isna
+from pandas import DataFrame, Index, NaT, Series, isna, to_datetime
import pandas._testing as tm
from pandas.core.indexes.datetimes import DatetimeIndex, bdate_range, date_range
from pandas.core.indexes.period import Period, PeriodIndex, period_range
@@ -331,7 +331,7 @@ def test_freq_with_no_period_alias(self):
bts = tm.makeTimeSeries(5).asfreq(freq)
_, ax = self.plt.subplots()
bts.plot(ax=ax)
- assert ax.get_lines()[0].get_xydata()[0, 0] == bts.index[0].toordinal()
+
idx = ax.get_lines()[0].get_xdata()
msg = "freq not specified and cannot be inferred"
with pytest.raises(ValueError, match=msg):
@@ -1279,6 +1279,8 @@ def test_mpl_nopandas(self):
@pytest.mark.slow
def test_irregular_ts_shared_ax_xlim(self):
# GH 2960
+ from pandas.plotting._matplotlib.converter import DatetimeConverter
+
ts = tm.makeTimeSeries()[:20]
ts_irregular = ts[[1, 4, 5, 6, 8, 9, 10, 12, 13, 14, 15, 17, 18]]
@@ -1289,8 +1291,8 @@ def test_irregular_ts_shared_ax_xlim(self):
# check that axis limits are correct
left, right = ax.get_xlim()
- assert left <= ts_irregular.index.min().toordinal()
- assert right >= ts_irregular.index.max().toordinal()
+ assert left <= DatetimeConverter.convert(ts_irregular.index.min(), "", ax)
+ assert right >= DatetimeConverter.convert(ts_irregular.index.max(), "", ax)
@pytest.mark.slow
def test_secondary_y_non_ts_xlim(self):
@@ -1345,6 +1347,8 @@ def test_secondary_y_mixed_freq_ts_xlim(self):
@pytest.mark.slow
def test_secondary_y_irregular_ts_xlim(self):
# GH 3490 - irregular-timeseries with secondary y
+ from pandas.plotting._matplotlib.converter import DatetimeConverter
+
ts = tm.makeTimeSeries()[:20]
ts_irregular = ts[[1, 4, 5, 6, 8, 9, 10, 12, 13, 14, 15, 17, 18]]
@@ -1356,8 +1360,8 @@ def test_secondary_y_irregular_ts_xlim(self):
ts_irregular[:5].plot(ax=ax)
left, right = ax.get_xlim()
- assert left <= ts_irregular.index.min().toordinal()
- assert right >= ts_irregular.index.max().toordinal()
+ assert left <= DatetimeConverter.convert(ts_irregular.index.min(), "", ax)
+ assert right >= DatetimeConverter.convert(ts_irregular.index.max(), "", ax)
def test_plot_outofbounds_datetime(self):
# 2579 - checking this does not raise
@@ -1490,6 +1494,32 @@ def test_matplotlib_scatter_datetime64(self):
expected = "2017-12-12"
assert label.get_text() == expected
+ def test_check_xticks_rot(self):
+ # https://github.com/pandas-dev/pandas/issues/29460
+ # regular time series
+ x = to_datetime(["2020-05-01", "2020-05-02", "2020-05-03"])
+ df = DataFrame({"x": x, "y": [1, 2, 3]})
+ axes = df.plot(x="x", y="y")
+ self._check_ticks_props(axes, xrot=0)
+
+ # irregular time series
+ x = to_datetime(["2020-05-01", "2020-05-02", "2020-05-04"])
+ df = DataFrame({"x": x, "y": [1, 2, 3]})
+ axes = df.plot(x="x", y="y")
+ self._check_ticks_props(axes, xrot=30)
+
+ # use timeseries index or not
+ axes = df.set_index("x").plot(y="y", use_index=True)
+ self._check_ticks_props(axes, xrot=30)
+ axes = df.set_index("x").plot(y="y", use_index=False)
+ self._check_ticks_props(axes, xrot=0)
+
+ # separate subplots
+ axes = df.plot(x="x", y="y", subplots=True, sharex=True)
+ self._check_ticks_props(axes, xrot=30)
+ axes = df.plot(x="x", y="y", subplots=True, sharex=False)
+ self._check_ticks_props(axes, xrot=0)
+
def _check_plot_works(f, freq=None, series=None, *args, **kwargs):
import matplotlib.pyplot as plt
diff --git a/pandas/tests/plotting/test_frame.py b/pandas/tests/plotting/test_frame.py
index 3d85e79b15c4c..ca4c2bdcc2fe1 100644
--- a/pandas/tests/plotting/test_frame.py
+++ b/pandas/tests/plotting/test_frame.py
@@ -48,10 +48,9 @@ def _assert_xtickslabels_visibility(self, axes, expected):
for ax, exp in zip(axes, expected):
self._check_visible(ax.get_xticklabels(), visible=exp)
- @pytest.mark.xfail(reason="Waiting for PR 34334", strict=True)
@pytest.mark.slow
def test_plot(self):
- from pandas.plotting._matplotlib.compat import _mpl_ge_3_1_0
+ from pandas.plotting._matplotlib.compat import mpl_ge_3_1_0
df = self.tdf
_check_plot_works(df.plot, grid=False)
@@ -66,10 +65,11 @@ def test_plot(self):
with tm.assert_produces_warning(UserWarning):
axes = _check_plot_works(df.plot, subplots=True, use_index=False)
+ self._check_ticks_props(axes, xrot=0)
self._check_axes_shape(axes, axes_num=4, layout=(4, 1))
df = DataFrame({"x": [1, 2], "y": [3, 4]})
- if _mpl_ge_3_1_0():
+ if mpl_ge_3_1_0():
msg = "'Line2D' object has no property 'blarg'"
else:
msg = "Unknown property blarg"
@@ -78,7 +78,8 @@ def test_plot(self):
df = DataFrame(np.random.rand(10, 3), index=list(string.ascii_letters[:10]))
- _check_plot_works(df.plot, use_index=True)
+ ax = _check_plot_works(df.plot, use_index=True)
+ self._check_ticks_props(ax, xrot=0)
_check_plot_works(df.plot, sort_columns=False)
_check_plot_works(df.plot, yticks=[1, 5, 10])
_check_plot_works(df.plot, xticks=[1, 5, 10])
@@ -110,7 +111,8 @@ def test_plot(self):
tuples = zip(string.ascii_letters[:10], range(10))
df = DataFrame(np.random.rand(10, 3), index=MultiIndex.from_tuples(tuples))
- _check_plot_works(df.plot, use_index=True)
+ ax = _check_plot_works(df.plot, use_index=True)
+ self._check_ticks_props(ax, xrot=0)
# unicode
index = MultiIndex.from_tuples(
@@ -205,6 +207,24 @@ def test_color_and_style_arguments(self):
with pytest.raises(ValueError):
df.plot(color=["red", "black"], style=["k-", "r--"])
+ @pytest.mark.parametrize(
+ "color, expected",
+ [
+ ("green", ["green"] * 4),
+ (["yellow", "red", "green", "blue"], ["yellow", "red", "green", "blue"]),
+ ],
+ )
+ def test_color_and_marker(self, color, expected):
+ # GH 21003
+ df = DataFrame(np.random.random((7, 4)))
+ ax = df.plot(color=color, style="d--")
+ # check colors
+ result = [i.get_color() for i in ax.lines]
+ assert result == expected
+ # check markers and linestyles
+ assert all(i.get_linestyle() == "--" for i in ax.lines)
+ assert all(i.get_marker() == "d" for i in ax.lines)
+
def test_nonnumeric_exclude(self):
df = DataFrame({"A": ["x", "y", "z"], "B": [1, 2, 3]})
ax = df.plot()
@@ -286,12 +306,14 @@ def test_xcompat(self):
ax = df.plot(x_compat=True)
lines = ax.get_lines()
assert not isinstance(lines[0].get_xdata(), PeriodIndex)
+ self._check_ticks_props(ax, xrot=30)
tm.close()
pd.plotting.plot_params["xaxis.compat"] = True
ax = df.plot()
lines = ax.get_lines()
assert not isinstance(lines[0].get_xdata(), PeriodIndex)
+ self._check_ticks_props(ax, xrot=30)
tm.close()
pd.plotting.plot_params["x_compat"] = False
@@ -307,12 +329,14 @@ def test_xcompat(self):
ax = df.plot()
lines = ax.get_lines()
assert not isinstance(lines[0].get_xdata(), PeriodIndex)
+ self._check_ticks_props(ax, xrot=30)
tm.close()
ax = df.plot()
lines = ax.get_lines()
assert not isinstance(lines[0].get_xdata(), PeriodIndex)
assert isinstance(PeriodIndex(lines[0].get_xdata()), PeriodIndex)
+ self._check_ticks_props(ax, xrot=0)
def test_period_compat(self):
# GH 9012
@@ -468,7 +492,6 @@ def test_groupby_boxplot_sharex(self):
expected = [False, False, True, True]
self._assert_xtickslabels_visibility(axes, expected)
- @pytest.mark.xfail(reason="Waiting for PR 34334", strict=True)
@pytest.mark.slow
def test_subplots_timeseries(self):
idx = date_range(start="2014-07-01", freq="M", periods=10)
@@ -1321,7 +1344,7 @@ def test_scatter_with_c_column_name_with_colors(self, cmap):
def test_plot_scatter_with_s(self):
# this refers to GH 32904
- df = DataFrame(np.random.random((10, 3)) * 100, columns=["a", "b", "c"],)
+ df = DataFrame(np.random.random((10, 3)) * 100, columns=["a", "b", "c"])
ax = df.plot.scatter(x="a", y="b", s="c")
tm.assert_numpy_array_equal(df["c"].values, right=ax.collections[0].get_sizes())
@@ -1563,6 +1586,7 @@ def test_boxplot(self):
ax.xaxis.get_ticklocs(), np.arange(1, len(numeric_cols) + 1)
)
assert len(ax.lines) == self.bp_n_objects * len(numeric_cols)
+ tm.close()
axes = series.plot.box(rot=40)
self._check_ticks_props(axes, xrot=40, yrot=0)
@@ -1715,7 +1739,7 @@ def test_hist_df(self):
def test_hist_weights(self, weights):
# GH 33173
np.random.seed(0)
- df = pd.DataFrame(dict(zip(["A", "B"], np.random.randn(2, 100,))))
+ df = pd.DataFrame(dict(zip(["A", "B"], np.random.randn(2, 100))))
ax1 = _check_plot_works(df.plot, kind="hist", weights=weights)
ax2 = _check_plot_works(df.plot, kind="hist")
@@ -2407,8 +2431,8 @@ def test_specified_props_kwd_plot_box(self, props, expected):
assert result[expected][0].get_color() == "C1"
def test_default_color_cycle(self):
- import matplotlib.pyplot as plt
import cycler
+ import matplotlib.pyplot as plt
colors = list("rgbk")
plt.rcParams["axes.prop_cycle"] = cycler.cycler("color", colors)
@@ -2795,10 +2819,12 @@ def test_table(self):
_check_plot_works(df.plot, table=True)
_check_plot_works(df.plot, table=df)
- ax = df.plot()
- assert len(ax.tables) == 0
- plotting.table(ax, df.T)
- assert len(ax.tables) == 1
+ # GH 35945 UserWarning
+ with tm.assert_produces_warning(None):
+ ax = df.plot()
+ assert len(ax.tables) == 0
+ plotting.table(ax, df.T)
+ assert len(ax.tables) == 1
def test_errorbar_scatter(self):
df = DataFrame(np.random.randn(5, 2), index=range(5), columns=["x", "y"])
@@ -2952,8 +2978,8 @@ def _check(axes):
@td.skip_if_no_scipy
def test_memory_leak(self):
""" Check that every plot type gets properly collected. """
- import weakref
import gc
+ import weakref
results = {}
for kind in plotting.PlotAccessor._all_kinds:
@@ -3031,8 +3057,8 @@ def test_df_subplots_patterns_minorticks(self):
@pytest.mark.slow
def test_df_gridspec_patterns(self):
# GH 10819
- import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
+ import matplotlib.pyplot as plt
ts = Series(np.random.randn(10), index=date_range("1/1/2000", periods=10))
@@ -3421,9 +3447,9 @@ def test_xlabel_ylabel_dataframe_subplots(
def _generate_4_axes_via_gridspec():
- import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.gridspec # noqa
+ import matplotlib.pyplot as plt
gs = mpl.gridspec.GridSpec(2, 2)
ax_tl = plt.subplot(gs[0, 0])
diff --git a/pandas/tests/plotting/test_hist_method.py b/pandas/tests/plotting/test_hist_method.py
index b6a6c326c3df3..34c881855d16a 100644
--- a/pandas/tests/plotting/test_hist_method.py
+++ b/pandas/tests/plotting/test_hist_method.py
@@ -101,7 +101,7 @@ def test_hist_layout_with_by(self):
@pytest.mark.slow
def test_hist_no_overlap(self):
- from matplotlib.pyplot import subplot, gcf
+ from matplotlib.pyplot import gcf, subplot
x = Series(randn(2))
y = Series(randn(2))
@@ -352,6 +352,7 @@ class TestDataFrameGroupByPlots(TestPlotBase):
@pytest.mark.slow
def test_grouped_hist_legacy(self):
from matplotlib.patches import Rectangle
+
from pandas.plotting._matplotlib.hist import _grouped_hist
df = DataFrame(randn(500, 2), columns=["A", "B"])
diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py
index 75eeede472fe9..0208ab3e0225b 100644
--- a/pandas/tests/plotting/test_misc.py
+++ b/pandas/tests/plotting/test_misc.py
@@ -96,7 +96,7 @@ def test_bootstrap_plot(self):
class TestDataFramePlots(TestPlotBase):
@td.skip_if_no_scipy
def test_scatter_matrix_axis(self):
- from pandas.plotting._matplotlib.compat import _mpl_ge_3_0_0
+ from pandas.plotting._matplotlib.compat import mpl_ge_3_0_0
scatter_matrix = plotting.scatter_matrix
@@ -105,7 +105,7 @@ def test_scatter_matrix_axis(self):
# we are plotting multiples on a sub-plot
with tm.assert_produces_warning(
- UserWarning, raise_on_extra_warnings=_mpl_ge_3_0_0()
+ UserWarning, raise_on_extra_warnings=mpl_ge_3_0_0()
):
axes = _check_plot_works(
scatter_matrix, filterwarnings="always", frame=df, range_padding=0.1
@@ -131,9 +131,10 @@ def test_scatter_matrix_axis(self):
@pytest.mark.slow
def test_andrews_curves(self, iris):
- from pandas.plotting import andrews_curves
from matplotlib import cm
+ from pandas.plotting import andrews_curves
+
df = iris
_check_plot_works(andrews_curves, frame=df, class_column="Name")
@@ -206,9 +207,10 @@ def test_andrews_curves(self, iris):
@pytest.mark.slow
def test_parallel_coordinates(self, iris):
- from pandas.plotting import parallel_coordinates
from matplotlib import cm
+ from pandas.plotting import parallel_coordinates
+
df = iris
ax = _check_plot_works(parallel_coordinates, frame=df, class_column="Name")
@@ -279,9 +281,10 @@ def test_parallel_coordinates_with_sorted_labels(self):
@pytest.mark.slow
def test_radviz(self, iris):
- from pandas.plotting import radviz
from matplotlib import cm
+ from pandas.plotting import radviz
+
df = iris
_check_plot_works(radviz, frame=df, class_column="Name")
@@ -350,7 +353,7 @@ def test_get_standard_colors_random_seed(self):
# GH17525
df = DataFrame(np.zeros((10, 10)))
- # Make sure that the random seed isn't reset by _get_standard_colors
+ # Make sure that the random seed isn't reset by get_standard_colors
plotting.parallel_coordinates(df, 0)
rand1 = random.random()
plotting.parallel_coordinates(df, 0)
@@ -358,19 +361,19 @@ def test_get_standard_colors_random_seed(self):
assert rand1 != rand2
# Make sure it produces the same colors every time it's called
- from pandas.plotting._matplotlib.style import _get_standard_colors
+ from pandas.plotting._matplotlib.style import get_standard_colors
- color1 = _get_standard_colors(1, color_type="random")
- color2 = _get_standard_colors(1, color_type="random")
+ color1 = get_standard_colors(1, color_type="random")
+ color2 = get_standard_colors(1, color_type="random")
assert color1 == color2
def test_get_standard_colors_default_num_colors(self):
- from pandas.plotting._matplotlib.style import _get_standard_colors
+ from pandas.plotting._matplotlib.style import get_standard_colors
# Make sure the default color_types returns the specified amount
- color1 = _get_standard_colors(1, color_type="default")
- color2 = _get_standard_colors(9, color_type="default")
- color3 = _get_standard_colors(20, color_type="default")
+ color1 = get_standard_colors(1, color_type="default")
+ color2 = get_standard_colors(9, color_type="default")
+ color3 = get_standard_colors(20, color_type="default")
assert len(color1) == 1
assert len(color2) == 9
assert len(color3) == 20
@@ -397,10 +400,11 @@ def test_get_standard_colors_no_appending(self):
# Make sure not to add more colors so that matplotlib can cycle
# correctly.
from matplotlib import cm
- from pandas.plotting._matplotlib.style import _get_standard_colors
+
+ from pandas.plotting._matplotlib.style import get_standard_colors
color_before = cm.gnuplot(range(5))
- color_after = _get_standard_colors(1, color=color_before)
+ color_after = get_standard_colors(1, color=color_before)
assert len(color_after) == len(color_before)
df = DataFrame(np.random.randn(48, 4), columns=list("ABCD"))
diff --git a/pandas/tests/plotting/test_series.py b/pandas/tests/plotting/test_series.py
index 64da98f57676f..d56c882471a9a 100644
--- a/pandas/tests/plotting/test_series.py
+++ b/pandas/tests/plotting/test_series.py
@@ -109,6 +109,7 @@ def test_ts_area_lim(self):
line = ax.get_lines()[0].get_data(orig=False)[0]
assert xmin <= line[0]
assert xmax >= line[-1]
+ self._check_ticks_props(ax, xrot=0)
tm.close()
# GH 7471
@@ -118,6 +119,7 @@ def test_ts_area_lim(self):
line = ax.get_lines()[0].get_data(orig=False)[0]
assert xmin <= line[0]
assert xmax >= line[-1]
+ self._check_ticks_props(ax, xrot=30)
tm.close()
tz_ts = self.ts.copy()
@@ -128,6 +130,7 @@ def test_ts_area_lim(self):
line = ax.get_lines()[0].get_data(orig=False)[0]
assert xmin <= line[0]
assert xmax >= line[-1]
+ self._check_ticks_props(ax, xrot=0)
tm.close()
_, ax = self.plt.subplots()
@@ -136,6 +139,7 @@ def test_ts_area_lim(self):
line = ax.get_lines()[0].get_data(orig=False)[0]
assert xmin <= line[0]
assert xmax >= line[-1]
+ self._check_ticks_props(ax, xrot=0)
def test_label(self):
s = Series([1, 2])
@@ -274,14 +278,17 @@ def test_rotation(self):
self._check_ticks_props(axes, xrot=30)
def test_irregular_datetime(self):
+ from pandas.plotting._matplotlib.converter import DatetimeConverter
+
rng = date_range("1/1/2000", "3/1/2000")
rng = rng[[0, 1, 2, 3, 5, 9, 10, 11, 12]]
ser = Series(randn(len(rng)), rng)
_, ax = self.plt.subplots()
ax = ser.plot(ax=ax)
- xp = datetime(1999, 1, 1).toordinal()
+ xp = DatetimeConverter.convert(datetime(1999, 1, 1), "", ax)
ax.set_xlim("1/1/1999", "1/1/2001")
assert xp == ax.get_xlim()[0]
+ self._check_ticks_props(ax, xrot=30)
def test_unsorted_index_xlim(self):
ser = Series(
@@ -450,7 +457,7 @@ def test_hist_layout_with_by(self):
@pytest.mark.slow
def test_hist_no_overlap(self):
- from matplotlib.pyplot import subplot, gcf
+ from matplotlib.pyplot import gcf, subplot
x = Series(randn(2))
y = Series(randn(2))
@@ -684,11 +691,13 @@ def test_kind_both_ways(self):
kinds = (
plotting.PlotAccessor._common_kinds + plotting.PlotAccessor._series_kinds
)
- _, ax = self.plt.subplots()
for kind in kinds:
-
+ _, ax = self.plt.subplots()
s.plot(kind=kind, ax=ax)
+ self.plt.close()
+ _, ax = self.plt.subplots()
getattr(s.plot, kind)()
+ self.plt.close()
@pytest.mark.slow
def test_invalid_plot_data(self):
@@ -729,6 +738,26 @@ def test_dup_datetime_index_plot(self):
s = Series(values, index=index)
_check_plot_works(s.plot)
+ def test_errorbar_asymmetrical(self):
+ # GH9536
+ s = Series(np.arange(10), name="x")
+ err = np.random.rand(2, 10)
+
+ ax = s.plot(yerr=err, xerr=err)
+
+ result = np.vstack([i.vertices[:, 1] for i in ax.collections[1].get_paths()])
+ expected = (err.T * np.array([-1, 1])) + s.to_numpy().reshape(-1, 1)
+ tm.assert_numpy_array_equal(result, expected)
+
+ msg = (
+ "Asymmetrical error bars should be provided "
+ f"with the shape \\(2, {len(s)}\\)"
+ )
+ with pytest.raises(ValueError, match=msg):
+ s.plot(yerr=np.random.rand(2, 11))
+
+ tm.close()
+
@pytest.mark.slow
def test_errorbar_plot(self):
@@ -785,52 +814,53 @@ def test_series_grid_settings(self):
@pytest.mark.slow
def test_standard_colors(self):
- from pandas.plotting._matplotlib.style import _get_standard_colors
+ from pandas.plotting._matplotlib.style import get_standard_colors
for c in ["r", "red", "green", "#FF0000"]:
- result = _get_standard_colors(1, color=c)
+ result = get_standard_colors(1, color=c)
assert result == [c]
- result = _get_standard_colors(1, color=[c])
+ result = get_standard_colors(1, color=[c])
assert result == [c]
- result = _get_standard_colors(3, color=c)
+ result = get_standard_colors(3, color=c)
assert result == [c] * 3
- result = _get_standard_colors(3, color=[c])
+ result = get_standard_colors(3, color=[c])
assert result == [c] * 3
@pytest.mark.slow
def test_standard_colors_all(self):
import matplotlib.colors as colors
- from pandas.plotting._matplotlib.style import _get_standard_colors
+
+ from pandas.plotting._matplotlib.style import get_standard_colors
# multiple colors like mediumaquamarine
for c in colors.cnames:
- result = _get_standard_colors(num_colors=1, color=c)
+ result = get_standard_colors(num_colors=1, color=c)
assert result == [c]
- result = _get_standard_colors(num_colors=1, color=[c])
+ result = get_standard_colors(num_colors=1, color=[c])
assert result == [c]
- result = _get_standard_colors(num_colors=3, color=c)
+ result = get_standard_colors(num_colors=3, color=c)
assert result == [c] * 3
- result = _get_standard_colors(num_colors=3, color=[c])
+ result = get_standard_colors(num_colors=3, color=[c])
assert result == [c] * 3
# single letter colors like k
for c in colors.ColorConverter.colors:
- result = _get_standard_colors(num_colors=1, color=c)
+ result = get_standard_colors(num_colors=1, color=c)
assert result == [c]
- result = _get_standard_colors(num_colors=1, color=[c])
+ result = get_standard_colors(num_colors=1, color=[c])
assert result == [c]
- result = _get_standard_colors(num_colors=3, color=c)
+ result = get_standard_colors(num_colors=3, color=c)
assert result == [c] * 3
- result = _get_standard_colors(num_colors=3, color=[c])
+ result = get_standard_colors(num_colors=3, color=[c])
assert result == [c] * 3
def test_series_plot_color_kwargs(self):
@@ -933,7 +963,7 @@ def test_plot_no_numeric_data(self):
def test_style_single_ok(self):
s = pd.Series([1, 2])
ax = s.plot(style="s", color="C3")
- assert ax.lines[0].get_color() == ["C3"]
+ assert ax.lines[0].get_color() == "C3"
@pytest.mark.parametrize(
"index_name, old_label, new_label",
diff --git a/pandas/tests/reductions/test_reductions.py b/pandas/tests/reductions/test_reductions.py
index a112bc80b60b0..bbf2d9f1f0784 100644
--- a/pandas/tests/reductions/test_reductions.py
+++ b/pandas/tests/reductions/test_reductions.py
@@ -914,6 +914,13 @@ def test_all_any_boolean(self):
tm.assert_series_equal(s.all(level=0), Series([False, True, False]))
tm.assert_series_equal(s.any(level=0), Series([False, True, True]))
+ def test_any_axis1_bool_only(self):
+ # GH#32432
+ df = pd.DataFrame({"A": [True, False], "B": [1, 2]})
+ result = df.any(axis=1, bool_only=True)
+ expected = pd.Series([True, False])
+ tm.assert_series_equal(result, expected)
+
def test_timedelta64_analytics(self):
# index min/max
diff --git a/pandas/tests/resample/test_datetime_index.py b/pandas/tests/resample/test_datetime_index.py
index e7637a598403f..9475dcc6981ff 100644
--- a/pandas/tests/resample/test_datetime_index.py
+++ b/pandas/tests/resample/test_datetime_index.py
@@ -124,7 +124,7 @@ def test_resample_integerarray():
result = ts.resample("3T").mean()
expected = Series(
- [1, 4, 7], index=pd.date_range("1/1/2000", periods=3, freq="3T"), dtype="Int64",
+ [1, 4, 7], index=pd.date_range("1/1/2000", periods=3, freq="3T"), dtype="Int64"
)
tm.assert_series_equal(result, expected)
@@ -764,7 +764,7 @@ def test_resample_origin():
@pytest.mark.parametrize(
- "origin", ["invalid_value", "epch", "startday", "startt", "2000-30-30", object()],
+ "origin", ["invalid_value", "epch", "startday", "startt", "2000-30-30", object()]
)
def test_resample_bad_origin(origin):
rng = date_range("2000-01-01 00:00:00", "2000-01-01 02:00", freq="s")
@@ -777,9 +777,7 @@ def test_resample_bad_origin(origin):
ts.resample("5min", origin=origin)
-@pytest.mark.parametrize(
- "offset", ["invalid_value", "12dayys", "2000-30-30", object()],
-)
+@pytest.mark.parametrize("offset", ["invalid_value", "12dayys", "2000-30-30", object()])
def test_resample_bad_offset(offset):
rng = date_range("2000-01-01 00:00:00", "2000-01-01 02:00", freq="s")
ts = Series(np.random.randn(len(rng)), index=rng)
@@ -1595,7 +1593,7 @@ def test_downsample_dst_at_midnight():
"America/Havana", ambiguous=True
)
dti = pd.DatetimeIndex(dti, freq="D")
- expected = DataFrame([7.5, 28.0, 44.5], index=dti,)
+ expected = DataFrame([7.5, 28.0, 44.5], index=dti)
tm.assert_frame_equal(result, expected)
@@ -1742,3 +1740,50 @@ def test_resample_apply_product():
columns=["A", "B"],
)
tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+ "first,last,freq_in,freq_out,exp_last",
+ [
+ (
+ "2020-03-28",
+ "2020-03-31",
+ "D",
+ "24H",
+ "2020-03-30 01:00",
+ ), # includes transition into DST
+ (
+ "2020-03-28",
+ "2020-10-27",
+ "D",
+ "24H",
+ "2020-10-27 00:00",
+ ), # includes transition into and out of DST
+ (
+ "2020-10-25",
+ "2020-10-27",
+ "D",
+ "24H",
+ "2020-10-26 23:00",
+ ), # includes transition out of DST
+ (
+ "2020-03-28",
+ "2020-03-31",
+ "24H",
+ "D",
+ "2020-03-30 00:00",
+ ), # same as above, but from 24H to D
+ ("2020-03-28", "2020-10-27", "24H", "D", "2020-10-27 00:00"),
+ ("2020-10-25", "2020-10-27", "24H", "D", "2020-10-26 00:00"),
+ ],
+)
+def test_resample_calendar_day_with_dst(
+ first: str, last: str, freq_in: str, freq_out: str, exp_last: str
+):
+ # GH 35219
+ ts = pd.Series(1.0, pd.date_range(first, last, freq=freq_in, tz="Europe/Amsterdam"))
+ result = ts.resample(freq_out).pad()
+ expected = pd.Series(
+ 1.0, pd.date_range(first, exp_last, freq=freq_out, tz="Europe/Amsterdam")
+ )
+ tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/resample/test_resampler_grouper.py b/pandas/tests/resample/test_resampler_grouper.py
index cbf3a778f9ae0..73bf7dafac254 100644
--- a/pandas/tests/resample/test_resampler_grouper.py
+++ b/pandas/tests/resample/test_resampler_grouper.py
@@ -3,6 +3,7 @@
import numpy as np
import pytest
+import pandas.util._test_decorators as td
from pandas.util._test_decorators import async_mark
import pandas as pd
@@ -17,6 +18,7 @@
@async_mark()
+@td.check_file_leaks
async def test_tab_complete_ipython6_warning(ip):
from IPython.core.completer import provisionalcompleter
diff --git a/pandas/tests/resample/test_time_grouper.py b/pandas/tests/resample/test_time_grouper.py
index 26e429c47b494..f638706207679 100644
--- a/pandas/tests/resample/test_time_grouper.py
+++ b/pandas/tests/resample/test_time_grouper.py
@@ -287,3 +287,65 @@ def test_upsample_sum(method, method_args, expected_values):
result = methodcaller(method, **method_args)(resampled)
expected = pd.Series(expected_values, index=index)
tm.assert_series_equal(result, expected)
+
+
+def test_groupby_resample_interpolate():
+ # GH 35325
+ d = {"price": [10, 11, 9], "volume": [50, 60, 50]}
+
+ df = pd.DataFrame(d)
+
+ df["week_starting"] = pd.date_range("01/01/2018", periods=3, freq="W")
+
+ result = (
+ df.set_index("week_starting")
+ .groupby("volume")
+ .resample("1D")
+ .interpolate(method="linear")
+ )
+ expected_ind = pd.MultiIndex.from_tuples(
+ [
+ (50, "2018-01-07"),
+ (50, pd.Timestamp("2018-01-08")),
+ (50, pd.Timestamp("2018-01-09")),
+ (50, pd.Timestamp("2018-01-10")),
+ (50, pd.Timestamp("2018-01-11")),
+ (50, pd.Timestamp("2018-01-12")),
+ (50, pd.Timestamp("2018-01-13")),
+ (50, pd.Timestamp("2018-01-14")),
+ (50, pd.Timestamp("2018-01-15")),
+ (50, pd.Timestamp("2018-01-16")),
+ (50, pd.Timestamp("2018-01-17")),
+ (50, pd.Timestamp("2018-01-18")),
+ (50, pd.Timestamp("2018-01-19")),
+ (50, pd.Timestamp("2018-01-20")),
+ (50, pd.Timestamp("2018-01-21")),
+ (60, pd.Timestamp("2018-01-14")),
+ ],
+ names=["volume", "week_starting"],
+ )
+ expected = pd.DataFrame(
+ data={
+ "price": [
+ 10.0,
+ 9.928571428571429,
+ 9.857142857142858,
+ 9.785714285714286,
+ 9.714285714285714,
+ 9.642857142857142,
+ 9.571428571428571,
+ 9.5,
+ 9.428571428571429,
+ 9.357142857142858,
+ 9.285714285714286,
+ 9.214285714285714,
+ 9.142857142857142,
+ 9.071428571428571,
+ 9.0,
+ 11.0,
+ ],
+ "volume": [50.0] * 15 + [60],
+ },
+ index=expected_ind,
+ )
+ tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/resample/test_timedelta.py b/pandas/tests/resample/test_timedelta.py
index 0fbb60c176b30..3fa85e62d028c 100644
--- a/pandas/tests/resample/test_timedelta.py
+++ b/pandas/tests/resample/test_timedelta.py
@@ -150,3 +150,18 @@ def test_resample_timedelta_edge_case(start, end, freq, resample_freq):
tm.assert_index_equal(result.index, expected_index)
assert result.index.freq == expected_index.freq
assert not np.isnan(result[-1])
+
+
+def test_resample_with_timedelta_yields_no_empty_groups():
+ # GH 10603
+ df = pd.DataFrame(
+ np.random.normal(size=(10000, 4)),
+ index=pd.timedelta_range(start="0s", periods=10000, freq="3906250n"),
+ )
+ result = df.loc["1s":, :].resample("3s").apply(lambda x: len(x))
+
+ expected = pd.DataFrame(
+ [[768.0] * 4] * 12 + [[528.0] * 4],
+ index=pd.timedelta_range(start="1s", periods=13, freq="3s"),
+ )
+ tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/reshape/merge/test_join.py b/pandas/tests/reshape/merge/test_join.py
index c33443e24b268..d4d4c4190417e 100644
--- a/pandas/tests/reshape/merge/test_join.py
+++ b/pandas/tests/reshape/merge/test_join.py
@@ -2,7 +2,7 @@
from numpy.random import randn
import pytest
-from pandas._libs import join as libjoin
+from pandas._libs.join import inner_join, left_outer_join
import pandas as pd
from pandas import DataFrame, Index, MultiIndex, Series, concat, merge
@@ -48,7 +48,7 @@ def test_cython_left_outer_join(self):
right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64)
max_group = 5
- ls, rs = libjoin.left_outer_join(left, right, max_group)
+ ls, rs = left_outer_join(left, right, max_group)
exp_ls = left.argsort(kind="mergesort")
exp_rs = right.argsort(kind="mergesort")
@@ -70,7 +70,7 @@ def test_cython_right_outer_join(self):
right = a_([1, 1, 0, 4, 2, 2, 1], dtype=np.int64)
max_group = 5
- rs, ls = libjoin.left_outer_join(right, left, max_group)
+ rs, ls = left_outer_join(right, left, max_group)
exp_ls = left.argsort(kind="mergesort")
exp_rs = right.argsort(kind="mergesort")
@@ -116,7 +116,7 @@ def test_cython_inner_join(self):
right = a_([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.int64)
max_group = 5
- ls, rs = libjoin.inner_join(left, right, max_group)
+ ls, rs = inner_join(left, right, max_group)
exp_ls = left.argsort(kind="mergesort")
exp_rs = right.argsort(kind="mergesort")
diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py
index 0a4d5f17a48cc..4fd3c688b8771 100644
--- a/pandas/tests/reshape/merge/test_merge.py
+++ b/pandas/tests/reshape/merge/test_merge.py
@@ -1999,6 +1999,7 @@ def test_merge_series(on, left_on, right_on, left_index, right_index, nm):
(0, 0, dict(suffixes=("", "_dup")), ["0", "0_dup"]),
(0, 0, dict(suffixes=(None, "_dup")), [0, "0_dup"]),
(0, 0, dict(suffixes=("_x", "_y")), ["0_x", "0_y"]),
+ (0, 0, dict(suffixes=["_x", "_y"]), ["0_x", "0_y"]),
("a", 0, dict(suffixes=(None, "_y")), ["a", 0]),
(0.0, 0.0, dict(suffixes=("_x", None)), ["0.0_x", 0.0]),
("b", "b", dict(suffixes=(None, "_y")), ["b", "b_y"]),
@@ -2069,18 +2070,13 @@ def test_merge_suffix_error(col1, col2, suffixes):
pd.merge(a, b, left_index=True, right_index=True, suffixes=suffixes)
-@pytest.mark.parametrize(
- "col1, col2, suffixes", [("a", "a", {"a", "b"}), ("a", "a", None), (0, 0, None)],
-)
-def test_merge_suffix_type_error(col1, col2, suffixes):
- a = pd.DataFrame({col1: [1, 2, 3]})
- b = pd.DataFrame({col2: [3, 4, 5]})
+@pytest.mark.parametrize("suffixes", [{"left", "right"}, {"left": 0, "right": 0}])
+def test_merge_suffix_warns(suffixes):
+ a = pd.DataFrame({"a": [1, 2, 3]})
+ b = pd.DataFrame({"b": [3, 4, 5]})
- msg = (
- f"suffixes should be tuple of \\(str, str\\). But got {type(suffixes).__name__}"
- )
- with pytest.raises(TypeError, match=msg):
- pd.merge(a, b, left_index=True, right_index=True, suffixes=suffixes)
+ with tm.assert_produces_warning(FutureWarning):
+ pd.merge(a, b, left_index=True, right_index=True, suffixes={"left", "right"})
@pytest.mark.parametrize(
diff --git a/pandas/tests/reshape/merge/test_merge_asof.py b/pandas/tests/reshape/merge/test_merge_asof.py
index 9b09f0033715d..895de2b748c34 100644
--- a/pandas/tests/reshape/merge/test_merge_asof.py
+++ b/pandas/tests/reshape/merge/test_merge_asof.py
@@ -1339,3 +1339,25 @@ def test_merge_index_column_tz(self):
index=pd.Index([0, 1, 2, 3, 4]),
)
tm.assert_frame_equal(result, expected)
+
+ def test_left_index_right_index_tolerance(self):
+ # https://github.com/pandas-dev/pandas/issues/35558
+ dr1 = pd.date_range(
+ start="1/1/2020", end="1/20/2020", freq="2D"
+ ) + pd.Timedelta(seconds=0.4)
+ dr2 = pd.date_range(start="1/1/2020", end="2/1/2020")
+
+ df1 = pd.DataFrame({"val1": "foo"}, index=pd.DatetimeIndex(dr1))
+ df2 = pd.DataFrame({"val2": "bar"}, index=pd.DatetimeIndex(dr2))
+
+ expected = pd.DataFrame(
+ {"val1": "foo", "val2": "bar"}, index=pd.DatetimeIndex(dr1)
+ )
+ result = pd.merge_asof(
+ df1,
+ df2,
+ left_index=True,
+ right_index=True,
+ tolerance=pd.Timedelta(seconds=0.5),
+ )
+ tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/reshape/merge/test_merge_index_as_string.py b/pandas/tests/reshape/merge/test_merge_index_as_string.py
index 08614d04caf4b..d20d93370ec7e 100644
--- a/pandas/tests/reshape/merge/test_merge_index_as_string.py
+++ b/pandas/tests/reshape/merge/test_merge_index_as_string.py
@@ -29,7 +29,7 @@ def df2():
@pytest.fixture(params=[[], ["outer"], ["outer", "inner"]])
def left_df(request, df1):
- """ Construct left test DataFrame with specified levels
+ """Construct left test DataFrame with specified levels
(any of 'outer', 'inner', and 'v1')
"""
levels = request.param
@@ -41,7 +41,7 @@ def left_df(request, df1):
@pytest.fixture(params=[[], ["outer"], ["outer", "inner"]])
def right_df(request, df2):
- """ Construct right test DataFrame with specified levels
+ """Construct right test DataFrame with specified levels
(any of 'outer', 'inner', and 'v2')
"""
levels = request.param
diff --git a/pandas/tests/reshape/test_concat.py b/pandas/tests/reshape/test_concat.py
index ffeb5ff0f8aaa..7d6611722d8b5 100644
--- a/pandas/tests/reshape/test_concat.py
+++ b/pandas/tests/reshape/test_concat.py
@@ -1087,20 +1087,44 @@ def test_append_empty_frame_to_series_with_dateutil_tz(self):
date = Timestamp("2018-10-24 07:30:00", tz=dateutil.tz.tzutc())
s = Series({"date": date, "a": 1.0, "b": 2.0})
df = DataFrame(columns=["c", "d"])
- result = df.append(s, ignore_index=True)
- # n.b. it's not clear to me that expected is correct here.
- # It's possible that the `date` column should have
- # datetime64[ns, tz] dtype for both result and expected.
- # that would be more consistent with new columns having
- # their own dtype (float for a and b, datetime64ns, tz for date).
+ result_a = df.append(s, ignore_index=True)
expected = DataFrame(
- [[np.nan, np.nan, 1.0, 2.0, date]],
- columns=["c", "d", "a", "b", "date"],
- dtype=object,
+ [[np.nan, np.nan, 1.0, 2.0, date]], columns=["c", "d", "a", "b", "date"]
)
# These columns get cast to object after append
- expected["a"] = expected["a"].astype(float)
- expected["b"] = expected["b"].astype(float)
+ expected["c"] = expected["c"].astype(object)
+ expected["d"] = expected["d"].astype(object)
+ tm.assert_frame_equal(result_a, expected)
+
+ expected = DataFrame(
+ [[np.nan, np.nan, 1.0, 2.0, date]] * 2, columns=["c", "d", "a", "b", "date"]
+ )
+ expected["c"] = expected["c"].astype(object)
+ expected["d"] = expected["d"].astype(object)
+
+ result_b = result_a.append(s, ignore_index=True)
+ tm.assert_frame_equal(result_b, expected)
+
+ # column order is different
+ expected = expected[["c", "d", "date", "a", "b"]]
+ result = df.append([s, s], ignore_index=True)
+ tm.assert_frame_equal(result, expected)
+
+ def test_append_empty_tz_frame_with_datetime64ns(self):
+ # https://github.com/pandas-dev/pandas/issues/35460
+ df = pd.DataFrame(columns=["a"]).astype("datetime64[ns, UTC]")
+
+ # pd.NaT gets inferred as tz-naive, so append result is tz-naive
+ result = df.append({"a": pd.NaT}, ignore_index=True)
+ expected = pd.DataFrame({"a": [pd.NaT]}).astype("datetime64[ns]")
+ tm.assert_frame_equal(result, expected)
+
+ # also test with typed value to append
+ df = pd.DataFrame(columns=["a"]).astype("datetime64[ns, UTC]")
+ result = df.append(
+ pd.Series({"a": pd.NaT}, dtype="datetime64[ns]"), ignore_index=True
+ )
+ expected = pd.DataFrame({"a": [pd.NaT]}).astype("datetime64[ns]")
tm.assert_frame_equal(result, expected)
@@ -1272,6 +1296,43 @@ def test_concat_ignore_index(self, sort):
tm.assert_frame_equal(v1, expected)
+ @pytest.mark.parametrize(
+ "name_in1,name_in2,name_in3,name_out",
+ [
+ ("idx", "idx", "idx", "idx"),
+ ("idx", "idx", None, "idx"),
+ ("idx", None, None, "idx"),
+ ("idx1", "idx2", None, None),
+ ("idx1", "idx1", "idx2", None),
+ ("idx1", "idx2", "idx3", None),
+ (None, None, None, None),
+ ],
+ )
+ def test_concat_same_index_names(self, name_in1, name_in2, name_in3, name_out):
+ # GH13475
+ indices = [
+ pd.Index(["a", "b", "c"], name=name_in1),
+ pd.Index(["b", "c", "d"], name=name_in2),
+ pd.Index(["c", "d", "e"], name=name_in3),
+ ]
+ frames = [
+ pd.DataFrame({c: [0, 1, 2]}, index=i)
+ for i, c in zip(indices, ["x", "y", "z"])
+ ]
+ result = pd.concat(frames, axis=1)
+
+ exp_ind = pd.Index(["a", "b", "c", "d", "e"], name=name_out)
+ expected = pd.DataFrame(
+ {
+ "x": [0, 1, 2, np.nan, np.nan],
+ "y": [np.nan, 0, 1, 2, np.nan],
+ "z": [np.nan, np.nan, 0, 1, 2],
+ },
+ index=exp_ind,
+ )
+
+ tm.assert_frame_equal(result, expected)
+
def test_concat_multiindex_with_keys(self):
index = MultiIndex(
levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]],
@@ -2857,3 +2918,12 @@ def test_concat_frame_axis0_extension_dtypes():
result = pd.concat([df2, df1], ignore_index=True)
expected = pd.DataFrame({"a": [4, 5, 6, 1, 2, 3]}, dtype="Int64")
tm.assert_frame_equal(result, expected)
+
+
+def test_concat_preserves_extension_int64_dtype():
+ # GH 24768
+ df_a = pd.DataFrame({"a": [-1]}, dtype="Int64")
+ df_b = pd.DataFrame({"b": [1]}, dtype="Int64")
+ result = pd.concat([df_a, df_b], ignore_index=True)
+ expected = pd.DataFrame({"a": [-1, None], "b": [None, 1]}, dtype="Int64")
+ tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/reshape/test_crosstab.py b/pandas/tests/reshape/test_crosstab.py
index 8795af2e11122..1aadcfdc30f1b 100644
--- a/pandas/tests/reshape/test_crosstab.py
+++ b/pandas/tests/reshape/test_crosstab.py
@@ -354,7 +354,7 @@ def test_crosstab_normalize(self):
crosstab(df.a, df.b, normalize="columns"),
)
tm.assert_frame_equal(
- crosstab(df.a, df.b, normalize=0), crosstab(df.a, df.b, normalize="index"),
+ crosstab(df.a, df.b, normalize=0), crosstab(df.a, df.b, normalize="index")
)
row_normal_margins = DataFrame(
@@ -377,7 +377,7 @@ def test_crosstab_normalize(self):
crosstab(df.a, df.b, normalize="index", margins=True), row_normal_margins
)
tm.assert_frame_equal(
- crosstab(df.a, df.b, normalize="columns", margins=True), col_normal_margins,
+ crosstab(df.a, df.b, normalize="columns", margins=True), col_normal_margins
)
tm.assert_frame_equal(
crosstab(df.a, df.b, normalize=True, margins=True), all_normal_margins
@@ -698,3 +698,48 @@ def test_margin_normalize(self):
names=["A", "B"],
)
tm.assert_frame_equal(result, expected)
+
+ def test_margin_normalize_multiple_columns(self):
+ # GH 35144
+ # use multiple columns with margins and normalization
+ df = DataFrame(
+ {
+ "A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"],
+ "B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"],
+ "C": [
+ "small",
+ "large",
+ "large",
+ "small",
+ "small",
+ "large",
+ "small",
+ "small",
+ "large",
+ ],
+ "D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
+ "E": [2, 4, 5, 5, 6, 6, 8, 9, 9],
+ }
+ )
+ result = crosstab(
+ index=df.C,
+ columns=[df.A, df.B],
+ margins=True,
+ margins_name="margin",
+ normalize=True,
+ )
+ expected = DataFrame(
+ [
+ [0.111111, 0.111111, 0.222222, 0.000000, 0.444444],
+ [0.111111, 0.111111, 0.111111, 0.222222, 0.555556],
+ [0.222222, 0.222222, 0.333333, 0.222222, 1.0],
+ ],
+ index=["large", "small", "margin"],
+ )
+ expected.columns = MultiIndex(
+ levels=[["bar", "foo", "margin"], ["", "one", "two"]],
+ codes=[[0, 0, 1, 1, 2], [1, 2, 1, 2, 0]],
+ names=["A", "B"],
+ )
+ expected.index.name = "C"
+ tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/reshape/test_get_dummies.py b/pandas/tests/reshape/test_get_dummies.py
index c003bfa6a239a..82e0e52c089a2 100644
--- a/pandas/tests/reshape/test_get_dummies.py
+++ b/pandas/tests/reshape/test_get_dummies.py
@@ -161,7 +161,7 @@ def test_get_dummies_unicode(self, sparse):
s = [e, eacute, eacute]
res = get_dummies(s, prefix="letter", sparse=sparse)
exp = DataFrame(
- {"letter_e": [1, 0, 0], f"letter_{eacute}": [0, 1, 1]}, dtype=np.uint8,
+ {"letter_e": [1, 0, 0], f"letter_{eacute}": [0, 1, 1]}, dtype=np.uint8
)
if sparse:
exp = exp.apply(SparseArray, fill_value=0)
@@ -386,7 +386,7 @@ def test_dataframe_dummies_with_categorical(self, df, sparse, dtype):
"get_dummies_kwargs,expected",
[
(
- {"data": DataFrame(({"ä": ["a"]}))},
+ {"data": DataFrame({"ä": ["a"]})},
DataFrame({"ä_a": [1]}, dtype=np.uint8),
),
(
diff --git a/pandas/tests/reshape/test_melt.py b/pandas/tests/reshape/test_melt.py
index 000a6354277ab..79879ef346f53 100644
--- a/pandas/tests/reshape/test_melt.py
+++ b/pandas/tests/reshape/test_melt.py
@@ -357,6 +357,47 @@ def test_melt_mixed_int_str_value_vars(self):
expected = DataFrame({"variable": [0, "a"], "value": ["foo", "bar"]})
tm.assert_frame_equal(result, expected)
+ def test_ignore_index(self):
+ # GH 17440
+ df = DataFrame({"foo": [0], "bar": [1]}, index=["first"])
+ result = melt(df, ignore_index=False)
+ expected = DataFrame(
+ {"variable": ["foo", "bar"], "value": [0, 1]}, index=["first", "first"]
+ )
+ tm.assert_frame_equal(result, expected)
+
+ def test_ignore_multiindex(self):
+ # GH 17440
+ index = pd.MultiIndex.from_tuples(
+ [("first", "second"), ("first", "third")], names=["baz", "foobar"]
+ )
+ df = DataFrame({"foo": [0, 1], "bar": [2, 3]}, index=index)
+ result = melt(df, ignore_index=False)
+
+ expected_index = pd.MultiIndex.from_tuples(
+ [("first", "second"), ("first", "third")] * 2, names=["baz", "foobar"]
+ )
+ expected = DataFrame(
+ {"variable": ["foo"] * 2 + ["bar"] * 2, "value": [0, 1, 2, 3]},
+ index=expected_index,
+ )
+
+ tm.assert_frame_equal(result, expected)
+
+ def test_ignore_index_name_and_type(self):
+ # GH 17440
+ index = pd.Index(["foo", "bar"], dtype="category", name="baz")
+ df = DataFrame({"x": [0, 1], "y": [2, 3]}, index=index)
+ result = melt(df, ignore_index=False)
+
+ expected_index = pd.Index(["foo", "bar"] * 2, dtype="category", name="baz")
+ expected = DataFrame(
+ {"variable": ["x", "x", "y", "y"], "value": [0, 1, 2, 3]},
+ index=expected_index,
+ )
+
+ tm.assert_frame_equal(result, expected)
+
class TestLreshape:
def test_pairs(self):
@@ -758,7 +799,7 @@ def test_invalid_separator(self):
expected = expected.set_index(["id", "year"])[
["X", "A2010", "A2011", "B2010", "A", "B"]
]
- expected.index.set_levels([0, 1], level=0, inplace=True)
+ expected.index = expected.index.set_levels([0, 1], level=0)
result = wide_to_long(df, ["A", "B"], i="id", j="year", sep=sep)
tm.assert_frame_equal(result.sort_index(axis=1), expected.sort_index(axis=1))
@@ -820,7 +861,7 @@ def test_invalid_suffixtype(self):
expected = pd.DataFrame(exp_data).astype({"year": "int"})
expected = expected.set_index(["id", "year"])
- expected.index.set_levels([0, 1], level=0, inplace=True)
+ expected.index = expected.index.set_levels([0, 1], level=0)
result = wide_to_long(df, ["A", "B"], i="id", j="year")
tm.assert_frame_equal(result.sort_index(axis=1), expected.sort_index(axis=1))
@@ -1014,3 +1055,17 @@ def test_col_substring_of_stubname(self):
)
result = pd.wide_to_long(wide_df, stubnames="PA", i=["node_id", "A"], j="time")
tm.assert_frame_equal(result, expected)
+
+ def test_warn_of_column_name_value(self):
+ # GH34731
+ # raise a warning if the resultant value column name matches
+ # a name in the dataframe already (default name is "value")
+ df = pd.DataFrame({"col": list("ABC"), "value": range(10, 16, 2)})
+ expected = pd.DataFrame(
+ [["A", "col", "A"], ["B", "col", "B"], ["C", "col", "C"]],
+ columns=["value", "variable", "value"],
+ )
+
+ with tm.assert_produces_warning(FutureWarning):
+ result = df.melt(id_vars="value")
+ tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py
index c07a5673fe503..67b3151b0ff9c 100644
--- a/pandas/tests/reshape/test_pivot.py
+++ b/pandas/tests/reshape/test_pivot.py
@@ -1817,7 +1817,7 @@ def test_categorical_aggfunc(self, observed):
["A", "B", "C"], categories=["A", "B", "C"], ordered=False, name="C1"
)
expected_columns = pd.Index(["a", "b"], name="C2")
- expected_data = np.array([[1.0, np.nan], [1.0, np.nan], [np.nan, 2.0]])
+ expected_data = np.array([[1, 0], [1, 0], [0, 2]], dtype=np.int64)
expected = pd.DataFrame(
expected_data, index=expected_index, columns=expected_columns
)
@@ -1851,18 +1851,19 @@ def test_categorical_pivot_index_ordering(self, observed):
values="Sales",
index="Month",
columns="Year",
- dropna=observed,
+ observed=observed,
aggfunc="sum",
)
expected_columns = pd.Int64Index([2013, 2014], name="Year")
expected_index = pd.CategoricalIndex(
- ["January"], categories=months, ordered=False, name="Month"
+ months, categories=months, ordered=False, name="Month"
)
+ expected_data = [[320, 120]] + [[0, 0]] * 11
expected = pd.DataFrame(
- [[320, 120]], index=expected_index, columns=expected_columns
+ expected_data, index=expected_index, columns=expected_columns
)
- if not observed:
- result = result.dropna().astype(np.int64)
+ if observed:
+ expected = expected.loc[["January"]]
tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/reshape/test_util.py b/pandas/tests/reshape/test_util.py
index 9d074b5ade425..0acadc54cec0c 100644
--- a/pandas/tests/reshape/test_util.py
+++ b/pandas/tests/reshape/test_util.py
@@ -65,3 +65,13 @@ def test_invalid_input(self, X):
with pytest.raises(TypeError, match=msg):
cartesian_product(X=X)
+
+ def test_exceed_product_space(self):
+ # GH31355: raise useful error when produce space is too large
+ msg = "Product space too large to allocate arrays!"
+
+ with pytest.raises(ValueError, match=msg):
+ dims = [np.arange(0, 22, dtype=np.int16) for i in range(12)] + [
+ (np.arange(15128, dtype=np.int16)),
+ ]
+ cartesian_product(X=dims)
diff --git a/pandas/tests/scalar/interval/test_arithmetic.py b/pandas/tests/scalar/interval/test_arithmetic.py
index 5252f1a4d5a24..b4c2b448e252a 100644
--- a/pandas/tests/scalar/interval/test_arithmetic.py
+++ b/pandas/tests/scalar/interval/test_arithmetic.py
@@ -45,3 +45,15 @@ def test_numeric_interval_add_timedelta_raises(interval, delta):
with pytest.raises((TypeError, ValueError), match=msg):
delta + interval
+
+
+@pytest.mark.parametrize("klass", [timedelta, np.timedelta64, Timedelta])
+def test_timdelta_add_timestamp_interval(klass):
+ delta = klass(0)
+ expected = Interval(Timestamp("2020-01-01"), Timestamp("2020-02-01"))
+
+ result = delta + expected
+ assert result == expected
+
+ result = expected + delta
+ assert result == expected
diff --git a/pandas/tests/scalar/interval/test_interval.py b/pandas/tests/scalar/interval/test_interval.py
index a0151bb9ac7bf..8ad9a2c7a9c70 100644
--- a/pandas/tests/scalar/interval/test_interval.py
+++ b/pandas/tests/scalar/interval/test_interval.py
@@ -2,6 +2,7 @@
import pytest
from pandas import Interval, Period, Timedelta, Timestamp
+import pandas._testing as tm
import pandas.core.common as com
@@ -267,3 +268,11 @@ def test_constructor_errors_tz(self, tz_left, tz_right):
msg = "left and right must have the same time zone"
with pytest.raises(error, match=msg):
Interval(left, right)
+
+ def test_equality_comparison_broadcasts_over_array(self):
+ # https://github.com/pandas-dev/pandas/issues/35931
+ interval = Interval(0, 1)
+ arr = np.array([interval, interval])
+ result = interval == arr
+ expected = np.array([True, True])
+ tm.assert_numpy_array_equal(result, expected)
diff --git a/pandas/tests/scalar/test_na_scalar.py b/pandas/tests/scalar/test_na_scalar.py
index dc5eb15348c1b..5c4d7e191d1bb 100644
--- a/pandas/tests/scalar/test_na_scalar.py
+++ b/pandas/tests/scalar/test_na_scalar.py
@@ -28,9 +28,9 @@ def test_format():
assert format(NA, ">10") == " "
assert format(NA, "xxx") == "" # NA is flexible, accept any format spec
- assert "{}".format(NA) == ""
- assert "{:>10}".format(NA) == " "
- assert "{:xxx}".format(NA) == ""
+ assert f"{NA}" == ""
+ assert f"{NA:>10}" == " "
+ assert f"{NA:xxx}" == ""
def test_truthiness():
@@ -111,7 +111,7 @@ def test_pow_special(value, asarray):
@pytest.mark.parametrize(
- "value", [1, 1.0, True, np.bool_(True), np.int_(1), np.float_(1)],
+ "value", [1, 1.0, True, np.bool_(True), np.int_(1), np.float_(1)]
)
@pytest.mark.parametrize("asarray", [True, False])
def test_rpow_special(value, asarray):
@@ -128,9 +128,7 @@ def test_rpow_special(value, asarray):
assert result == value
-@pytest.mark.parametrize(
- "value", [-1, -1.0, np.int_(-1), np.float_(-1)],
-)
+@pytest.mark.parametrize("value", [-1, -1.0, np.int_(-1), np.float_(-1)])
@pytest.mark.parametrize("asarray", [True, False])
def test_rpow_minus_one(value, asarray):
if asarray:
@@ -193,9 +191,7 @@ def test_logical_not():
assert ~NA is NA
-@pytest.mark.parametrize(
- "shape", [(3,), (3, 3), (1, 2, 3)],
-)
+@pytest.mark.parametrize("shape", [(3,), (3, 3), (1, 2, 3)])
def test_arithmetic_ndarray(shape, all_arithmetic_functions):
op = all_arithmetic_functions
a = np.zeros(shape)
diff --git a/pandas/tests/scalar/test_nat.py b/pandas/tests/scalar/test_nat.py
index e1e2ea1a5cec8..09d5d9c1677d0 100644
--- a/pandas/tests/scalar/test_nat.py
+++ b/pandas/tests/scalar/test_nat.py
@@ -308,10 +308,6 @@ def test_overlap_public_nat_methods(klass, expected):
# In case when Timestamp, Timedelta, and NaT are overlap, the overlap
# is considered to be with Timestamp and NaT, not Timedelta.
- # "fromisoformat" was introduced in 3.7
- if klass is Timestamp and not compat.PY37:
- expected.remove("fromisoformat")
-
# "fromisocalendar" was introduced in 3.8
if klass is Timestamp and not compat.PY38:
expected.remove("fromisocalendar")
@@ -513,11 +509,67 @@ def test_to_numpy_alias():
assert isna(expected) and isna(result)
-@pytest.mark.parametrize("other", [Timedelta(0), Timestamp(0)])
+@pytest.mark.parametrize(
+ "other",
+ [
+ Timedelta(0),
+ Timedelta(0).to_pytimedelta(),
+ pytest.param(
+ Timedelta(0).to_timedelta64(),
+ marks=pytest.mark.xfail(
+ reason="td64 doesnt return NotImplemented, see numpy#17017"
+ ),
+ ),
+ Timestamp(0),
+ Timestamp(0).to_pydatetime(),
+ pytest.param(
+ Timestamp(0).to_datetime64(),
+ marks=pytest.mark.xfail(
+ reason="dt64 doesnt return NotImplemented, see numpy#17017"
+ ),
+ ),
+ Timestamp(0).tz_localize("UTC"),
+ NaT,
+ ],
+)
def test_nat_comparisons(compare_operators_no_eq_ne, other):
# GH 26039
- assert getattr(NaT, compare_operators_no_eq_ne)(other) is False
- assert getattr(other, compare_operators_no_eq_ne)(NaT) is False
+ opname = compare_operators_no_eq_ne
+
+ assert getattr(NaT, opname)(other) is False
+
+ op = getattr(operator, opname.strip("_"))
+ assert op(NaT, other) is False
+ assert op(other, NaT) is False
+
+
+@pytest.mark.parametrize("other", [np.timedelta64(0, "ns"), np.datetime64("now", "ns")])
+def test_nat_comparisons_numpy(other):
+ # Once numpy#17017 is fixed and the xfailed cases in test_nat_comparisons
+ # pass, this test can be removed
+ assert not NaT == other
+ assert NaT != other
+ assert not NaT < other
+ assert not NaT > other
+ assert not NaT <= other
+ assert not NaT >= other
+
+
+@pytest.mark.parametrize("other", ["foo", 2, 2.0])
+@pytest.mark.parametrize("op", [operator.le, operator.lt, operator.ge, operator.gt])
+def test_nat_comparisons_invalid(other, op):
+ # GH#35585
+ assert not NaT == other
+ assert not other == NaT
+
+ assert NaT != other
+ assert other != NaT
+
+ with pytest.raises(TypeError):
+ op(NaT, other)
+
+ with pytest.raises(TypeError):
+ op(other, NaT)
@pytest.mark.parametrize(
diff --git a/pandas/tests/scalar/timedelta/test_arithmetic.py b/pandas/tests/scalar/timedelta/test_arithmetic.py
index cb33f99d9bd91..d4d7e4b85268f 100644
--- a/pandas/tests/scalar/timedelta/test_arithmetic.py
+++ b/pandas/tests/scalar/timedelta/test_arithmetic.py
@@ -7,8 +7,10 @@
import numpy as np
import pytest
+from pandas.compat.numpy import is_numpy_dev
+
import pandas as pd
-from pandas import NaT, Timedelta, Timestamp, _is_numpy_dev, compat, offsets
+from pandas import NaT, Timedelta, Timestamp, compat, offsets
import pandas._testing as tm
from pandas.core import ops
@@ -426,7 +428,7 @@ def test_td_div_numeric_scalar(self):
np.float64("NaN"),
marks=pytest.mark.xfail(
# Works on numpy dev only in python 3.9
- _is_numpy_dev and not compat.PY39,
+ is_numpy_dev and not compat.PY39,
raises=RuntimeWarning,
reason="https://github.com/pandas-dev/pandas/issues/31992",
),
diff --git a/pandas/tests/scalar/timestamp/test_arithmetic.py b/pandas/tests/scalar/timestamp/test_arithmetic.py
index 954301b979074..1e980b6e4559c 100644
--- a/pandas/tests/scalar/timestamp/test_arithmetic.py
+++ b/pandas/tests/scalar/timestamp/test_arithmetic.py
@@ -213,7 +213,7 @@ def test_add_int_with_freq(self, ts, other):
with pytest.raises(TypeError, match=msg):
other - ts
- @pytest.mark.parametrize("shape", [(6,), (2, 3,)])
+ @pytest.mark.parametrize("shape", [(6,), (2, 3)])
def test_addsub_m8ndarray(self, shape):
# GH#33296
ts = Timestamp("2020-04-04 15:45")
@@ -237,7 +237,7 @@ def test_addsub_m8ndarray(self, shape):
with pytest.raises(TypeError, match=msg):
other - ts
- @pytest.mark.parametrize("shape", [(6,), (2, 3,)])
+ @pytest.mark.parametrize("shape", [(6,), (2, 3)])
def test_addsub_m8ndarray_tzaware(self, shape):
# GH#33296
ts = Timestamp("2020-04-04 15:45", tz="US/Pacific")
diff --git a/pandas/tests/scalar/timestamp/test_constructors.py b/pandas/tests/scalar/timestamp/test_constructors.py
index 770753f42a4c8..316a299ba1cbb 100644
--- a/pandas/tests/scalar/timestamp/test_constructors.py
+++ b/pandas/tests/scalar/timestamp/test_constructors.py
@@ -174,7 +174,10 @@ def test_constructor_invalid(self):
def test_constructor_invalid_tz(self):
# GH#17690
- msg = "must be a datetime.tzinfo"
+ msg = (
+ "Argument 'tzinfo' has incorrect type "
+ r"\(expected datetime.tzinfo, got str\)"
+ )
with pytest.raises(TypeError, match=msg):
Timestamp("2017-10-22", tzinfo="US/Eastern")
diff --git a/pandas/tests/scalar/timestamp/test_timezones.py b/pandas/tests/scalar/timestamp/test_timezones.py
index 9611c827be6fe..f05f2054b2483 100644
--- a/pandas/tests/scalar/timestamp/test_timezones.py
+++ b/pandas/tests/scalar/timestamp/test_timezones.py
@@ -21,9 +21,12 @@ class TestTimestampTZOperations:
# Timestamp.tz_localize
def test_tz_localize_pushes_out_of_bounds(self):
- msg = "^$"
# GH#12677
# tz_localize that pushes away from the boundary is OK
+ msg = (
+ f"Converting {Timestamp.min.strftime('%Y-%m-%d %H:%M:%S')} "
+ f"underflows past {Timestamp.min}"
+ )
pac = Timestamp.min.tz_localize("US/Pacific")
assert pac.value > Timestamp.min.value
pac.tz_convert("Asia/Tokyo") # tz_convert doesn't change value
@@ -31,6 +34,10 @@ def test_tz_localize_pushes_out_of_bounds(self):
Timestamp.min.tz_localize("Asia/Tokyo")
# tz_localize that pushes away from the boundary is OK
+ msg = (
+ f"Converting {Timestamp.max.strftime('%Y-%m-%d %H:%M:%S')} "
+ f"overflows past {Timestamp.max}"
+ )
tokyo = Timestamp.max.tz_localize("Asia/Tokyo")
assert tokyo.value < Timestamp.max.value
tokyo.tz_convert("US/Pacific") # tz_convert doesn't change value
@@ -334,7 +341,7 @@ def test_timestamp_to_datetime_tzoffset(self):
def test_timestamp_constructor_near_dst_boundary(self):
# GH#11481 & GH#15777
# Naive string timestamps were being localized incorrectly
- # with tz_convert_single instead of tz_localize_to_utc
+ # with tz_convert_from_utc_single instead of tz_localize_to_utc
for tz in ["Europe/Brussels", "Europe/Prague"]:
result = Timestamp("2015-10-25 01:00", tz=tz)
diff --git a/pandas/tests/series/apply/__init__.py b/pandas/tests/series/apply/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/pandas/tests/series/apply/test_apply_relabeling.py b/pandas/tests/series/apply/test_apply_relabeling.py
new file mode 100644
index 0000000000000..0b8d2c4e1f26d
--- /dev/null
+++ b/pandas/tests/series/apply/test_apply_relabeling.py
@@ -0,0 +1,33 @@
+import pandas as pd
+import pandas._testing as tm
+
+
+class TestNamedAggregation:
+ def test_relabel_no_duplicated_method(self):
+ # this is to test there is no duplicated method used in agg
+ df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4]})
+
+ result = df["A"].agg(foo="sum")
+ expected = df["A"].agg({"foo": "sum"})
+ tm.assert_series_equal(result, expected)
+
+ result = df["B"].agg(foo="min", bar="max")
+ expected = df["B"].agg({"foo": "min", "bar": "max"})
+ tm.assert_series_equal(result, expected)
+
+ result = df["B"].agg(foo=sum, bar=min, cat="max")
+ expected = df["B"].agg({"foo": sum, "bar": min, "cat": "max"})
+ tm.assert_series_equal(result, expected)
+
+ def test_relabel_duplicated_method(self):
+ # this is to test with nested renaming, duplicated method can be used
+ # if they are assigned with different new names
+ df = pd.DataFrame({"A": [1, 2, 1, 2], "B": [1, 2, 3, 4]})
+
+ result = df["A"].agg(foo="sum", bar="sum")
+ expected = pd.Series([6, 6], index=["foo", "bar"], name="A")
+ tm.assert_series_equal(result, expected)
+
+ result = df["B"].agg(foo=min, bar="min")
+ expected = pd.Series([1, 1], index=["foo", "bar"], name="B")
+ tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/series/test_apply.py b/pandas/tests/series/apply/test_series_apply.py
similarity index 96%
rename from pandas/tests/series/test_apply.py
rename to pandas/tests/series/apply/test_series_apply.py
index 308398642895c..ce8759c4ba76d 100644
--- a/pandas/tests/series/test_apply.py
+++ b/pandas/tests/series/apply/test_series_apply.py
@@ -209,23 +209,17 @@ def test_transform(self, string_series):
f_abs = np.abs(string_series)
# ufunc
- result = string_series.transform(np.sqrt)
- expected = f_sqrt.copy()
- tm.assert_series_equal(result, expected)
-
result = string_series.apply(np.sqrt)
+ expected = f_sqrt.copy()
tm.assert_series_equal(result, expected)
# list-like
- result = string_series.transform([np.sqrt])
+ result = string_series.apply([np.sqrt])
expected = f_sqrt.to_frame().copy()
expected.columns = ["sqrt"]
tm.assert_frame_equal(result, expected)
- result = string_series.transform([np.sqrt])
- tm.assert_frame_equal(result, expected)
-
- result = string_series.transform(["sqrt"])
+ result = string_series.apply(["sqrt"])
tm.assert_frame_equal(result, expected)
# multiple items in list
@@ -236,10 +230,6 @@ def test_transform(self, string_series):
result = string_series.apply([np.sqrt, np.abs])
tm.assert_frame_equal(result, expected)
- result = string_series.transform(["sqrt", "abs"])
- expected.columns = ["sqrt", "abs"]
- tm.assert_frame_equal(result, expected)
-
# dict, provide renaming
expected = pd.concat([f_sqrt, f_abs], axis=1)
expected.columns = ["foo", "bar"]
@@ -250,19 +240,11 @@ def test_transform(self, string_series):
def test_transform_and_agg_error(self, string_series):
# we are trying to transform with an aggregator
- msg = "transforms cannot produce aggregated results"
- with pytest.raises(ValueError, match=msg):
- string_series.transform(["min", "max"])
-
msg = "cannot combine transform and aggregation"
with pytest.raises(ValueError, match=msg):
with np.errstate(all="ignore"):
string_series.agg(["sqrt", "max"])
- with pytest.raises(ValueError, match=msg):
- with np.errstate(all="ignore"):
- string_series.transform(["sqrt", "max"])
-
msg = "cannot perform both aggregation and transformation"
with pytest.raises(ValueError, match=msg):
with np.errstate(all="ignore"):
@@ -463,13 +445,13 @@ def test_agg_cython_table_raises(self, series, func, expected):
# e.g. Series('a b'.split()).cumprod() will raise
series.agg(func)
- def test_transform_none_to_type(self):
- # GH34377
- df = pd.DataFrame({"a": [None]})
+ def test_series_apply_no_suffix_index(self):
+ # GH36189
+ s = pd.Series([4] * 3)
+ result = s.apply(["sum", lambda x: x.sum(), lambda x: x.sum()])
+ expected = pd.Series([12, 12, 12], index=["sum", "", ""])
- msg = "DataFrame constructor called with incompatible data and dtype"
- with pytest.raises(TypeError, match=msg):
- df.transform({"a": int})
+ tm.assert_series_equal(result, expected)
class TestSeriesMap:
diff --git a/pandas/tests/series/apply/test_series_transform.py b/pandas/tests/series/apply/test_series_transform.py
new file mode 100644
index 0000000000000..0842674da2a7d
--- /dev/null
+++ b/pandas/tests/series/apply/test_series_transform.py
@@ -0,0 +1,165 @@
+import numpy as np
+import pytest
+
+from pandas import DataFrame, Series, concat
+import pandas._testing as tm
+from pandas.core.base import SpecificationError
+from pandas.core.groupby.base import transformation_kernels
+
+
+def test_transform_ufunc(string_series):
+ # GH 35964
+ with np.errstate(all="ignore"):
+ f_sqrt = np.sqrt(string_series)
+
+ # ufunc
+ result = string_series.transform(np.sqrt)
+ expected = f_sqrt.copy()
+ tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("op", transformation_kernels)
+def test_transform_groupby_kernel(string_series, op):
+ # GH 35964
+ if op == "cumcount":
+ pytest.xfail("Series.cumcount does not exist")
+ if op == "tshift":
+ pytest.xfail("Only works on time index and is deprecated")
+
+ args = [0.0] if op == "fillna" else []
+ ones = np.ones(string_series.shape[0])
+ expected = string_series.groupby(ones).transform(op, *args)
+ result = string_series.transform(op, 0, *args)
+ tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+ "ops, names", [([np.sqrt], ["sqrt"]), ([np.abs, np.sqrt], ["absolute", "sqrt"])]
+)
+def test_transform_list(string_series, ops, names):
+ # GH 35964
+ with np.errstate(all="ignore"):
+ expected = concat([op(string_series) for op in ops], axis=1)
+ expected.columns = names
+ result = string_series.transform(ops)
+ tm.assert_frame_equal(result, expected)
+
+
+def test_transform_dict(string_series):
+ # GH 35964
+ with np.errstate(all="ignore"):
+ expected = concat([np.sqrt(string_series), np.abs(string_series)], axis=1)
+ expected.columns = ["foo", "bar"]
+ result = string_series.transform({"foo": np.sqrt, "bar": np.abs})
+ tm.assert_frame_equal(result, expected)
+
+
+def test_transform_udf(axis, string_series):
+ # GH 35964
+ # via apply
+ def func(x):
+ if isinstance(x, Series):
+ raise ValueError
+ return x + 1
+
+ result = string_series.transform(func)
+ expected = string_series + 1
+ tm.assert_series_equal(result, expected)
+
+ # via map Series -> Series
+ def func(x):
+ if not isinstance(x, Series):
+ raise ValueError
+ return x + 1
+
+ result = string_series.transform(func)
+ expected = string_series + 1
+ tm.assert_series_equal(result, expected)
+
+
+def test_transform_wont_agg(string_series):
+ # GH 35964
+ # we are trying to transform with an aggregator
+ msg = "Function did not transform"
+ with pytest.raises(ValueError, match=msg):
+ string_series.transform(["min", "max"])
+
+ msg = "Function did not transform"
+ with pytest.raises(ValueError, match=msg):
+ with np.errstate(all="ignore"):
+ string_series.transform(["sqrt", "max"])
+
+
+def test_transform_none_to_type():
+ # GH34377
+ df = DataFrame({"a": [None]})
+ msg = "Transform function failed"
+ with pytest.raises(ValueError, match=msg):
+ df.transform({"a": int})
+
+
+def test_transform_reducer_raises(all_reductions):
+ # GH 35964
+ op = all_reductions
+ s = Series([1, 2, 3])
+ msg = "Function did not transform"
+ with pytest.raises(ValueError, match=msg):
+ s.transform(op)
+ with pytest.raises(ValueError, match=msg):
+ s.transform([op])
+ with pytest.raises(ValueError, match=msg):
+ s.transform({"A": op})
+ with pytest.raises(ValueError, match=msg):
+ s.transform({"A": [op]})
+
+
+# mypy doesn't allow adding lists of different types
+# https://github.com/python/mypy/issues/5492
+@pytest.mark.parametrize("op", [*transformation_kernels, lambda x: x + 1])
+def test_transform_bad_dtype(op):
+ # GH 35964
+ s = Series(3 * [object]) # Series that will fail on most transforms
+ if op in ("backfill", "shift", "pad", "bfill", "ffill"):
+ pytest.xfail("Transform function works on any datatype")
+ msg = "Transform function failed"
+ with pytest.raises(ValueError, match=msg):
+ s.transform(op)
+ with pytest.raises(ValueError, match=msg):
+ s.transform([op])
+ with pytest.raises(ValueError, match=msg):
+ s.transform({"A": op})
+ with pytest.raises(ValueError, match=msg):
+ s.transform({"A": [op]})
+
+
+@pytest.mark.parametrize("use_apply", [True, False])
+def test_transform_passes_args(use_apply):
+ # GH 35964
+ # transform uses UDF either via apply or passing the entire Series
+ expected_args = [1, 2]
+ expected_kwargs = {"c": 3}
+
+ def f(x, a, b, c):
+ # transform is using apply iff x is not a Series
+ if use_apply == isinstance(x, Series):
+ # Force transform to fallback
+ raise ValueError
+ assert [a, b] == expected_args
+ assert c == expected_kwargs["c"]
+ return x
+
+ Series([1]).transform(f, 0, *expected_args, **expected_kwargs)
+
+
+def test_transform_axis_1_raises():
+ # GH 35964
+ msg = "No axis named 1 for object type Series"
+ with pytest.raises(ValueError, match=msg):
+ Series([1]).transform("sum", axis=1)
+
+
+def test_transform_nested_renamer():
+ # GH 35964
+ match = "nested renamer is not supported"
+ with pytest.raises(SpecificationError, match=match):
+ Series([1]).transform({"A": {"B": ["sum"]}})
diff --git a/pandas/tests/series/indexing/test_datetime.py b/pandas/tests/series/indexing/test_datetime.py
index 0b34fab7b80b1..088f8681feb99 100644
--- a/pandas/tests/series/indexing/test_datetime.py
+++ b/pandas/tests/series/indexing/test_datetime.py
@@ -11,7 +11,6 @@
from pandas import DataFrame, DatetimeIndex, NaT, Series, Timestamp, date_range
import pandas._testing as tm
-
"""
Also test support for datetime64[ns] in Series / DataFrame
"""
@@ -166,6 +165,7 @@ def test_getitem_setitem_datetime_tz_pytz():
def test_getitem_setitem_datetime_tz_dateutil():
from dateutil.tz import tzutc
+
from pandas._libs.tslibs.timezones import dateutil_gettz as gettz
tz = (
diff --git a/pandas/tests/series/indexing/test_getitem.py b/pandas/tests/series/indexing/test_getitem.py
index 164c63483f71f..6b7cda89a4714 100644
--- a/pandas/tests/series/indexing/test_getitem.py
+++ b/pandas/tests/series/indexing/test_getitem.py
@@ -51,11 +51,7 @@ class TestSeriesGetitemSlices:
def test_getitem_slice_2d(self, datetime_series):
# GH#30588 multi-dimensional indexing deprecated
- # This is currently failing because the test was relying on
- # the DeprecationWarning coming through Index.__getitem__.
- # We want to implement a warning specifically for Series.__getitem__
- # at which point this will become a Deprecation/FutureWarning
- with tm.assert_produces_warning(None):
+ with tm.assert_produces_warning(FutureWarning):
# GH#30867 Don't want to support this long-term, but
# for now ensure that the warning from Index
# doesn't comes through via Series.__getitem__.
@@ -135,3 +131,9 @@ def test_getitem_generator(string_series):
expected = string_series[string_series > 0]
tm.assert_series_equal(result, expected)
tm.assert_series_equal(result2, expected)
+
+
+def test_getitem_ndim_deprecated():
+ s = pd.Series([0, 1])
+ with tm.assert_produces_warning(FutureWarning):
+ s[:, None]
diff --git a/pandas/tests/series/indexing/test_indexing.py b/pandas/tests/series/indexing/test_indexing.py
index 737e21af9242f..1fafdf00393e1 100644
--- a/pandas/tests/series/indexing/test_indexing.py
+++ b/pandas/tests/series/indexing/test_indexing.py
@@ -383,7 +383,7 @@ def test_2d_to_1d_assignment_raises():
@pytest.mark.filterwarnings("ignore:Using a non-tuple:FutureWarning")
def test_basic_getitem_setitem_corner(datetime_series):
# invalid tuples, e.g. td.ts[:, None] vs. td.ts[:, 2]
- msg = "Can only tuple-index with a MultiIndex"
+ msg = "key of type tuple not found and not a MultiIndex"
with pytest.raises(ValueError, match=msg):
datetime_series[:, 2]
with pytest.raises(ValueError, match=msg):
@@ -736,14 +736,16 @@ def test_append_timedelta_does_not_cast(td):
def test_underlying_data_conversion():
# GH 4080
df = DataFrame({c: [1, 2, 3] for c in ["a", "b", "c"]})
- df.set_index(["a", "b", "c"], inplace=True)
+ return_value = df.set_index(["a", "b", "c"], inplace=True)
+ assert return_value is None
s = Series([1], index=[(2, 2, 2)])
df["val"] = 0
df
df["val"].update(s)
expected = DataFrame(dict(a=[1, 2, 3], b=[1, 2, 3], c=[1, 2, 3], val=[0, 1, 0]))
- expected.set_index(["a", "b", "c"], inplace=True)
+ return_value = expected.set_index(["a", "b", "c"], inplace=True)
+ assert return_value is None
tm.assert_frame_equal(df, expected)
# GH 3970
@@ -940,3 +942,22 @@ def assert_slices_equivalent(l_slc, i_slc):
for key2 in [keystr2, box(keystr2)]:
assert_slices_equivalent(SLC[key2:key:-1], SLC[13:8:-1])
assert_slices_equivalent(SLC[key:key2:-1], SLC[0:0:-1])
+
+
+def test_tuple_index():
+ # GH 35534 - Selecting values when a Series has an Index of tuples
+ s = pd.Series([1, 2], index=[("a",), ("b",)])
+ assert s[("a",)] == 1
+ assert s[("b",)] == 2
+ s[("b",)] = 3
+ assert s[("b",)] == 3
+
+
+def test_frozenset_index():
+ # GH35747 - Selecting values when a Series has an Index of frozenset
+ idx0, idx1 = frozenset("a"), frozenset("b")
+ s = pd.Series([1, 2], index=[idx0, idx1])
+ assert s[idx0] == 1
+ assert s[idx1] == 2
+ s[idx1] = 3
+ assert s[idx1] == 3
diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py
index 3463de25ad91b..593d1c78a19e2 100644
--- a/pandas/tests/series/indexing/test_setitem.py
+++ b/pandas/tests/series/indexing/test_setitem.py
@@ -1,6 +1,7 @@
import numpy as np
-from pandas import NaT, Series, date_range
+from pandas import MultiIndex, NaT, Series, date_range
+import pandas.testing as tm
class TestSetitemDT64Values:
@@ -17,3 +18,11 @@ def test_setitem_none_nan(self):
series[5:7] = np.nan
assert series[6] is NaT
+
+ def test_setitem_multiindex_empty_slice(self):
+ # https://github.com/pandas-dev/pandas/issues/35878
+ idx = MultiIndex.from_tuples([("a", 1), ("b", 2)])
+ result = Series([1, 2], index=idx)
+ expected = result.copy()
+ result.loc[[]] = 0
+ tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/series/methods/test_argsort.py b/pandas/tests/series/methods/test_argsort.py
index 4353eb4c8cd64..ec9ba468c996c 100644
--- a/pandas/tests/series/methods/test_argsort.py
+++ b/pandas/tests/series/methods/test_argsort.py
@@ -9,7 +9,7 @@ class TestSeriesArgsort:
def _check_accum_op(self, name, ser, check_dtype=True):
func = getattr(np, name)
tm.assert_numpy_array_equal(
- func(ser).values, func(np.array(ser)), check_dtype=check_dtype,
+ func(ser).values, func(np.array(ser)), check_dtype=check_dtype
)
# with missing values
diff --git a/pandas/tests/series/methods/test_asof.py b/pandas/tests/series/methods/test_asof.py
index 19caf4eccf748..4b4ef5ea046be 100644
--- a/pandas/tests/series/methods/test_asof.py
+++ b/pandas/tests/series/methods/test_asof.py
@@ -90,7 +90,7 @@ def test_with_nan(self):
tm.assert_series_equal(result, expected)
def test_periodindex(self):
- from pandas import period_range, PeriodIndex
+ from pandas import PeriodIndex, period_range
# array or list or dates
N = 50
diff --git a/pandas/tests/series/methods/test_astype.py b/pandas/tests/series/methods/test_astype.py
index 9fdc4179de2e1..7449d8d65ef96 100644
--- a/pandas/tests/series/methods/test_astype.py
+++ b/pandas/tests/series/methods/test_astype.py
@@ -1,4 +1,7 @@
-from pandas import Series, date_range
+import numpy as np
+import pytest
+
+from pandas import Interval, Series, Timestamp, date_range
import pandas._testing as tm
@@ -23,3 +26,32 @@ def test_astype_dt64tz_to_str(self):
dtype=object,
)
tm.assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize(
+ "values",
+ [
+ Series(["x", "y", "z"], dtype="string"),
+ Series(["x", "y", "z"], dtype="category"),
+ Series(3 * [Timestamp("2020-01-01", tz="UTC")]),
+ Series(3 * [Interval(0, 1)]),
+ ],
+ )
+ @pytest.mark.parametrize("errors", ["raise", "ignore"])
+ def test_astype_ignores_errors_for_extension_dtypes(self, values, errors):
+ # https://github.com/pandas-dev/pandas/issues/35471
+ if errors == "ignore":
+ expected = values
+ result = values.astype(float, errors="ignore")
+ tm.assert_series_equal(result, expected)
+ else:
+ msg = "(Cannot cast)|(could not convert)"
+ with pytest.raises((ValueError, TypeError), match=msg):
+ values.astype(float, errors=errors)
+
+ @pytest.mark.parametrize("dtype", [np.float16, np.float32, np.float64])
+ def test_astype_from_float_to_str(self, dtype):
+ # https://github.com/pandas-dev/pandas/issues/36451
+ s = Series([0.1], dtype=dtype)
+ result = s.astype(str)
+ expected = Series(["0.1"])
+ tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/series/methods/test_convert_dtypes.py b/pandas/tests/series/methods/test_convert_dtypes.py
index dd4bf642e68e8..8a915324a72c1 100644
--- a/pandas/tests/series/methods/test_convert_dtypes.py
+++ b/pandas/tests/series/methods/test_convert_dtypes.py
@@ -219,10 +219,10 @@ class TestSeriesConvertDtypes:
pd.to_datetime(["2020-01-14 10:00", "2020-01-15 11:11"]),
object,
{
- ((True,), (True, False), (True, False), (True, False),): np.dtype(
+ ((True,), (True, False), (True, False), (True, False)): np.dtype(
"datetime64[ns]"
),
- ((False,), (True, False), (True, False), (True, False),): np.dtype(
+ ((False,), (True, False), (True, False), (True, False)): np.dtype(
"O"
),
},
diff --git a/pandas/tests/series/methods/test_describe.py b/pandas/tests/series/methods/test_describe.py
index 4e59c6995f4f2..a15dc0751aa7d 100644
--- a/pandas/tests/series/methods/test_describe.py
+++ b/pandas/tests/series/methods/test_describe.py
@@ -83,7 +83,7 @@ def test_describe_with_tz(self, tz_naive_fixture):
start = Timestamp(2018, 1, 1)
end = Timestamp(2018, 1, 5)
s = Series(date_range(start, end, tz=tz), name=name)
- result = s.describe()
+ result = s.describe(datetime_is_numeric=True)
expected = Series(
[
5,
@@ -98,3 +98,43 @@ def test_describe_with_tz(self, tz_naive_fixture):
index=["count", "mean", "min", "25%", "50%", "75%", "max"],
)
tm.assert_series_equal(result, expected)
+
+ def test_describe_with_tz_warns(self):
+ name = tz = "CET"
+ start = Timestamp(2018, 1, 1)
+ end = Timestamp(2018, 1, 5)
+ s = Series(date_range(start, end, tz=tz), name=name)
+
+ with tm.assert_produces_warning(FutureWarning):
+ result = s.describe()
+
+ expected = Series(
+ [
+ 5,
+ 5,
+ s.value_counts().index[0],
+ 1,
+ start.tz_localize(tz),
+ end.tz_localize(tz),
+ ],
+ name=name,
+ index=["count", "unique", "top", "freq", "first", "last"],
+ )
+ tm.assert_series_equal(result, expected)
+
+ def test_datetime_is_numeric_includes_datetime(self):
+ s = Series(date_range("2012", periods=3))
+ result = s.describe(datetime_is_numeric=True)
+ expected = Series(
+ [
+ 3,
+ Timestamp("2012-01-02"),
+ Timestamp("2012-01-01"),
+ Timestamp("2012-01-01T12:00:00"),
+ Timestamp("2012-01-02"),
+ Timestamp("2012-01-02T12:00:00"),
+ Timestamp("2012-01-03"),
+ ],
+ index=["count", "mean", "min", "25%", "50%", "75%", "max"],
+ )
+ tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/series/methods/test_drop_duplicates.py b/pandas/tests/series/methods/test_drop_duplicates.py
index a4532ebb3d8c5..6eb0e09f12658 100644
--- a/pandas/tests/series/methods/test_drop_duplicates.py
+++ b/pandas/tests/series/methods/test_drop_duplicates.py
@@ -22,7 +22,8 @@ def test_drop_duplicates(any_numpy_dtype, keep, expected):
tm.assert_series_equal(tc.duplicated(keep=keep), expected)
tm.assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected])
sc = tc.copy()
- sc.drop_duplicates(keep=keep, inplace=True)
+ return_value = sc.drop_duplicates(keep=keep, inplace=True)
+ assert return_value is None
tm.assert_series_equal(sc, tc[~expected])
@@ -40,8 +41,9 @@ def test_drop_duplicates_bool(keep, expected):
tm.assert_series_equal(tc.duplicated(keep=keep), expected)
tm.assert_series_equal(tc.drop_duplicates(keep=keep), tc[~expected])
sc = tc.copy()
- sc.drop_duplicates(keep=keep, inplace=True)
+ return_value = sc.drop_duplicates(keep=keep, inplace=True)
tm.assert_series_equal(sc, tc[~expected])
+ assert return_value is None
@pytest.mark.parametrize("values", [[], list(range(5))])
@@ -84,21 +86,24 @@ def test_drop_duplicates_categorical_non_bool(self, dtype, ordered):
tm.assert_series_equal(tc1.duplicated(), expected)
tm.assert_series_equal(tc1.drop_duplicates(), tc1[~expected])
sc = tc1.copy()
- sc.drop_duplicates(inplace=True)
+ return_value = sc.drop_duplicates(inplace=True)
+ assert return_value is None
tm.assert_series_equal(sc, tc1[~expected])
expected = Series([False, False, True, False])
tm.assert_series_equal(tc1.duplicated(keep="last"), expected)
tm.assert_series_equal(tc1.drop_duplicates(keep="last"), tc1[~expected])
sc = tc1.copy()
- sc.drop_duplicates(keep="last", inplace=True)
+ return_value = sc.drop_duplicates(keep="last", inplace=True)
+ assert return_value is None
tm.assert_series_equal(sc, tc1[~expected])
expected = Series([False, False, True, True])
tm.assert_series_equal(tc1.duplicated(keep=False), expected)
tm.assert_series_equal(tc1.drop_duplicates(keep=False), tc1[~expected])
sc = tc1.copy()
- sc.drop_duplicates(keep=False, inplace=True)
+ return_value = sc.drop_duplicates(keep=False, inplace=True)
+ assert return_value is None
tm.assert_series_equal(sc, tc1[~expected])
# Test case 2
@@ -113,27 +118,30 @@ def test_drop_duplicates_categorical_non_bool(self, dtype, ordered):
tm.assert_series_equal(tc2.duplicated(), expected)
tm.assert_series_equal(tc2.drop_duplicates(), tc2[~expected])
sc = tc2.copy()
- sc.drop_duplicates(inplace=True)
+ return_value = sc.drop_duplicates(inplace=True)
+ assert return_value is None
tm.assert_series_equal(sc, tc2[~expected])
expected = Series([False, True, True, False, False, False, False])
tm.assert_series_equal(tc2.duplicated(keep="last"), expected)
tm.assert_series_equal(tc2.drop_duplicates(keep="last"), tc2[~expected])
sc = tc2.copy()
- sc.drop_duplicates(keep="last", inplace=True)
+ return_value = sc.drop_duplicates(keep="last", inplace=True)
+ assert return_value is None
tm.assert_series_equal(sc, tc2[~expected])
expected = Series([False, True, True, False, True, True, False])
tm.assert_series_equal(tc2.duplicated(keep=False), expected)
tm.assert_series_equal(tc2.drop_duplicates(keep=False), tc2[~expected])
sc = tc2.copy()
- sc.drop_duplicates(keep=False, inplace=True)
+ return_value = sc.drop_duplicates(keep=False, inplace=True)
+ assert return_value is None
tm.assert_series_equal(sc, tc2[~expected])
def test_drop_duplicates_categorical_bool(self, ordered):
tc = Series(
Categorical(
- [True, False, True, False], categories=[True, False], ordered=ordered,
+ [True, False, True, False], categories=[True, False], ordered=ordered
)
)
@@ -141,19 +149,22 @@ def test_drop_duplicates_categorical_bool(self, ordered):
tm.assert_series_equal(tc.duplicated(), expected)
tm.assert_series_equal(tc.drop_duplicates(), tc[~expected])
sc = tc.copy()
- sc.drop_duplicates(inplace=True)
+ return_value = sc.drop_duplicates(inplace=True)
+ assert return_value is None
tm.assert_series_equal(sc, tc[~expected])
expected = Series([True, True, False, False])
tm.assert_series_equal(tc.duplicated(keep="last"), expected)
tm.assert_series_equal(tc.drop_duplicates(keep="last"), tc[~expected])
sc = tc.copy()
- sc.drop_duplicates(keep="last", inplace=True)
+ return_value = sc.drop_duplicates(keep="last", inplace=True)
+ assert return_value is None
tm.assert_series_equal(sc, tc[~expected])
expected = Series([True, True, True, True])
tm.assert_series_equal(tc.duplicated(keep=False), expected)
tm.assert_series_equal(tc.drop_duplicates(keep=False), tc[~expected])
sc = tc.copy()
- sc.drop_duplicates(keep=False, inplace=True)
+ return_value = sc.drop_duplicates(keep=False, inplace=True)
+ assert return_value is None
tm.assert_series_equal(sc, tc[~expected])
diff --git a/pandas/tests/series/methods/test_equals.py b/pandas/tests/series/methods/test_equals.py
new file mode 100644
index 0000000000000..cf55482fefe22
--- /dev/null
+++ b/pandas/tests/series/methods/test_equals.py
@@ -0,0 +1,67 @@
+from contextlib import nullcontext
+
+import numpy as np
+import pytest
+
+from pandas import MultiIndex, Series
+import pandas._testing as tm
+
+
+@pytest.mark.parametrize(
+ "arr, idx",
+ [
+ ([1, 2, 3, 4], [0, 2, 1, 3]),
+ ([1, np.nan, 3, np.nan], [0, 2, 1, 3]),
+ (
+ [1, np.nan, 3, np.nan],
+ MultiIndex.from_tuples([(0, "a"), (1, "b"), (2, "c"), (3, "c")]),
+ ),
+ ],
+)
+def test_equals(arr, idx):
+ s1 = Series(arr, index=idx)
+ s2 = s1.copy()
+ assert s1.equals(s2)
+
+ s1[1] = 9
+ assert not s1.equals(s2)
+
+
+@pytest.mark.parametrize(
+ "val", [1, 1.1, 1 + 1j, True, "abc", [1, 2], (1, 2), {1, 2}, {"a": 1}, None]
+)
+def test_equals_list_array(val):
+ # GH20676 Verify equals operator for list of Numpy arrays
+ arr = np.array([1, 2])
+ s1 = Series([arr, arr])
+ s2 = s1.copy()
+ assert s1.equals(s2)
+
+ s1[1] = val
+
+ cm = (
+ tm.assert_produces_warning(FutureWarning, check_stacklevel=False)
+ if isinstance(val, str)
+ else nullcontext()
+ )
+ with cm:
+ assert not s1.equals(s2)
+
+
+def test_equals_false_negative():
+ # GH8437 Verify false negative behavior of equals function for dtype object
+ arr = [False, np.nan]
+ s1 = Series(arr)
+ s2 = s1.copy()
+ s3 = Series(index=range(2), dtype=object)
+ s4 = s3.copy()
+ s5 = s3.copy()
+ s6 = s3.copy()
+
+ s3[:-1] = s4[:-1] = s5[0] = s6[0] = False
+ assert s1.equals(s1)
+ assert s1.equals(s2)
+ assert s1.equals(s3)
+ assert s1.equals(s4)
+ assert s1.equals(s5)
+ assert s5.equals(s6)
diff --git a/pandas/tests/series/methods/test_explode.py b/pandas/tests/series/methods/test_explode.py
index 4b65e042f7b02..1f0fbd1cc5ecb 100644
--- a/pandas/tests/series/methods/test_explode.py
+++ b/pandas/tests/series/methods/test_explode.py
@@ -126,3 +126,11 @@ def test_ignore_index():
result = s.explode(ignore_index=True)
expected = pd.Series([1, 2, 3, 4], index=[0, 1, 2, 3], dtype=object)
tm.assert_series_equal(result, expected)
+
+
+def test_explode_sets():
+ # https://github.com/pandas-dev/pandas/issues/35614
+ s = pd.Series([{"a", "b", "c"}], index=[1])
+ result = s.explode().sort_values()
+ expected = pd.Series(["a", "b", "c"], index=[1, 1, 1])
+ tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/series/methods/test_fillna.py b/pandas/tests/series/methods/test_fillna.py
index c34838be24fc1..80b8271e16e7a 100644
--- a/pandas/tests/series/methods/test_fillna.py
+++ b/pandas/tests/series/methods/test_fillna.py
@@ -67,7 +67,8 @@ def test_fillna_numeric_inplace(self):
x = Series([np.nan, 1.0, np.nan, 3.0, np.nan], ["z", "a", "b", "c", "d"])
y = x.copy()
- y.fillna(value=0, inplace=True)
+ return_value = y.fillna(value=0, inplace=True)
+ assert return_value is None
expected = x.fillna(value=0)
tm.assert_series_equal(y, expected)
diff --git a/pandas/tests/series/methods/test_interpolate.py b/pandas/tests/series/methods/test_interpolate.py
index c4b10e0ccdc3e..cba9443005f2f 100644
--- a/pandas/tests/series/methods/test_interpolate.py
+++ b/pandas/tests/series/methods/test_interpolate.py
@@ -30,7 +30,7 @@
]
)
def nontemporal_method(request):
- """ Fixture that returns an (method name, required kwargs) pair.
+ """Fixture that returns an (method name, required kwargs) pair.
This fixture does not include method 'time' as a parameterization; that
method requires a Series with a DatetimeIndex, and is generally tested
@@ -60,7 +60,7 @@ def nontemporal_method(request):
]
)
def interp_methods_ind(request):
- """ Fixture that returns a (method name, required kwargs) pair to
+ """Fixture that returns a (method name, required kwargs) pair to
be tested for various Index types.
This fixture does not include methods - 'time', 'index', 'nearest',
diff --git a/pandas/tests/series/methods/test_replace.py b/pandas/tests/series/methods/test_replace.py
index 8f57cf3191d5d..e255d46e81851 100644
--- a/pandas/tests/series/methods/test_replace.py
+++ b/pandas/tests/series/methods/test_replace.py
@@ -1,3 +1,5 @@
+import re
+
import numpy as np
import pytest
@@ -13,7 +15,8 @@ def test_replace(self, datetime_series):
ser[6:10] = 0
# replace list with a single value
- ser.replace([np.nan], -1, inplace=True)
+ return_value = ser.replace([np.nan], -1, inplace=True)
+ assert return_value is None
exp = ser.fillna(-1)
tm.assert_series_equal(ser, exp)
@@ -48,7 +51,8 @@ def test_replace(self, datetime_series):
tm.assert_series_equal(rs, rs2)
# replace inplace
- ser.replace([np.nan, "foo", "bar"], -1, inplace=True)
+ return_value = ser.replace([np.nan, "foo", "bar"], -1, inplace=True)
+ assert return_value is None
assert (ser[:5] == -1).all()
assert (ser[6:10] == -1).all()
@@ -124,7 +128,8 @@ def test_replace_with_single_list(self):
tm.assert_series_equal(result, pd.Series([0, 0, 0, 0, 4]))
s = ser.copy()
- s.replace([1, 2, 3], inplace=True)
+ return_value = s.replace([1, 2, 3], inplace=True)
+ assert return_value is None
tm.assert_series_equal(s, pd.Series([0, 0, 0, 0, 4]))
# make sure things don't get corrupted when fillna call fails
@@ -134,7 +139,8 @@ def test_replace_with_single_list(self):
r"\(bfill\)\. Got crash_cymbal"
)
with pytest.raises(ValueError, match=msg):
- s.replace([1, 2, 3], inplace=True, method="crash_cymbal")
+ return_value = s.replace([1, 2, 3], inplace=True, method="crash_cymbal")
+ assert return_value is None
tm.assert_series_equal(s, ser)
def test_replace_with_empty_list(self):
@@ -156,7 +162,8 @@ def test_replace_mixed_types(self):
def check_replace(to_rep, val, expected):
sc = s.copy()
r = s.replace(to_rep, val)
- sc.replace(to_rep, val, inplace=True)
+ return_value = sc.replace(to_rep, val, inplace=True)
+ assert return_value is None
tm.assert_series_equal(expected, r)
tm.assert_series_equal(expected, sc)
@@ -211,8 +218,9 @@ def test_replace_bool_with_bool(self):
def test_replace_with_dict_with_bool_keys(self):
s = pd.Series([True, False, True])
- with pytest.raises(TypeError, match="Cannot compare types .+"):
- s.replace({"asdf": "asdb", True: "yes"})
+ result = s.replace({"asdf": "asdb", True: "yes"})
+ expected = pd.Series(["yes", False, "yes"])
+ tm.assert_series_equal(result, expected)
def test_replace2(self):
N = 100
@@ -242,7 +250,8 @@ def test_replace2(self):
tm.assert_series_equal(rs, rs2)
# replace inplace
- ser.replace([np.nan, "foo", "bar"], -1, inplace=True)
+ return_value = ser.replace([np.nan, "foo", "bar"], -1, inplace=True)
+ assert return_value is None
assert (ser[:5] == -1).all()
assert (ser[6:10] == -1).all()
assert (ser[20:30] == -1).all()
@@ -325,11 +334,13 @@ def test_replace_categorical_single(self):
tm.assert_series_equal(expected, result)
assert c[2] != "foo" # ensure non-inplace call does not alter original
- c.replace(c[2], "foo", inplace=True)
+ return_value = c.replace(c[2], "foo", inplace=True)
+ assert return_value is None
tm.assert_series_equal(expected, c)
first_value = c[0]
- c.replace(c[1], c[0], inplace=True)
+ return_value = c.replace(c[1], c[0], inplace=True)
+ assert return_value is None
assert c[0] == c[1] == first_value # test replacing with existing value
def test_replace_with_no_overflowerror(self):
@@ -387,6 +398,29 @@ def test_replace_invalid_to_replace(self):
with pytest.raises(TypeError, match=msg):
series.replace(lambda x: x.strip())
+ @pytest.mark.parametrize("frame", [False, True])
+ def test_replace_nonbool_regex(self, frame):
+ obj = pd.Series(["a", "b", "c "])
+ if frame:
+ obj = obj.to_frame()
+
+ msg = "'to_replace' must be 'None' if 'regex' is not a bool"
+ with pytest.raises(ValueError, match=msg):
+ obj.replace(to_replace=["a"], regex="foo")
+
+ @pytest.mark.parametrize("frame", [False, True])
+ def test_replace_empty_copy(self, frame):
+ obj = pd.Series([], dtype=np.float64)
+ if frame:
+ obj = obj.to_frame()
+
+ res = obj.replace(4, 5, inplace=True)
+ assert res is None
+
+ res = obj.replace(4, 5, inplace=False)
+ tm.assert_equal(res, obj)
+ assert res is not obj
+
def test_replace_only_one_dictlike_arg(self):
# GH#33340
@@ -407,3 +441,11 @@ def test_replace_extension_other(self):
# https://github.com/pandas-dev/pandas/issues/34530
ser = pd.Series(pd.array([1, 2, 3], dtype="Int64"))
ser.replace("", "") # no exception
+
+ def test_replace_with_compiled_regex(self):
+ # https://github.com/pandas-dev/pandas/issues/35680
+ s = pd.Series(["a", "b", "c"])
+ regex = re.compile("^a$")
+ result = s.replace({regex: "z"}, regex=True)
+ expected = pd.Series(["z", "b", "c"])
+ tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/series/methods/test_reset_index.py b/pandas/tests/series/methods/test_reset_index.py
index a11590d42552d..1474bb95f4af2 100644
--- a/pandas/tests/series/methods/test_reset_index.py
+++ b/pandas/tests/series/methods/test_reset_index.py
@@ -1,6 +1,7 @@
import numpy as np
import pytest
+import pandas as pd
from pandas import DataFrame, Index, MultiIndex, RangeIndex, Series
import pandas._testing as tm
@@ -21,7 +22,8 @@ def test_reset_index(self):
# check inplace
s = ser.reset_index(drop=True)
s2 = ser
- s2.reset_index(drop=True, inplace=True)
+ return_value = s2.reset_index(drop=True, inplace=True)
+ assert return_value is None
tm.assert_series_equal(s, s2)
# level
@@ -110,11 +112,21 @@ def test_reset_index_drop_errors(self):
s.reset_index("wrong", drop=True)
-def test_reset_index_dtypes_on_empty_series_with_multiindex():
+@pytest.mark.parametrize(
+ "array, dtype",
+ [
+ (["a", "b"], object),
+ (
+ pd.period_range("12-1-2000", periods=2, freq="Q-DEC"),
+ pd.PeriodDtype(freq="Q-DEC"),
+ ),
+ ],
+)
+def test_reset_index_dtypes_on_empty_series_with_multiindex(array, dtype):
# GH 19602 - Preserve dtype on empty Series with MultiIndex
- idx = MultiIndex.from_product([[0, 1], [0.5, 1.0], ["a", "b"]])
+ idx = MultiIndex.from_product([[0, 1], [0.5, 1.0], array])
result = Series(dtype=object, index=idx)[:0].reset_index().dtypes
expected = Series(
- {"level_0": np.int64, "level_1": np.float64, "level_2": object, 0: object}
+ {"level_0": np.int64, "level_1": np.float64, "level_2": dtype, 0: object}
)
tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/series/methods/test_sort_values.py b/pandas/tests/series/methods/test_sort_values.py
index b32c59b4daa0d..b49e39d4592ea 100644
--- a/pandas/tests/series/methods/test_sort_values.py
+++ b/pandas/tests/series/methods/test_sort_values.py
@@ -65,7 +65,8 @@ def test_sort_values(self, datetime_series):
# inplace=True
ts = datetime_series.copy()
- ts.sort_values(ascending=False, inplace=True)
+ return_value = ts.sort_values(ascending=False, inplace=True)
+ assert return_value is None
tm.assert_series_equal(ts, datetime_series.sort_values(ascending=False))
tm.assert_index_equal(
ts.index, datetime_series.sort_values(ascending=False).index
diff --git a/pandas/tests/series/methods/test_truncate.py b/pandas/tests/series/methods/test_truncate.py
index 8a2c62cee7e24..45592f8d99b93 100644
--- a/pandas/tests/series/methods/test_truncate.py
+++ b/pandas/tests/series/methods/test_truncate.py
@@ -136,7 +136,19 @@ def test_truncate_multiindex(self):
df = pd.DataFrame.from_dict(
{"L1": [2, 2, 3, 3], "L2": ["A", "B", "A", "B"], "col": [2, 3, 4, 5]}
)
- df.set_index(["L1", "L2"], inplace=True)
+ return_value = df.set_index(["L1", "L2"], inplace=True)
+ assert return_value is None
expected = df.col
tm.assert_series_equal(result, expected)
+
+ def test_truncate_one_element_series(self):
+ # GH 35544
+ series = pd.Series([0.1], index=pd.DatetimeIndex(["2020-08-04"]))
+ before = pd.Timestamp("2020-08-02")
+ after = pd.Timestamp("2020-08-04")
+
+ result = series.truncate(before=before, after=after)
+
+ # the input Series and the expected Series are the same
+ tm.assert_series_equal(result, series)
diff --git a/pandas/tests/series/methods/test_unstack.py b/pandas/tests/series/methods/test_unstack.py
index cdf6a16e88ad0..d651315d64561 100644
--- a/pandas/tests/series/methods/test_unstack.py
+++ b/pandas/tests/series/methods/test_unstack.py
@@ -75,9 +75,7 @@ def test_unstack_tuplename_in_multiindex():
expected = pd.DataFrame(
[[1, 1, 1], [1, 1, 1], [1, 1, 1]],
- columns=pd.MultiIndex.from_tuples(
- [("a",), ("b",), ("c",)], names=[("A", "a")],
- ),
+ columns=pd.MultiIndex.from_tuples([("a",), ("b",), ("c",)], names=[("A", "a")]),
index=pd.Index([1, 2, 3], name=("B", "b")),
)
tm.assert_frame_equal(result, expected)
@@ -115,7 +113,7 @@ def test_unstack_mixed_type_name_in_multiindex(
result = ser.unstack(unstack_idx)
expected = pd.DataFrame(
- expected_values, columns=expected_columns, index=expected_index,
+ expected_values, columns=expected_columns, index=expected_index
)
tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py
index ab8618eb0a7d4..6ba55ce3c74b9 100644
--- a/pandas/tests/series/test_analytics.py
+++ b/pandas/tests/series/test_analytics.py
@@ -3,8 +3,6 @@
import numpy as np
import pytest
-import pandas.util._test_decorators as td
-
import pandas as pd
from pandas import DataFrame, Series
import pandas._testing as tm
@@ -130,7 +128,6 @@ def test_is_monotonic(self):
@pytest.mark.parametrize("func", [np.any, np.all])
@pytest.mark.parametrize("kwargs", [dict(keepdims=True), dict(out=object())])
- @td.skip_if_np_lt("1.15")
def test_validate_any_all_out_keepdims_raises(self, kwargs, func):
s = pd.Series([1, 2])
param = list(kwargs)[0]
@@ -144,7 +141,6 @@ def test_validate_any_all_out_keepdims_raises(self, kwargs, func):
with pytest.raises(ValueError, match=msg):
func(s, **kwargs)
- @td.skip_if_np_lt("1.15")
def test_validate_sum_initial(self):
s = pd.Series([1, 2])
msg = (
@@ -167,7 +163,6 @@ def test_validate_median_initial(self):
# method instead of the ufunc.
s.median(overwrite_input=True)
- @td.skip_if_np_lt("1.15")
def test_validate_stat_keepdims(self):
s = pd.Series([1, 2])
msg = (
@@ -185,7 +180,7 @@ def test_td64_summation_overflow(self):
# mean
result = (s - s.min()).mean()
- expected = pd.Timedelta((pd.TimedeltaIndex((s - s.min())).asi8 / len(s)).sum())
+ expected = pd.Timedelta((pd.TimedeltaIndex(s - s.min()).asi8 / len(s)).sum())
# the computation is converted to float so
# might be some loss of precision
diff --git a/pandas/tests/series/test_api.py b/pandas/tests/series/test_api.py
index 042841bb4e019..a69c0ee75eaba 100644
--- a/pandas/tests/series/test_api.py
+++ b/pandas/tests/series/test_api.py
@@ -5,6 +5,7 @@
import numpy as np
import pytest
+import pandas.util._test_decorators as td
from pandas.util._test_decorators import async_mark
import pandas as pd
@@ -179,7 +180,8 @@ def test_constructor_dict_timedelta_index(self):
def test_sparse_accessor_updates_on_inplace(self):
s = pd.Series([1, 1, 2, 3], dtype="Sparse[int]")
- s.drop([0, 1], inplace=True)
+ return_value = s.drop([0, 1], inplace=True)
+ assert return_value is None
assert s.sparse.density == 1.0
def test_tab_completion(self):
@@ -459,7 +461,8 @@ def f(x):
def test_str_accessor_updates_on_inplace(self):
s = pd.Series(list("abc"))
- s.drop([0], inplace=True)
+ return_value = s.drop([0], inplace=True)
+ assert return_value is None
assert len(s.str.lower()) == 2
def test_str_attribute(self):
@@ -484,6 +487,7 @@ def test_empty_method(self):
assert not full_series.empty
@async_mark()
+ @td.check_file_leaks
async def test_tab_complete_warning(self, ip):
# https://github.com/pandas-dev/pandas/issues/16409
pytest.importorskip("IPython", minversion="6.0.0")
@@ -520,6 +524,32 @@ def test_attrs(self):
result = s + 1
assert result.attrs == {"version": 1}
+ @pytest.mark.parametrize("allows_duplicate_labels", [True, False, None])
+ def test_set_flags(self, allows_duplicate_labels):
+ df = pd.Series([1, 2])
+ result = df.set_flags(allows_duplicate_labels=allows_duplicate_labels)
+ if allows_duplicate_labels is None:
+ # We don't update when it's not provided
+ assert result.flags.allows_duplicate_labels is True
+ else:
+ assert result.flags.allows_duplicate_labels is allows_duplicate_labels
+
+ # We made a copy
+ assert df is not result
+ # We didn't mutate df
+ assert df.flags.allows_duplicate_labels is True
+
+ # But we didn't copy data
+ result.iloc[0] = 0
+ assert df.iloc[0] == 0
+
+ # Now we do copy.
+ result = df.set_flags(
+ copy=True, allows_duplicate_labels=allows_duplicate_labels
+ )
+ result.iloc[0] = 10
+ assert df.iloc[0] == 0
+
class TestCategoricalSeries:
@pytest.mark.parametrize(
@@ -548,7 +578,8 @@ def test_cat_accessor(self):
assert not s.cat.ordered, False
exp = Categorical(["a", "b", np.nan, "a"], categories=["b", "a"])
- s.cat.set_categories(["b", "a"], inplace=True)
+ return_value = s.cat.set_categories(["b", "a"], inplace=True)
+ assert return_value is None
tm.assert_categorical_equal(s.values, exp)
res = s.cat.set_categories(["b", "a"])
@@ -579,8 +610,10 @@ def test_cat_accessor_no_new_attributes(self):
def test_cat_accessor_updates_on_inplace(self):
s = Series(list("abc")).astype("category")
- s.drop(0, inplace=True)
- s.cat.remove_unused_categories(inplace=True)
+ return_value = s.drop(0, inplace=True)
+ assert return_value is None
+ return_value = s.cat.remove_unused_categories(inplace=True)
+ assert return_value is None
assert len(s.cat.categories) == 2
def test_categorical_delegations(self):
@@ -614,7 +647,8 @@ def test_categorical_delegations(self):
assert s.cat.ordered
s = s.cat.as_unordered()
assert not s.cat.ordered
- s.cat.as_ordered(inplace=True)
+ return_value = s.cat.as_ordered(inplace=True)
+ assert return_value is None
assert s.cat.ordered
# reorder
diff --git a/pandas/tests/series/test_arithmetic.py b/pandas/tests/series/test_arithmetic.py
index 5c8a0d224c4f9..8fad6ee1cca8b 100644
--- a/pandas/tests/series/test_arithmetic.py
+++ b/pandas/tests/series/test_arithmetic.py
@@ -195,8 +195,8 @@ def test_add_with_duplicate_index(self):
tm.assert_series_equal(result, expected)
def test_add_na_handling(self):
- from decimal import Decimal
from datetime import date
+ from decimal import Decimal
s = Series(
[Decimal("1.3"), Decimal("2.3")], index=[date(2012, 1, 1), date(2012, 1, 2)]
@@ -260,75 +260,59 @@ def test_sub_datetimelike_align(self):
class TestSeriesFlexComparison:
- def test_comparison_flex_basic(self):
+ @pytest.mark.parametrize("axis", [0, None, "index"])
+ def test_comparison_flex_basic(self, axis, all_compare_operators):
+ op = all_compare_operators.strip("__")
+ left = pd.Series(np.random.randn(10))
+ right = pd.Series(np.random.randn(10))
+ result = getattr(left, op)(right, axis=axis)
+ expected = getattr(operator, op)(left, right)
+ tm.assert_series_equal(result, expected)
+
+ def test_comparison_bad_axis(self, all_compare_operators):
+ op = all_compare_operators.strip("__")
left = pd.Series(np.random.randn(10))
right = pd.Series(np.random.randn(10))
- tm.assert_series_equal(left.eq(right), left == right)
- tm.assert_series_equal(left.ne(right), left != right)
- tm.assert_series_equal(left.le(right), left < right)
- tm.assert_series_equal(left.lt(right), left <= right)
- tm.assert_series_equal(left.gt(right), left > right)
- tm.assert_series_equal(left.ge(right), left >= right)
-
- # axis
- for axis in [0, None, "index"]:
- tm.assert_series_equal(left.eq(right, axis=axis), left == right)
- tm.assert_series_equal(left.ne(right, axis=axis), left != right)
- tm.assert_series_equal(left.le(right, axis=axis), left < right)
- tm.assert_series_equal(left.lt(right, axis=axis), left <= right)
- tm.assert_series_equal(left.gt(right, axis=axis), left > right)
- tm.assert_series_equal(left.ge(right, axis=axis), left >= right)
-
- #
msg = "No axis named 1 for object type"
- for op in ["eq", "ne", "le", "le", "gt", "ge"]:
- with pytest.raises(ValueError, match=msg):
- getattr(left, op)(right, axis=1)
+ with pytest.raises(ValueError, match=msg):
+ getattr(left, op)(right, axis=1)
- def test_comparison_flex_alignment(self):
+ @pytest.mark.parametrize(
+ "values, op",
+ [
+ ([False, False, True, False], "eq"),
+ ([True, True, False, True], "ne"),
+ ([False, False, True, False], "le"),
+ ([False, False, False, False], "lt"),
+ ([False, True, True, False], "ge"),
+ ([False, True, False, False], "gt"),
+ ],
+ )
+ def test_comparison_flex_alignment(self, values, op):
left = Series([1, 3, 2], index=list("abc"))
right = Series([2, 2, 2], index=list("bcd"))
+ result = getattr(left, op)(right)
+ expected = pd.Series(values, index=list("abcd"))
+ tm.assert_series_equal(result, expected)
- exp = pd.Series([False, False, True, False], index=list("abcd"))
- tm.assert_series_equal(left.eq(right), exp)
-
- exp = pd.Series([True, True, False, True], index=list("abcd"))
- tm.assert_series_equal(left.ne(right), exp)
-
- exp = pd.Series([False, False, True, False], index=list("abcd"))
- tm.assert_series_equal(left.le(right), exp)
-
- exp = pd.Series([False, False, False, False], index=list("abcd"))
- tm.assert_series_equal(left.lt(right), exp)
-
- exp = pd.Series([False, True, True, False], index=list("abcd"))
- tm.assert_series_equal(left.ge(right), exp)
-
- exp = pd.Series([False, True, False, False], index=list("abcd"))
- tm.assert_series_equal(left.gt(right), exp)
-
- def test_comparison_flex_alignment_fill(self):
+ @pytest.mark.parametrize(
+ "values, op, fill_value",
+ [
+ ([False, False, True, True], "eq", 2),
+ ([True, True, False, False], "ne", 2),
+ ([False, False, True, True], "le", 0),
+ ([False, False, False, True], "lt", 0),
+ ([True, True, True, False], "ge", 0),
+ ([True, True, False, False], "gt", 0),
+ ],
+ )
+ def test_comparison_flex_alignment_fill(self, values, op, fill_value):
left = Series([1, 3, 2], index=list("abc"))
right = Series([2, 2, 2], index=list("bcd"))
-
- exp = pd.Series([False, False, True, True], index=list("abcd"))
- tm.assert_series_equal(left.eq(right, fill_value=2), exp)
-
- exp = pd.Series([True, True, False, False], index=list("abcd"))
- tm.assert_series_equal(left.ne(right, fill_value=2), exp)
-
- exp = pd.Series([False, False, True, True], index=list("abcd"))
- tm.assert_series_equal(left.le(right, fill_value=0), exp)
-
- exp = pd.Series([False, False, False, True], index=list("abcd"))
- tm.assert_series_equal(left.lt(right, fill_value=0), exp)
-
- exp = pd.Series([True, True, True, False], index=list("abcd"))
- tm.assert_series_equal(left.ge(right, fill_value=0), exp)
-
- exp = pd.Series([True, True, False, False], index=list("abcd"))
- tm.assert_series_equal(left.gt(right, fill_value=0), exp)
+ result = getattr(left, op)(right, fill_value=fill_value)
+ expected = pd.Series(values, index=list("abcd"))
+ tm.assert_series_equal(result, expected)
class TestSeriesComparison:
@@ -501,7 +485,7 @@ def test_unequal_categorical_comparison_raises_type_error(self):
# for unequal comps, but not for equal/not equal
cat = Series(Categorical(list("abc"), ordered=True))
- msg = "Cannot compare a Categorical for op.+with a scalar"
+ msg = "Invalid comparison between dtype=category and str"
with pytest.raises(TypeError, match=msg):
cat < "d"
with pytest.raises(TypeError, match=msg):
@@ -553,32 +537,30 @@ def test_comparison_tuples(self):
expected = Series([True, False])
tm.assert_series_equal(result, expected)
- def test_comparison_operators_with_nas(self):
+ def test_comparison_operators_with_nas(self, all_compare_operators):
+ op = all_compare_operators
ser = Series(bdate_range("1/1/2000", periods=10), dtype=object)
ser[::2] = np.nan
- # test that comparisons work
- ops = ["lt", "le", "gt", "ge", "eq", "ne"]
- for op in ops:
- val = ser[5]
+ f = getattr(operator, op)
- f = getattr(operator, op)
- result = f(ser, val)
+ # test that comparisons work
+ val = ser[5]
- expected = f(ser.dropna(), val).reindex(ser.index)
+ result = f(ser, val)
+ expected = f(ser.dropna(), val).reindex(ser.index)
- if op == "ne":
- expected = expected.fillna(True).astype(bool)
- else:
- expected = expected.fillna(False).astype(bool)
+ if op == "__ne__":
+ expected = expected.fillna(True).astype(bool)
+ else:
+ expected = expected.fillna(False).astype(bool)
- tm.assert_series_equal(result, expected)
+ tm.assert_series_equal(result, expected)
- # FIXME: dont leave commented-out
- # fffffffuuuuuuuuuuuu
- # result = f(val, s)
- # expected = f(val, s.dropna()).reindex(s.index)
- # tm.assert_series_equal(result, expected)
+ # FIXME: dont leave commented-out
+ # result = f(val, ser)
+ # expected = f(val, ser.dropna()).reindex(ser.index)
+ # tm.assert_series_equal(result, expected)
def test_ne(self):
ts = Series([3, 4, 5, 6, 7], [3, 4, 5, 6, 7], dtype=float)
@@ -586,35 +568,52 @@ def test_ne(self):
assert tm.equalContents(ts.index != 5, expected)
assert tm.equalContents(~(ts.index == 5), expected)
- def test_comp_ops_df_compat(self):
+ @pytest.mark.parametrize(
+ "left, right",
+ [
+ (
+ pd.Series([1, 2, 3], index=list("ABC"), name="x"),
+ pd.Series([2, 2, 2], index=list("ABD"), name="x"),
+ ),
+ (
+ pd.Series([1, 2, 3], index=list("ABC"), name="x"),
+ pd.Series([2, 2, 2, 2], index=list("ABCD"), name="x"),
+ ),
+ ],
+ )
+ def test_comp_ops_df_compat(self, left, right):
# GH 1134
- s1 = pd.Series([1, 2, 3], index=list("ABC"), name="x")
- s2 = pd.Series([2, 2, 2], index=list("ABD"), name="x")
-
- s3 = pd.Series([1, 2, 3], index=list("ABC"), name="x")
- s4 = pd.Series([2, 2, 2, 2], index=list("ABCD"), name="x")
-
- for left, right in [(s1, s2), (s2, s1), (s3, s4), (s4, s3)]:
-
- msg = "Can only compare identically-labeled Series objects"
- with pytest.raises(ValueError, match=msg):
- left == right
+ msg = "Can only compare identically-labeled Series objects"
+ with pytest.raises(ValueError, match=msg):
+ left == right
+ with pytest.raises(ValueError, match=msg):
+ right == left
- with pytest.raises(ValueError, match=msg):
- left != right
+ with pytest.raises(ValueError, match=msg):
+ left != right
+ with pytest.raises(ValueError, match=msg):
+ right != left
- with pytest.raises(ValueError, match=msg):
- left < right
+ with pytest.raises(ValueError, match=msg):
+ left < right
+ with pytest.raises(ValueError, match=msg):
+ right < left
- msg = "Can only compare identically-labeled DataFrame objects"
- with pytest.raises(ValueError, match=msg):
- left.to_frame() == right.to_frame()
+ msg = "Can only compare identically-labeled DataFrame objects"
+ with pytest.raises(ValueError, match=msg):
+ left.to_frame() == right.to_frame()
+ with pytest.raises(ValueError, match=msg):
+ right.to_frame() == left.to_frame()
- with pytest.raises(ValueError, match=msg):
- left.to_frame() != right.to_frame()
+ with pytest.raises(ValueError, match=msg):
+ left.to_frame() != right.to_frame()
+ with pytest.raises(ValueError, match=msg):
+ right.to_frame() != left.to_frame()
- with pytest.raises(ValueError, match=msg):
- left.to_frame() < right.to_frame()
+ with pytest.raises(ValueError, match=msg):
+ left.to_frame() < right.to_frame()
+ with pytest.raises(ValueError, match=msg):
+ right.to_frame() < left.to_frame()
def test_compare_series_interval_keyword(self):
# GH#25338
diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py
index 1dd410ad02ee0..1b5fddaf14335 100644
--- a/pandas/tests/series/test_constructors.py
+++ b/pandas/tests/series/test_constructors.py
@@ -8,16 +8,23 @@
from pandas._libs import iNaT, lib
from pandas.core.dtypes.common import is_categorical_dtype, is_datetime64tz_dtype
-from pandas.core.dtypes.dtypes import CategoricalDtype
+from pandas.core.dtypes.dtypes import (
+ CategoricalDtype,
+ DatetimeTZDtype,
+ IntervalDtype,
+ PeriodDtype,
+)
import pandas as pd
from pandas import (
Categorical,
DataFrame,
Index,
+ Interval,
IntervalIndex,
MultiIndex,
NaT,
+ Period,
Series,
Timestamp,
date_range,
@@ -42,7 +49,7 @@ class TestSeriesConstructors:
(lambda: Series({}), True),
(lambda: Series(()), False), # creates a RangeIndex
(lambda: Series([]), False), # creates a RangeIndex
- (lambda: Series((_ for _ in [])), False), # creates a RangeIndex
+ (lambda: Series(_ for _ in []), False), # creates a RangeIndex
(lambda: Series(data=None), True),
(lambda: Series(data={}), True),
(lambda: Series(data=()), False), # creates a RangeIndex
@@ -215,8 +222,7 @@ def test_constructor_iterable(self):
# GH 21987
class Iter:
def __iter__(self):
- for i in range(10):
- yield i
+ yield from range(10)
expected = Series(list(range(10)), dtype="int64")
result = Series(Iter(), dtype="int64")
@@ -1075,6 +1081,26 @@ def test_constructor_dict_order(self):
expected = Series([1, 0, 2], index=list("bac"))
tm.assert_series_equal(result, expected)
+ @pytest.mark.parametrize(
+ "data,dtype",
+ [
+ (Period("2020-01"), PeriodDtype("M")),
+ (Interval(left=0, right=5), IntervalDtype("int64")),
+ (
+ Timestamp("2011-01-01", tz="US/Eastern"),
+ DatetimeTZDtype(tz="US/Eastern"),
+ ),
+ ],
+ )
+ def test_constructor_dict_extension(self, data, dtype):
+ d = {"a": data}
+ result = Series(d, index=["a"])
+ expected = Series(data, index=["a"], dtype=dtype)
+
+ assert result.dtype == dtype
+
+ tm.assert_series_equal(result, expected)
+
@pytest.mark.parametrize("value", [2, np.nan, None, float("nan")])
def test_constructor_dict_nan_key(self, value):
# GH 18480
@@ -1449,3 +1475,35 @@ def test_constructor_datetimelike_scalar_to_string_dtype(self):
result = Series("M", index=[1, 2, 3], dtype="string")
expected = pd.Series(["M", "M", "M"], index=[1, 2, 3], dtype="string")
tm.assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize(
+ "values",
+ [
+ [np.datetime64("2012-01-01"), np.datetime64("2013-01-01")],
+ ["2012-01-01", "2013-01-01"],
+ ],
+ )
+ def test_constructor_sparse_datetime64(self, values):
+ # https://github.com/pandas-dev/pandas/issues/35762
+ dtype = pd.SparseDtype("datetime64[ns]")
+ result = pd.Series(values, dtype=dtype)
+ arr = pd.arrays.SparseArray(values, dtype=dtype)
+ expected = pd.Series(arr)
+ tm.assert_series_equal(result, expected)
+
+ def test_construction_from_ordered_collection(self):
+ # https://github.com/pandas-dev/pandas/issues/36044
+ result = Series({"a": 1, "b": 2}.keys())
+ expected = Series(["a", "b"])
+ tm.assert_series_equal(result, expected)
+
+ result = Series({"a": 1, "b": 2}.values())
+ expected = Series([1, 2])
+ tm.assert_series_equal(result, expected)
+
+ def test_construction_from_large_int_scalar_no_overflow(self):
+ # https://github.com/pandas-dev/pandas/issues/36291
+ n = 1_000_000_000_000_000_000_000
+ result = Series(n, index=[0])
+ expected = Series(n)
+ tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/series/test_cumulative.py b/pandas/tests/series/test_cumulative.py
index 0b4c5f091106a..e070b86717503 100644
--- a/pandas/tests/series/test_cumulative.py
+++ b/pandas/tests/series/test_cumulative.py
@@ -17,7 +17,7 @@
def _check_accum_op(name, series, check_dtype=True):
func = getattr(np, name)
tm.assert_numpy_array_equal(
- func(series).values, func(np.array(series)), check_dtype=check_dtype,
+ func(series).values, func(np.array(series)), check_dtype=check_dtype
)
# with missing values
diff --git a/pandas/tests/series/test_datetime_values.py b/pandas/tests/series/test_datetime_values.py
index 0fd51b8828bc5..723bd303b1974 100644
--- a/pandas/tests/series/test_datetime_values.py
+++ b/pandas/tests/series/test_datetime_values.py
@@ -625,7 +625,8 @@ def test_dt_accessor_invalid(self, ser):
def test_dt_accessor_updates_on_inplace(self):
s = Series(pd.date_range("2018-01-01", periods=10))
s[2] = None
- s.fillna(pd.Timestamp("2018-01-01"), inplace=True)
+ return_value = s.fillna(pd.Timestamp("2018-01-01"), inplace=True)
+ assert return_value is None
result = s.dt.date
assert result[0] == result[2]
@@ -681,6 +682,9 @@ def test_setitem_with_different_tz(self):
[[pd.NaT], [[np.NaN, np.NaN, np.NaN]]],
[["2019-12-31", "2019-12-29"], [[2020, 1, 2], [2019, 52, 7]]],
[["2010-01-01", pd.NaT], [[2009, 53, 5], [np.NaN, np.NaN, np.NaN]]],
+ # see GH#36032
+ [["2016-01-08", "2016-01-04"], [[2016, 1, 5], [2016, 1, 1]]],
+ [["2016-01-07", "2016-01-01"], [[2016, 1, 4], [2015, 53, 5]]],
],
)
def test_isocalendar(self, input_series, expected_output):
diff --git a/pandas/tests/series/test_dtypes.py b/pandas/tests/series/test_dtypes.py
index bcc0b18134dad..ae89e16ca7667 100644
--- a/pandas/tests/series/test_dtypes.py
+++ b/pandas/tests/series/test_dtypes.py
@@ -137,13 +137,13 @@ def test_astype_str_cast_dt64(self):
ts = Series([Timestamp("2010-01-04 00:00:00")])
s = ts.astype(str)
- expected = Series([str("2010-01-04")])
+ expected = Series(["2010-01-04"])
tm.assert_series_equal(s, expected)
ts = Series([Timestamp("2010-01-04 00:00:00", tz="US/Eastern")])
s = ts.astype(str)
- expected = Series([str("2010-01-04 00:00:00-05:00")])
+ expected = Series(["2010-01-04 00:00:00-05:00"])
tm.assert_series_equal(s, expected)
def test_astype_str_cast_td64(self):
@@ -152,7 +152,7 @@ def test_astype_str_cast_td64(self):
td = Series([Timedelta(1, unit="d")])
ser = td.astype(str)
- expected = Series([str("1 days")])
+ expected = Series(["1 days"])
tm.assert_series_equal(ser, expected)
def test_astype_unicode(self):
@@ -167,7 +167,7 @@ def test_astype_unicode(self):
former_encoding = None
if sys.getdefaultencoding() == "utf-8":
- test_series.append(Series(["野菜食べないとやばい".encode("utf-8")]))
+ test_series.append(Series(["野菜食べないとやばい".encode()]))
for s in test_series:
res = s.astype("unicode")
diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py
index 708118e950686..b12ebd58e6a7b 100644
--- a/pandas/tests/series/test_io.py
+++ b/pandas/tests/series/test_io.py
@@ -66,12 +66,11 @@ def test_from_csv(self, datetime_series, string_series):
tm.assert_series_equal(check_series, series)
def test_to_csv(self, datetime_series):
- import io
with tm.ensure_clean() as path:
datetime_series.to_csv(path, header=False)
- with io.open(path, newline=None) as f:
+ with open(path, newline=None) as f:
lines = f.readlines()
assert lines[1] != "\n"
diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py
index 162778e372426..0144e4257efe0 100644
--- a/pandas/tests/series/test_missing.py
+++ b/pandas/tests/series/test_missing.py
@@ -453,7 +453,8 @@ def test_fillna_downcast(self):
def test_fillna_int(self):
s = Series(np.random.randint(-100, 100, 50))
- s.fillna(method="ffill", inplace=True)
+ return_value = s.fillna(method="ffill", inplace=True)
+ assert return_value is None
tm.assert_series_equal(s.fillna(method="ffill", inplace=False), s)
def test_categorical_nan_equality(self):
@@ -680,7 +681,8 @@ def test_dropna_empty(self):
s = Series([], dtype=object)
assert len(s.dropna()) == 0
- s.dropna(inplace=True)
+ return_value = s.dropna(inplace=True)
+ assert return_value is None
assert len(s) == 0
# invalid axis
@@ -729,7 +731,8 @@ def test_dropna_no_nan(self):
assert result is not s
s2 = s.copy()
- s2.dropna(inplace=True)
+ return_value = s2.dropna(inplace=True)
+ assert return_value is None
tm.assert_series_equal(s2, s)
def test_dropna_intervals(self):
@@ -775,7 +778,8 @@ def test_pad_nan(self):
[np.nan, 1.0, np.nan, 3.0, np.nan], ["z", "a", "b", "c", "d"], dtype=float
)
- x.fillna(method="pad", inplace=True)
+ return_value = x.fillna(method="pad", inplace=True)
+ assert return_value is None
expected = Series(
[np.nan, 1.0, 1.0, 3.0, 3.0], ["z", "a", "b", "c", "d"], dtype=float
@@ -799,7 +803,8 @@ def test_dropna_preserve_name(self, datetime_series):
assert result.name == datetime_series.name
name = datetime_series.name
ts = datetime_series.copy()
- ts.dropna(inplace=True)
+ return_value = ts.dropna(inplace=True)
+ assert return_value is None
assert ts.name == name
def test_series_fillna_limit(self):
diff --git a/pandas/tests/series/test_operators.py b/pandas/tests/series/test_operators.py
index e1c9682329271..aee947e738525 100644
--- a/pandas/tests/series/test_operators.py
+++ b/pandas/tests/series/test_operators.py
@@ -536,3 +536,44 @@ def test_invert(self):
ser = tm.makeStringSeries()
ser.name = "series"
tm.assert_series_equal(-(ser < 0), ~(ser < 0))
+
+ @pytest.mark.parametrize(
+ "source, target",
+ [
+ ([1, 2, 3], [-1, -2, -3]),
+ ([1, 2, None], [-1, -2, None]),
+ ([-1, 0, 1], [1, 0, -1]),
+ ],
+ )
+ def test_unary_minus_nullable_int(
+ self, any_signed_nullable_int_dtype, source, target
+ ):
+ dtype = any_signed_nullable_int_dtype
+ s = pd.Series(source, dtype=dtype)
+ result = -s
+ expected = pd.Series(target, dtype=dtype)
+ tm.assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize(
+ "source", [[1, 2, 3], [1, 2, None], [-1, 0, 1]],
+ )
+ def test_unary_plus_nullable_int(self, any_signed_nullable_int_dtype, source):
+ dtype = any_signed_nullable_int_dtype
+ expected = pd.Series(source, dtype=dtype)
+ result = +expected
+ tm.assert_series_equal(result, expected)
+
+ @pytest.mark.parametrize(
+ "source, target",
+ [
+ ([1, 2, 3], [1, 2, 3]),
+ ([1, -2, None], [1, 2, None]),
+ ([-1, 0, 1], [1, 0, 1]),
+ ],
+ )
+ def test_abs_nullable_int(self, any_signed_nullable_int_dtype, source, target):
+ dtype = any_signed_nullable_int_dtype
+ s = pd.Series(source, dtype=dtype)
+ result = abs(s)
+ expected = pd.Series(target, dtype=dtype)
+ tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/series/test_subclass.py b/pandas/tests/series/test_subclass.py
index a596ed49c1df2..86330b7cc6993 100644
--- a/pandas/tests/series/test_subclass.py
+++ b/pandas/tests/series/test_subclass.py
@@ -51,3 +51,11 @@ def test_explode(self):
s = tm.SubclassedSeries([[1, 2, 3], "foo", [], [3, 4]])
result = s.explode()
assert isinstance(result, tm.SubclassedSeries)
+
+ def test_equals(self):
+ # https://github.com/pandas-dev/pandas/pull/34402
+ # allow subclass in both directions
+ s1 = pd.Series([1, 2, 3])
+ s2 = tm.SubclassedSeries([1, 2, 3])
+ assert s1.equals(s2)
+ assert s2.equals(s1)
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
index a080bf0feaebc..6102f43f4db6a 100644
--- a/pandas/tests/test_algos.py
+++ b/pandas/tests/test_algos.py
@@ -6,7 +6,9 @@
from numpy.random import RandomState
import pytest
-from pandas._libs import algos as libalgos, groupby as libgroupby, hashtable as ht
+from pandas._libs import algos as libalgos, hashtable as ht
+from pandas._libs.groupby import group_var_float32, group_var_float64
+from pandas.compat import IS64
from pandas.compat.numpy import np_array_datetime64_compat
import pandas.util._test_decorators as td
@@ -28,7 +30,6 @@
IntervalIndex,
Series,
Timestamp,
- compat,
)
import pandas._testing as tm
import pandas.core.algorithms as algos
@@ -251,6 +252,19 @@ def test_object_factorize(self, writable):
tm.assert_numpy_array_equal(codes, expected_codes)
tm.assert_numpy_array_equal(uniques, expected_uniques)
+ def test_datetime64_factorize(self, writable):
+ # GH35650 Verify whether read-only datetime64 array can be factorized
+ data = np.array([np.datetime64("2020-01-01T00:00:00.000")])
+ data.setflags(write=writable)
+ expected_codes = np.array([0], dtype=np.intp)
+ expected_uniques = np.array(
+ ["2020-01-01T00:00:00.000000000"], dtype="datetime64[ns]"
+ )
+
+ codes, uniques = pd.factorize(data)
+ tm.assert_numpy_array_equal(codes, expected_codes)
+ tm.assert_numpy_array_equal(uniques, expected_uniques)
+
def test_deprecate_order(self):
# gh 19727 - check warning is raised for deprecated keyword, order.
# Test not valid once order keyword is removed.
@@ -289,7 +303,7 @@ def test_parametrized_factorize_na_value_default(self, data):
],
)
def test_parametrized_factorize_na_value(self, data, na_value):
- codes, uniques = algos._factorize_array(data, na_value=na_value)
+ codes, uniques = algos.factorize_array(data, na_value=na_value)
expected_uniques = data[[1, 3]]
expected_codes = np.array([-1, 0, -1, 1], dtype=np.intp)
tm.assert_numpy_array_equal(codes, expected_codes)
@@ -326,73 +340,47 @@ def test_factorize_na_sentinel(self, sort, na_sentinel, data, uniques):
tm.assert_extension_array_equal(uniques, expected_uniques)
@pytest.mark.parametrize(
- "data, dropna, expected_codes, expected_uniques",
+ "data, expected_codes, expected_uniques",
[
(
["a", None, "b", "a"],
- True,
- np.array([0, -1, 1, 0], dtype=np.dtype("intp")),
- np.array(["a", "b"], dtype=object),
- ),
- (
- ["a", np.nan, "b", "a"],
- True,
- np.array([0, -1, 1, 0], dtype=np.dtype("intp")),
- np.array(["a", "b"], dtype=object),
- ),
- (
- ["a", None, "b", "a"],
- False,
np.array([0, 2, 1, 0], dtype=np.dtype("intp")),
np.array(["a", "b", np.nan], dtype=object),
),
(
["a", np.nan, "b", "a"],
- False,
np.array([0, 2, 1, 0], dtype=np.dtype("intp")),
np.array(["a", "b", np.nan], dtype=object),
),
],
)
- def test_object_factorize_dropna(
- self, data, dropna, expected_codes, expected_uniques
+ def test_object_factorize_na_sentinel_none(
+ self, data, expected_codes, expected_uniques
):
- codes, uniques = algos.factorize(data, dropna=dropna)
+ codes, uniques = algos.factorize(data, na_sentinel=None)
tm.assert_numpy_array_equal(uniques, expected_uniques)
tm.assert_numpy_array_equal(codes, expected_codes)
@pytest.mark.parametrize(
- "data, dropna, expected_codes, expected_uniques",
+ "data, expected_codes, expected_uniques",
[
(
[1, None, 1, 2],
- True,
- np.array([0, -1, 0, 1], dtype=np.dtype("intp")),
- np.array([1, 2], dtype="O"),
- ),
- (
- [1, np.nan, 1, 2],
- True,
- np.array([0, -1, 0, 1], dtype=np.dtype("intp")),
- np.array([1, 2], dtype=np.float64),
- ),
- (
- [1, None, 1, 2],
- False,
np.array([0, 2, 0, 1], dtype=np.dtype("intp")),
np.array([1, 2, np.nan], dtype="O"),
),
(
[1, np.nan, 1, 2],
- False,
np.array([0, 2, 0, 1], dtype=np.dtype("intp")),
np.array([1, 2, np.nan], dtype=np.float64),
),
],
)
- def test_int_factorize_dropna(self, data, dropna, expected_codes, expected_uniques):
- codes, uniques = algos.factorize(data, dropna=dropna)
+ def test_int_factorize_na_sentinel_none(
+ self, data, expected_codes, expected_uniques
+ ):
+ codes, uniques = algos.factorize(data, na_sentinel=None)
tm.assert_numpy_array_equal(uniques, expected_uniques)
tm.assert_numpy_array_equal(codes, expected_codes)
@@ -813,7 +801,6 @@ def test_i8(self):
tm.assert_numpy_array_equal(result, expected)
def test_large(self):
-
s = pd.date_range("20000101", periods=2000000, freq="s").values
result = algos.isin(s, s[0:2])
expected = np.zeros(len(s), dtype=bool)
@@ -853,6 +840,23 @@ def test_same_nan_is_in(self):
result = algos.isin(comps, values)
tm.assert_numpy_array_equal(expected, result)
+ def test_same_nan_is_in_large(self):
+ # https://github.com/pandas-dev/pandas/issues/22205
+ s = np.tile(1.0, 1_000_001)
+ s[0] = np.nan
+ result = algos.isin(s, [np.nan, 1])
+ expected = np.ones(len(s), dtype=bool)
+ tm.assert_numpy_array_equal(result, expected)
+
+ def test_same_nan_is_in_large_series(self):
+ # https://github.com/pandas-dev/pandas/issues/22205
+ s = np.tile(1.0, 1_000_001)
+ series = pd.Series(s)
+ s[0] = np.nan
+ result = series.isin([np.nan, 1])
+ expected = pd.Series(np.ones(len(s), dtype=bool))
+ tm.assert_series_equal(result, expected)
+
def test_same_object_is_in(self):
# GH 22160
# there could be special treatment for nans
@@ -956,7 +960,7 @@ def test_isin_int_df_string_search(self):
@pytest.mark.xfail(reason="problem related with issue #34125")
def test_isin_nan_df_string_search(self):
"""Comparing df with nan value (np.nan,2) with a string at isin() ("NaN")
- -> should not match values because np.nan is not equal str NaN """
+ -> should not match values because np.nan is not equal str NaN"""
df = pd.DataFrame({"values": [np.nan, 2]})
result = df.isin(["NaN"])
expected_false = pd.DataFrame({"values": [False, False]})
@@ -1149,7 +1153,7 @@ def test_dropna(self):
)
# 32-bit linux has a different ordering
- if not compat.is_platform_32bit():
+ if IS64:
result = Series([10.3, 5.0, 5.0, None]).value_counts(dropna=False)
expected = Series([2, 1, 1], index=[5.0, 10.3, np.nan])
tm.assert_series_equal(result, expected)
@@ -1182,7 +1186,7 @@ def test_value_counts_uint64(self):
result = algos.value_counts(arr)
# 32-bit linux has a different ordering
- if not compat.is_platform_32bit():
+ if IS64:
tm.assert_series_equal(result, expected)
@@ -1493,7 +1497,7 @@ def test_group_var_constant(self):
class TestGroupVarFloat64(GroupVarTestMixin):
__test__ = True
- algo = staticmethod(libgroupby.group_var_float64)
+ algo = staticmethod(group_var_float64)
dtype = np.float64
rtol = 1e-5
@@ -1516,7 +1520,7 @@ def test_group_var_large_inputs(self):
class TestGroupVarFloat32(GroupVarTestMixin):
__test__ = True
- algo = staticmethod(libgroupby.group_var_float32)
+ algo = staticmethod(group_var_float32)
dtype = np.float32
rtol = 1e-2
diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py
index bcfed2d0d3a10..f7f3f1fa0c13d 100644
--- a/pandas/tests/test_common.py
+++ b/pandas/tests/test_common.py
@@ -6,10 +6,11 @@
import numpy as np
import pytest
-from pandas.compat.numpy import _np_version_under1p17
+from pandas.compat.numpy import np_version_under1p17
import pandas as pd
from pandas import Series, Timestamp
+import pandas._testing as tm
from pandas.core import ops
import pandas.core.common as com
@@ -71,7 +72,7 @@ def test_random_state():
# Check BitGenerators
# GH32503
- if not _np_version_under1p17:
+ if not np_version_under1p17:
assert (
com.random_state(npr.MT19937(3)).uniform()
== npr.RandomState(npr.MT19937(3)).uniform()
@@ -157,3 +158,12 @@ def test_version_tag():
raise ValueError(
"No git tags exist, please sync tags between upstream and your repo"
)
+
+
+@pytest.mark.parametrize(
+ "obj", [(obj,) for obj in pd.__dict__.values() if callable(obj)]
+)
+def test_serializable(obj):
+ # GH 35611
+ unpickled = tm.round_trip_pickle(obj)
+ assert type(obj) == type(unpickled)
diff --git a/pandas/tests/test_downstream.py b/pandas/tests/test_downstream.py
index e718a6b759963..b32c5e91af295 100644
--- a/pandas/tests/test_downstream.py
+++ b/pandas/tests/test_downstream.py
@@ -90,7 +90,7 @@ def test_statsmodels():
def test_scikit_learn(df):
sklearn = import_module("sklearn") # noqa
- from sklearn import svm, datasets
+ from sklearn import datasets, svm
digits = datasets.load_digits()
clf = svm.SVC(gamma=0.001, C=100.0)
diff --git a/pandas/tests/test_expressions.py b/pandas/tests/test_expressions.py
index 2368e93ddc256..da7f8b9b4a721 100644
--- a/pandas/tests/test_expressions.py
+++ b/pandas/tests/test_expressions.py
@@ -35,7 +35,7 @@
)
-@pytest.mark.skipif(not expr._USE_NUMEXPR, reason="not using numexpr")
+@pytest.mark.skipif(not expr.USE_NUMEXPR, reason="not using numexpr")
class TestExpressions:
def setup_method(self, method):
diff --git a/pandas/tests/test_flags.py b/pandas/tests/test_flags.py
new file mode 100644
index 0000000000000..f6e3ae4980afb
--- /dev/null
+++ b/pandas/tests/test_flags.py
@@ -0,0 +1,48 @@
+import pytest
+
+import pandas as pd
+
+
+class TestFlags:
+ def test_equality(self):
+ a = pd.DataFrame().set_flags(allows_duplicate_labels=True).flags
+ b = pd.DataFrame().set_flags(allows_duplicate_labels=False).flags
+
+ assert a == a
+ assert b == b
+ assert a != b
+ assert a != 2
+
+ def test_set(self):
+ df = pd.DataFrame().set_flags(allows_duplicate_labels=True)
+ a = df.flags
+ a.allows_duplicate_labels = False
+ assert a.allows_duplicate_labels is False
+ a["allows_duplicate_labels"] = True
+ assert a.allows_duplicate_labels is True
+
+ def test_repr(self):
+ a = repr(pd.DataFrame({"A"}).set_flags(allows_duplicate_labels=True).flags)
+ assert a == ""
+ a = repr(pd.DataFrame({"A"}).set_flags(allows_duplicate_labels=False).flags)
+ assert a == ""
+
+ def test_obj_ref(self):
+ df = pd.DataFrame()
+ flags = df.flags
+ del df
+ with pytest.raises(ValueError, match="object has been deleted"):
+ flags.allows_duplicate_labels = True
+
+ def test_getitem(self):
+ df = pd.DataFrame()
+ flags = df.flags
+ assert flags["allows_duplicate_labels"] is True
+ flags["allows_duplicate_labels"] = False
+ assert flags["allows_duplicate_labels"] is False
+
+ with pytest.raises(KeyError):
+ flags["a"]
+
+ with pytest.raises(ValueError):
+ flags["a"] = 10
diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py
index 1ba73292dc0b4..274860b3fdb5c 100644
--- a/pandas/tests/test_multilevel.py
+++ b/pandas/tests/test_multilevel.py
@@ -63,8 +63,8 @@ def setup_method(self, method):
).sum()
# use Int64Index, to make sure things work
- self.ymd.index.set_levels(
- [lev.astype("i8") for lev in self.ymd.index.levels], inplace=True
+ self.ymd.index = self.ymd.index.set_levels(
+ [lev.astype("i8") for lev in self.ymd.index.levels]
)
self.ymd.index.set_names(["year", "month", "day"], inplace=True)
@@ -1846,7 +1846,7 @@ def test_multilevel_index_loc_order(self, dim, keys, expected):
# GH 22797
# Try to respect order of keys given for MultiIndex.loc
kwargs = {dim: [["c", "a", "a", "b", "b"], [1, 1, 2, 1, 2]]}
- df = pd.DataFrame(np.arange(25).reshape(5, 5), **kwargs,)
+ df = pd.DataFrame(np.arange(25).reshape(5, 5), **kwargs)
exp_index = MultiIndex.from_arrays(expected)
if dim == "index":
res = df.loc[keys, :]
diff --git a/pandas/tests/test_nanops.py b/pandas/tests/test_nanops.py
index 0d60e6e8a978f..c45e4508c6153 100644
--- a/pandas/tests/test_nanops.py
+++ b/pandas/tests/test_nanops.py
@@ -285,7 +285,7 @@ def test_nansum(self, skipna):
def test_nanmean(self, skipna):
self.check_funs(
- nanops.nanmean, np.mean, skipna, allow_obj=False, allow_date=False,
+ nanops.nanmean, np.mean, skipna, allow_obj=False, allow_date=False
)
def test_nanmean_overflow(self):
diff --git a/pandas/tests/test_sorting.py b/pandas/tests/test_sorting.py
index 98297474243e4..deb7434694d01 100644
--- a/pandas/tests/test_sorting.py
+++ b/pandas/tests/test_sorting.py
@@ -60,6 +60,7 @@ def test_int64_overflow(self):
assert left[k] == v
assert len(left) == len(right)
+ @pytest.mark.arm_slow
def test_int64_overflow_moar(self):
# GH9096
diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py
index d9396d70f9112..c792a48d3ef08 100644
--- a/pandas/tests/test_strings.py
+++ b/pandas/tests/test_strings.py
@@ -29,6 +29,8 @@ def assert_series_or_index_equal(left, right):
("decode", ("UTF-8",), {}),
("encode", ("UTF-8",), {}),
("endswith", ("a",), {}),
+ ("endswith", ("a",), {"na": True}),
+ ("endswith", ("a",), {"na": False}),
("extract", ("([a-z]*)",), {"expand": False}),
("extract", ("([a-z]*)",), {"expand": True}),
("extractall", ("([a-z]*)",), {}),
@@ -58,6 +60,8 @@ def assert_series_or_index_equal(left, right):
("split", (" ",), {"expand": False}),
("split", (" ",), {"expand": True}),
("startswith", ("a",), {}),
+ ("startswith", ("a",), {"na": True}),
+ ("startswith", ("a",), {"na": False}),
# translating unicode points of "a" to "d"
("translate", ({97: 100},), {}),
("wrap", (2,), {}),
@@ -838,15 +842,23 @@ def test_contains_for_object_category(self):
expected = Series([True, False, False, True, False])
tm.assert_series_equal(result, expected)
- def test_startswith(self):
- values = Series(["om", np.nan, "foo_nom", "nom", "bar_foo", np.nan, "foo"])
+ @pytest.mark.parametrize("dtype", [None, "category"])
+ @pytest.mark.parametrize("null_value", [None, np.nan, pd.NA])
+ @pytest.mark.parametrize("na", [True, False])
+ def test_startswith(self, dtype, null_value, na):
+ # add category dtype parametrizations for GH-36241
+ values = Series(
+ ["om", null_value, "foo_nom", "nom", "bar_foo", null_value, "foo"],
+ dtype=dtype,
+ )
result = values.str.startswith("foo")
exp = Series([False, np.nan, True, False, False, np.nan, True])
tm.assert_series_equal(result, exp)
- result = values.str.startswith("foo", na=True)
- tm.assert_series_equal(result, exp.fillna(True).astype(bool))
+ result = values.str.startswith("foo", na=na)
+ exp = Series([False, na, True, False, False, na, True])
+ tm.assert_series_equal(result, exp)
# mixed
mixed = np.array(
@@ -867,15 +879,23 @@ def test_startswith(self):
)
tm.assert_series_equal(rs, xp)
- def test_endswith(self):
- values = Series(["om", np.nan, "foo_nom", "nom", "bar_foo", np.nan, "foo"])
+ @pytest.mark.parametrize("dtype", [None, "category"])
+ @pytest.mark.parametrize("null_value", [None, np.nan, pd.NA])
+ @pytest.mark.parametrize("na", [True, False])
+ def test_endswith(self, dtype, null_value, na):
+ # add category dtype parametrizations for GH-36241
+ values = Series(
+ ["om", null_value, "foo_nom", "nom", "bar_foo", null_value, "foo"],
+ dtype=dtype,
+ )
result = values.str.endswith("foo")
exp = Series([False, np.nan, False, False, True, np.nan, True])
tm.assert_series_equal(result, exp)
- result = values.str.endswith("foo", na=False)
- tm.assert_series_equal(result, exp.fillna(False).astype(bool))
+ result = values.str.endswith("foo", na=na)
+ exp = Series([False, na, False, False, True, na, True])
+ tm.assert_series_equal(result, exp)
# mixed
mixed = np.array(
@@ -3552,6 +3572,10 @@ def test_string_array(any_string_method):
assert result.dtype == "boolean"
result = result.astype(object)
+ elif expected.dtype == "bool":
+ assert result.dtype == "boolean"
+ result = result.astype("bool")
+
elif expected.dtype == "float" and expected.isna().any():
assert result.dtype == "Int64"
result = result.astype("float")
diff --git a/pandas/tests/tools/test_to_datetime.py b/pandas/tests/tools/test_to_datetime.py
index d2049892705ea..819474e1f32e7 100644
--- a/pandas/tests/tools/test_to_datetime.py
+++ b/pandas/tests/tools/test_to_datetime.py
@@ -1217,10 +1217,10 @@ def test_unit_mixed(self, cache):
@pytest.mark.parametrize("cache", [True, False])
def test_unit_rounding(self, cache):
- # GH 14156: argument will incur floating point errors but no
- # premature rounding
+ # GH 14156 & GH 20445: argument will incur floating point errors
+ # but no premature rounding
result = pd.to_datetime(1434743731.8770001, unit="s", cache=cache)
- expected = pd.Timestamp("2015-06-19 19:55:31.877000093")
+ expected = pd.Timestamp("2015-06-19 19:55:31.877000192")
assert result == expected
@pytest.mark.parametrize("cache", [True, False])
@@ -1454,6 +1454,8 @@ def test_to_datetime_unit(self):
]
+ [NaT]
)
+ # GH20455 argument will incur floating point errors but no premature rounding
+ result = result.round("ms")
tm.assert_series_equal(result, expected)
s = pd.concat(
diff --git a/pandas/tests/tools/test_to_numeric.py b/pandas/tests/tools/test_to_numeric.py
index 263887a8ea36e..450076f2824ad 100644
--- a/pandas/tests/tools/test_to_numeric.py
+++ b/pandas/tests/tools/test_to_numeric.py
@@ -649,3 +649,61 @@ def test_failure_to_convert_uint64_string_to_NaN():
ser = Series([32, 64, np.nan])
result = to_numeric(pd.Series(["32", "64", "uint64"]), errors="coerce")
tm.assert_series_equal(result, ser)
+
+
+@pytest.mark.parametrize(
+ "strrep",
+ [
+ "243.164",
+ "245.968",
+ "249.585",
+ "259.745",
+ "265.742",
+ "272.567",
+ "279.196",
+ "280.366",
+ "275.034",
+ "271.351",
+ "272.889",
+ "270.627",
+ "280.828",
+ "290.383",
+ "308.153",
+ "319.945",
+ "336.0",
+ "344.09",
+ "351.385",
+ "356.178",
+ "359.82",
+ "361.03",
+ "367.701",
+ "380.812",
+ "387.98",
+ "391.749",
+ "391.171",
+ "385.97",
+ "385.345",
+ "386.121",
+ "390.996",
+ "399.734",
+ "413.073",
+ "421.532",
+ "430.221",
+ "437.092",
+ "439.746",
+ "446.01",
+ "451.191",
+ "460.463",
+ "469.779",
+ "472.025",
+ "479.49",
+ "474.864",
+ "467.54",
+ "471.978",
+ ],
+)
+def test_precision_float_conversion(strrep):
+ # GH 31364
+ result = to_numeric(strrep)
+
+ assert result == float(strrep)
diff --git a/pandas/tests/tools/test_to_timedelta.py b/pandas/tests/tools/test_to_timedelta.py
index 1e193f22a6698..f68d83f7f4d58 100644
--- a/pandas/tests/tools/test_to_timedelta.py
+++ b/pandas/tests/tools/test_to_timedelta.py
@@ -166,3 +166,16 @@ def test_to_timedelta_ignore_strings_unit(self):
arr = np.array([1, 2, "error"], dtype=object)
result = pd.to_timedelta(arr, unit="ns", errors="ignore")
tm.assert_numpy_array_equal(result, arr)
+
+ def test_to_timedelta_nullable_int64_dtype(self):
+ # GH 35574
+ expected = Series([timedelta(days=1), timedelta(days=2)])
+ result = to_timedelta(Series([1, 2], dtype="Int64"), unit="days")
+
+ tm.assert_series_equal(result, expected)
+
+ # IntegerArray Series with nulls
+ expected = Series([timedelta(days=1), None])
+ result = to_timedelta(Series([1, None], dtype="Int64"), unit="days")
+
+ tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/tseries/offsets/data/dateoffset_0_15_2.pickle b/pandas/tests/tseries/offsets/data/dateoffset_0_15_2.pickle
deleted file mode 100644
index ce561526a5e12..0000000000000
--- a/pandas/tests/tseries/offsets/data/dateoffset_0_15_2.pickle
+++ /dev/null
@@ -1,183 +0,0 @@
-(dp0
-S'YearBegin'
-p1
-ccopy_reg
-_reconstructor
-p2
-(cpandas.tseries.offsets
-YearBegin
-p3
-c__builtin__
-object
-p4
-Ntp5
-Rp6
-(dp7
-S'normalize'
-p8
-I00
-sS'kwds'
-p9
-(dp10
-sS'n'
-p11
-I1
-sS'_offset'
-p12
-cdatetime
-timedelta
-p13
-(I1
-I0
-I0
-tp14
-Rp15
-sS'month'
-p16
-I1
-sS'_use_relativedelta'
-p17
-I00
-sbsS'Week'
-p18
-g2
-(cpandas.tseries.offsets
-Week
-p19
-g4
-Ntp20
-Rp21
-(dp22
-g8
-I00
-sS'_inc'
-p23
-g13
-(I7
-I0
-I0
-tp24
-Rp25
-sg9
-(dp26
-sS'weekday'
-p27
-Nsg11
-I1
-sbsS'MonthBegin'
-p28
-g2
-(cpandas.tseries.offsets
-MonthBegin
-p29
-g4
-Ntp30
-Rp31
-(dp32
-g8
-I00
-sg12
-g13
-(I1
-I0
-I0
-tp33
-Rp34
-sg17
-I00
-sg9
-(dp35
-sg11
-I1
-sbsS'Day'
-p36
-g2
-(cpandas.tseries.offsets
-Day
-p37
-g4
-Ntp38
-Rp39
-(dp40
-g8
-I00
-sg12
-g13
-(I1
-I0
-I0
-tp41
-Rp42
-sg17
-I00
-sg9
-(dp43
-sg11
-I1
-sbsS'DateOffset'
-p44
-g2
-(cpandas.tseries.offsets
-DateOffset
-p45
-g4
-Ntp46
-Rp47
-(dp48
-g8
-I00
-sg12
-(idateutil.relativedelta
-relativedelta
-p49
-(dp50
-S'_has_time'
-p51
-I0
-sS'hour'
-p52
-NsS'seconds'
-p53
-I0
-sS'months'
-p54
-I0
-sS'year'
-p55
-NsS'days'
-p56
-I0
-sS'years'
-p57
-I1
-sS'hours'
-p58
-I0
-sS'second'
-p59
-NsS'microsecond'
-p60
-Nsg16
-NsS'microseconds'
-p61
-I0
-sS'leapdays'
-p62
-I0
-sS'minutes'
-p63
-I0
-sS'day'
-p64
-NsS'minute'
-p65
-Nsg27
-Nsbsg17
-I01
-sg9
-(dp66
-g57
-I1
-ssg11
-I1
-sbs.
\ No newline at end of file
diff --git a/pandas/tests/tseries/offsets/test_offsets.py b/pandas/tests/tseries/offsets/test_offsets.py
index 784c04f225630..3a0a292d360d4 100644
--- a/pandas/tests/tseries/offsets/test_offsets.py
+++ b/pandas/tests/tseries/offsets/test_offsets.py
@@ -14,7 +14,6 @@
import pandas._libs.tslibs.offsets as liboffsets
from pandas._libs.tslibs.offsets import ApplyTypeError, _get_offset, _offset_map
from pandas._libs.tslibs.period import INVALID_FREQ_ERR_MSG
-import pandas.compat as compat
from pandas.compat.numpy import np_datetime64_compat
from pandas.errors import PerformanceWarning
@@ -635,22 +634,6 @@ def test_add_empty_datetimeindex(self, offset_types, tz_naive_fixture):
result = offset_s + dta
tm.assert_equal(result, dta)
- def test_pickle_v0_15_2(self, datapath):
- offsets = {
- "DateOffset": DateOffset(years=1),
- "MonthBegin": MonthBegin(1),
- "Day": Day(1),
- "YearBegin": YearBegin(1),
- "Week": Week(1),
- }
-
- pickle_path = datapath("tseries", "offsets", "data", "dateoffset_0_15_2.pickle")
- # This code was executed once on v0.15.2 to generate the pickle:
- # with open(pickle_path, 'wb') as f: pickle.dump(offsets, f)
- #
- result = read_pickle(pickle_path)
- tm.assert_dict_equal(offsets, result)
-
def test_pickle_roundtrip(self, offset_types):
off = self._get_offset(offset_types)
res = tm.round_trip_pickle(off)
@@ -664,6 +647,15 @@ def test_pickle_roundtrip(self, offset_types):
# Make sure nothings got lost from _params (which __eq__) is based on
assert getattr(off, attr) == getattr(res, attr)
+ def test_pickle_dateoffset_odd_inputs(self):
+ # GH#34511
+ off = DateOffset(months=12)
+ res = tm.round_trip_pickle(off)
+ assert off == res
+
+ base_dt = datetime(2020, 1, 1)
+ assert base_dt + off == base_dt + res
+
def test_onOffset_deprecated(self, offset_types):
# GH#30340 use idiomatic naming
off = self._get_offset(offset_types)
@@ -744,10 +736,7 @@ def test_repr(self):
assert repr(self.offset) == ""
assert repr(self.offset2) == "<2 * BusinessDays>"
- if compat.PY37:
- expected = ""
- else:
- expected = ""
+ expected = ""
assert repr(self.offset + timedelta(1)) == expected
def test_with_offset(self):
@@ -2636,10 +2625,7 @@ def test_repr(self):
assert repr(self.offset) == ""
assert repr(self.offset2) == "<2 * CustomBusinessDays>"
- if compat.PY37:
- expected = ""
- else:
- expected = ""
+ expected = ""
assert repr(self.offset + timedelta(1)) == expected
def test_with_offset(self):
@@ -3663,14 +3649,19 @@ def test_offset(self, case):
@pytest.mark.parametrize("case", offset_cases)
def test_apply_index(self, case):
+ # https://github.com/pandas-dev/pandas/issues/34580
offset, cases = case
s = DatetimeIndex(cases.keys())
+ exp = DatetimeIndex(cases.values())
+
with tm.assert_produces_warning(None):
# GH#22535 check that we don't get a FutureWarning from adding
# an integer array to PeriodIndex
result = offset + s
+ tm.assert_index_equal(result, exp)
- exp = DatetimeIndex(cases.values())
+ with tm.assert_produces_warning(FutureWarning):
+ result = offset.apply_index(s)
tm.assert_index_equal(result, exp)
on_offset_cases = [
@@ -4310,12 +4301,6 @@ def test_all_offset_classes(self, tup):
# ---------------------------------------------------------------------
-def test_get_offset_day_error():
- # subclass of _BaseOffset must override _day_opt attribute, or we should
- # get a NotImplementedError
-
- with pytest.raises(NotImplementedError):
- DateOffset()._get_offset_day(datetime.now())
def test_valid_default_arguments(offset_types):
diff --git a/pandas/tests/tseries/offsets/test_offsets_properties.py b/pandas/tests/tseries/offsets/test_offsets_properties.py
index 81465e733da85..0fa9081d606b0 100644
--- a/pandas/tests/tseries/offsets/test_offsets_properties.py
+++ b/pandas/tests/tseries/offsets/test_offsets_properties.py
@@ -85,6 +85,7 @@
# Offset-specific behaviour tests
+@pytest.mark.arm_slow
@given(gen_random_datetime, gen_yqm_offset)
def test_on_offset_implementations(dt, offset):
assume(not offset.normalize)
@@ -95,34 +96,6 @@ def test_on_offset_implementations(dt, offset):
assert offset.is_on_offset(dt) == (compare == dt)
-@pytest.mark.xfail(
- reason="res_v2 below is incorrect, needs to use the "
- "commented-out version with tz_localize. "
- "But with that fix in place, hypothesis then "
- "has errors in timezone generation."
-)
-@given(gen_yqm_offset, gen_date_range)
-def test_apply_index_implementations(offset, rng):
- # offset.apply_index(dti)[i] should match dti[i] + offset
- assume(offset.n != 0) # TODO: test for that case separately
-
- # rng = pd.date_range(start='1/1/2000', periods=100000, freq='T')
- ser = pd.Series(rng)
-
- res = rng + offset
- res_v2 = offset.apply_index(rng)
- # res_v2 = offset.apply_index(rng.tz_localize(None)).tz_localize(rng.tz)
- assert (res == res_v2).all()
-
- assert res[0] == rng[0] + offset
- assert res[-1] == rng[-1] + offset
- res2 = ser + offset
- # apply_index is only for indexes, not series, so no res2_v2
- assert res2.iloc[0] == ser.iloc[0] + offset
- assert res2.iloc[-1] == ser.iloc[-1] + offset
- # TODO: Check randomly assorted entries, not just first/last
-
-
@given(gen_yqm_offset)
def test_shift_across_dst(offset):
# GH#18319 check that 1) timezone is correctly normalized and
diff --git a/pandas/tests/tseries/offsets/test_ticks.py b/pandas/tests/tseries/offsets/test_ticks.py
index 10c239c683bc0..cc23f5f3201da 100644
--- a/pandas/tests/tseries/offsets/test_ticks.py
+++ b/pandas/tests/tseries/offsets/test_ticks.py
@@ -64,6 +64,7 @@ def test_tick_add_sub(cls, n, m):
assert left - right == expected
+@pytest.mark.arm_slow
@pytest.mark.parametrize("cls", tick_classes)
@settings(deadline=None)
@example(n=2, m=3)
diff --git a/pandas/tests/tslibs/test_api.py b/pandas/tests/tslibs/test_api.py
index 840a8c2fb68b1..eca444c9ceb34 100644
--- a/pandas/tests/tslibs/test_api.py
+++ b/pandas/tests/tslibs/test_api.py
@@ -16,8 +16,8 @@ def test_namespace():
"offsets",
"parsing",
"period",
- "resolution",
"strptime",
+ "vectorized",
"timedeltas",
"timestamps",
"timezones",
@@ -32,16 +32,22 @@ def test_namespace():
"is_null_datetimelike",
"nat_strings",
"OutOfBoundsDatetime",
+ "OutOfBoundsTimedelta",
"Period",
"IncompatibleFrequency",
"Resolution",
"Tick",
"Timedelta",
+ "dt64arr_to_periodarr",
"Timestamp",
+ "is_date_array_normalized",
+ "ints_to_pydatetime",
+ "normalize_i8_timestamps",
+ "get_resolution",
"delta_to_nanoseconds",
"ints_to_pytimedelta",
"localize_pydatetime",
- "tz_convert_single",
+ "tz_convert_from_utc_single",
"to_offset",
]
diff --git a/pandas/tests/tslibs/test_ccalendar.py b/pandas/tests/tslibs/test_ccalendar.py
index aab86d3a2df69..1ff700fdc23a3 100644
--- a/pandas/tests/tslibs/test_ccalendar.py
+++ b/pandas/tests/tslibs/test_ccalendar.py
@@ -1,10 +1,13 @@
from datetime import date, datetime
+from hypothesis import given, strategies as st
import numpy as np
import pytest
from pandas._libs.tslibs import ccalendar
+import pandas as pd
+
@pytest.mark.parametrize(
"date_tuple,expected",
@@ -48,3 +51,15 @@ def test_dt_correct_iso_8601_year_week_and_day(input_date_tuple, expected_iso_tu
expected_from_date_isocalendar = date(*input_date_tuple).isocalendar()
assert result == expected_from_date_isocalendar
assert result == expected_iso_tuple
+
+
+@given(
+ st.datetimes(
+ min_value=pd.Timestamp.min.to_pydatetime(warn=False),
+ max_value=pd.Timestamp.max.to_pydatetime(warn=False),
+ )
+)
+def test_isocalendar(dt):
+ expected = dt.isocalendar()
+ result = ccalendar.get_iso_calendar(dt.year, dt.month, dt.day)
+ assert result == expected
diff --git a/pandas/tests/tslibs/test_conversion.py b/pandas/tests/tslibs/test_conversion.py
index fd8c9df026674..87cd97f853f4d 100644
--- a/pandas/tests/tslibs/test_conversion.py
+++ b/pandas/tests/tslibs/test_conversion.py
@@ -4,7 +4,13 @@
import pytest
from pytz import UTC
-from pandas._libs.tslibs import conversion, iNaT, timezones, tzconversion
+from pandas._libs.tslibs import (
+ OutOfBoundsTimedelta,
+ conversion,
+ iNaT,
+ timezones,
+ tzconversion,
+)
from pandas import Timestamp, date_range
import pandas._testing as tm
@@ -12,41 +18,52 @@
def _compare_utc_to_local(tz_didx):
def f(x):
- return tzconversion.tz_convert_single(x, UTC, tz_didx.tz)
+ return tzconversion.tz_convert_from_utc_single(x, tz_didx.tz)
- result = tzconversion.tz_convert(tz_didx.asi8, UTC, tz_didx.tz)
+ result = tzconversion.tz_convert_from_utc(tz_didx.asi8, tz_didx.tz)
expected = np.vectorize(f)(tz_didx.asi8)
tm.assert_numpy_array_equal(result, expected)
-def _compare_local_to_utc(tz_didx, utc_didx):
- def f(x):
- return tzconversion.tz_convert_single(x, tz_didx.tz, UTC)
+def _compare_local_to_utc(tz_didx, naive_didx):
+ # Check that tz_localize behaves the same vectorized and pointwise.
+ err1 = err2 = None
+ try:
+ result = tzconversion.tz_localize_to_utc(naive_didx.asi8, tz_didx.tz)
+ err1 = None
+ except Exception as err:
+ err1 = err
- result = tzconversion.tz_convert(utc_didx.asi8, tz_didx.tz, UTC)
- expected = np.vectorize(f)(utc_didx.asi8)
+ try:
+ expected = naive_didx.map(lambda x: x.tz_localize(tz_didx.tz)).asi8
+ except Exception as err:
+ err2 = err
- tm.assert_numpy_array_equal(result, expected)
+ if err1 is not None:
+ assert type(err1) == type(err2)
+ else:
+ assert err2 is None
+ tm.assert_numpy_array_equal(result, expected)
def test_tz_convert_single_matches_tz_convert_hourly(tz_aware_fixture):
tz = tz_aware_fixture
tz_didx = date_range("2014-03-01", "2015-01-10", freq="H", tz=tz)
- utc_didx = date_range("2014-03-01", "2015-01-10", freq="H")
+ naive_didx = date_range("2014-03-01", "2015-01-10", freq="H")
_compare_utc_to_local(tz_didx)
- _compare_local_to_utc(tz_didx, utc_didx)
+ _compare_local_to_utc(tz_didx, naive_didx)
@pytest.mark.parametrize("freq", ["D", "A"])
def test_tz_convert_single_matches_tz_convert(tz_aware_fixture, freq):
tz = tz_aware_fixture
tz_didx = date_range("2000-01-01", "2020-01-01", freq=freq, tz=tz)
- utc_didx = date_range("2000-01-01", "2020-01-01", freq=freq)
+ naive_didx = date_range("2000-01-01", "2020-01-01", freq=freq)
_compare_utc_to_local(tz_didx)
- _compare_local_to_utc(tz_didx, utc_didx)
+ _compare_local_to_utc(tz_didx, naive_didx)
@pytest.mark.parametrize(
@@ -57,9 +74,15 @@ def test_tz_convert_single_matches_tz_convert(tz_aware_fixture, freq):
],
)
def test_tz_convert_corner(arr):
- result = tzconversion.tz_convert(
- arr, timezones.maybe_get_tz("US/Eastern"), timezones.maybe_get_tz("Asia/Tokyo")
- )
+ result = tzconversion.tz_convert_from_utc(arr, timezones.maybe_get_tz("Asia/Tokyo"))
+ tm.assert_numpy_array_equal(result, arr)
+
+
+def test_tz_convert_readonly():
+ # GH#35530
+ arr = np.array([0], dtype=np.int64)
+ arr.setflags(write=False)
+ result = tzconversion.tz_convert_from_utc(arr, UTC)
tm.assert_numpy_array_equal(result, arr)
@@ -80,6 +103,13 @@ def test_ensure_datetime64ns_bigendian():
tm.assert_numpy_array_equal(result, expected)
+def test_ensure_timedelta64ns_overflows():
+ arr = np.arange(10).astype("m8[Y]") * 100
+ msg = r"Out of bounds for nanosecond timedelta64\[Y\] 900"
+ with pytest.raises(OutOfBoundsTimedelta, match=msg):
+ conversion.ensure_timedelta64ns(arr)
+
+
class SubDatetime(datetime):
pass
diff --git a/pandas/tests/tslibs/test_libfrequencies.py b/pandas/tests/tslibs/test_libfrequencies.py
index 993f2f4c8ef10..83f28f6b5dc01 100644
--- a/pandas/tests/tslibs/test_libfrequencies.py
+++ b/pandas/tests/tslibs/test_libfrequencies.py
@@ -9,19 +9,19 @@
"obj,expected",
[
("W", "DEC"),
- (offsets.Week(), "DEC"),
+ (offsets.Week().freqstr, "DEC"),
("D", "DEC"),
- (offsets.Day(), "DEC"),
+ (offsets.Day().freqstr, "DEC"),
("Q", "DEC"),
- (offsets.QuarterEnd(startingMonth=12), "DEC"),
+ (offsets.QuarterEnd(startingMonth=12).freqstr, "DEC"),
("Q-JAN", "JAN"),
- (offsets.QuarterEnd(startingMonth=1), "JAN"),
+ (offsets.QuarterEnd(startingMonth=1).freqstr, "JAN"),
("A-DEC", "DEC"),
("Y-DEC", "DEC"),
- (offsets.YearEnd(), "DEC"),
+ (offsets.YearEnd().freqstr, "DEC"),
("A-MAY", "MAY"),
("Y-MAY", "MAY"),
- (offsets.YearEnd(month=5), "MAY"),
+ (offsets.YearEnd(month=5).freqstr, "MAY"),
],
)
def test_get_rule_month(obj, expected):
diff --git a/pandas/tests/tslibs/test_liboffsets.py b/pandas/tests/tslibs/test_liboffsets.py
index 206a604788c7e..6a514d2cc8713 100644
--- a/pandas/tests/tslibs/test_liboffsets.py
+++ b/pandas/tests/tslibs/test_liboffsets.py
@@ -5,6 +5,7 @@
import pytest
+from pandas._libs.tslibs.ccalendar import get_firstbday, get_lastbday
import pandas._libs.tslibs.offsets as liboffsets
from pandas._libs.tslibs.offsets import roll_qtrday
@@ -25,7 +26,7 @@ def day_opt(request):
)
def test_get_last_bday(dt, exp_week_day, exp_last_day):
assert dt.weekday() == exp_week_day
- assert liboffsets.get_lastbday(dt.year, dt.month) == exp_last_day
+ assert get_lastbday(dt.year, dt.month) == exp_last_day
@pytest.mark.parametrize(
@@ -37,7 +38,7 @@ def test_get_last_bday(dt, exp_week_day, exp_last_day):
)
def test_get_first_bday(dt, exp_week_day, exp_first_day):
assert dt.weekday() == exp_week_day
- assert liboffsets.get_firstbday(dt.year, dt.month) == exp_first_day
+ assert get_firstbday(dt.year, dt.month) == exp_first_day
@pytest.mark.parametrize(
diff --git a/pandas/tests/tslibs/test_parsing.py b/pandas/tests/tslibs/test_parsing.py
index dc7421ea63464..70fa724464226 100644
--- a/pandas/tests/tslibs/test_parsing.py
+++ b/pandas/tests/tslibs/test_parsing.py
@@ -148,14 +148,14 @@ def test_parsers_month_freq(date_str, expected):
],
)
def test_guess_datetime_format_with_parseable_formats(string, fmt):
- result = parsing._guess_datetime_format(string)
+ result = parsing.guess_datetime_format(string)
assert result == fmt
@pytest.mark.parametrize("dayfirst,expected", [(True, "%d/%m/%Y"), (False, "%m/%d/%Y")])
def test_guess_datetime_format_with_dayfirst(dayfirst, expected):
ambiguous_string = "01/01/2011"
- result = parsing._guess_datetime_format(ambiguous_string, dayfirst=dayfirst)
+ result = parsing.guess_datetime_format(ambiguous_string, dayfirst=dayfirst)
assert result == expected
@@ -169,7 +169,7 @@ def test_guess_datetime_format_with_dayfirst(dayfirst, expected):
],
)
def test_guess_datetime_format_with_locale_specific_formats(string, fmt):
- result = parsing._guess_datetime_format(string)
+ result = parsing.guess_datetime_format(string)
assert result == fmt
@@ -189,7 +189,7 @@ def test_guess_datetime_format_with_locale_specific_formats(string, fmt):
def test_guess_datetime_format_invalid_inputs(invalid_dt):
# A datetime string must include a year, month and a day for it to be
# guessable, in addition to being a string that looks like a datetime.
- assert parsing._guess_datetime_format(invalid_dt) is None
+ assert parsing.guess_datetime_format(invalid_dt) is None
@pytest.mark.parametrize(
@@ -205,7 +205,7 @@ def test_guess_datetime_format_invalid_inputs(invalid_dt):
)
def test_guess_datetime_format_no_padding(string, fmt):
# see gh-11142
- result = parsing._guess_datetime_format(string)
+ result = parsing.guess_datetime_format(string)
assert result == fmt
diff --git a/pandas/tests/tslibs/test_timezones.py b/pandas/tests/tslibs/test_timezones.py
index 03cc8fcb6e904..81b41f567976d 100644
--- a/pandas/tests/tslibs/test_timezones.py
+++ b/pandas/tests/tslibs/test_timezones.py
@@ -106,3 +106,15 @@ def test_infer_tz_mismatch(infer_setup, ordered):
with pytest.raises(AssertionError, match=msg):
timezones.infer_tzinfo(*args)
+
+
+def test_maybe_get_tz_invalid_types():
+ with pytest.raises(TypeError, match=""):
+ timezones.maybe_get_tz(44.0)
+
+ with pytest.raises(TypeError, match=""):
+ timezones.maybe_get_tz(pytz)
+
+ msg = ""
+ with pytest.raises(TypeError, match=msg):
+ timezones.maybe_get_tz(Timestamp.now("UTC"))
diff --git a/pandas/tests/util/test_assert_extension_array_equal.py b/pandas/tests/util/test_assert_extension_array_equal.py
index d9fdf1491c328..f9259beab5d13 100644
--- a/pandas/tests/util/test_assert_extension_array_equal.py
+++ b/pandas/tests/util/test_assert_extension_array_equal.py
@@ -1,6 +1,7 @@
import numpy as np
import pytest
+from pandas import array
import pandas._testing as tm
from pandas.core.arrays.sparse import SparseArray
@@ -102,3 +103,11 @@ def test_assert_extension_array_equal_non_extension_array(side):
with pytest.raises(AssertionError, match=msg):
tm.assert_extension_array_equal(*args)
+
+
+@pytest.mark.parametrize("right_dtype", ["Int32", "int64"])
+def test_assert_extension_array_equal_ignore_dtype_mismatch(right_dtype):
+ # https://github.com/pandas-dev/pandas/issues/35715
+ left = array([1, 2, 3], dtype="Int64")
+ right = array([1, 2, 3], dtype=right_dtype)
+ tm.assert_extension_array_equal(left, right, check_dtype=False)
diff --git a/pandas/tests/util/test_assert_frame_equal.py b/pandas/tests/util/test_assert_frame_equal.py
index fe3e1ff906919..5174ff005b5fb 100644
--- a/pandas/tests/util/test_assert_frame_equal.py
+++ b/pandas/tests/util/test_assert_frame_equal.py
@@ -260,3 +260,26 @@ def test_assert_frame_equal_interval_dtype_mismatch():
with pytest.raises(AssertionError, match=msg):
tm.assert_frame_equal(left, right, check_dtype=True)
+
+
+@pytest.mark.parametrize("right_dtype", ["Int32", "int64"])
+def test_assert_frame_equal_ignore_extension_dtype_mismatch(right_dtype):
+ # https://github.com/pandas-dev/pandas/issues/35715
+ left = pd.DataFrame({"a": [1, 2, 3]}, dtype="Int64")
+ right = pd.DataFrame({"a": [1, 2, 3]}, dtype=right_dtype)
+ tm.assert_frame_equal(left, right, check_dtype=False)
+
+
+def test_allows_duplicate_labels():
+ left = pd.DataFrame()
+ right = pd.DataFrame().set_flags(allows_duplicate_labels=False)
+ tm.assert_frame_equal(left, left)
+ tm.assert_frame_equal(right, right)
+ tm.assert_frame_equal(left, right, check_flags=False)
+ tm.assert_frame_equal(right, left, check_flags=False)
+
+ with pytest.raises(AssertionError, match=" bool:
def _maybe_coerce_freq(code) -> str:
- """ we might need to coerce a code to a rule_code
+ """we might need to coerce a code to a rule_code
and uppercase it
Parameters
diff --git a/pandas/tseries/holiday.py b/pandas/tseries/holiday.py
index 8ab37f787bd10..d8a3040919e7b 100644
--- a/pandas/tseries/holiday.py
+++ b/pandas/tseries/holiday.py
@@ -12,7 +12,7 @@
from pandas.tseries.offsets import Day, Easter
-def next_monday(dt):
+def next_monday(dt: datetime) -> datetime:
"""
If holiday falls on Saturday, use following Monday instead;
if holiday falls on Sunday, use Monday instead
@@ -24,7 +24,7 @@ def next_monday(dt):
return dt
-def next_monday_or_tuesday(dt):
+def next_monday_or_tuesday(dt: datetime) -> datetime:
"""
For second holiday of two adjacent ones!
If holiday falls on Saturday, use following Monday instead;
@@ -39,7 +39,7 @@ def next_monday_or_tuesday(dt):
return dt
-def previous_friday(dt):
+def previous_friday(dt: datetime) -> datetime:
"""
If holiday falls on Saturday or Sunday, use previous Friday instead.
"""
@@ -50,7 +50,7 @@ def previous_friday(dt):
return dt
-def sunday_to_monday(dt):
+def sunday_to_monday(dt: datetime) -> datetime:
"""
If holiday falls on Sunday, use day thereafter (Monday) instead.
"""
@@ -59,7 +59,7 @@ def sunday_to_monday(dt):
return dt
-def weekend_to_monday(dt):
+def weekend_to_monday(dt: datetime) -> datetime:
"""
If holiday falls on Sunday or Saturday,
use day thereafter (Monday) instead.
@@ -72,7 +72,7 @@ def weekend_to_monday(dt):
return dt
-def nearest_workday(dt):
+def nearest_workday(dt: datetime) -> datetime:
"""
If holiday falls on Saturday, use day before (Friday) instead;
if holiday falls on Sunday, use day thereafter (Monday) instead.
@@ -84,7 +84,7 @@ def nearest_workday(dt):
return dt
-def next_workday(dt):
+def next_workday(dt: datetime) -> datetime:
"""
returns next weekday used for observances
"""
@@ -95,7 +95,7 @@ def next_workday(dt):
return dt
-def previous_workday(dt):
+def previous_workday(dt: datetime) -> datetime:
"""
returns previous weekday used for observances
"""
@@ -106,14 +106,14 @@ def previous_workday(dt):
return dt
-def before_nearest_workday(dt):
+def before_nearest_workday(dt: datetime) -> datetime:
"""
returns previous workday after nearest workday
"""
return previous_workday(nearest_workday(dt))
-def after_nearest_workday(dt):
+def after_nearest_workday(dt: datetime) -> datetime:
"""
returns next workday after nearest workday
needed for Boxing day or multiple holidays in a series
@@ -428,9 +428,11 @@ def holidays(self, start=None, end=None, return_name=False):
# If we don't have a cache or the dates are outside the prior cache, we
# get them again
if self._cache is None or start < self._cache[0] or end > self._cache[1]:
- holidays = [rule.dates(start, end, return_name=True) for rule in self.rules]
- if holidays:
- holidays = concat(holidays)
+ pre_holidays = [
+ rule.dates(start, end, return_name=True) for rule in self.rules
+ ]
+ if pre_holidays:
+ holidays = concat(pre_holidays)
else:
holidays = Series(index=DatetimeIndex([]), dtype=object)
diff --git a/pandas/util/__init__.py b/pandas/util/__init__.py
index b5271dbc0443e..9f2bf156b7e37 100644
--- a/pandas/util/__init__.py
+++ b/pandas/util/__init__.py
@@ -1,30 +1,12 @@
from pandas.util._decorators import Appender, Substitution, cache_readonly # noqa
-from pandas import compat
from pandas.core.util.hashing import hash_array, hash_pandas_object # noqa
-# compatibility for import pandas; pandas.util.testing
-if compat.PY37:
+def __getattr__(name):
+ if name == "testing":
+ import pandas.util.testing
- def __getattr__(name):
- if name == "testing":
- import pandas.util.testing
-
- return pandas.util.testing
- else:
- raise AttributeError(f"module 'pandas.util' has no attribute '{name}'")
-
-
-else:
-
- class _testing:
- def __getattr__(self, item):
- import pandas.util.testing
-
- return getattr(pandas.util.testing, item)
-
- testing = _testing()
-
-
-del compat
+ return pandas.util.testing
+ else:
+ raise AttributeError(f"module 'pandas.util' has no attribute '{name}'")
diff --git a/pandas/util/_decorators.py b/pandas/util/_decorators.py
index 6135ccba1573d..f81bca7e85156 100644
--- a/pandas/util/_decorators.py
+++ b/pandas/util/_decorators.py
@@ -323,7 +323,8 @@ def wrapper(*args, **kwargs) -> Callable[..., Any]:
sig = inspect.Signature(params)
# https://github.com/python/typing/issues/598
- func.__signature__ = sig # type: ignore
+ # error: "F" has no attribute "__signature__"
+ func.__signature__ = sig # type: ignore[attr-defined]
return cast(F, wrapper)
return decorate
@@ -357,8 +358,12 @@ def decorator(decorated: F) -> F:
for docstring in docstrings:
if hasattr(docstring, "_docstring_components"):
+ # error: Item "str" of "Union[str, Callable[..., Any]]" has no
+ # attribute "_docstring_components" [union-attr]
+ # error: Item "function" of "Union[str, Callable[..., Any]]"
+ # has no attribute "_docstring_components" [union-attr]
docstring_components.extend(
- docstring._docstring_components # type: ignore
+ docstring._docstring_components # type: ignore[union-attr]
)
elif isinstance(docstring, str) or docstring.__doc__:
docstring_components.append(docstring)
@@ -373,7 +378,10 @@ def decorator(decorated: F) -> F:
]
)
- decorated._docstring_components = docstring_components # type: ignore
+ # error: "F" has no attribute "_docstring_components"
+ decorated._docstring_components = ( # type: ignore[attr-defined]
+ docstring_components
+ )
return decorated
return decorator
diff --git a/pandas/util/_doctools.py b/pandas/util/_doctools.py
index f413490764124..3a8a1a3144269 100644
--- a/pandas/util/_doctools.py
+++ b/pandas/util/_doctools.py
@@ -53,8 +53,8 @@ def plot(self, left, right, labels=None, vertical: bool = True):
vertical : bool, default True
If True, use vertical layout. If False, use horizontal layout.
"""
- import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
+ import matplotlib.pyplot as plt
if not isinstance(left, list):
left = [left]
diff --git a/pandas/util/_test_decorators.py b/pandas/util/_test_decorators.py
index 25394dc6775d8..0e8f6b933cd97 100644
--- a/pandas/util/_test_decorators.py
+++ b/pandas/util/_test_decorators.py
@@ -23,19 +23,18 @@ def test_foo():
For more information, refer to the ``pytest`` documentation on ``skipif``.
"""
+from contextlib import contextmanager
from distutils.version import LooseVersion
-from functools import wraps
import locale
from typing import Callable, Optional
import numpy as np
import pytest
-from pandas.compat import is_platform_32bit, is_platform_windows
+from pandas.compat import IS64, is_platform_windows
from pandas.compat._optional import import_optional_dependency
-from pandas.compat.numpy import _np_version
-from pandas.core.computation.expressions import _NUMEXPR_INSTALLED, _USE_NUMEXPR
+from pandas.core.computation.expressions import NUMEXPR_INSTALLED, USE_NUMEXPR
def safe_import(mod_name: str, min_version: Optional[str] = None):
@@ -94,7 +93,7 @@ def safe_import(mod_name: str, min_version: Optional[str] = None):
def _skip_if_no_mpl():
mod = safe_import("matplotlib")
if mod:
- mod.use("Agg", warn=True)
+ mod.use("Agg")
else:
return True
@@ -120,7 +119,9 @@ def _skip_if_no_scipy() -> bool:
)
-def skip_if_installed(package: str) -> Callable:
+# TODO: return type, _pytest.mark.structures.MarkDecorator is not public
+# https://github.com/pytest-dev/pytest/issues/7469
+def skip_if_installed(package: str):
"""
Skip a test if a package is installed.
@@ -134,7 +135,9 @@ def skip_if_installed(package: str) -> Callable:
)
-def skip_if_no(package: str, min_version: Optional[str] = None) -> Callable:
+# TODO: return type, _pytest.mark.structures.MarkDecorator is not public
+# https://github.com/pytest-dev/pytest/issues/7469
+def skip_if_no(package: str, min_version: Optional[str] = None):
"""
Generic function to help skip tests when required packages are not
present on the testing system.
@@ -176,33 +179,33 @@ def skip_if_no(package: str, min_version: Optional[str] = None) -> Callable:
_skip_if_no_mpl(), reason="Missing matplotlib dependency"
)
skip_if_mpl = pytest.mark.skipif(not _skip_if_no_mpl(), reason="matplotlib is present")
-skip_if_32bit = pytest.mark.skipif(is_platform_32bit(), reason="skipping for 32 bit")
+skip_if_32bit = pytest.mark.skipif(not IS64, reason="skipping for 32 bit")
skip_if_windows = pytest.mark.skipif(is_platform_windows(), reason="Running on Windows")
skip_if_windows_python_3 = pytest.mark.skipif(
is_platform_windows(), reason="not used on win32"
)
skip_if_has_locale = pytest.mark.skipif(
- _skip_if_has_locale(), reason=f"Specific locale is set {locale.getlocale()[0]}",
+ _skip_if_has_locale(), reason=f"Specific locale is set {locale.getlocale()[0]}"
)
skip_if_not_us_locale = pytest.mark.skipif(
- _skip_if_not_us_locale(), reason=f"Specific locale is set {locale.getlocale()[0]}",
+ _skip_if_not_us_locale(), reason=f"Specific locale is set {locale.getlocale()[0]}"
)
skip_if_no_scipy = pytest.mark.skipif(
_skip_if_no_scipy(), reason="Missing SciPy requirement"
)
skip_if_no_ne = pytest.mark.skipif(
- not _USE_NUMEXPR,
- reason=f"numexpr enabled->{_USE_NUMEXPR}, installed->{_NUMEXPR_INSTALLED}",
+ not USE_NUMEXPR,
+ reason=f"numexpr enabled->{USE_NUMEXPR}, installed->{NUMEXPR_INSTALLED}",
)
-def skip_if_np_lt(
- ver_str: str, reason: Optional[str] = None, *args, **kwds
-) -> Callable:
+# TODO: return type, _pytest.mark.structures.MarkDecorator is not public
+# https://github.com/pytest-dev/pytest/issues/7469
+def skip_if_np_lt(ver_str: str, *args, reason: Optional[str] = None):
if reason is None:
reason = f"NumPy {ver_str} or greater required"
return pytest.mark.skipif(
- _np_version < LooseVersion(ver_str), reason=reason, *args, **kwds
+ np.__version__ < LooseVersion(ver_str), *args, reason=reason
)
@@ -235,23 +238,36 @@ def documented_fixture(fixture):
def check_file_leaks(func) -> Callable:
"""
- Decorate a test function tot check that we are not leaking file descriptors.
+ Decorate a test function to check that we are not leaking file descriptors.
"""
- psutil = safe_import("psutil")
- if not psutil:
+ with file_leak_context():
return func
- @wraps(func)
- def new_func(*args, **kwargs):
+
+@contextmanager
+def file_leak_context():
+ """
+ ContextManager analogue to check_file_leaks.
+ """
+ psutil = safe_import("psutil")
+ if not psutil:
+ yield
+ else:
proc = psutil.Process()
flist = proc.open_files()
+ conns = proc.connections()
- func(*args, **kwargs)
+ yield
flist2 = proc.open_files()
- assert flist2 == flist
-
- return new_func
+ # on some builds open_files includes file position, which we _dont_
+ # expect to remain unchanged, so we need to compare excluding that
+ flist_ex = [(x.path, x.fd) for x in flist]
+ flist2_ex = [(x.path, x.fd) for x in flist2]
+ assert flist2_ex == flist_ex, (flist2, flist)
+
+ conns2 = proc.connections()
+ assert conns2 == conns, (conns2, conns)
def async_mark():
diff --git a/pandas/util/_validators.py b/pandas/util/_validators.py
index bb6c6de441558..fa7201a5188a5 100644
--- a/pandas/util/_validators.py
+++ b/pandas/util/_validators.py
@@ -371,14 +371,13 @@ def validate_percentile(q: Union[float, Iterable[float]]) -> np.ndarray:
ValueError if percentiles are not in given interval([0, 1]).
"""
q_arr = np.asarray(q)
- msg = (
- "percentiles should all be in the interval [0, 1]."
- f"Try {q_arr / 100.0} instead."
- )
+ # Don't change this to an f-string. The string formatting
+ # is too expensive for cases where we don't need it.
+ msg = "percentiles should all be in the interval [0, 1]. Try {} instead."
if q_arr.ndim == 0:
if not 0 <= q_arr <= 1:
- raise ValueError(msg)
+ raise ValueError(msg.format(q_arr / 100.0))
else:
if not all(0 <= qs <= 1 for qs in q_arr):
- raise ValueError(msg)
+ raise ValueError(msg.format(q_arr / 100.0))
return q_arr
diff --git a/pyproject.toml b/pyproject.toml
index aaebcff8e4c1e..8161e8ad752da 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,17 +4,15 @@
requires = [
"setuptools",
"wheel",
- "Cython>=0.29.16", # Note: sync with setup.py
- "numpy==1.15.4; python_version=='3.6' and platform_system!='AIX'",
- "numpy==1.15.4; python_version=='3.7' and platform_system!='AIX'",
+ "Cython>=0.29.21,<3", # Note: sync with setup.py
+ "numpy==1.16.5; python_version=='3.7' and platform_system!='AIX'",
"numpy==1.17.3; python_version>='3.8' and platform_system!='AIX'",
- "numpy==1.16.0; python_version=='3.6' and platform_system=='AIX'",
- "numpy==1.16.0; python_version=='3.7' and platform_system=='AIX'",
+ "numpy==1.16.5; python_version=='3.7' and platform_system=='AIX'",
"numpy==1.17.3; python_version>='3.8' and platform_system=='AIX'",
]
[tool.black]
-target-version = ['py36', 'py37', 'py38']
+target-version = ['py37', 'py38']
exclude = '''
(
asv_bench/env
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 44c975a3b3cfb..fb647c10f72bc 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,21 +1,21 @@
# This file is auto-generated from environment.yml, do not modify.
# See that file for comments about the need/usage of each dependency.
-numpy>=1.15
+numpy>=1.16.5
python-dateutil>=2.7.3
pytz
asv
-cython>=0.29.16
+cython>=0.29.21
black==19.10b0
cpplint
flake8<3.8.0
flake8-comprehensions>=3.1.0
flake8-rst>=0.6.0,<=0.7.0
-isort
-mypy==0.730
+isort>=5.2.1
+mypy==0.782
pycodestyle
gitpython
-gitdb2==2.0.6
+gitdb
sphinx
nbconvert>=5.4.1
nbsphinx
@@ -32,6 +32,7 @@ boto3
botocore>=1.11
hypothesis>=3.82
moto
+flask
pytest>=5.0.1
pytest-cov
pytest-xdist>=1.21
@@ -60,10 +61,10 @@ xlsxwriter
xlwt
odfpy
fastparquet>=0.3.2
-pyarrow>=0.13.1
+pyarrow>=0.15.0
python-snappy
pyqt5>=5.9.2
-tables>=3.4.3
+tables>=3.4.4
s3fs>=0.4.0
fsspec>=0.7.4
gcsfs>=0.6.0
@@ -72,5 +73,7 @@ xarray
cftime
pyreadstat
tabulate>=0.8.3
+natsort
git+https://github.com/pandas-dev/pydata-sphinx-theme.git@master
-git+https://github.com/numpy/numpydoc
\ No newline at end of file
+git+https://github.com/numpy/numpydoc
+pyflakes>=2.2.0
\ No newline at end of file
diff --git a/scripts/tests/test_validate_unwanted_patterns.py b/scripts/tests/test_validate_unwanted_patterns.py
index b6cfa20cd7ca0..947666a730ee9 100644
--- a/scripts/tests/test_validate_unwanted_patterns.py
+++ b/scripts/tests/test_validate_unwanted_patterns.py
@@ -1,7 +1,6 @@
import io
import pytest
-
import validate_unwanted_patterns
diff --git a/scripts/validate_rst_title_capitalization.py b/scripts/validate_rst_title_capitalization.py
index 5de2a07381ae5..b654e27737359 100755
--- a/scripts/validate_rst_title_capitalization.py
+++ b/scripts/validate_rst_title_capitalization.py
@@ -138,6 +138,9 @@
"CategoricalDtype",
"UTC",
"Panel",
+ "False",
+ "Styler",
+ "os",
}
CAP_EXCEPTIONS_DICT = {word.lower(): word for word in CAPITALIZATION_EXCEPTIONS}
@@ -209,7 +212,7 @@ def find_titles(rst_file: str) -> Iterable[Tuple[str, int]]:
The corresponding line number of the heading.
"""
- with open(rst_file, "r") as fd:
+ with open(rst_file) as fd:
previous_line = ""
for i, line in enumerate(fd):
line = line[:-1]
@@ -247,10 +250,9 @@ def find_rst_files(source_paths: List[str]) -> Iterable[str]:
elif directory_address.endswith(".rst"):
yield directory_address
else:
- for filename in glob.glob(
+ yield from glob.glob(
pathname=f"{directory_address}/**/*.rst", recursive=True
- ):
- yield filename
+ )
def main(source_paths: List[str], output_format: str) -> int:
diff --git a/scripts/validate_unwanted_patterns.py b/scripts/validate_unwanted_patterns.py
index 193fef026a96b..b6ffab1482bbc 100755
--- a/scripts/validate_unwanted_patterns.py
+++ b/scripts/validate_unwanted_patterns.py
@@ -16,9 +16,40 @@
import sys
import token
import tokenize
-from typing import IO, Callable, FrozenSet, Iterable, List, Tuple
-
-PATHS_TO_IGNORE: Tuple[str, ...] = ("asv_bench/env",)
+from typing import IO, Callable, FrozenSet, Iterable, List, Set, Tuple
+
+PRIVATE_IMPORTS_TO_IGNORE: Set[str] = {
+ "_extension_array_shared_docs",
+ "_index_shared_docs",
+ "_interval_shared_docs",
+ "_merge_doc",
+ "_shared_docs",
+ "_apply_docs",
+ "_new_Index",
+ "_new_PeriodIndex",
+ "_doc_template",
+ "_agg_template",
+ "_pipe_template",
+ "_get_version",
+ "__main__",
+ "_transform_template",
+ "_arith_doc_FRAME",
+ "_flex_comp_doc_FRAME",
+ "_make_flex_doc",
+ "_op_descriptions",
+ "_IntegerDtype",
+ "_use_inf_as_na",
+ "_get_plot_backend",
+ "_matplotlib",
+ "_arrow_utils",
+ "_registry",
+ "_get_offset", # TODO: remove after get_offset deprecation enforced
+ "_test_parse_iso8601",
+ "_json_normalize", # TODO: remove after deprecation is enforced
+ "_testing",
+ "_test_decorators",
+ "__version__", # check np.__version__ in compat.numpy.function
+}
def _get_literal_string_prefix_len(token_string: str) -> int:
@@ -114,6 +145,88 @@ def bare_pytest_raises(file_obj: IO[str]) -> Iterable[Tuple[int, str]]:
)
+PRIVATE_FUNCTIONS_ALLOWED = {"sys._getframe"} # no known alternative
+
+
+def private_function_across_module(file_obj: IO[str]) -> Iterable[Tuple[int, str]]:
+ """
+ Checking that a private function is not used across modules.
+ Parameters
+ ----------
+ file_obj : IO
+ File-like object containing the Python code to validate.
+ Yields
+ ------
+ line_number : int
+ Line number of the private function that is used across modules.
+ msg : str
+ Explenation of the error.
+ """
+ contents = file_obj.read()
+ tree = ast.parse(contents)
+
+ imported_modules: Set[str] = set()
+
+ for node in ast.walk(tree):
+ if isinstance(node, (ast.Import, ast.ImportFrom)):
+ for module in node.names:
+ module_fqdn = module.name if module.asname is None else module.asname
+ imported_modules.add(module_fqdn)
+
+ if not isinstance(node, ast.Call):
+ continue
+
+ try:
+ module_name = node.func.value.id
+ function_name = node.func.attr
+ except AttributeError:
+ continue
+
+ # Exception section #
+
+ # (Debatable) Class case
+ if module_name[0].isupper():
+ continue
+ # (Debatable) Dunder methods case
+ elif function_name.startswith("__") and function_name.endswith("__"):
+ continue
+ elif module_name + "." + function_name in PRIVATE_FUNCTIONS_ALLOWED:
+ continue
+
+ if module_name in imported_modules and function_name.startswith("_"):
+ yield (node.lineno, f"Private function '{module_name}.{function_name}'")
+
+
+def private_import_across_module(file_obj: IO[str]) -> Iterable[Tuple[int, str]]:
+ """
+ Checking that a private function is not imported across modules.
+ Parameters
+ ----------
+ file_obj : IO
+ File-like object containing the Python code to validate.
+ Yields
+ ------
+ line_number : int
+ Line number of import statement, that imports the private function.
+ msg : str
+ Explenation of the error.
+ """
+ contents = file_obj.read()
+ tree = ast.parse(contents)
+
+ for node in ast.walk(tree):
+ if not (isinstance(node, ast.Import) or isinstance(node, ast.ImportFrom)):
+ continue
+
+ for module in node.names:
+ module_name = module.name.split(".")[-1]
+ if module_name in PRIVATE_IMPORTS_TO_IGNORE:
+ continue
+
+ if module_name.startswith("_"):
+ yield (node.lineno, f"Import of internal function {repr(module_name)}")
+
+
def strings_to_concatenate(file_obj: IO[str]) -> Iterable[Tuple[int, str]]:
"""
This test case is necessary after 'Black' (https://github.com/psf/black),
@@ -293,6 +406,7 @@ def main(
source_path: str,
output_format: str,
file_extensions_to_check: str,
+ excluded_file_paths: str,
) -> bool:
"""
Main entry point of the script.
@@ -305,6 +419,10 @@ def main(
Source path representing path to a file/directory.
output_format : str
Output format of the error message.
+ file_extensions_to_check : str
+ Comma separated values of what file extensions to check.
+ excluded_file_paths : str
+ Comma separated values of what file paths to exclude during the check.
Returns
-------
@@ -325,10 +443,11 @@ def main(
FILE_EXTENSIONS_TO_CHECK: FrozenSet[str] = frozenset(
file_extensions_to_check.split(",")
)
+ PATHS_TO_IGNORE = frozenset(excluded_file_paths.split(","))
if os.path.isfile(source_path):
file_path = source_path
- with open(file_path, "r") as file_obj:
+ with open(file_path) as file_obj:
for line_number, msg in function(file_obj):
is_failed = True
print(
@@ -347,7 +466,7 @@ def main(
continue
file_path = os.path.join(subdir, file_name)
- with open(file_path, "r") as file_obj:
+ with open(file_path) as file_obj:
for line_number, msg in function(file_obj):
is_failed = True
print(
@@ -362,6 +481,8 @@ def main(
if __name__ == "__main__":
available_validation_types: List[str] = [
"bare_pytest_raises",
+ "private_function_across_module",
+ "private_import_across_module",
"strings_to_concatenate",
"strings_with_wrong_placed_whitespace",
]
@@ -387,7 +508,12 @@ def main(
parser.add_argument(
"--included-file-extensions",
default="py,pyx,pxd,pxi",
- help="Coma seperated file extensions to check.",
+ help="Comma separated file extensions to check.",
+ )
+ parser.add_argument(
+ "--excluded-file-paths",
+ default="asv_bench/env",
+ help="Comma separated file paths to exclude.",
)
args = parser.parse_args()
@@ -398,5 +524,6 @@ def main(
source_path=args.path,
output_format=args.format,
file_extensions_to_check=args.included_file_extensions,
+ excluded_file_paths=args.excluded_file_paths,
)
)
diff --git a/setup.cfg b/setup.cfg
index 49a57b7a525f0..e7d7df7ff19a2 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -54,13 +54,6 @@ exclude =
# sync minversion with setup.cfg & install.rst
minversion = 4.0.2
testpaths = pandas
-markers =
- single: mark a test as single cpu only
- slow: mark a test as slow
- network: mark a test as network
- db: tests requiring a database (mysql or postgres)
- high_memory: mark a test as a high-memory only
- clipboard: mark a pd.read_clipboard test
doctest_optionflags = NORMALIZE_WHITESPACE IGNORE_EXCEPTION_DETAIL ELLIPSIS
addopts = --strict-data-files
xfail_strict = True
@@ -75,6 +68,7 @@ omit =
*/tests/*
pandas/_typing.py
pandas/_version.py
+ pandas/_vendored/typing_extensions.py
plugins = Cython.Coverage
[coverage:report]
@@ -106,13 +100,13 @@ directory = coverage_html_report
# To be kept consistent with "Import Formatting" section in contributing.rst
[isort]
-known_pre_libs = pandas._config
+known_pre_libs = pandas._config,pandas._vendored
known_pre_core = pandas._libs,pandas._typing,pandas.util._*,pandas.compat,pandas.errors
known_dtypes = pandas.core.dtypes
known_post_core = pandas.tseries,pandas.io,pandas.plotting
sections = FUTURE,STDLIB,THIRDPARTY,PRE_LIBS,PRE_CORE,DTYPES,FIRSTPARTY,POST_CORE,LOCALFOLDER
known_first_party = pandas
-known_third_party = _pytest,announce,dateutil,docutils,flake8,git,hypothesis,jinja2,lxml,matplotlib,numpy,numpydoc,pkg_resources,pyarrow,pytest,pytz,requests,scipy,setuptools,sphinx,sqlalchemy,validate_docstrings,yaml,odf
+known_third_party = announce,dateutil,docutils,flake8,git,hypothesis,jinja2,lxml,matplotlib,numpy,numpydoc,pkg_resources,pyarrow,pytest,pytz,requests,scipy,setuptools,sphinx,sqlalchemy,validate_docstrings,validate_unwanted_patterns,yaml,odf
multi_line_output = 3
include_trailing_comma = True
force_grid_wrap = 0
@@ -120,7 +114,7 @@ combine_as_imports = True
line_length = 88
force_sort_within_sections = True
skip_glob = env,
-skip = pandas/__init__.py
+skip = pandas/__init__.py,pandas/_vendored/typing_extensions.py
[mypy]
ignore_missing_imports=True
@@ -129,14 +123,16 @@ check_untyped_defs=True
strict_equality=True
warn_redundant_casts = True
warn_unused_ignores = True
+show_error_codes = True
-[mypy-pandas.tests.*]
+[mypy-pandas._vendored.*]
check_untyped_defs=False
-
-[mypy-pandas.conftest]
ignore_errors=True
-[mypy-pandas.tests.tools.test_to_datetime]
+[mypy-pandas.tests.*]
+check_untyped_defs=False
+
+[mypy-pandas.conftest,pandas.tests.window.conftest]
ignore_errors=True
[mypy-pandas._testing]
@@ -145,7 +141,22 @@ check_untyped_defs=False
[mypy-pandas._version]
check_untyped_defs=False
-[mypy-pandas.core.arrays.interval]
+[mypy-pandas.compat.pickle_compat]
+check_untyped_defs=False
+
+[mypy-pandas.core.apply]
+check_untyped_defs=False
+
+[mypy-pandas.core.arrays.base]
+check_untyped_defs=False
+
+[mypy-pandas.core.arrays.datetimelike]
+check_untyped_defs=False
+
+[mypy-pandas.core.arrays.sparse.array]
+check_untyped_defs=False
+
+[mypy-pandas.core.arrays.string_]
check_untyped_defs=False
[mypy-pandas.core.base]
@@ -157,13 +168,13 @@ check_untyped_defs=False
[mypy-pandas.core.computation.expressions]
check_untyped_defs=False
-[mypy-pandas.core.computation.pytables]
+[mypy-pandas.core.computation.ops]
check_untyped_defs=False
-[mypy-pandas.core.computation.scope]
+[mypy-pandas.core.computation.pytables]
check_untyped_defs=False
-[mypy-pandas.core.dtypes.cast]
+[mypy-pandas.core.computation.scope]
check_untyped_defs=False
[mypy-pandas.core.frame]
@@ -172,6 +183,9 @@ check_untyped_defs=False
[mypy-pandas.core.generic]
check_untyped_defs=False
+[mypy-pandas.core.groupby.base]
+check_untyped_defs=False
+
[mypy-pandas.core.groupby.generic]
check_untyped_defs=False
@@ -184,19 +198,31 @@ check_untyped_defs=False
[mypy-pandas.core.indexes.base]
check_untyped_defs=False
+[mypy-pandas.core.indexes.category]
+check_untyped_defs=False
+
+[mypy-pandas.core.indexes.datetimelike]
+check_untyped_defs=False
+
[mypy-pandas.core.indexes.datetimes]
check_untyped_defs=False
+[mypy-pandas.core.indexes.extension]
+check_untyped_defs=False
+
[mypy-pandas.core.indexes.interval]
check_untyped_defs=False
[mypy-pandas.core.indexes.multi]
check_untyped_defs=False
-[mypy-pandas.core.internals.blocks]
+[mypy-pandas.core.indexes.period]
+check_untyped_defs=False
+
+[mypy-pandas.core.indexes.range]
check_untyped_defs=False
-[mypy-pandas.core.internals.concat]
+[mypy-pandas.core.internals.blocks]
check_untyped_defs=False
[mypy-pandas.core.internals.construction]
@@ -205,22 +231,31 @@ check_untyped_defs=False
[mypy-pandas.core.internals.managers]
check_untyped_defs=False
+[mypy-pandas.core.internals.ops]
+check_untyped_defs=False
+
[mypy-pandas.core.missing]
check_untyped_defs=False
[mypy-pandas.core.ops.docstrings]
check_untyped_defs=False
+[mypy-pandas.core.resample]
+check_untyped_defs=False
+
+[mypy-pandas.core.reshape.concat]
+check_untyped_defs=False
+
[mypy-pandas.core.reshape.merge]
check_untyped_defs=False
-[mypy-pandas.core.strings]
+[mypy-pandas.core.series]
check_untyped_defs=False
-[mypy-pandas.core.window.common]
+[mypy-pandas.core.strings]
check_untyped_defs=False
-[mypy-pandas.core.window.ewm]
+[mypy-pandas.core.window.common]
check_untyped_defs=False
[mypy-pandas.core.window.expanding]
@@ -232,22 +267,19 @@ check_untyped_defs=False
[mypy-pandas.io.clipboard]
check_untyped_defs=False
-[mypy-pandas.io.excel._base]
+[mypy-pandas.io.common]
check_untyped_defs=False
-[mypy-pandas.io.excel._openpyxl]
+[mypy-pandas.io.excel._base]
check_untyped_defs=False
[mypy-pandas.io.excel._util]
check_untyped_defs=False
-[mypy-pandas.io.excel._xlwt]
-check_untyped_defs=False
-
[mypy-pandas.io.formats.console]
check_untyped_defs=False
-[mypy-pandas.io.formats.css]
+[mypy-pandas.io.formats.csvs]
check_untyped_defs=False
[mypy-pandas.io.formats.excel]
@@ -286,8 +318,9 @@ check_untyped_defs=False
[mypy-pandas.plotting._matplotlib.core]
check_untyped_defs=False
-[mypy-pandas.plotting._matplotlib.misc]
+[mypy-pandas.plotting._misc]
check_untyped_defs=False
-[mypy-pandas.tseries.holiday]
+[mypy-pandas.util._decorators]
check_untyped_defs=False
+
diff --git a/setup.py b/setup.py
index e9d305d831653..8f447d5c38169 100755
--- a/setup.py
+++ b/setup.py
@@ -33,8 +33,8 @@ def is_platform_mac():
return sys.platform == "darwin"
-min_numpy_ver = "1.15.4"
-min_cython_ver = "0.29.16" # note: sync with pyproject.toml
+min_numpy_ver = "1.16.5"
+min_cython_ver = "0.29.21" # note: sync with pyproject.toml
try:
import Cython
@@ -99,7 +99,7 @@ def render_templates(cls, pxifiles):
# if .pxi.in is not updated, no need to output .pxi
continue
- with open(pxifile, "r") as f:
+ with open(pxifile) as f:
tmpl = f.read()
pyxcontent = tempita.sub(tmpl)
@@ -197,9 +197,9 @@ def build_extensions(self):
"Intended Audience :: Science/Research",
"Programming Language :: Python",
"Programming Language :: Python :: 3",
- "Programming Language :: Python :: 3.6",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
+ "Programming Language :: Python :: 3.9",
"Programming Language :: Cython",
"Topic :: Scientific/Engineering",
]
@@ -319,9 +319,9 @@ class CheckSDist(sdist_class):
"pandas/_libs/tslibs/conversion.pyx",
"pandas/_libs/tslibs/fields.pyx",
"pandas/_libs/tslibs/offsets.pyx",
- "pandas/_libs/tslibs/resolution.pyx",
"pandas/_libs/tslibs/parsing.pyx",
"pandas/_libs/tslibs/tzconversion.pyx",
+ "pandas/_libs/tslibs/vectorized.pyx",
"pandas/_libs/window/indexers.pyx",
"pandas/_libs/writers.pyx",
"pandas/io/sas/sas.pyx",
@@ -457,6 +457,9 @@ def run(self):
if sys.version_info[:2] == (3, 8): # GH 33239
extra_compile_args.append("-Wno-error=deprecated-declarations")
+ # https://github.com/pandas-dev/pandas/issues/35559
+ extra_compile_args.append("-Wno-error=unreachable-code")
+
# enable coverage by building cython files by setting the environment variable
# "PANDAS_CYTHON_COVERAGE" (with a Truthy value) or by running build_ext
# with `--with-cython-coverage`enabled
@@ -638,10 +641,6 @@ def srcpath(name=None, suffix=".pyx", subdir="src"):
"depends": tseries_depends,
"sources": ["pandas/_libs/tslibs/src/datetime/np_datetime.c"],
},
- "_libs.tslibs.resolution": {
- "pyxfile": "_libs/tslibs/resolution",
- "depends": tseries_depends,
- },
"_libs.tslibs.strptime": {
"pyxfile": "_libs/tslibs/strptime",
"depends": tseries_depends,
@@ -659,6 +658,7 @@ def srcpath(name=None, suffix=".pyx", subdir="src"):
"pyxfile": "_libs/tslibs/tzconversion",
"depends": tseries_depends,
},
+ "_libs.tslibs.vectorized": {"pyxfile": "_libs/tslibs/vectorized"},
"_libs.testing": {"pyxfile": "_libs/testing"},
"_libs.window.aggregations": {
"pyxfile": "_libs/window/aggregations",
@@ -745,7 +745,7 @@ def setup_package():
setuptools_kwargs = {
"install_requires": [
"python-dateutil >= 2.7.3",
- "pytz >= 2017.2",
+ "pytz >= 2017.3",
f"numpy >= {min_numpy_ver}",
],
"setup_requires": [f"numpy >= {min_numpy_ver}"],
@@ -769,11 +769,11 @@ def setup_package():
long_description=LONG_DESCRIPTION,
classifiers=CLASSIFIERS,
platforms="any",
- python_requires=">=3.6.1",
+ python_requires=">=3.7.1",
extras_require={
"test": [
# sync with setup.cfg minversion & install.rst
- "pytest>=4.0.2",
+ "pytest>=5.0.1",
"pytest-xdist",
"hypothesis>=3.58",
]
diff --git a/versioneer.py b/versioneer.py
index 5882349f65f0b..65c9523ba5573 100644
--- a/versioneer.py
+++ b/versioneer.py
@@ -349,7 +349,7 @@
import sys
-class VersioneerConfig(object):
+class VersioneerConfig:
pass
@@ -398,7 +398,7 @@ def get_config_from_root(root):
# the top of versioneer.py for instructions on writing your setup.cfg .
setup_cfg = os.path.join(root, "setup.cfg")
parser = configparser.SafeConfigParser()
- with open(setup_cfg, "r") as f:
+ with open(setup_cfg) as f:
parser.readfp(f)
VCS = parser.get("versioneer", "VCS") # mandatory
@@ -451,7 +451,7 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False):
stderr=(subprocess.PIPE if hide_stderr else None),
)
break
- except EnvironmentError:
+ except OSError:
e = sys.exc_info()[1]
if e.errno == errno.ENOENT:
continue
@@ -461,7 +461,7 @@ def run_command(commands, args, cwd=None, verbose=False, hide_stderr=False):
return None
else:
if verbose:
- print("unable to find command, tried %s" % (commands,))
+ print(f"unable to find command, tried {commands}")
return None
stdout = p.communicate()[0].strip().decode()
@@ -946,7 +946,7 @@ def git_get_keywords(versionfile_abs):
# _version.py.
keywords = {}
try:
- f = open(versionfile_abs, "r")
+ f = open(versionfile_abs)
for line in f.readlines():
if line.strip().startswith("git_refnames ="):
mo = re.search(r'=\s*"(.*)"', line)
@@ -957,7 +957,7 @@ def git_get_keywords(versionfile_abs):
if mo:
keywords["full"] = mo.group(1)
f.close()
- except EnvironmentError:
+ except OSError:
pass
return keywords
@@ -1072,9 +1072,8 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command):
if verbose:
fmt = "tag '%s' doesn't start with prefix '%s'"
print(fmt % (full_tag, tag_prefix))
- pieces["error"] = "tag '%s' doesn't start with prefix '%s'" % (
- full_tag,
- tag_prefix,
+ pieces["error"] = "tag '{}' doesn't start with prefix '{}'".format(
+ full_tag, tag_prefix,
)
return pieces
pieces["closest-tag"] = full_tag[len(tag_prefix) :]
@@ -1111,13 +1110,13 @@ def do_vcs_install(manifest_in, versionfile_source, ipy):
files.append(versioneer_file)
present = False
try:
- f = open(".gitattributes", "r")
+ f = open(".gitattributes")
for line in f.readlines():
if line.strip().startswith(versionfile_source):
if "export-subst" in line.strip().split()[1:]:
present = True
f.close()
- except EnvironmentError:
+ except OSError:
pass
if not present:
f = open(".gitattributes", "a+")
@@ -1171,7 +1170,7 @@ def versions_from_file(filename):
try:
with open(filename) as f:
contents = f.read()
- except EnvironmentError:
+ except OSError:
raise NotThisMethod("unable to read _version.py")
mo = re.search(
r"version_json = '''\n(.*)''' # END VERSION_JSON", contents, re.M | re.S
@@ -1187,7 +1186,7 @@ def write_to_version_file(filename, versions):
with open(filename, "w") as f:
f.write(SHORT_VERSION_PY % contents)
- print("set %s to '%s'" % (filename, versions["version"]))
+ print("set {} to '{}'".format(filename, versions["version"]))
def plus_or_dot(pieces):
@@ -1399,7 +1398,7 @@ def get_versions(verbose=False):
try:
ver = versions_from_file(versionfile_abs)
if verbose:
- print("got version from file %s %s" % (versionfile_abs, ver))
+ print(f"got version from file {versionfile_abs} {ver}")
return ver
except NotThisMethod:
pass
@@ -1619,11 +1618,7 @@ def do_setup():
root = get_root()
try:
cfg = get_config_from_root(root)
- except (
- EnvironmentError,
- configparser.NoSectionError,
- configparser.NoOptionError,
- ) as e:
+ except (OSError, configparser.NoSectionError, configparser.NoOptionError) as e:
if isinstance(e, (EnvironmentError, configparser.NoSectionError)):
print("Adding sample versioneer config to setup.cfg", file=sys.stderr)
with open(os.path.join(root, "setup.cfg"), "a") as f:
@@ -1648,9 +1643,9 @@ def do_setup():
ipy = os.path.join(os.path.dirname(cfg.versionfile_source), "__init__.py")
if os.path.exists(ipy):
try:
- with open(ipy, "r") as f:
+ with open(ipy) as f:
old = f.read()
- except EnvironmentError:
+ except OSError:
old = ""
if INIT_PY_SNIPPET not in old:
print(" appending to %s" % ipy)
@@ -1669,12 +1664,12 @@ def do_setup():
manifest_in = os.path.join(root, "MANIFEST.in")
simple_includes = set()
try:
- with open(manifest_in, "r") as f:
+ with open(manifest_in) as f:
for line in f:
if line.startswith("include "):
for include in line.split()[1:]:
simple_includes.add(include)
- except EnvironmentError:
+ except OSError:
pass
# That doesn't cover everything MANIFEST.in can do
# (https://docs.python.org/2/distutils/sourcedist.html#commands), so
@@ -1707,7 +1702,7 @@ def scan_setup_py():
found = set()
setters = False
errors = 0
- with open("setup.py", "r") as f:
+ with open("setup.py") as f:
for line in f.readlines():
if "import versioneer" in line:
found.add("import")
diff --git a/web/pandas/about/team.md b/web/pandas/about/team.md
index 8eb2edebec817..39f63202e1986 100644
--- a/web/pandas/about/team.md
+++ b/web/pandas/about/team.md
@@ -2,7 +2,7 @@
## Contributors
-_pandas_ is made with love by more than [1,500 volunteer contributors](https://github.com/pandas-dev/pandas/graphs/contributors).
+_pandas_ is made with love by more than [2,000 volunteer contributors](https://github.com/pandas-dev/pandas/graphs/contributors).
If you want to support pandas development, you can find information in the [donations page](../donate.html).
@@ -42,7 +42,7 @@ If you want to support pandas development, you can find information in the [dona
> or anyone willing to increase the diversity of our team.
> We have identified visible gaps and obstacles in sustaining diversity and inclusion in the open-source communities and we are proactive in increasing
> the diversity of our team.
-> We have a [code of conduct]({base_url}/community/coc.html) to ensure a friendly and welcoming environment.
+> We have a [code of conduct](../community/coc.html) to ensure a friendly and welcoming environment.
> Please send an email to [pandas-code-of-conduct-committee](mailto:pandas-coc@googlegroups.com), if you think we can do a
> better job at achieving this goal.
diff --git a/web/pandas/community/ecosystem.md b/web/pandas/community/ecosystem.md
index 715a84c1babc6..515d23afb93ec 100644
--- a/web/pandas/community/ecosystem.md
+++ b/web/pandas/community/ecosystem.md
@@ -42,6 +42,13 @@ datasets into feature matrices for machine learning using reusable
feature engineering "primitives". Users can contribute their own
primitives in Python and share them with the rest of the community.
+### [Compose](https://github.com/FeatureLabs/compose)
+
+Compose is a machine learning tool for labeling data and prediction engineering.
+It allows you to structure the labeling process by parameterizing
+prediction problems and transforming time-driven relational data into
+target values with cutoff times that can be used for supervised learning.
+
## Visualization
### [Altair](https://altair-viz.github.io/)
@@ -353,13 +360,23 @@ Cyberpandas provides an extension type for storing arrays of IP
Addresses. These arrays can be stored inside pandas' Series and
DataFrame.
+### [Pint-Pandas](https://github.com/hgrecco/pint-pandas)
+
+Pint-Pandas provides an extension type for storing numeric arrays with units.
+These arrays can be stored inside pandas' Series and DataFrame. Operations
+between Series and DataFrame columns which use pint's extension array are then
+units aware.
+
## Accessors
A directory of projects providing
`extension accessors `. This is for users to discover new accessors and for library
authors to coordinate on the namespace.
- | Library | Accessor | Classes |
- | ------------------------------------------------------------|----------|-----------------------|
- | [cyberpandas](https://cyberpandas.readthedocs.io/en/latest) | `ip` | `Series` |
- | [pdvega](https://altair-viz.github.io/pdvega/) | `vgplot` | `Series`, `DataFrame` |
+ | Library | Accessor | Classes |
+ | --------------------------------------------------------------|----------|-----------------------|
+ | [cyberpandas](https://cyberpandas.readthedocs.io/en/latest) | `ip` | `Series` |
+ | [pdvega](https://altair-viz.github.io/pdvega/) | `vgplot` | `Series`, `DataFrame` |
+ | [pandas_path](https://github.com/drivendataorg/pandas-path/) | `path` | `Index`, `Series` |
+ | [pint-pandas](https://github.com/hgrecco/pint-pandas) | `pint` | `Series`, `DataFrame` |
+ | [composeml](https://github.com/FeatureLabs/compose) | `slice` | `DataFrame` |
diff --git a/web/pandas/config.yml b/web/pandas/config.yml
index 23575cc123050..9a178d26659c3 100644
--- a/web/pandas/config.yml
+++ b/web/pandas/config.yml
@@ -79,6 +79,13 @@ maintainers:
- datapythonista
- simonjayhawkins
- topper-123
+ - alimcmaster1
+ - bashtage
+ - charlesdong1991
+ - Dr-Irv
+ - dsaxton
+ - MarcoGorelli
+ - rhshadrach
emeritus:
- Wouter Overmeire
- Skipper Seabold
diff --git a/web/pandas/index.html b/web/pandas/index.html
index 83d0f48197033..75c797d6dd93d 100644
--- a/web/pandas/index.html
+++ b/web/pandas/index.html
@@ -63,7 +63,7 @@ With the support of:
{% if releases %}
Latest version: {{ releases[0].name }}
|