diff --git a/.github/workflows/comment-commands.yml b/.github/workflows/comment-commands.yml index 45f3e911377c1..62956f5825782 100644 --- a/.github/workflows/comment-commands.yml +++ b/.github/workflows/comment-commands.yml @@ -11,7 +11,7 @@ permissions: jobs: issue_assign: runs-on: ubuntu-22.04 - if: (!github.event.issue.pull_request) && trim(github.event.comment.body) == 'take' + if: (!github.event.issue.pull_request) && github.event.comment.body == 'take' concurrency: group: ${{ github.actor }}-issue-assign steps: diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 354402c572ade..32ca5573ac08a 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -152,7 +152,7 @@ jobs: run: echo "sdist_name=$(cd ./dist && ls -d */)" >> "$GITHUB_ENV" - name: Build wheels - uses: pypa/cibuildwheel@v2.21.3 + uses: pypa/cibuildwheel@v2.22.0 with: package-dir: ./dist/${{ startsWith(matrix.buildplat[1], 'macosx') && env.sdist_name || needs.build_sdist.outputs.sdist_file }} env: diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 772793702f8b8..9faa2a249613b 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -82,7 +82,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.Timestamp.min PR02" \ -i "pandas.Timestamp.resolution PR02" \ -i "pandas.Timestamp.tzinfo GL08" \ - -i "pandas.api.types.is_re_compilable PR07,SA01" \ -i "pandas.arrays.ArrowExtensionArray PR07,SA01" \ -i "pandas.arrays.IntegerArray SA01" \ -i "pandas.arrays.IntervalArray.length SA01" \ @@ -109,17 +108,11 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then -i "pandas.core.resample.Resampler.std SA01" \ -i "pandas.core.resample.Resampler.transform PR01,RT03,SA01" \ -i "pandas.core.resample.Resampler.var SA01" \ - -i "pandas.errors.DuplicateLabelError SA01" \ -i "pandas.errors.IntCastingNaNError SA01" \ - -i "pandas.errors.InvalidIndexError SA01" \ -i "pandas.errors.NullFrequencyError SA01" \ - -i "pandas.errors.NumExprClobberingError SA01" \ -i "pandas.errors.NumbaUtilError SA01" \ - -i "pandas.errors.OutOfBoundsTimedelta SA01" \ -i "pandas.errors.PerformanceWarning SA01" \ - -i "pandas.errors.PossibleDataLossError SA01" \ -i "pandas.errors.UndefinedVariableError PR01,SA01" \ - -i "pandas.errors.UnsortedIndexError SA01" \ -i "pandas.errors.ValueLabelTypeMismatch SA01" \ -i "pandas.infer_freq SA01" \ -i "pandas.io.json.build_table_schema PR07,RT03,SA01" \ diff --git a/doc/source/conf.py b/doc/source/conf.py index ddbda0aa3bf65..677ee6274b093 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -242,7 +242,6 @@ "external_links": [], "footer_start": ["pandas_footer", "sphinx-version"], "github_url": "https://github.com/pandas-dev/pandas", - "twitter_url": "https://twitter.com/pandas_dev", "analytics": { "plausible_analytics_domain": "pandas.pydata.org", "plausible_analytics_url": "https://views.scientific-python.org/js/script.js", @@ -258,6 +257,11 @@ # patch version doesn't compare as equal (e.g. 2.2.1 != 2.2.0 but it should be) "show_version_warning_banner": False, "icon_links": [ + { + "name": "X", + "url": "https://x.com/pandas_dev", + "icon": "fa-brands fa-square-x-twitter", + }, { "name": "Mastodon", "url": "https://fosstodon.org/@pandas_dev", diff --git a/doc/source/user_guide/reshaping.rst b/doc/source/user_guide/reshaping.rst index 3347f3a2534f4..8c5e98791a9ef 100644 --- a/doc/source/user_guide/reshaping.rst +++ b/doc/source/user_guide/reshaping.rst @@ -321,7 +321,7 @@ The missing value can be filled with a specific value with the ``fill_value`` ar .. image:: ../_static/reshaping_melt.png The top-level :func:`~pandas.melt` function and the corresponding :meth:`DataFrame.melt` -are useful to massage a :class:`DataFrame` into a format where one or more columns +are useful to reshape a :class:`DataFrame` into a format where one or more columns are *identifier variables*, while all other columns, considered *measured variables*, are "unpivoted" to the row axis, leaving just two non-identifier columns, "variable" and "value". The names of those columns can be customized diff --git a/doc/source/user_guide/window.rst b/doc/source/user_guide/window.rst index e25c4c2441920..0581951d5bfad 100644 --- a/doc/source/user_guide/window.rst +++ b/doc/source/user_guide/window.rst @@ -567,9 +567,9 @@ One must have :math:`0 < \alpha \leq 1`, and while it is possible to pass \alpha = \begin{cases} - \frac{2}{s + 1}, & \text{for span}\ s \geq 1\\ - \frac{1}{1 + c}, & \text{for center of mass}\ c \geq 0\\ - 1 - \exp^{\frac{\log 0.5}{h}}, & \text{for half-life}\ h > 0 + \frac{2}{s + 1}, & \text{for span}\ s \geq 1\\ + \frac{1}{1 + c}, & \text{for center of mass}\ c \geq 0\\ + 1 - e^{\frac{\log 0.5}{h}}, & \text{for half-life}\ h > 0 \end{cases} One must specify precisely one of **span**, **center of mass**, **half-life** diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 120ee978292d6..1de45f4c5c140 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -690,6 +690,7 @@ I/O - Bug in :meth:`DataFrame.from_records` where ``columns`` parameter with numpy structured array was not reordering and filtering out the columns (:issue:`59717`) - Bug in :meth:`DataFrame.to_dict` raises unnecessary ``UserWarning`` when columns are not unique and ``orient='tight'``. (:issue:`58281`) - Bug in :meth:`DataFrame.to_excel` when writing empty :class:`DataFrame` with :class:`MultiIndex` on both axes (:issue:`57696`) +- Bug in :meth:`DataFrame.to_excel` where the :class:`MultiIndex` index with a period level was not a date (:issue:`60099`) - Bug in :meth:`DataFrame.to_stata` when writing :class:`DataFrame` and ``byteorder=`big```. (:issue:`58969`) - Bug in :meth:`DataFrame.to_stata` when writing more than 32,000 value labels. (:issue:`60107`) - Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`) @@ -760,6 +761,7 @@ ExtensionArray - Bug in :meth:`.arrays.ArrowExtensionArray.__setitem__` which caused wrong behavior when using an integer array with repeated values as a key (:issue:`58530`) - Bug in :meth:`api.types.is_datetime64_any_dtype` where a custom :class:`ExtensionDtype` would return ``False`` for array-likes (:issue:`57055`) - Bug in comparison between object with :class:`ArrowDtype` and incompatible-dtyped (e.g. string vs bool) incorrectly raising instead of returning all-``False`` (for ``==``) or all-``True`` (for ``!=``) (:issue:`59505`) +- Bug in constructing pandas data structures when passing into ``dtype`` a string of the type followed by ``[pyarrow]`` while PyArrow is not installed would raise ``NameError`` rather than ``ImportError`` (:issue:`57928`) - Bug in various :class:`DataFrame` reductions for pyarrow temporal dtypes returning incorrect dtype when result was null (:issue:`59234`) Styler @@ -769,6 +771,7 @@ Styler Other ^^^^^ - Bug in :class:`DataFrame` when passing a ``dict`` with a NA scalar and ``columns`` that would always return ``np.nan`` (:issue:`57205`) +- Bug in :func:`comparison_op` where comparing a ``string`` dtype array with an ``object`` dtype array containing mixed types would raise a ``TypeError`` when PyArrow-based strings are enabled. (:issue:`60228`) - Bug in :func:`eval` on :class:`ExtensionArray` on including division ``/`` failed with a ``TypeError``. (:issue:`58748`) - Bug in :func:`eval` where the names of the :class:`Series` were not preserved when using ``engine="numexpr"``. (:issue:`10239`) - Bug in :func:`eval` with ``engine="numexpr"`` returning unexpected result for float division. (:issue:`59736`) @@ -788,6 +791,7 @@ Other - Bug in :meth:`Series.dt` methods in :class:`ArrowDtype` that were returning incorrect values. (:issue:`57355`) - Bug in :meth:`Series.rank` that doesn't preserve missing values for nullable integers when ``na_option='keep'``. (:issue:`56976`) - Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` inconsistently replacing matching instances when ``regex=True`` and missing values are present. (:issue:`56599`) +- Bug in :meth:`Series.to_string` when series contains complex floats with exponents (:issue:`60405`) - Bug in :meth:`read_csv` where chained fsspec TAR file and ``compression="infer"`` fails with ``tarfile.ReadError`` (:issue:`60028`) - Bug in Dataframe Interchange Protocol implementation was returning incorrect results for data buffers' associated dtype, for string and datetime columns (:issue:`54781`) - Bug in ``Series.list`` methods not preserving the original :class:`Index`. (:issue:`58425`) diff --git a/pandas/_libs/index.pyi b/pandas/_libs/index.pyi index bf6d8ba8973d3..3af2856d2fbbf 100644 --- a/pandas/_libs/index.pyi +++ b/pandas/_libs/index.pyi @@ -72,6 +72,9 @@ class MaskedUInt16Engine(MaskedIndexEngine): ... class MaskedUInt8Engine(MaskedIndexEngine): ... class MaskedBoolEngine(MaskedUInt8Engine): ... +class StringObjectEngine(ObjectEngine): + def __init__(self, values: object, na_value) -> None: ... + class BaseMultiIndexCodesEngine: levels: list[np.ndarray] offsets: np.ndarray # np.ndarray[..., ndim=1] diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 1506a76aa94a6..688f943760d1f 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -557,6 +557,31 @@ cdef class StringEngine(IndexEngine): raise KeyError(val) return str(val) +cdef class StringObjectEngine(ObjectEngine): + + cdef: + object na_value + bint uses_na + + def __init__(self, ndarray values, na_value): + super().__init__(values) + self.na_value = na_value + self.uses_na = na_value is C_NA + + cdef bint _checknull(self, object val): + if self.uses_na: + return val is C_NA + else: + return util.is_nan(val) + + cdef _check_type(self, object val): + if isinstance(val, str): + return val + elif self._checknull(val): + return self.na_value + else: + raise KeyError(val) + cdef class DatetimeEngine(Int64Engine): diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index 193556b2697a9..1b7f04fe17238 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -201,6 +201,10 @@ class OutOfBoundsTimedelta(ValueError): Representation should be within a timedelta64[ns]. + See Also + -------- + date_range : Return a fixed frequency DatetimeIndex. + Examples -------- >>> pd.date_range(start="1/1/1700", freq="B", periods=100000) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index e0c93db0afb07..8f8372add8cf7 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -726,7 +726,14 @@ def _cmp_method(self, other, op) -> ArrowExtensionArray: other, (ArrowExtensionArray, np.ndarray, list, BaseMaskedArray) ) or isinstance(getattr(other, "dtype", None), CategoricalDtype): try: - result = pc_func(self._pa_array, self._box_pa(other)) + if pa.types.is_string(self._pa_array.type): + other_array = self._box_pa(other) + self_array = self._pa_array.cast(pa.large_string()) + if pa.types.is_string(other_array.type): + other_array = other_array.cast(pa.large_string()) + result = pc_func(self_array, other_array) + else: + result = pc_func(self._pa_array, self._box_pa(other)) except pa.ArrowNotImplementedError: # TODO: could this be wrong if other is object dtype? # in which case we need to operate pointwise? diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index f47ef095a8409..bbbf1d9ca60bd 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -1055,7 +1055,9 @@ def shift(self, periods: int = 1, fill_value: object = None) -> IntervalArray: from pandas import Index fill_value = Index(self._left, copy=False)._na_value - empty = IntervalArray.from_breaks([fill_value] * (empty_len + 1)) + empty = IntervalArray.from_breaks( + [fill_value] * (empty_len + 1), closed=self.closed + ) else: empty = self._from_sequence([fill_value] * empty_len, dtype=self.dtype) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 96b0aa16940a6..e5d1033de4457 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -2344,6 +2344,8 @@ def construct_from_string(cls, string: str) -> ArrowDtype: if string == "string[pyarrow]": # Ensure Registry.find skips ArrowDtype to use StringDtype instead raise TypeError("string[pyarrow] should be constructed by StringDtype") + if pa_version_under10p1: + raise ImportError("pyarrow>=10.0.1 is required for ArrowDtype") base_type = string[:-9] # get rid of "[pyarrow]" try: diff --git a/pandas/core/dtypes/inference.py b/pandas/core/dtypes/inference.py index 6adb34ff0f777..918d107f2ce6c 100644 --- a/pandas/core/dtypes/inference.py +++ b/pandas/core/dtypes/inference.py @@ -190,12 +190,17 @@ def is_re_compilable(obj: object) -> bool: Parameters ---------- obj : The object to check + The object to check if the object can be compiled into a regex pattern instance. Returns ------- bool Whether `obj` can be compiled as a regex pattern. + See Also + -------- + api.types.is_re : Check if the object is a regex pattern instance. + Examples -------- >>> from pandas.api.types import is_re_compilable diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 039bdf9c36ee7..a6be17a654aa7 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -838,7 +838,7 @@ def pop(self, item: Hashable) -> Series | Any: return result @final - def squeeze(self, axis: Axis | None = None): + def squeeze(self, axis: Axis | None = None) -> Scalar | Series | DataFrame: """ Squeeze 1 dimensional axis objects into scalars. diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index d4ba7e01ebfa9..165fe109c4c94 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -876,7 +876,7 @@ def _engine( # ndarray[Any, Any]]" has no attribute "_ndarray" [union-attr] target_values = self._data._ndarray # type: ignore[union-attr] elif is_string_dtype(self.dtype) and not is_object_dtype(self.dtype): - return libindex.StringEngine(target_values) + return libindex.StringObjectEngine(target_values, self.dtype.na_value) # type: ignore[union-attr] # error: Argument 1 to "ExtensionEngine" has incompatible type # "ndarray[Any, Any]"; expected "ExtensionArray" @@ -5974,7 +5974,6 @@ def _should_fallback_to_positional(self) -> bool: def get_indexer_non_unique( self, target ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: - target = ensure_index(target) target = self._maybe_cast_listlike_indexer(target) if not self._should_compare(target) and not self._should_partial_index(target): diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index bfd8e3ccd2f7c..f4cb82816bbcf 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -51,9 +51,9 @@ def melt( """ Unpivot a DataFrame from wide to long format, optionally leaving identifiers set. - This function is useful to massage a DataFrame into a format where one + This function is useful to reshape a DataFrame into a format where one or more columns are identifier variables (`id_vars`), while all other - columns, considered measured variables (`value_vars`), are "unpivoted" to + columns are considered measured variables (`value_vars`), and are "unpivoted" to the row axis, leaving just two non-identifier columns, 'variable' and 'value'. diff --git a/pandas/core/series.py b/pandas/core/series.py index 35b576da87ed7..4fa8b86fa4c16 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -567,7 +567,7 @@ def __arrow_c_stream__(self, requested_schema=None): Export the pandas Series as an Arrow C stream PyCapsule. This relies on pyarrow to convert the pandas Series to the Arrow - format (and follows the default behaviour of ``pyarrow.Array.from_pandas`` + format (and follows the default behavior of ``pyarrow.Array.from_pandas`` in its handling of the index, i.e. to ignore it). This conversion is not necessarily zero-copy. @@ -2226,7 +2226,7 @@ def drop_duplicates( 5 hippo Name: animal, dtype: object - With the 'keep' parameter, the selection behaviour of duplicated values + With the 'keep' parameter, the selection behavior of duplicated values can be changed. The value 'first' keeps the first occurrence for each set of duplicated entries. The default value of keep is 'first'. @@ -3451,7 +3451,7 @@ def sort_values( 4 5.0 dtype: float64 - Sort values ascending order (default behaviour) + Sort values ascending order (default behavior) >>> s.sort_values(ascending=True) 1 1.0 @@ -4098,7 +4098,7 @@ def swaplevel( In the following example, we will swap the levels of the indices. Here, we will swap the levels column-wise, but levels can be swapped row-wise - in a similar manner. Note that column-wise is the default behaviour. + in a similar manner. Note that column-wise is the default behavior. By not supplying any arguments for i and j, we swap the last and second to last indices. diff --git a/pandas/errors/__init__.py b/pandas/errors/__init__.py index 68bd70603abae..70d839d817114 100644 --- a/pandas/errors/__init__.py +++ b/pandas/errors/__init__.py @@ -100,6 +100,11 @@ class UnsortedIndexError(KeyError): Subclass of `KeyError`. + See Also + -------- + DataFrame.sort_index : Sort a DataFrame by its index. + DataFrame.set_index : Set the DataFrame index using existing columns. + Examples -------- >>> df = pd.DataFrame( @@ -388,6 +393,19 @@ class DuplicateLabelError(ValueError): """ Error raised when an operation would introduce duplicate labels. + This error is typically encountered when performing operations on objects + with `allows_duplicate_labels=False` and the operation would result in + duplicate labels in the index. Duplicate labels can lead to ambiguities + in indexing and reduce data integrity. + + See Also + -------- + Series.set_flags : Return a new ``Series`` object with updated flags. + DataFrame.set_flags : Return a new ``DataFrame`` object with updated flags. + Series.reindex : Conform ``Series`` object to new index with optional filling logic. + DataFrame.reindex : Conform ``DataFrame`` object to new index with optional filling + logic. + Examples -------- >>> s = pd.Series([0, 1, 2], index=["a", "b", "c"]).set_flags( @@ -407,6 +425,16 @@ class InvalidIndexError(Exception): """ Exception raised when attempting to use an invalid index key. + This exception is triggered when a user attempts to access or manipulate + data in a pandas DataFrame or Series using an index key that is not valid + for the given object. This may occur in cases such as using a malformed + slice, a mismatched key for a ``MultiIndex``, or attempting to access an index + element that does not exist. + + See Also + -------- + MultiIndex : A multi-level, or hierarchical, index object for pandas objects. + Examples -------- >>> idx = pd.MultiIndex.from_product([["x", "y"], [0, 1]]) @@ -510,6 +538,11 @@ class NumExprClobberingError(NameError): to 'numexpr'. 'numexpr' is the default engine value for these methods if the numexpr package is installed. + See Also + -------- + eval : Evaluate a Python expression as a string using various backends. + DataFrame.query : Query the columns of a DataFrame with a boolean expression. + Examples -------- >>> df = pd.DataFrame({"abs": [1, 1, 1]}) @@ -633,6 +666,15 @@ class PossibleDataLossError(Exception): """ Exception raised when trying to open a HDFStore file when already opened. + This error is triggered when there is a potential risk of data loss due to + conflicting operations on an HDFStore file. It serves to prevent unintended + overwrites or data corruption by enforcing exclusive access to the file. + + See Also + -------- + HDFStore : Dict-like IO interface for storing pandas objects in PyTables. + HDFStore.open : Open an HDFStore file in the specified mode. + Examples -------- >>> store = pd.HDFStore("my-store", "a") # doctest: +SKIP diff --git a/pandas/io/formats/excel.py b/pandas/io/formats/excel.py index 6a3e215de3f96..5fde6577e9f95 100644 --- a/pandas/io/formats/excel.py +++ b/pandas/io/formats/excel.py @@ -37,6 +37,7 @@ DataFrame, Index, MultiIndex, + Period, PeriodIndex, ) import pandas.core.common as com @@ -803,6 +804,9 @@ def _format_hierarchical_rows(self) -> Iterable[ExcelCell]: allow_fill=levels._can_hold_na, fill_value=levels._na_value, ) + # GH#60099 + if isinstance(values[0], Period): + values = values.to_timestamp() for i, span_val in spans.items(): mergestart, mergeend = None, None @@ -827,6 +831,10 @@ def _format_hierarchical_rows(self) -> Iterable[ExcelCell]: # Format hierarchical rows with non-merged values. for indexcolvals in zip(*self.df.index): for idx, indexcolval in enumerate(indexcolvals): + # GH#60099 + if isinstance(indexcolval, Period): + indexcolval = indexcolval.to_timestamp() + yield CssExcelCell( row=self.rowcounter + idx, col=gcolidx, diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 861f5885f80c6..4f87b1a30ca61 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1749,7 +1749,7 @@ def _trim_zeros_complex(str_complexes: ArrayLike, decimal: str = ".") -> list[st # The split will give [{"", "-"}, "xxx", "+/-", "xxx", "j", ""] # Therefore, the imaginary part is the 4th and 3rd last elements, # and the real part is everything before the imaginary part - trimmed = re.split(r"([j+-])", x) + trimmed = re.split(r"(?=10.0.1 is required"): + pd.Series([-1.5, 0.2, None], dtype="float32[pyarrow]") diff --git a/pandas/tests/frame/methods/test_shift.py b/pandas/tests/frame/methods/test_shift.py index a0f96ff111444..b52240c208493 100644 --- a/pandas/tests/frame/methods/test_shift.py +++ b/pandas/tests/frame/methods/test_shift.py @@ -757,3 +757,12 @@ def test_shift_with_offsets_freq_empty(self): df_shifted = DataFrame(index=shifted_dates) result = df.shift(freq=offset) tm.assert_frame_equal(result, df_shifted) + + def test_series_shift_interval_preserves_closed(self): + # GH#60389 + ser = Series( + [pd.Interval(1, 2, closed="right"), pd.Interval(2, 3, closed="right")] + ) + result = ser.shift(1) + expected = Series([np.nan, pd.Interval(1, 2, closed="right")]) + tm.assert_series_equal(result, expected) diff --git a/pandas/tests/indexes/string/test_indexing.py b/pandas/tests/indexes/string/test_indexing.py index 755b7109a5a04..d1a278af337b7 100644 --- a/pandas/tests/indexes/string/test_indexing.py +++ b/pandas/tests/indexes/string/test_indexing.py @@ -6,6 +6,51 @@ import pandas._testing as tm +def _isnan(val): + try: + return val is not pd.NA and np.isnan(val) + except TypeError: + return False + + +class TestGetLoc: + def test_get_loc(self, any_string_dtype): + index = Index(["a", "b", "c"], dtype=any_string_dtype) + assert index.get_loc("b") == 1 + + def test_get_loc_raises(self, any_string_dtype): + index = Index(["a", "b", "c"], dtype=any_string_dtype) + with pytest.raises(KeyError, match="d"): + index.get_loc("d") + + def test_get_loc_invalid_value(self, any_string_dtype): + index = Index(["a", "b", "c"], dtype=any_string_dtype) + with pytest.raises(KeyError, match="1"): + index.get_loc(1) + + def test_get_loc_non_unique(self, any_string_dtype): + index = Index(["a", "b", "a"], dtype=any_string_dtype) + result = index.get_loc("a") + expected = np.array([True, False, True]) + tm.assert_numpy_array_equal(result, expected) + + def test_get_loc_non_missing(self, any_string_dtype, nulls_fixture): + index = Index(["a", "b", "c"], dtype=any_string_dtype) + with pytest.raises(KeyError): + index.get_loc(nulls_fixture) + + def test_get_loc_missing(self, any_string_dtype, nulls_fixture): + index = Index(["a", "b", nulls_fixture], dtype=any_string_dtype) + if any_string_dtype == "string" and ( + (any_string_dtype.na_value is pd.NA and nulls_fixture is not pd.NA) + or (_isnan(any_string_dtype.na_value) and not _isnan(nulls_fixture)) + ): + with pytest.raises(KeyError): + index.get_loc(nulls_fixture) + else: + assert index.get_loc(nulls_fixture) == 2 + + class TestGetIndexer: @pytest.mark.parametrize( "method,expected", @@ -41,23 +86,60 @@ def test_get_indexer_strings_raises(self, any_string_dtype): ["a", "b", "c", "d"], method="pad", tolerance=[2, 2, 2, 2] ) + @pytest.mark.parametrize("null", [None, np.nan, float("nan"), pd.NA]) + def test_get_indexer_missing(self, any_string_dtype, null, using_infer_string): + # NaT and Decimal("NaN") from null_fixture are not supported for string dtype + index = Index(["a", "b", null], dtype=any_string_dtype) + result = index.get_indexer(["a", null, "c"]) + if using_infer_string: + expected = np.array([0, 2, -1], dtype=np.intp) + elif any_string_dtype == "string" and ( + (any_string_dtype.na_value is pd.NA and null is not pd.NA) + or (_isnan(any_string_dtype.na_value) and not _isnan(null)) + ): + expected = np.array([0, -1, -1], dtype=np.intp) + else: + expected = np.array([0, 2, -1], dtype=np.intp) -class TestGetIndexerNonUnique: - @pytest.mark.xfail(reason="TODO(infer_string)", strict=False) - def test_get_indexer_non_unique_nas(self, any_string_dtype, nulls_fixture): - index = Index(["a", "b", None], dtype=any_string_dtype) - indexer, missing = index.get_indexer_non_unique([nulls_fixture]) + tm.assert_numpy_array_equal(result, expected) - expected_indexer = np.array([2], dtype=np.intp) - expected_missing = np.array([], dtype=np.intp) + +class TestGetIndexerNonUnique: + @pytest.mark.parametrize("null", [None, np.nan, float("nan"), pd.NA]) + def test_get_indexer_non_unique_nas( + self, any_string_dtype, null, using_infer_string + ): + index = Index(["a", "b", null], dtype=any_string_dtype) + indexer, missing = index.get_indexer_non_unique(["a", null]) + + if using_infer_string: + expected_indexer = np.array([0, 2], dtype=np.intp) + expected_missing = np.array([], dtype=np.intp) + elif any_string_dtype == "string" and ( + (any_string_dtype.na_value is pd.NA and null is not pd.NA) + or (_isnan(any_string_dtype.na_value) and not _isnan(null)) + ): + expected_indexer = np.array([0, -1], dtype=np.intp) + expected_missing = np.array([1], dtype=np.intp) + else: + expected_indexer = np.array([0, 2], dtype=np.intp) + expected_missing = np.array([], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected_indexer) tm.assert_numpy_array_equal(missing, expected_missing) # actually non-unique - index = Index(["a", None, "b", None], dtype=any_string_dtype) - indexer, missing = index.get_indexer_non_unique([nulls_fixture]) - - expected_indexer = np.array([1, 3], dtype=np.intp) + index = Index(["a", null, "b", null], dtype=any_string_dtype) + indexer, missing = index.get_indexer_non_unique(["a", null]) + + if using_infer_string: + expected_indexer = np.array([0, 1, 3], dtype=np.intp) + elif any_string_dtype == "string" and ( + (any_string_dtype.na_value is pd.NA and null is not pd.NA) + or (_isnan(any_string_dtype.na_value) and not _isnan(null)) + ): + pass + else: + expected_indexer = np.array([0, 1, 3], dtype=np.intp) tm.assert_numpy_array_equal(indexer, expected_indexer) tm.assert_numpy_array_equal(missing, expected_missing) diff --git a/pandas/tests/io/excel/test_style.py b/pandas/tests/io/excel/test_style.py index f70e65e34c584..71ef1201e523f 100644 --- a/pandas/tests/io/excel/test_style.py +++ b/pandas/tests/io/excel/test_style.py @@ -9,6 +9,9 @@ from pandas import ( DataFrame, + MultiIndex, + Timestamp, + period_range, read_excel, ) import pandas._testing as tm @@ -333,3 +336,26 @@ def test_styler_to_s3(s3_public_bucket, s3so): f"s3://{mock_bucket_name}/{target_file}", index_col=0, storage_options=s3so ) tm.assert_frame_equal(result, df) + + +@pytest.mark.parametrize("merge_cells", [True, False, "columns"]) +def test_format_hierarchical_rows_periodindex(merge_cells): + # GH#60099 + df = DataFrame( + {"A": [1, 2]}, + index=MultiIndex.from_arrays( + [ + period_range(start="2006-10-06", end="2006-10-07", freq="D"), + ["X", "Y"], + ], + names=["date", "category"], + ), + ) + formatter = ExcelFormatter(df, merge_cells=merge_cells) + formatted_cells = formatter._format_hierarchical_rows() + + for cell in formatted_cells: + if cell.row != 0 and cell.col == 0: + assert isinstance( + cell.val, Timestamp + ), "Period should be converted to Timestamp" diff --git a/pandas/tests/io/excel/test_writers.py b/pandas/tests/io/excel/test_writers.py index 19fe9855dbb85..18948de72200a 100644 --- a/pandas/tests/io/excel/test_writers.py +++ b/pandas/tests/io/excel/test_writers.py @@ -23,6 +23,7 @@ MultiIndex, date_range, option_context, + period_range, ) import pandas._testing as tm @@ -335,6 +336,43 @@ def test_multiindex_interval_datetimes(self, tmp_excel): ) tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize("merge_cells", [True, False, "columns"]) + def test_excel_round_trip_with_periodindex(self, tmp_excel, merge_cells): + # GH#60099 + df = DataFrame( + {"A": [1, 2]}, + index=MultiIndex.from_arrays( + [ + period_range(start="2006-10-06", end="2006-10-07", freq="D"), + ["X", "Y"], + ], + names=["date", "category"], + ), + ) + df.to_excel(tmp_excel, merge_cells=merge_cells) + result = pd.read_excel(tmp_excel, index_col=[0, 1]) + expected = DataFrame( + {"A": [1, 2]}, + MultiIndex.from_arrays( + [ + [ + pd.to_datetime("2006-10-06 00:00:00"), + pd.to_datetime("2006-10-07 00:00:00"), + ], + ["X", "Y"], + ], + names=["date", "category"], + ), + ) + time_format = ( + "datetime64[s]" if tmp_excel.endswith(".ods") else "datetime64[us]" + ) + expected.index = expected.index.set_levels( + expected.index.levels[0].astype(time_format), level=0 + ) + + tm.assert_frame_equal(result, expected) + @pytest.mark.parametrize( "engine,ext", diff --git a/pandas/tests/io/formats/test_to_string.py b/pandas/tests/io/formats/test_to_string.py index 5731f74a03852..af3cdf2d44af3 100644 --- a/pandas/tests/io/formats/test_to_string.py +++ b/pandas/tests/io/formats/test_to_string.py @@ -422,6 +422,24 @@ def test_to_string_complex_float_formatting(self): ) assert result == expected + def test_to_string_complex_float_formatting_with_exponents(self): + # GH #60393 + with option_context("display.precision", 6): + df = DataFrame( + { + "x": [ + (1.8816e-09 + 0j), + (1.8816e-09 + 3.39676e-09j), + ] + } + ) + result = df.to_string() + expected = ( + " x\n0 1.881600e-09+0.000000e+00j\n" + "1 1.881600e-09+3.396760e-09j" + ) + assert result == expected + def test_to_string_format_inf(self): # GH#24861 df = DataFrame( diff --git a/pandas/tests/io/parser/common/test_common_basic.py b/pandas/tests/io/parser/common/test_common_basic.py index 511db2c6a33d8..3680273f5e98a 100644 --- a/pandas/tests/io/parser/common/test_common_basic.py +++ b/pandas/tests/io/parser/common/test_common_basic.py @@ -15,6 +15,7 @@ from pandas._config import using_string_dtype +from pandas.compat import HAS_PYARROW from pandas.errors import ( EmptyDataError, ParserError, @@ -766,7 +767,7 @@ def test_dict_keys_as_names(all_parsers): tm.assert_frame_equal(result, expected) -@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)") +@pytest.mark.xfail(using_string_dtype() and HAS_PYARROW, reason="TODO(infer_string)") @xfail_pyarrow # UnicodeDecodeError: 'utf-8' codec can't decode byte 0xed in position 0 def test_encoding_surrogatepass(all_parsers): # GH39017 diff --git a/pandas/tests/series/methods/test_compare.py b/pandas/tests/series/methods/test_compare.py index 2a57d5139b62c..93fef353457a7 100644 --- a/pandas/tests/series/methods/test_compare.py +++ b/pandas/tests/series/methods/test_compare.py @@ -138,3 +138,17 @@ def test_compare_datetime64_and_string(): tm.assert_series_equal(result_eq1, expected_eq) tm.assert_series_equal(result_eq2, expected_eq) tm.assert_series_equal(result_neq, expected_neq) + + +def test_comparison_string_mixed_object(): + # Issue https://github.com/pandas-dev/pandas/issues/60228 + pd.options.future.infer_string = True + + ser_string = pd.Series(["a", "b"], dtype="string") + ser_mixed = pd.Series([1, "b"]) + + result = ser_string == ser_mixed + expected = pd.Series([False, True], dtype="boolean") + tm.assert_series_equal(result, expected) + + pd.options.future.infer_string = False diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 3d1177c23c612..611b92eb022d6 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -1254,7 +1254,7 @@ def test_value_counts_nat(self): result_dt = algos.value_counts_internal(dt) tm.assert_series_equal(result_dt, exp_dt) - exp_td = Series({np.timedelta64(10000): 1}, name="count") + exp_td = Series([1], index=[np.timedelta64(10000)], name="count") result_td = algos.value_counts_internal(td) tm.assert_series_equal(result_td, exp_td)