Skip to content

Commit 9bcc977

Browse files
authored
Merge branch 'pandas-dev:main' into main
2 parents 57b5371 + c68c626 commit 9bcc977

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

48 files changed

+552
-181
lines changed

doc/source/getting_started/install.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -276,7 +276,7 @@ Computation
276276
========================= ================== =============================================================
277277
Dependency Minimum Version Notes
278278
========================= ================== =============================================================
279-
SciPy 1.14.1 Miscellaneous statistical functions
279+
SciPy 1.4.1 Miscellaneous statistical functions
280280
numba 0.50.1 Alternative execution engine for rolling operations
281281
(see :ref:`Enhancing Performance <enhancingperf.numba>`)
282282
xarray 0.15.1 pandas-like API for N-dimensional data

doc/source/user_guide/io.rst

+5
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,11 @@ dtype : Type name or dict of column -> type, default ``None``
186186
(unsupported with ``engine='python'``). Use ``str`` or ``object`` together
187187
with suitable ``na_values`` settings to preserve and
188188
not interpret dtype.
189+
.. versionadded:: 1.5.0
190+
191+
Support for defaultdict was added. Specify a defaultdict as input where
192+
the default determines the dtype of the columns which are not explicitly
193+
listed.
189194
engine : {``'c'``, ``'python'``, ``'pyarrow'``}
190195
Parser engine to use. The C and pyarrow engines are faster, while the python engine
191196
is currently more feature-complete. Multithreading is currently only supported by

doc/source/whatsnew/v1.5.0.rst

+31-9
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ Other enhancements
3939
- :meth:`to_numeric` now preserves float64 arrays when downcasting would generate values not representable in float32 (:issue:`43693`)
4040
- :meth:`Series.reset_index` and :meth:`DataFrame.reset_index` now support the argument ``allow_duplicates`` (:issue:`44410`)
4141
- :meth:`.GroupBy.min` and :meth:`.GroupBy.max` now supports `Numba <https://numba.pydata.org/>`_ execution with the ``engine`` keyword (:issue:`45428`)
42+
- :func:`read_csv` now supports ``defaultdict`` as a ``dtype`` parameter (:issue:`41574`)
4243
- :meth:`DataFrame.rolling` and :meth:`Series.rolling` now support a ``step`` parameter with fixed-length windows (:issue:`15354`)
4344
- Implemented a ``bool``-dtype :class:`Index`, passing a bool-dtype array-like to ``pd.Index`` will now retain ``bool`` dtype instead of casting to ``object`` (:issue:`45061`)
4445
- Implemented a complex-dtype :class:`Index`, passing a complex-dtype array-like to ``pd.Index`` will now retain complex dtype instead of casting to ``object`` (:issue:`45845`)
@@ -83,32 +84,51 @@ did not have the same index as the input.
8384

8485
.. code-block:: ipython
8586
86-
In [3]: df.groupby('a', dropna=True).transform(lambda x: x.sum())
87+
In [3]: # Value in the last row should be np.nan
88+
df.groupby('a', dropna=True).transform('sum')
8789
Out[3]:
8890
b
8991
0 5
9092
1 5
93+
2 5
9194
92-
In [3]: df.groupby('a', dropna=True).transform(lambda x: x)
95+
In [3]: # Should have one additional row with the value np.nan
96+
df.groupby('a', dropna=True).transform(lambda x: x.sum())
9397
Out[3]:
9498
b
95-
0 2
96-
1 3
99+
0 5
100+
1 5
97101
98-
In [3]: df.groupby('a', dropna=True).transform('sum')
102+
In [3]: # The value in the last row is np.nan interpreted as an integer
103+
df.groupby('a', dropna=True).transform('ffill')
104+
Out[3]:
105+
b
106+
0 2
107+
1 3
108+
2 -9223372036854775808
109+
110+
In [3]: # Should have one additional row with the value np.nan
111+
df.groupby('a', dropna=True).transform(lambda x: x)
99112
Out[3]:
100113
b
101-
0 5
102-
1 5
103-
2 5
114+
0 2
115+
1 3
104116
105117
*New behavior*:
106118

107119
.. ipython:: python
108120
121+
df.groupby('a', dropna=True).transform('sum')
109122
df.groupby('a', dropna=True).transform(lambda x: x.sum())
123+
df.groupby('a', dropna=True).transform('ffill')
110124
df.groupby('a', dropna=True).transform(lambda x: x)
111-
df.groupby('a', dropna=True).transform('sum')
125+
126+
.. _whatsnew_150.notable_bug_fixes.visualization:
127+
128+
Styler
129+
^^^^^^
130+
131+
- Fix showing "None" as ylabel in :meth:`Series.plot` when not setting ylabel (:issue:`46129`)
112132

113133
.. _whatsnew_150.notable_bug_fixes.notable_bug_fix2:
114134

@@ -343,6 +363,7 @@ Other Deprecations
343363
- Deprecated treating all-bool ``object``-dtype columns as bool-like in :meth:`DataFrame.any` and :meth:`DataFrame.all` with ``bool_only=True``, explicitly cast to bool instead (:issue:`46188`)
344364
- Deprecated behavior of method :meth:`DataFrame.quantile`, attribute ``numeric_only`` will default False. Including datetime/timedelta columns in the result (:issue:`7308`).
345365
- Deprecated :attr:`Timedelta.freq` and :attr:`Timedelta.is_populated` (:issue:`46430`)
366+
- Deprecated :attr:`Timedelta.delta` (:issue:`46476`)
346367
-
347368

348369
.. ---------------------------------------------------------------------------
@@ -384,6 +405,7 @@ Datetimelike
384405
- Bug in :class:`Timestamp` construction when passing datetime components as positional arguments and ``tzinfo`` as a keyword argument incorrectly raising (:issue:`31929`)
385406
- Bug in :meth:`Index.astype` when casting from object dtype to ``timedelta64[ns]`` dtype incorrectly casting ``np.datetime64("NaT")`` values to ``np.timedelta64("NaT")`` instead of raising (:issue:`45722`)
386407
- Bug in :meth:`SeriesGroupBy.value_counts` index when passing categorical column (:issue:`44324`)
408+
- Bug in :meth:`DatetimeIndex.tz_localize` localizing to UTC failing to make a copy of the underlying data (:issue:`46460`)
387409
-
388410

389411
Timedelta

pandas/_libs/interval.pyi

+6-6
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,6 @@ class _LengthDescriptor:
3232
def __get__(
3333
self, instance: Interval[_OrderableTimesT], owner: Any
3434
) -> Timedelta: ...
35-
@overload
36-
def __get__(self, instance: IntervalTree, owner: Any) -> np.ndarray: ...
3735

3836
class _MidDescriptor:
3937
@overload
@@ -42,8 +40,6 @@ class _MidDescriptor:
4240
def __get__(
4341
self, instance: Interval[_OrderableTimesT], owner: Any
4442
) -> _OrderableTimesT: ...
45-
@overload
46-
def __get__(self, instance: IntervalTree, owner: Any) -> np.ndarray: ...
4743

4844
class IntervalMixin:
4945
@property
@@ -54,8 +50,6 @@ class IntervalMixin:
5450
def open_left(self) -> bool: ...
5551
@property
5652
def open_right(self) -> bool: ...
57-
mid: _MidDescriptor
58-
length: _LengthDescriptor
5953
@property
6054
def is_empty(self) -> bool: ...
6155
def _check_closed_matches(self, other: IntervalMixin, name: str = ...) -> None: ...
@@ -67,6 +61,8 @@ class Interval(IntervalMixin, Generic[_OrderableT]):
6761
def right(self: Interval[_OrderableT]) -> _OrderableT: ...
6862
@property
6963
def closed(self) -> IntervalClosedType: ...
64+
mid: _MidDescriptor
65+
length: _LengthDescriptor
7066
def __init__(
7167
self,
7268
left: _OrderableT,
@@ -162,6 +158,10 @@ class IntervalTree(IntervalMixin):
162158
closed: IntervalClosedType = ...,
163159
leaf_size: int = ...,
164160
): ...
161+
@property
162+
def mid(self) -> np.ndarray: ...
163+
@property
164+
def length(self) -> np.ndarray: ...
165165
def get_indexer(self, target) -> npt.NDArray[np.intp]: ...
166166
def get_indexer_non_unique(
167167
self, target

pandas/_libs/parsers.pyx

+5
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# Copyright (c) 2012, Lambda Foundry, Inc.
22
# See LICENSE for the license
33
from base64 import decode
4+
from collections import defaultdict
45
from csv import (
56
QUOTE_MINIMAL,
67
QUOTE_NONE,
@@ -964,6 +965,8 @@ cdef class TextReader:
964965

965966
results = {}
966967
nused = 0
968+
is_default_dict_dtype = isinstance(self.dtype, defaultdict)
969+
967970
for i in range(self.table_width):
968971
if i < self.leading_cols:
969972
# Pass through leading columns always
@@ -994,6 +997,8 @@ cdef class TextReader:
994997
col_dtype = self.dtype[name]
995998
elif i in self.dtype:
996999
col_dtype = self.dtype[i]
1000+
elif is_default_dict_dtype:
1001+
col_dtype = self.dtype[name]
9971002
else:
9981003
if self.dtype.names:
9991004
# structured array

pandas/_libs/tslibs/dtypes.pxd

+3
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
1+
from numpy cimport int64_t
2+
13
from pandas._libs.tslibs.np_datetime cimport NPY_DATETIMEUNIT
24

35

46
cdef str npy_unit_to_abbrev(NPY_DATETIMEUNIT unit)
57
cdef NPY_DATETIMEUNIT freq_group_code_to_npy_unit(int freq) nogil
8+
cdef int64_t periods_per_day(NPY_DATETIMEUNIT reso=*)
69

710
cdef dict attrname_to_abbrevs
811

pandas/_libs/tslibs/dtypes.pyx

+30
Original file line numberDiff line numberDiff line change
@@ -307,6 +307,36 @@ cdef NPY_DATETIMEUNIT freq_group_code_to_npy_unit(int freq) nogil:
307307
return NPY_DATETIMEUNIT.NPY_FR_D
308308

309309

310+
cdef int64_t periods_per_day(NPY_DATETIMEUNIT reso=NPY_DATETIMEUNIT.NPY_FR_ns):
311+
"""
312+
How many of the given time units fit into a single day?
313+
"""
314+
cdef:
315+
int64_t day_units
316+
317+
if reso == NPY_DATETIMEUNIT.NPY_FR_ps:
318+
# pico is the smallest unit for which we don't overflow, so
319+
# we exclude fempto and atto
320+
day_units = 24 * 3600 * 1_000_000_000_000
321+
elif reso == NPY_DATETIMEUNIT.NPY_FR_ns:
322+
day_units = 24 * 3600 * 1_000_000_000
323+
elif reso == NPY_DATETIMEUNIT.NPY_FR_us:
324+
day_units = 24 * 3600 * 1_000_000
325+
elif reso == NPY_DATETIMEUNIT.NPY_FR_ms:
326+
day_units = 24 * 3600 * 1_000
327+
elif reso == NPY_DATETIMEUNIT.NPY_FR_s:
328+
day_units = 24 * 3600
329+
elif reso == NPY_DATETIMEUNIT.NPY_FR_m:
330+
day_units = 24 * 60
331+
elif reso == NPY_DATETIMEUNIT.NPY_FR_h:
332+
day_units = 24
333+
elif reso == NPY_DATETIMEUNIT.NPY_FR_D:
334+
day_units = 1
335+
else:
336+
raise NotImplementedError(reso)
337+
return day_units
338+
339+
310340
cdef dict _reso_str_map = {
311341
Resolution.RESO_NS.value: "nanosecond",
312342
Resolution.RESO_US.value: "microsecond",

pandas/_libs/tslibs/timedeltas.pyx

+6
Original file line numberDiff line numberDiff line change
@@ -1014,6 +1014,12 @@ cdef class _Timedelta(timedelta):
10141014
>>> td.delta
10151015
42
10161016
"""
1017+
# Deprecated GH#46476
1018+
warnings.warn(
1019+
"Timedelta.delta is deprecated and will be removed in a future version.",
1020+
FutureWarning,
1021+
stacklevel=1,
1022+
)
10171023
return self.value
10181024

10191025
@property

pandas/_libs/tslibs/tzconversion.pyx

+17-7
Original file line numberDiff line numberDiff line change
@@ -64,8 +64,6 @@ cdef int64_t tz_localize_to_utc_single(
6464
return val - _tz_localize_using_tzinfo_api(val, tz, to_utc=True)
6565

6666
elif is_fixed_offset(tz):
67-
# TODO: in this case we should be able to use get_utcoffset,
68-
# that returns None for e.g. 'dateutil//usr/share/zoneinfo/Etc/GMT-9'
6967
_, deltas, _ = get_dst_info(tz)
7068
delta = deltas[0]
7169
return val - delta
@@ -121,9 +119,10 @@ timedelta-like}
121119
Py_ssize_t delta_idx_offset, delta_idx, pos_left, pos_right
122120
int64_t *tdata
123121
int64_t v, left, right, val, v_left, v_right, new_local, remaining_mins
124-
int64_t first_delta
122+
int64_t first_delta, delta
125123
int64_t shift_delta = 0
126-
ndarray[int64_t] trans, result, result_a, result_b, dst_hours
124+
ndarray[int64_t] trans, result_a, result_b, dst_hours
125+
int64_t[::1] result
127126
npy_datetimestruct dts
128127
bint infer_dst = False, is_dst = False, fill = False
129128
bint shift_forward = False, shift_backward = False
@@ -132,7 +131,7 @@ timedelta-like}
132131

133132
# Vectorized version of DstTzInfo.localize
134133
if is_utc(tz) or tz is None:
135-
return vals
134+
return vals.copy()
136135

137136
result = np.empty(n, dtype=np.int64)
138137

@@ -143,7 +142,18 @@ timedelta-like}
143142
result[i] = NPY_NAT
144143
else:
145144
result[i] = v - _tz_localize_using_tzinfo_api(v, tz, to_utc=True)
146-
return result
145+
return result.base # to return underlying ndarray
146+
147+
elif is_fixed_offset(tz):
148+
_, deltas, _ = get_dst_info(tz)
149+
delta = deltas[0]
150+
for i in range(n):
151+
v = vals[i]
152+
if v == NPY_NAT:
153+
result[i] = NPY_NAT
154+
else:
155+
result[i] = v - delta
156+
return result.base # to return underlying ndarray
147157

148158
# silence false-positive compiler warning
149159
ambiguous_array = np.empty(0, dtype=bool)
@@ -298,7 +308,7 @@ timedelta-like}
298308
stamp = _render_tstamp(val)
299309
raise pytz.NonExistentTimeError(stamp)
300310

301-
return result
311+
return result.base # .base to get underlying ndarray
302312

303313

304314
cdef inline Py_ssize_t bisect_right_i8(int64_t *data,

pandas/_libs/tslibs/vectorized.pyx

+10-16
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,9 @@ cnp.import_array()
2121
from .conversion cimport normalize_i8_stamp
2222

2323
from .dtypes import Resolution
24+
2425
from .ccalendar cimport DAY_NANOS
26+
from .dtypes cimport c_Resolution
2527
from .nattype cimport (
2628
NPY_NAT,
2729
c_NaT as NaT,
@@ -168,27 +170,19 @@ def ints_to_pydatetime(
168170

169171
# -------------------------------------------------------------------------
170172

171-
cdef:
172-
int RESO_US = Resolution.RESO_US.value
173-
int RESO_MS = Resolution.RESO_MS.value
174-
int RESO_SEC = Resolution.RESO_SEC.value
175-
int RESO_MIN = Resolution.RESO_MIN.value
176-
int RESO_HR = Resolution.RESO_HR.value
177-
int RESO_DAY = Resolution.RESO_DAY.value
178-
179173

180-
cdef inline int _reso_stamp(npy_datetimestruct *dts):
174+
cdef inline c_Resolution _reso_stamp(npy_datetimestruct *dts):
181175
if dts.us != 0:
182176
if dts.us % 1000 == 0:
183-
return RESO_MS
184-
return RESO_US
177+
return c_Resolution.RESO_MS
178+
return c_Resolution.RESO_US
185179
elif dts.sec != 0:
186-
return RESO_SEC
180+
return c_Resolution.RESO_SEC
187181
elif dts.min != 0:
188-
return RESO_MIN
182+
return c_Resolution.RESO_MIN
189183
elif dts.hour != 0:
190-
return RESO_HR
191-
return RESO_DAY
184+
return c_Resolution.RESO_HR
185+
return c_Resolution.RESO_DAY
192186

193187

194188
@cython.wraparound(False)
@@ -205,7 +199,7 @@ def get_resolution(const int64_t[:] stamps, tzinfo tz=None) -> Resolution:
205199
str typ
206200

207201
npy_datetimestruct dts
208-
int reso = RESO_DAY, curr_reso
202+
c_Resolution reso = c_Resolution.RESO_DAY, curr_reso
209203

210204
if is_utc(tz) or tz is None:
211205
use_utc = True

pandas/_typing.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -297,6 +297,9 @@ def closed(self) -> bool:
297297
else:
298298
TakeIndexer = Any
299299

300+
# Shared by functions such as drop and astype
301+
IgnoreRaise = Literal["ignore", "raise"]
302+
300303
# Windowing rank methods
301304
WindowingRankType = Literal["average", "min", "max"]
302305

@@ -311,7 +314,7 @@ def closed(self) -> bool:
311314

312315
# datetime and NaTType
313316
DatetimeNaTType = Union[datetime, "NaTType"]
314-
DateTimeErrorChoices = Literal["ignore", "raise", "coerce"]
317+
DateTimeErrorChoices = Union[IgnoreRaise, Literal["coerce"]]
315318

316319
# sort_index
317320
SortKind = Literal["quicksort", "mergesort", "heapsort", "stable"]

pandas/core/arrays/datetimes.py

+9-3
Original file line numberDiff line numberDiff line change
@@ -456,9 +456,13 @@ def _generate_range(
456456
endpoint_tz = start.tz if start is not None else end.tz
457457

458458
if tz is not None and endpoint_tz is None:
459-
i8values = tzconversion.tz_localize_to_utc(
460-
i8values, tz, ambiguous=ambiguous, nonexistent=nonexistent
461-
)
459+
460+
if not timezones.is_utc(tz):
461+
# short-circuit tz_localize_to_utc which would make
462+
# an unnecessary copy with UTC but be a no-op.
463+
i8values = tzconversion.tz_localize_to_utc(
464+
i8values, tz, ambiguous=ambiguous, nonexistent=nonexistent
465+
)
462466

463467
# i8values is localized datetime64 array -> have to convert
464468
# start/end as well to compare
@@ -2126,6 +2130,8 @@ def _sequence_to_dt64ns(
21262130
if tz is not None:
21272131
# Convert tz-naive to UTC
21282132
tz = timezones.maybe_get_tz(tz)
2133+
# TODO: if tz is UTC, are there situations where we *don't* want a
2134+
# copy? tz_localize_to_utc always makes one.
21292135
data = tzconversion.tz_localize_to_utc(
21302136
data.view("i8"), tz, ambiguous=ambiguous
21312137
)

0 commit comments

Comments
 (0)