Skip to content

Commit 01b0ad5

Browse files
authored
PERF: DataFrame.insert (#42998)
1 parent e86bba6 commit 01b0ad5

File tree

4 files changed

+121
-24
lines changed

4 files changed

+121
-24
lines changed

asv_bench/benchmarks/indexing.py

+9
Original file line numberDiff line numberDiff line change
@@ -366,11 +366,20 @@ class InsertColumns:
366366
def setup(self):
367367
self.N = 10 ** 3
368368
self.df = DataFrame(index=range(self.N))
369+
self.df2 = DataFrame(np.random.randn(self.N, 2))
369370

370371
def time_insert(self):
371372
for i in range(100):
372373
self.df.insert(0, i, np.random.randn(self.N), allow_duplicates=True)
373374

375+
def time_insert_middle(self):
376+
# same as time_insert but inserting to a middle column rather than
377+
# front or back (which have fast-paths)
378+
for i in range(100):
379+
self.df2.insert(
380+
1, "colname", np.random.randn(self.N), allow_duplicates=True
381+
)
382+
374383
def time_assign_with_setitem(self):
375384
for i in range(100):
376385
self.df[i] = np.random.randn(self.N)

pandas/_libs/internals.pyi

+7
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import numpy as np
1010
from pandas._typing import (
1111
ArrayLike,
1212
T,
13+
npt,
1314
)
1415

1516
from pandas import Index
@@ -25,6 +26,12 @@ def get_blkno_placements(
2526
blknos: np.ndarray,
2627
group: bool = ...,
2728
) -> Iterator[tuple[int, BlockPlacement]]: ...
29+
def update_blklocs_and_blknos(
30+
blklocs: npt.NDArray[np.intp],
31+
blknos: npt.NDArray[np.intp],
32+
loc: int,
33+
nblocks: int,
34+
) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ...
2835

2936
class BlockPlacement:
3037
def __init__(self, val: int | slice | np.ndarray): ...

pandas/_libs/internals.pyx

+62
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,39 @@ cdef class BlockPlacement:
210210

211211
return self._as_slice
212212

213+
cpdef BlockPlacement increment_above(self, Py_ssize_t loc):
214+
"""
215+
Increment any entries of 'loc' or above by one.
216+
"""
217+
cdef:
218+
slice nv, s = self._ensure_has_slice()
219+
Py_ssize_t other_int, start, stop, step, l
220+
ndarray newarr
221+
222+
if s is not None:
223+
# see if we are either all-above or all-below, each of which
224+
# have fastpaths available.
225+
226+
start, stop, step, l = slice_get_indices_ex(s)
227+
228+
if start < loc and stop <= loc:
229+
# We are entirely below, nothing to increment
230+
return self
231+
232+
if start >= loc and stop >= loc:
233+
# We are entirely above, we can efficiently increment out slice
234+
nv = slice(start + 1, stop + 1, step)
235+
return BlockPlacement(nv)
236+
237+
if loc == 0:
238+
# fastpath where we know everything is >= 0
239+
newarr = self.as_array + 1
240+
return BlockPlacement(newarr)
241+
242+
newarr = self.as_array.copy()
243+
newarr[newarr >= loc] += 1
244+
return BlockPlacement(newarr)
245+
213246
def tile_for_unstack(self, factor: int) -> np.ndarray:
214247
"""
215248
Find the new mgr_locs for the un-stacked version of a Block.
@@ -481,6 +514,35 @@ def get_blkno_placements(blknos, group: bool = True):
481514
yield blkno, BlockPlacement(indexer)
482515

483516

517+
cpdef update_blklocs_and_blknos(
518+
ndarray[intp_t] blklocs, ndarray[intp_t] blknos, Py_ssize_t loc, intp_t nblocks
519+
):
520+
"""
521+
Update blklocs and blknos when a new column is inserted at 'loc'.
522+
"""
523+
cdef:
524+
Py_ssize_t i
525+
cnp.npy_intp length = len(blklocs) + 1
526+
ndarray[intp_t] new_blklocs, new_blknos
527+
528+
# equiv: new_blklocs = np.empty(length, dtype=np.intp)
529+
new_blklocs = cnp.PyArray_EMPTY(1, &length, cnp.NPY_INTP, 0)
530+
new_blknos = cnp.PyArray_EMPTY(1, &length, cnp.NPY_INTP, 0)
531+
532+
for i in range(loc):
533+
new_blklocs[i] = blklocs[i]
534+
new_blknos[i] = blknos[i]
535+
536+
new_blklocs[loc] = 0
537+
new_blknos[loc] = nblocks
538+
539+
for i in range(loc, length - 1):
540+
new_blklocs[i + 1] = blklocs[i]
541+
new_blknos[i + 1] = blknos[i]
542+
543+
return new_blklocs, new_blknos
544+
545+
484546
@cython.freelist(64)
485547
cdef class SharedBlock:
486548
"""

pandas/core/internals/managers.py

+43-24
Original file line numberDiff line numberDiff line change
@@ -139,8 +139,8 @@ class BaseBlockManager(DataManager):
139139

140140
__slots__ = ()
141141

142-
_blknos: np.ndarray
143-
_blklocs: np.ndarray
142+
_blknos: npt.NDArray[np.intp]
143+
_blklocs: npt.NDArray[np.intp]
144144
blocks: tuple[Block, ...]
145145
axes: list[Index]
146146

@@ -156,7 +156,7 @@ def from_blocks(cls: type_t[T], blocks: list[Block], axes: list[Index]) -> T:
156156
raise NotImplementedError
157157

158158
@property
159-
def blknos(self):
159+
def blknos(self) -> npt.NDArray[np.intp]:
160160
"""
161161
Suppose we want to find the array corresponding to our i'th column.
162162
@@ -172,7 +172,7 @@ def blknos(self):
172172
return self._blknos
173173

174174
@property
175-
def blklocs(self):
175+
def blklocs(self) -> npt.NDArray[np.intp]:
176176
"""
177177
See blknos.__doc__
178178
"""
@@ -1151,23 +1151,8 @@ def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None:
11511151

11521152
block = new_block(values=value, ndim=self.ndim, placement=slice(loc, loc + 1))
11531153

1154-
for blkno, count in _fast_count_smallints(self.blknos[loc:]):
1155-
blk = self.blocks[blkno]
1156-
if count == len(blk.mgr_locs):
1157-
blk.mgr_locs = blk.mgr_locs.add(1)
1158-
else:
1159-
new_mgr_locs = blk.mgr_locs.as_array.copy()
1160-
new_mgr_locs[new_mgr_locs >= loc] += 1
1161-
blk.mgr_locs = BlockPlacement(new_mgr_locs)
1162-
1163-
# Accessing public blklocs ensures the public versions are initialized
1164-
if loc == self.blklocs.shape[0]:
1165-
# np.append is a lot faster, let's use it if we can.
1166-
self._blklocs = np.append(self._blklocs, 0)
1167-
self._blknos = np.append(self._blknos, len(self.blocks))
1168-
else:
1169-
self._blklocs = np.insert(self._blklocs, loc, 0)
1170-
self._blknos = np.insert(self._blknos, loc, len(self.blocks))
1154+
self._insert_update_mgr_locs(loc)
1155+
self._insert_update_blklocs_and_blknos(loc)
11711156

11721157
self.axes[0] = new_axis
11731158
self.blocks += (block,)
@@ -1184,6 +1169,38 @@ def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None:
11841169
stacklevel=5,
11851170
)
11861171

1172+
def _insert_update_mgr_locs(self, loc) -> None:
1173+
"""
1174+
When inserting a new Block at location 'loc', we increment
1175+
all of the mgr_locs of blocks above that by one.
1176+
"""
1177+
for blkno, count in _fast_count_smallints(self.blknos[loc:]):
1178+
# .620 this way, .326 of which is in increment_above
1179+
blk = self.blocks[blkno]
1180+
blk._mgr_locs = blk._mgr_locs.increment_above(loc)
1181+
1182+
def _insert_update_blklocs_and_blknos(self, loc) -> None:
1183+
"""
1184+
When inserting a new Block at location 'loc', we update our
1185+
_blklocs and _blknos.
1186+
"""
1187+
1188+
# Accessing public blklocs ensures the public versions are initialized
1189+
if loc == self.blklocs.shape[0]:
1190+
# np.append is a lot faster, let's use it if we can.
1191+
self._blklocs = np.append(self._blklocs, 0)
1192+
self._blknos = np.append(self._blknos, len(self.blocks))
1193+
elif loc == 0:
1194+
# np.append is a lot faster, let's use it if we can.
1195+
self._blklocs = np.append(self._blklocs[::-1], 0)[::-1]
1196+
self._blknos = np.append(self._blknos[::-1], len(self.blocks))[::-1]
1197+
else:
1198+
new_blklocs, new_blknos = libinternals.update_blklocs_and_blknos(
1199+
self.blklocs, self.blknos, loc, len(self.blocks)
1200+
)
1201+
self._blklocs = new_blklocs
1202+
self._blknos = new_blknos
1203+
11871204
def idelete(self, indexer) -> BlockManager:
11881205
"""
11891206
Delete selected locations, returning a new BlockManager.
@@ -2050,11 +2067,13 @@ def _merge_blocks(
20502067
return blocks
20512068

20522069

2053-
def _fast_count_smallints(arr: np.ndarray) -> np.ndarray:
2070+
def _fast_count_smallints(arr: npt.NDArray[np.intp]):
20542071
"""Faster version of set(arr) for sequences of small numbers."""
2055-
counts = np.bincount(arr.astype(np.int_))
2072+
counts = np.bincount(arr.astype(np.int_, copy=False))
20562073
nz = counts.nonzero()[0]
2057-
return np.c_[nz, counts[nz]]
2074+
# Note: list(zip(...) outperforms list(np.c_[nz, counts[nz]]) here,
2075+
# in one benchmark by a factor of 11
2076+
return zip(nz, counts[nz])
20582077

20592078

20602079
def _preprocess_slice_or_indexer(

0 commit comments

Comments
 (0)