Skip to content

Commit 91d1c50

Browse files
meiermarkjreback
authored andcommitted
BUG: Index.str.partition not nan-safe (#23558) (#23618)
1 parent e2c4f04 commit 91d1c50

File tree

3 files changed

+69
-46
lines changed

3 files changed

+69
-46
lines changed

Diff for: doc/source/whatsnew/v0.24.0.rst

+2-2
Original file line numberDiff line numberDiff line change
@@ -1270,8 +1270,8 @@ Numeric
12701270
Strings
12711271
^^^^^^^
12721272

1273-
-
1274-
-
1273+
- Bug in :meth:`Index.str.partition` was not nan-safe (:issue:`23558`).
1274+
- Bug in :meth:`Index.str.split` was not nan-safe (:issue:`23677`).
12751275
-
12761276

12771277
Interval

Diff for: pandas/_libs/lib.pyx

+2-2
Original file line numberDiff line numberDiff line change
@@ -2273,7 +2273,7 @@ def to_object_array_tuples(rows: list):
22732273

22742274
k = 0
22752275
for i in range(n):
2276-
tmp = len(rows[i])
2276+
tmp = 1 if checknull(rows[i]) else len(rows[i])
22772277
if tmp > k:
22782278
k = tmp
22792279

@@ -2287,7 +2287,7 @@ def to_object_array_tuples(rows: list):
22872287
except Exception:
22882288
# upcast any subclasses to tuple
22892289
for i in range(n):
2290-
row = tuple(rows[i])
2290+
row = (rows[i],) if checknull(rows[i]) else tuple(rows[i])
22912291
for j in range(len(row)):
22922292
result[i, j] = row[j]
22932293

Diff for: pandas/tests/test_strings.py

+65-42
Original file line numberDiff line numberDiff line change
@@ -2330,24 +2330,35 @@ def test_split_to_dataframe(self):
23302330
s.str.split('_', expand="not_a_boolean")
23312331

23322332
def test_split_to_multiindex_expand(self):
2333-
idx = Index(['nosplit', 'alsonosplit'])
2333+
# https://github.com/pandas-dev/pandas/issues/23677
2334+
2335+
idx = Index(['nosplit', 'alsonosplit', np.nan])
23342336
result = idx.str.split('_', expand=True)
23352337
exp = idx
23362338
tm.assert_index_equal(result, exp)
23372339
assert result.nlevels == 1
23382340

2339-
idx = Index(['some_equal_splits', 'with_no_nans'])
2341+
idx = Index(['some_equal_splits', 'with_no_nans', np.nan, None])
23402342
result = idx.str.split('_', expand=True)
2341-
exp = MultiIndex.from_tuples([('some', 'equal', 'splits'), (
2342-
'with', 'no', 'nans')])
2343+
exp = MultiIndex.from_tuples([('some', 'equal', 'splits'),
2344+
('with', 'no', 'nans'),
2345+
[np.nan, np.nan, np.nan],
2346+
[None, None, None]])
23432347
tm.assert_index_equal(result, exp)
23442348
assert result.nlevels == 3
23452349

2346-
idx = Index(['some_unequal_splits', 'one_of_these_things_is_not'])
2350+
idx = Index(['some_unequal_splits',
2351+
'one_of_these_things_is_not',
2352+
np.nan, None])
23472353
result = idx.str.split('_', expand=True)
2348-
exp = MultiIndex.from_tuples([('some', 'unequal', 'splits', NA, NA, NA
2349-
), ('one', 'of', 'these', 'things',
2350-
'is', 'not')])
2354+
exp = MultiIndex.from_tuples([('some', 'unequal', 'splits',
2355+
NA, NA, NA),
2356+
('one', 'of', 'these',
2357+
'things', 'is', 'not'),
2358+
(np.nan, np.nan, np.nan,
2359+
np.nan, np.nan, np.nan),
2360+
(None, None, None,
2361+
None, None, None)])
23512362
tm.assert_index_equal(result, exp)
23522363
assert result.nlevels == 6
23532364

@@ -2441,50 +2452,54 @@ def test_split_with_name(self):
24412452
tm.assert_index_equal(res, exp)
24422453

24432454
def test_partition_series(self):
2444-
values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])
2455+
# https://github.com/pandas-dev/pandas/issues/23558
2456+
2457+
values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h', None])
24452458

24462459
result = values.str.partition('_', expand=False)
24472460
exp = Series([('a', '_', 'b_c'), ('c', '_', 'd_e'), NA,
2448-
('f', '_', 'g_h')])
2461+
('f', '_', 'g_h'), None])
24492462
tm.assert_series_equal(result, exp)
24502463

24512464
result = values.str.rpartition('_', expand=False)
24522465
exp = Series([('a_b', '_', 'c'), ('c_d', '_', 'e'), NA,
2453-
('f_g', '_', 'h')])
2466+
('f_g', '_', 'h'), None])
24542467
tm.assert_series_equal(result, exp)
24552468

24562469
# more than one char
2457-
values = Series(['a__b__c', 'c__d__e', NA, 'f__g__h'])
2470+
values = Series(['a__b__c', 'c__d__e', NA, 'f__g__h', None])
24582471
result = values.str.partition('__', expand=False)
24592472
exp = Series([('a', '__', 'b__c'), ('c', '__', 'd__e'), NA,
2460-
('f', '__', 'g__h')])
2473+
('f', '__', 'g__h'), None])
24612474
tm.assert_series_equal(result, exp)
24622475

24632476
result = values.str.rpartition('__', expand=False)
24642477
exp = Series([('a__b', '__', 'c'), ('c__d', '__', 'e'), NA,
2465-
('f__g', '__', 'h')])
2478+
('f__g', '__', 'h'), None])
24662479
tm.assert_series_equal(result, exp)
24672480

24682481
# None
2469-
values = Series(['a b c', 'c d e', NA, 'f g h'])
2482+
values = Series(['a b c', 'c d e', NA, 'f g h', None])
24702483
result = values.str.partition(expand=False)
24712484
exp = Series([('a', ' ', 'b c'), ('c', ' ', 'd e'), NA,
2472-
('f', ' ', 'g h')])
2485+
('f', ' ', 'g h'), None])
24732486
tm.assert_series_equal(result, exp)
24742487

24752488
result = values.str.rpartition(expand=False)
24762489
exp = Series([('a b', ' ', 'c'), ('c d', ' ', 'e'), NA,
2477-
('f g', ' ', 'h')])
2490+
('f g', ' ', 'h'), None])
24782491
tm.assert_series_equal(result, exp)
24792492

2480-
# Not splited
2481-
values = Series(['abc', 'cde', NA, 'fgh'])
2493+
# Not split
2494+
values = Series(['abc', 'cde', NA, 'fgh', None])
24822495
result = values.str.partition('_', expand=False)
2483-
exp = Series([('abc', '', ''), ('cde', '', ''), NA, ('fgh', '', '')])
2496+
exp = Series([('abc', '', ''), ('cde', '', ''), NA,
2497+
('fgh', '', ''), None])
24842498
tm.assert_series_equal(result, exp)
24852499

24862500
result = values.str.rpartition('_', expand=False)
2487-
exp = Series([('', '', 'abc'), ('', '', 'cde'), NA, ('', '', 'fgh')])
2501+
exp = Series([('', '', 'abc'), ('', '', 'cde'), NA,
2502+
('', '', 'fgh'), None])
24882503
tm.assert_series_equal(result, exp)
24892504

24902505
# unicode
@@ -2508,57 +2523,65 @@ def test_partition_series(self):
25082523
assert result == [v.rpartition('_') for v in values]
25092524

25102525
def test_partition_index(self):
2511-
values = Index(['a_b_c', 'c_d_e', 'f_g_h'])
2526+
# https://github.com/pandas-dev/pandas/issues/23558
2527+
2528+
values = Index(['a_b_c', 'c_d_e', 'f_g_h', np.nan, None])
25122529

25132530
result = values.str.partition('_', expand=False)
2514-
exp = Index(np.array([('a', '_', 'b_c'), ('c', '_', 'd_e'), ('f', '_',
2515-
'g_h')]))
2531+
exp = Index(np.array([('a', '_', 'b_c'), ('c', '_', 'd_e'),
2532+
('f', '_', 'g_h'), np.nan, None]))
25162533
tm.assert_index_equal(result, exp)
25172534
assert result.nlevels == 1
25182535

25192536
result = values.str.rpartition('_', expand=False)
2520-
exp = Index(np.array([('a_b', '_', 'c'), ('c_d', '_', 'e'), (
2521-
'f_g', '_', 'h')]))
2537+
exp = Index(np.array([('a_b', '_', 'c'), ('c_d', '_', 'e'),
2538+
('f_g', '_', 'h'), np.nan, None]))
25222539
tm.assert_index_equal(result, exp)
25232540
assert result.nlevels == 1
25242541

25252542
result = values.str.partition('_')
2526-
exp = Index([('a', '_', 'b_c'), ('c', '_', 'd_e'), ('f', '_', 'g_h')])
2543+
exp = Index([('a', '_', 'b_c'), ('c', '_', 'd_e'),
2544+
('f', '_', 'g_h'), (np.nan, np.nan, np.nan),
2545+
(None, None, None)])
25272546
tm.assert_index_equal(result, exp)
25282547
assert isinstance(result, MultiIndex)
25292548
assert result.nlevels == 3
25302549

25312550
result = values.str.rpartition('_')
2532-
exp = Index([('a_b', '_', 'c'), ('c_d', '_', 'e'), ('f_g', '_', 'h')])
2551+
exp = Index([('a_b', '_', 'c'), ('c_d', '_', 'e'),
2552+
('f_g', '_', 'h'), (np.nan, np.nan, np.nan),
2553+
(None, None, None)])
25332554
tm.assert_index_equal(result, exp)
25342555
assert isinstance(result, MultiIndex)
25352556
assert result.nlevels == 3
25362557

25372558
def test_partition_to_dataframe(self):
2538-
values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])
2559+
# https://github.com/pandas-dev/pandas/issues/23558
2560+
2561+
values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h', None])
25392562
result = values.str.partition('_')
2540-
exp = DataFrame({0: ['a', 'c', np.nan, 'f'],
2541-
1: ['_', '_', np.nan, '_'],
2542-
2: ['b_c', 'd_e', np.nan, 'g_h']})
2563+
exp = DataFrame({0: ['a', 'c', np.nan, 'f', None],
2564+
1: ['_', '_', np.nan, '_', None],
2565+
2: ['b_c', 'd_e', np.nan, 'g_h', None]})
25432566
tm.assert_frame_equal(result, exp)
25442567

25452568
result = values.str.rpartition('_')
2546-
exp = DataFrame({0: ['a_b', 'c_d', np.nan, 'f_g'],
2547-
1: ['_', '_', np.nan, '_'],
2548-
2: ['c', 'e', np.nan, 'h']})
2569+
exp = DataFrame({0: ['a_b', 'c_d', np.nan, 'f_g', None],
2570+
1: ['_', '_', np.nan, '_', None],
2571+
2: ['c', 'e', np.nan, 'h', None]})
25492572
tm.assert_frame_equal(result, exp)
25502573

2551-
values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])
2574+
values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h', None])
25522575
result = values.str.partition('_', expand=True)
2553-
exp = DataFrame({0: ['a', 'c', np.nan, 'f'],
2554-
1: ['_', '_', np.nan, '_'],
2555-
2: ['b_c', 'd_e', np.nan, 'g_h']})
2576+
exp = DataFrame({0: ['a', 'c', np.nan, 'f', None],
2577+
1: ['_', '_', np.nan, '_', None],
2578+
2: ['b_c', 'd_e', np.nan, 'g_h', None]})
25562579
tm.assert_frame_equal(result, exp)
25572580

25582581
result = values.str.rpartition('_', expand=True)
2559-
exp = DataFrame({0: ['a_b', 'c_d', np.nan, 'f_g'],
2560-
1: ['_', '_', np.nan, '_'],
2561-
2: ['c', 'e', np.nan, 'h']})
2582+
exp = DataFrame({0: ['a_b', 'c_d', np.nan, 'f_g', None],
2583+
1: ['_', '_', np.nan, '_', None],
2584+
2: ['c', 'e', np.nan, 'h', None]})
25622585
tm.assert_frame_equal(result, exp)
25632586

25642587
def test_partition_with_name(self):

0 commit comments

Comments
 (0)