BUG: Index.str.partition not nan-safe (#23558) (#23618)

meiermark · jreback · commit 91d1c50327c2 · 2018-11-18T13:32:50.000-05:00
diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst
@@ -1270,8 +1270,8 @@ Numeric
 Strings
 ^^^^^^^
 
--
--
+- Bug in :meth:`Index.str.partition` was not nan-safe (:issue:`23558`).
+- Bug in :meth:`Index.str.split` was not nan-safe (:issue:`23677`).
 -
 
 Interval
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -2273,7 +2273,7 @@ def to_object_array_tuples(rows: list):
 
     k = 0
     for i in range(n):
-        tmp = len(rows[i])
+        tmp = 1 if checknull(rows[i]) else len(rows[i])
         if tmp > k:
             k = tmp
 
@@ -2287,7 +2287,7 @@ def to_object_array_tuples(rows: list):
     except Exception:
         # upcast any subclasses to tuple
         for i in range(n):
-            row = tuple(rows[i])
+            row = (rows[i],) if checknull(rows[i]) else tuple(rows[i])
             for j in range(len(row)):
                 result[i, j] = row[j]
 
diff --git a/pandas/tests/test_strings.py b/pandas/tests/test_strings.py
@@ -2330,24 +2330,35 @@ def test_split_to_dataframe(self):
             s.str.split('_', expand="not_a_boolean")
 
     def test_split_to_multiindex_expand(self):
-        idx = Index(['nosplit', 'alsonosplit'])
+        # https://github.com/pandas-dev/pandas/issues/23677
+
+        idx = Index(['nosplit', 'alsonosplit', np.nan])
         result = idx.str.split('_', expand=True)
         exp = idx
         tm.assert_index_equal(result, exp)
         assert result.nlevels == 1
 
-        idx = Index(['some_equal_splits', 'with_no_nans'])
+        idx = Index(['some_equal_splits', 'with_no_nans', np.nan, None])
         result = idx.str.split('_', expand=True)
-        exp = MultiIndex.from_tuples([('some', 'equal', 'splits'), (
-            'with', 'no', 'nans')])
+        exp = MultiIndex.from_tuples([('some', 'equal', 'splits'),
+                                      ('with', 'no', 'nans'),
+                                      [np.nan, np.nan, np.nan],
+                                      [None, None, None]])
         tm.assert_index_equal(result, exp)
         assert result.nlevels == 3
 
-        idx = Index(['some_unequal_splits', 'one_of_these_things_is_not'])
+        idx = Index(['some_unequal_splits',
+                     'one_of_these_things_is_not',
+                     np.nan, None])
         result = idx.str.split('_', expand=True)
-        exp = MultiIndex.from_tuples([('some', 'unequal', 'splits', NA, NA, NA
-                                       ), ('one', 'of', 'these', 'things',
-                                           'is', 'not')])
+        exp = MultiIndex.from_tuples([('some', 'unequal', 'splits',
+                                       NA, NA, NA),
+                                      ('one', 'of', 'these',
+                                       'things', 'is', 'not'),
+                                      (np.nan, np.nan, np.nan,
+                                       np.nan, np.nan, np.nan),
+                                      (None, None, None,
+                                       None, None, None)])
         tm.assert_index_equal(result, exp)
         assert result.nlevels == 6
 
@@ -2441,50 +2452,54 @@ def test_split_with_name(self):
         tm.assert_index_equal(res, exp)
 
     def test_partition_series(self):
-        values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])
+        # https://github.com/pandas-dev/pandas/issues/23558
+
+        values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h', None])
 
         result = values.str.partition('_', expand=False)
         exp = Series([('a', '_', 'b_c'), ('c', '_', 'd_e'), NA,
-                      ('f', '_', 'g_h')])
+                      ('f', '_', 'g_h'), None])
         tm.assert_series_equal(result, exp)
 
         result = values.str.rpartition('_', expand=False)
         exp = Series([('a_b', '_', 'c'), ('c_d', '_', 'e'), NA,
-                      ('f_g', '_', 'h')])
+                      ('f_g', '_', 'h'), None])
         tm.assert_series_equal(result, exp)
 
         # more than one char
-        values = Series(['a__b__c', 'c__d__e', NA, 'f__g__h'])
+        values = Series(['a__b__c', 'c__d__e', NA, 'f__g__h', None])
         result = values.str.partition('__', expand=False)
         exp = Series([('a', '__', 'b__c'), ('c', '__', 'd__e'), NA,
-                      ('f', '__', 'g__h')])
+                      ('f', '__', 'g__h'), None])
         tm.assert_series_equal(result, exp)
 
         result = values.str.rpartition('__', expand=False)
         exp = Series([('a__b', '__', 'c'), ('c__d', '__', 'e'), NA,
-                      ('f__g', '__', 'h')])
+                      ('f__g', '__', 'h'), None])
         tm.assert_series_equal(result, exp)
 
         # None
-        values = Series(['a b c', 'c d e', NA, 'f g h'])
+        values = Series(['a b c', 'c d e', NA, 'f g h', None])
         result = values.str.partition(expand=False)
         exp = Series([('a', ' ', 'b c'), ('c', ' ', 'd e'), NA,
-                      ('f', ' ', 'g h')])
+                      ('f', ' ', 'g h'), None])
         tm.assert_series_equal(result, exp)
 
         result = values.str.rpartition(expand=False)
         exp = Series([('a b', ' ', 'c'), ('c d', ' ', 'e'), NA,
-                      ('f g', ' ', 'h')])
+                      ('f g', ' ', 'h'), None])
         tm.assert_series_equal(result, exp)
 
-        # Not splited
-        values = Series(['abc', 'cde', NA, 'fgh'])
+        # Not split
+        values = Series(['abc', 'cde', NA, 'fgh', None])
         result = values.str.partition('_', expand=False)
-        exp = Series([('abc', '', ''), ('cde', '', ''), NA, ('fgh', '', '')])
+        exp = Series([('abc', '', ''), ('cde', '', ''), NA,
+                      ('fgh', '', ''), None])
         tm.assert_series_equal(result, exp)
 
         result = values.str.rpartition('_', expand=False)
-        exp = Series([('', '', 'abc'), ('', '', 'cde'), NA, ('', '', 'fgh')])
+        exp = Series([('', '', 'abc'), ('', '', 'cde'), NA,
+                      ('', '', 'fgh'), None])
         tm.assert_series_equal(result, exp)
 
         # unicode
@@ -2508,57 +2523,65 @@ def test_partition_series(self):
         assert result == [v.rpartition('_') for v in values]
 
     def test_partition_index(self):
-        values = Index(['a_b_c', 'c_d_e', 'f_g_h'])
+        # https://github.com/pandas-dev/pandas/issues/23558
+
+        values = Index(['a_b_c', 'c_d_e', 'f_g_h', np.nan, None])
 
         result = values.str.partition('_', expand=False)
-        exp = Index(np.array([('a', '_', 'b_c'), ('c', '_', 'd_e'), ('f', '_',
-                                                                     'g_h')]))
+        exp = Index(np.array([('a', '_', 'b_c'), ('c', '_', 'd_e'),
+                              ('f', '_', 'g_h'), np.nan, None]))
         tm.assert_index_equal(result, exp)
         assert result.nlevels == 1
 
         result = values.str.rpartition('_', expand=False)
-        exp = Index(np.array([('a_b', '_', 'c'), ('c_d', '_', 'e'), (
-            'f_g', '_', 'h')]))
+        exp = Index(np.array([('a_b', '_', 'c'), ('c_d', '_', 'e'),
+                              ('f_g', '_', 'h'), np.nan, None]))
         tm.assert_index_equal(result, exp)
         assert result.nlevels == 1
 
         result = values.str.partition('_')
-        exp = Index([('a', '_', 'b_c'), ('c', '_', 'd_e'), ('f', '_', 'g_h')])
+        exp = Index([('a', '_', 'b_c'), ('c', '_', 'd_e'),
+                     ('f', '_', 'g_h'), (np.nan, np.nan, np.nan),
+                     (None, None, None)])
         tm.assert_index_equal(result, exp)
         assert isinstance(result, MultiIndex)
         assert result.nlevels == 3
 
         result = values.str.rpartition('_')
-        exp = Index([('a_b', '_', 'c'), ('c_d', '_', 'e'), ('f_g', '_', 'h')])
+        exp = Index([('a_b', '_', 'c'), ('c_d', '_', 'e'),
+                     ('f_g', '_', 'h'), (np.nan, np.nan, np.nan),
+                     (None, None, None)])
         tm.assert_index_equal(result, exp)
         assert isinstance(result, MultiIndex)
         assert result.nlevels == 3
 
     def test_partition_to_dataframe(self):
-        values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])
+        # https://github.com/pandas-dev/pandas/issues/23558
+
+        values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h', None])
         result = values.str.partition('_')
-        exp = DataFrame({0: ['a', 'c', np.nan, 'f'],
-                         1: ['_', '_', np.nan, '_'],
-                         2: ['b_c', 'd_e', np.nan, 'g_h']})
+        exp = DataFrame({0: ['a', 'c', np.nan, 'f', None],
+                         1: ['_', '_', np.nan, '_', None],
+                         2: ['b_c', 'd_e', np.nan, 'g_h', None]})
         tm.assert_frame_equal(result, exp)
 
         result = values.str.rpartition('_')
-        exp = DataFrame({0: ['a_b', 'c_d', np.nan, 'f_g'],
-                         1: ['_', '_', np.nan, '_'],
-                         2: ['c', 'e', np.nan, 'h']})
+        exp = DataFrame({0: ['a_b', 'c_d', np.nan, 'f_g', None],
+                         1: ['_', '_', np.nan, '_', None],
+                         2: ['c', 'e', np.nan, 'h', None]})
         tm.assert_frame_equal(result, exp)
 
-        values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h'])
+        values = Series(['a_b_c', 'c_d_e', NA, 'f_g_h', None])
         result = values.str.partition('_', expand=True)
-        exp = DataFrame({0: ['a', 'c', np.nan, 'f'],
-                         1: ['_', '_', np.nan, '_'],
-                         2: ['b_c', 'd_e', np.nan, 'g_h']})
+        exp = DataFrame({0: ['a', 'c', np.nan, 'f', None],
+                         1: ['_', '_', np.nan, '_', None],
+                         2: ['b_c', 'd_e', np.nan, 'g_h', None]})
         tm.assert_frame_equal(result, exp)
 
         result = values.str.rpartition('_', expand=True)
-        exp = DataFrame({0: ['a_b', 'c_d', np.nan, 'f_g'],
-                         1: ['_', '_', np.nan, '_'],
-                         2: ['c', 'e', np.nan, 'h']})
+        exp = DataFrame({0: ['a_b', 'c_d', np.nan, 'f_g', None],
+                         1: ['_', '_', np.nan, '_', None],
+                         2: ['c', 'e', np.nan, 'h', None]})
         tm.assert_frame_equal(result, exp)
 
     def test_partition_with_name(self):

Original file line number	Diff line number	Diff line change
`@@ -1270,8 +1270,8 @@ Numeric`
`1270`	`1270`	`Strings`
`1271`	`1271`	`^^^^^^^`
`1272`	`1272`
`1273`		`--`
`1274`		`--`
	`1273`	+- Bug in :meth:`Index.str.partition` was not nan-safe (:issue:`23558`).
	`1274`	+- Bug in :meth:`Index.str.split` was not nan-safe (:issue:`23677`).
`1275`	`1275`	`-`
`1276`	`1276`
`1277`	`1277`	`Interval`