From f069fc2750827cd907d874754b3ac203a94031df Mon Sep 17 00:00:00 2001 From: Nikoleta Glynatsi Date: Sat, 1 Sep 2018 12:03:13 +0200 Subject: [PATCH 01/15] tests for creating series string dtype More specifically the cases that seem to have an issue are when: - the series in empty - it's a single element series --- pandas/tests/series/test_constructors.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index ce0cf0d5c089e..cd2d71c8832b5 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -134,6 +134,14 @@ def test_constructor_no_data_index_order(self): result = pd.Series(index=['b', 'a', 'c']) assert result.index.tolist() == ['b', 'a', 'c'] + def test_constructor_no_data_string_type(self): + result = pd.Series(index=[1], dtype=str) + assert result.isna().all() + + def test_constructor_single_element_string_type(self): + result = pd.Series(13, index=[1], dtype=str) + assert result.values.tolist() == ['13'] + def test_constructor_dtype_str_na_values(self, string_dtype): # https://github.com/pandas-dev/pandas/issues/21083 ser = Series(['x', None], dtype=string_dtype) From 062786f944a562d2f860fc4b68f2ac54a829fb1e Mon Sep 17 00:00:00 2001 From: Nikoleta Glynatsi Date: Sat, 1 Sep 2018 12:28:05 +0200 Subject: [PATCH 02/15] Closes #22477 Add a check so if the dtype is str is will create an empty array type object and then pass the values. Add test for an empty series. To chech that it fills the series with NaN and not with 'n'. Also add a test for cases that no string values are given. --- pandas/core/dtypes/cast.py | 13 ++++++++----- pandas/core/series.py | 2 -- pandas/tests/series/test_constructors.py | 2 +- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index c7c6f89eb13a4..47fb6997f2a9d 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1217,11 +1217,14 @@ def construct_1d_arraylike_from_scalar(value, length, dtype): dtype = dtype.dtype # coerce if we have nan for an integer dtype - # GH 22858: only cast to float if an index - # (passed here as length) is specified - if length and is_integer_dtype(dtype) and isna(value): - dtype = np.float64 - subarr = np.empty(length, dtype=dtype) + if is_integer_dtype(dtype) and isna(value): + dtype = np.dtype('float64') + if isinstance(dtype, np.dtype) and dtype.kind in ("U", "S"): + subarr = np.empty(length, dtype=object) + if not isna(value): + value = str(value) + else: + subarr = np.empty(length, dtype=dtype) subarr.fill(value) return subarr diff --git a/pandas/core/series.py b/pandas/core/series.py index 20e4720a3bde7..0006dd2fcfc0d 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -148,7 +148,6 @@ class Series(base.IndexOpsMixin, generic.NDFrame): def __init__(self, data=None, index=None, dtype=None, name=None, copy=False, fastpath=False): - # we are called internally, so short-circuit if fastpath: @@ -4210,7 +4209,6 @@ def _sanitize_array(data, index, dtype=None, copy=False, """ sanitize input data to an ndarray, copy if specified, coerce to the dtype if specified """ - if dtype is not None: dtype = pandas_dtype(dtype) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index cd2d71c8832b5..5c02ce1e09529 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -137,7 +137,7 @@ def test_constructor_no_data_index_order(self): def test_constructor_no_data_string_type(self): result = pd.Series(index=[1], dtype=str) assert result.isna().all() - + def test_constructor_single_element_string_type(self): result = pd.Series(13, index=[1], dtype=str) assert result.values.tolist() == ['13'] From a522d7f909be8791a4922d7ca9c9bf9a20248b55 Mon Sep 17 00:00:00 2001 From: Nikoleta Glynatsi Date: Sat, 1 Sep 2018 15:29:42 +0200 Subject: [PATCH 03/15] undo changes to series.py --- pandas/core/series.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/series.py b/pandas/core/series.py index 0006dd2fcfc0d..20e4720a3bde7 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -148,6 +148,7 @@ class Series(base.IndexOpsMixin, generic.NDFrame): def __init__(self, data=None, index=None, dtype=None, name=None, copy=False, fastpath=False): + # we are called internally, so short-circuit if fastpath: @@ -4209,6 +4210,7 @@ def _sanitize_array(data, index, dtype=None, copy=False, """ sanitize input data to an ndarray, copy if specified, coerce to the dtype if specified """ + if dtype is not None: dtype = pandas_dtype(dtype) From c8667dd97f3b5a4d0a2cec2dc235a3d6e6b5363e Mon Sep 17 00:00:00 2001 From: Nikoleta Glynatsi Date: Sat, 1 Sep 2018 15:31:06 +0200 Subject: [PATCH 04/15] comment issue number under test To allow the developers to remember why the specific test was added --- pandas/tests/series/test_constructors.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 5c02ce1e09529..fc24c6200ce4c 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -135,10 +135,12 @@ def test_constructor_no_data_index_order(self): assert result.index.tolist() == ['b', 'a', 'c'] def test_constructor_no_data_string_type(self): + # GH 22477 result = pd.Series(index=[1], dtype=str) assert result.isna().all() def test_constructor_single_element_string_type(self): + # GH 22477 result = pd.Series(13, index=[1], dtype=str) assert result.values.tolist() == ['13'] From 4717e36219c3f5a521bcb139ed3b3ccefee311e9 Mon Sep 17 00:00:00 2001 From: Nikoleta Glynatsi Date: Sat, 1 Sep 2018 16:21:04 +0200 Subject: [PATCH 05/15] add test for strings --- pandas/tests/series/test_constructors.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index fc24c6200ce4c..b4c8494fe1429 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -144,6 +144,11 @@ def test_constructor_single_element_string_type(self): result = pd.Series(13, index=[1], dtype=str) assert result.values.tolist() == ['13'] + def test_constructor_string_series_string_type(self): + # GH 22477 + result = pd.Series('entry', index=[1], dtype=str) + assert result.values.tolist() == ['entry'] + def test_constructor_dtype_str_na_values(self, string_dtype): # https://github.com/pandas-dev/pandas/issues/21083 ser = Series(['x', None], dtype=string_dtype) From bdad7243b2a478dd98b770788d6c7e74b15ea250 Mon Sep 17 00:00:00 2001 From: Nikoleta Glynatsi Date: Sat, 1 Sep 2018 16:24:19 +0200 Subject: [PATCH 06/15] add test for unicode elements: fails This is currently failing. --- pandas/tests/series/test_constructors.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index b4c8494fe1429..0e522e82462db 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -144,11 +144,16 @@ def test_constructor_single_element_string_type(self): result = pd.Series(13, index=[1], dtype=str) assert result.values.tolist() == ['13'] - def test_constructor_string_series_string_type(self): + def test_constructor_string_element_string_type(self): # GH 22477 result = pd.Series('entry', index=[1], dtype=str) assert result.values.tolist() == ['entry'] + def test_constructor_unicode_element_string_type(self): + # GH 22477 + result = pd.Series(u'ѐ', index=[1], dtype=str) + assert result.values.tolist() == [u'ѐ'] + def test_constructor_dtype_str_na_values(self, string_dtype): # https://github.com/pandas-dev/pandas/issues/21083 ser = Series(['x', None], dtype=string_dtype) From 7691c8299a3e2f6987ed54ba9261bbc048c20d7b Mon Sep 17 00:00:00 2001 From: Nikoleta Glynatsi Date: Sat, 1 Sep 2018 16:50:03 +0200 Subject: [PATCH 07/15] except unicode in is_datetime64_dtype is_datetime64_dtype is trying to check the type of unicodes but numpy does not support unicode and this line breaks. Add except error and return false Test for unicode still fails for python 2 --- pandas/core/dtypes/common.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 94e9b72b001b1..4a1599d649c46 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -394,6 +394,8 @@ def is_datetime64_dtype(arr_or_dtype): tipo = _get_dtype_type(arr_or_dtype) except TypeError: return False + except UnicodeEncodeError: + return False return issubclass(tipo, np.datetime64) From 00a7ed8e0aa7334ecd676335f449e3d26598b209 Mon Sep 17 00:00:00 2001 From: Nikoleta Glynatsi Date: Sat, 1 Sep 2018 16:51:51 +0200 Subject: [PATCH 08/15] series with dtype accept unicode This was breaking for python 2. The fix is to use pandas text_type to return string type --- pandas/core/dtypes/cast.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 47fb6997f2a9d..ea92b66ebb44b 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1222,7 +1222,7 @@ def construct_1d_arraylike_from_scalar(value, length, dtype): if isinstance(dtype, np.dtype) and dtype.kind in ("U", "S"): subarr = np.empty(length, dtype=object) if not isna(value): - value = str(value) + value = text_type(value) else: subarr = np.empty(length, dtype=dtype) subarr.fill(value) From e9a290d71398526e7bc5b88fa5bb5b5047600852 Mon Sep 17 00:00:00 2001 From: Nikoleta Glynatsi Date: Tue, 4 Sep 2018 00:46:59 +0100 Subject: [PATCH 09/15] fixes failure with python2 --- pandas/core/dtypes/cast.py | 4 ++-- pandas/tests/series/test_constructors.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index ea92b66ebb44b..4b14529914294 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -6,7 +6,7 @@ from pandas._libs import lib, tslib, tslibs from pandas._libs.tslibs import OutOfBoundsDatetime, Period, iNaT -from pandas.compat import PY3, string_types, text_type +from pandas.compat import PY3, string_types, text_type, to_str from .common import ( _INT64_DTYPE, _NS_DTYPE, _POSSIBLY_CAST_DTYPES, _TD_DTYPE, _string_dtypes, @@ -1222,7 +1222,7 @@ def construct_1d_arraylike_from_scalar(value, length, dtype): if isinstance(dtype, np.dtype) and dtype.kind in ("U", "S"): subarr = np.empty(length, dtype=object) if not isna(value): - value = text_type(value) + value = to_str(value) else: subarr = np.empty(length, dtype=dtype) subarr.fill(value) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 0e522e82462db..82a531e2a19bc 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -151,8 +151,8 @@ def test_constructor_string_element_string_type(self): def test_constructor_unicode_element_string_type(self): # GH 22477 - result = pd.Series(u'ѐ', index=[1], dtype=str) - assert result.values.tolist() == [u'ѐ'] + result = pd.Series('ѐ', index=[1], dtype=str) + assert result.values.tolist() == ['ѐ'] def test_constructor_dtype_str_na_values(self, string_dtype): # https://github.com/pandas-dev/pandas/issues/21083 From aa6b4a94e47876062d73d8cd4841845d2fdba4ae Mon Sep 17 00:00:00 2001 From: Nikoleta Glynatsi Date: Sun, 11 Nov 2018 20:33:55 +0000 Subject: [PATCH 10/15] tweak tests as requested on pr parametrize tests and use iloc to check value --- pandas/tests/series/test_constructors.py | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 82a531e2a19bc..6eb6f63de40da 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -137,22 +137,19 @@ def test_constructor_no_data_index_order(self): def test_constructor_no_data_string_type(self): # GH 22477 result = pd.Series(index=[1], dtype=str) - assert result.isna().all() + assert np.isnan(result.iloc[0]) - def test_constructor_single_element_string_type(self): + @pytest.mark.parametrize('item', ['13']) + def test_constructor_single_element_string_type(self, item): # GH 22477 - result = pd.Series(13, index=[1], dtype=str) - assert result.values.tolist() == ['13'] + result = pd.Series(int(item), index=[1], dtype=str) + assert result.iloc[0] == item - def test_constructor_string_element_string_type(self): + @pytest.mark.parametrize('item', ['entry', 'ѐ']) + def test_constructor_string_element_string_type(self, item): # GH 22477 - result = pd.Series('entry', index=[1], dtype=str) - assert result.values.tolist() == ['entry'] - - def test_constructor_unicode_element_string_type(self): - # GH 22477 - result = pd.Series('ѐ', index=[1], dtype=str) - assert result.values.tolist() == ['ѐ'] + result = pd.Series(item, index=[1], dtype=str) + assert result.iloc[0] == item def test_constructor_dtype_str_na_values(self, string_dtype): # https://github.com/pandas-dev/pandas/issues/21083 From ee854d7859b5b40f45592148e20408d9d51f24ac Mon Sep 17 00:00:00 2001 From: Nikoleta Glynatsi Date: Sun, 11 Nov 2018 20:35:29 +0000 Subject: [PATCH 11/15] style tweak --- pandas/core/dtypes/common.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 4a1599d649c46..2756753855071 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -392,9 +392,7 @@ def is_datetime64_dtype(arr_or_dtype): return False try: tipo = _get_dtype_type(arr_or_dtype) - except TypeError: - return False - except UnicodeEncodeError: + except (TypeError, UnicodeEncodeError) as e: return False return issubclass(tipo, np.datetime64) From fdad0c5c201b556e893e1bbeda102100bb74aef1 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 18 Nov 2018 17:39:59 -0500 Subject: [PATCH 12/15] fixup --- doc/source/whatsnew/v0.24.0.rst | 1 + pandas/core/dtypes/common.py | 2 +- pandas/tests/series/test_constructors.py | 10 ++-------- 3 files changed, 4 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index d6f9bb66e1e28..35c02e369aa8d 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1431,6 +1431,7 @@ Reshaping - Bug in :func:`merge_asof` where confusing error message raised when attempting to merge with missing values (:issue:`23189`) - Bug in :meth:`DataFrame.nsmallest` and :meth:`DataFrame.nlargest` for dataframes that have a :class:`MultiIndex` for columns (:issue:`23033`). - Bug in :meth:`DataFrame.append` with a :class:`Series` with a dateutil timezone would raise a ``TypeError`` (:issue:`23682`) +- Bug in ``Series`` construction when passing no data and ``dtype=str`` (:issue:`22477`) .. _whatsnew_0240.bug_fixes.sparse: diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index f05106f7fa473..33177ac452414 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -419,7 +419,7 @@ def is_datetime64_dtype(arr_or_dtype): return False try: tipo = _get_dtype_type(arr_or_dtype) - except (TypeError, UnicodeEncodeError) as e: + except (TypeError, UnicodeEncodeError): return False return issubclass(tipo, np.datetime64) diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index 6eb6f63de40da..f5a445e2cca9a 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -139,17 +139,11 @@ def test_constructor_no_data_string_type(self): result = pd.Series(index=[1], dtype=str) assert np.isnan(result.iloc[0]) - @pytest.mark.parametrize('item', ['13']) - def test_constructor_single_element_string_type(self, item): - # GH 22477 - result = pd.Series(int(item), index=[1], dtype=str) - assert result.iloc[0] == item - - @pytest.mark.parametrize('item', ['entry', 'ѐ']) + @pytest.mark.parametrize('item', ['entry', 'ѐ', 13]) def test_constructor_string_element_string_type(self, item): # GH 22477 result = pd.Series(item, index=[1], dtype=str) - assert result.iloc[0] == item + assert result.iloc[0] == str(item) def test_constructor_dtype_str_na_values(self, string_dtype): # https://github.com/pandas-dev/pandas/issues/21083 From 9711d3505f7c1acb80a00ffaab3ae96c61ee3865 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Sun, 18 Nov 2018 18:35:38 -0500 Subject: [PATCH 13/15] fix test --- pandas/tests/frame/test_constructors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index c71d5d9f977f6..3dc3677afa32b 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -807,7 +807,7 @@ def test_constructor_corner_shape(self): @pytest.mark.parametrize("data, index, columns, dtype, expected", [ (None, lrange(10), ['a', 'b'], object, np.object_), - (None, None, ['a', 'b'], 'int64', np.dtype('int64')), + (None, None, ['a', 'b'], 'int64', np.dtype('float64')), (None, lrange(10), ['a', 'b'], int, np.dtype('float64')), ({}, None, ['foo', 'bar'], None, np.object_), ({'b': 1}, lrange(10), list('abc'), int, np.dtype('float64')) From 27701e0b9ee8ab19fe1247bde94226bcf8bd31a0 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Mon, 19 Nov 2018 20:27:21 -0500 Subject: [PATCH 14/15] fixup --- pandas/core/dtypes/cast.py | 4 ++-- pandas/tests/frame/test_constructors.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 4b14529914294..91c2a7f4c66da 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1217,9 +1217,9 @@ def construct_1d_arraylike_from_scalar(value, length, dtype): dtype = dtype.dtype # coerce if we have nan for an integer dtype - if is_integer_dtype(dtype) and isna(value): + if length and is_integer_dtype(dtype) and isna(value): dtype = np.dtype('float64') - if isinstance(dtype, np.dtype) and dtype.kind in ("U", "S"): + elif isinstance(dtype, np.dtype) and dtype.kind in ("U", "S"): subarr = np.empty(length, dtype=object) if not isna(value): value = to_str(value) diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 3dc3677afa32b..c71d5d9f977f6 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -807,7 +807,7 @@ def test_constructor_corner_shape(self): @pytest.mark.parametrize("data, index, columns, dtype, expected", [ (None, lrange(10), ['a', 'b'], object, np.object_), - (None, None, ['a', 'b'], 'int64', np.dtype('float64')), + (None, None, ['a', 'b'], 'int64', np.dtype('int64')), (None, lrange(10), ['a', 'b'], int, np.dtype('float64')), ({}, None, ['foo', 'bar'], None, np.object_), ({'b': 1}, lrange(10), list('abc'), int, np.dtype('float64')) From 0692db00f91ea5e9a19d2f3cd02238dae4bd2935 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 20 Nov 2018 08:16:47 -0500 Subject: [PATCH 15/15] fixup --- pandas/core/dtypes/cast.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 91c2a7f4c66da..3c5f8830441f7 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1216,15 +1216,17 @@ def construct_1d_arraylike_from_scalar(value, length, dtype): if not isinstance(dtype, (np.dtype, type(np.dtype))): dtype = dtype.dtype - # coerce if we have nan for an integer dtype if length and is_integer_dtype(dtype) and isna(value): + # coerce if we have nan for an integer dtype dtype = np.dtype('float64') elif isinstance(dtype, np.dtype) and dtype.kind in ("U", "S"): - subarr = np.empty(length, dtype=object) + # we need to coerce to object dtype to avoid + # to allow numpy to take our string as a scalar value + dtype = object if not isna(value): value = to_str(value) - else: - subarr = np.empty(length, dtype=dtype) + + subarr = np.empty(length, dtype=dtype) subarr.fill(value) return subarr