BUG: unique() should preserve the dtype of the input

stuarteberg · stuarteberg · commit ab66315b200a · 2019-08-14T11:23:55.000-04:00
diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst
@@ -156,7 +156,7 @@ ExtensionArray
 Other
 ^^^^^
 - Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` when replacing timezone-aware timestamps using a dict-like replacer (:issue:`27720`)
--
+- The returned dtype of ::func:`pd.unique` now matches the input dtype. (:issue`#27874`)
 -
 -
 
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -404,7 +404,7 @@ def unique(values):
 
     table = htable(len(values))
     uniques = table.unique(values)
-    uniques = _reconstruct_data(uniques, dtype, original)
+    uniques = _reconstruct_data(uniques, original.dtype, original)
     return uniques
 
 
diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py
@@ -159,8 +159,8 @@ def test_memory_usage(self):
 class Ops:
     def _allow_na_ops(self, obj):
         """Whether to skip test cases including NaN"""
-        if isinstance(obj, Index) and (obj.is_boolean() or not obj._can_hold_na):
-            # don't test boolean / int64 index
+        if (isinstance(obj, Index) and obj.is_boolean()) or not obj._can_hold_na:
+            # don't test boolean / integer dtypes
             return False
         return True
 
@@ -187,7 +187,24 @@ def setup_method(self, method):
         types = ["bool", "int", "float", "dt", "dt_tz", "period", "string", "unicode"]
         self.indexes = [getattr(self, "{}_index".format(t)) for t in types]
         self.series = [getattr(self, "{}_series".format(t)) for t in types]
-        self.objs = self.indexes + self.series
+
+        # To test narrow dtypes, we use narrower *data* elements, not *index* elements
+        index = self.int_index
+        self.float32_series = Series(arr.astype(np.float32), index=index, name="a")
+
+        arr_int = np.random.choice(10, size=10, replace=False)
+        self.int8_series = Series(arr_int.astype(np.int8), index=index, name="a")
+        self.int16_series = Series(arr_int.astype(np.int16), index=index, name="a")
+        self.int32_series = Series(arr_int.astype(np.int32), index=index, name="a")
+
+        self.uint8_series = Series(arr_int.astype(np.uint8), index=index, name="a")
+        self.uint16_series = Series(arr_int.astype(np.uint16), index=index, name="a")
+        self.uint32_series = Series(arr_int.astype(np.uint32), index=index, name="a")
+
+        nrw_types = ["float32", "int8", "int16", "int32", "uint8", "uint16", "uint32"]
+        self.narrow_series = [getattr(self, "{}_series".format(t)) for t in nrw_types]
+
+        self.objs = self.indexes + self.series + self.narrow_series
 
     def check_ops_properties(self, props, filter=None, ignore_failures=False):
         for op in props:
@@ -385,6 +402,7 @@ def test_value_counts_unique_nunique(self):
             if isinstance(o, Index):
                 assert isinstance(result, o.__class__)
                 tm.assert_index_equal(result, orig)
+                assert result.dtype == orig.dtype
             elif is_datetime64tz_dtype(o):
                 # datetimetz Series returns array of Timestamp
                 assert result[0] == orig[0]
@@ -396,6 +414,7 @@ def test_value_counts_unique_nunique(self):
                 )
             else:
                 tm.assert_numpy_array_equal(result, orig.values)
+                assert result.dtype == orig.dtype
 
             assert o.nunique() == len(np.unique(o.values))
 
@@ -904,7 +923,7 @@ def test_fillna(self):
 
                 expected = [fill_value] * 2 + list(values[2:])
 
-                expected = klass(expected)
+                expected = klass(expected, dtype=orig.dtype)
                 o = klass(values)
 
                 # check values has the same dtype as the original

Original file line number	Diff line number	Diff line change
`@@ -156,7 +156,7 @@ ExtensionArray`
`156`	`156`	`Other`
`157`	`157`	`^^^^^`
`158`	`158`	- Bug in :meth:`Series.replace` and :meth:`DataFrame.replace` when replacing timezone-aware timestamps using a dict-like replacer (:issue:`27720`)
`159`		`--`
	`159`	+- The returned dtype of ::func:`pd.unique` now matches the input dtype. (:issue`#27874`)
`160`	`160`	`-`
`161`	`161`	`-`
`162`	`162`