Skip to content

Commit 0f73f5f

Browse files
committed
Merge pull request #6378 from TomAugspurger/interpolate-ignore-good3
BUG: interpolate should preserve dtypes
2 parents c87a058 + 8d8d7a3 commit 0f73f5f

File tree

6 files changed

+110
-23
lines changed

6 files changed

+110
-23
lines changed

doc/source/release.rst

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,8 @@ API Changes
6666
- ``df['col'] = value`` and ``df.loc[:,'col'] = value`` are now completely equivalent;
6767
previously the ``.loc`` would not necessarily coerce the dtype of the resultant series (:issue:`6149`)
6868
- ``dtypes`` and ``ftypes`` now return a series with ``dtype=object`` on empty containers (:issue:`5740`)
69-
69+
- The ``interpolate`` ``downcast`` keyword default has been changed from ``infer`` to
70+
``None``. This is to preseve the original dtype unless explicitly requested otherwise (:issue:`6290`).
7071

7172
Experimental Features
7273
~~~~~~~~~~~~~~~~~~~~~
@@ -121,6 +122,7 @@ Bug Fixes
121122
- Bug in ``DataFrame.replace()`` when passing a nested ``dict`` that contained
122123
keys not in the values to be replaced (:issue:`6342`)
123124
- Bug in take with duplicate columns not consolidated (:issue:`6240`)
125+
- Bug in interpolate changing dtypes (:issue:`6290`)
124126

125127
pandas 0.13.1
126128
-------------

doc/source/v0.14.0.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@ API changes
2929
df.iloc[:,2:3]
3030
df.iloc[:,1:3]
3131

32+
- The ``DataFrame.interpolate()`` ``downcast`` keyword default has been changed from ``infer`` to
33+
``None``. This is to preseve the original dtype unless explicitly requested otherwise (:issue:`6290`).
34+
3235
MultiIndexing Using Slicers
3336
~~~~~~~~~~~~~~~~~~~~~~~~~~~
3437

pandas/core/generic.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2435,7 +2435,7 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None,
24352435
return self._constructor(new_data).__finalize__(self)
24362436

24372437
def interpolate(self, method='linear', axis=0, limit=None, inplace=False,
2438-
downcast='infer', **kwargs):
2438+
downcast=None, **kwargs):
24392439
"""
24402440
Interpolate values according to different methods.
24412441
@@ -2468,7 +2468,7 @@ def interpolate(self, method='linear', axis=0, limit=None, inplace=False,
24682468
Maximum number of consecutive NaNs to fill.
24692469
inplace : bool, default False
24702470
Update the NDFrame in place if possible.
2471-
downcast : optional, 'infer' or None, defaults to 'infer'
2471+
downcast : optional, 'infer' or None, defaults to None
24722472
Downcast dtypes if possible.
24732473
24742474
Returns
@@ -2492,7 +2492,6 @@ def interpolate(self, method='linear', axis=0, limit=None, inplace=False,
24922492
dtype: float64
24932493
24942494
"""
2495-
24962495
if self.ndim > 2:
24972496
raise NotImplementedError("Interpolate has not been implemented "
24982497
"on Panel and Panel 4D objects.")
@@ -2534,7 +2533,6 @@ def interpolate(self, method='linear', axis=0, limit=None, inplace=False,
25342533
inplace=inplace,
25352534
downcast=downcast,
25362535
**kwargs)
2537-
25382536
if inplace:
25392537
if axis == 1:
25402538
self._update_inplace(new_data)

pandas/core/internals.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -805,13 +805,25 @@ def interpolate(self, method='pad', axis=0, index=None,
805805
values=None, inplace=False, limit=None,
806806
fill_value=None, coerce=False, downcast=None, **kwargs):
807807

808+
def check_int_bool(self, inplace):
809+
# Only FloatBlocks will contain NaNs.
810+
# timedelta subclasses IntBlock
811+
if (self.is_bool or self.is_integer) and not self.is_timedelta:
812+
if inplace:
813+
return self
814+
else:
815+
return self.copy()
816+
808817
# a fill na type method
809818
try:
810819
m = com._clean_fill_method(method)
811820
except:
812821
m = None
813822

814823
if m is not None:
824+
r = check_int_bool(self, inplace)
825+
if r is not None:
826+
return r
815827
return self._interpolate_with_fill(method=m,
816828
axis=axis,
817829
inplace=inplace,
@@ -826,6 +838,9 @@ def interpolate(self, method='pad', axis=0, index=None,
826838
m = None
827839

828840
if m is not None:
841+
r = check_int_bool(self, inplace)
842+
if r is not None:
843+
return r
829844
return self._interpolate(method=m,
830845
index=index,
831846
values=values,

pandas/tests/test_generic.py

Lines changed: 61 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -459,7 +459,10 @@ def test_interpolate(self):
459459
self.assert_numpy_array_equal(time_interp, ord_ts)
460460

461461
# try time interpolation on a non-TimeSeries
462-
self.assertRaises(ValueError, self.series.interpolate, method='time')
462+
# Only raises ValueError if there are NaNs.
463+
non_ts = self.series.copy()
464+
non_ts[0] = np.NaN
465+
self.assertRaises(ValueError, non_ts.interpolate, method='time')
463466

464467
def test_interp_regression(self):
465468
_skip_if_no_scipy()
@@ -512,7 +515,7 @@ def test_interpolate_non_ts(self):
512515
def test_nan_interpolate(self):
513516
s = Series([0, 1, np.nan, 3])
514517
result = s.interpolate()
515-
expected = Series([0, 1, 2, 3])
518+
expected = Series([0., 1., 2., 3.])
516519
assert_series_equal(result, expected)
517520

518521
_skip_if_no_scipy()
@@ -522,20 +525,20 @@ def test_nan_interpolate(self):
522525
def test_nan_irregular_index(self):
523526
s = Series([1, 2, np.nan, 4], index=[1, 3, 5, 9])
524527
result = s.interpolate()
525-
expected = Series([1, 2, 3, 4], index=[1, 3, 5, 9])
528+
expected = Series([1., 2., 3., 4.], index=[1, 3, 5, 9])
526529
assert_series_equal(result, expected)
527530

528531
def test_nan_str_index(self):
529532
s = Series([0, 1, 2, np.nan], index=list('abcd'))
530533
result = s.interpolate()
531-
expected = Series([0, 1, 2, 2], index=list('abcd'))
534+
expected = Series([0., 1., 2., 2.], index=list('abcd'))
532535
assert_series_equal(result, expected)
533536

534537
def test_interp_quad(self):
535538
_skip_if_no_scipy()
536539
sq = Series([1, 4, np.nan, 16], index=[1, 2, 3, 4])
537540
result = sq.interpolate(method='quadratic')
538-
expected = Series([1, 4, 9, 16], index=[1, 2, 3, 4])
541+
expected = Series([1., 4., 9., 16.], index=[1, 2, 3, 4])
539542
assert_series_equal(result, expected)
540543

541544
def test_interp_scipy_basic(self):
@@ -545,18 +548,30 @@ def test_interp_scipy_basic(self):
545548
expected = Series([1., 3., 7.5, 12., 18.5, 25.])
546549
result = s.interpolate(method='slinear')
547550
assert_series_equal(result, expected)
551+
552+
result = s.interpolate(method='slinear', donwcast='infer')
553+
assert_series_equal(result, expected)
548554
# nearest
549555
expected = Series([1, 3, 3, 12, 12, 25])
550556
result = s.interpolate(method='nearest')
557+
assert_series_equal(result, expected.astype('float'))
558+
559+
result = s.interpolate(method='nearest', downcast='infer')
551560
assert_series_equal(result, expected)
552561
# zero
553562
expected = Series([1, 3, 3, 12, 12, 25])
554563
result = s.interpolate(method='zero')
564+
assert_series_equal(result, expected.astype('float'))
565+
566+
result = s.interpolate(method='zero', downcast='infer')
555567
assert_series_equal(result, expected)
556568
# quadratic
557569
expected = Series([1, 3., 6.769231, 12., 18.230769, 25.])
558570
result = s.interpolate(method='quadratic')
559571
assert_series_equal(result, expected)
572+
573+
result = s.interpolate(method='quadratic', downcast='infer')
574+
assert_series_equal(result, expected)
560575
# cubic
561576
expected = Series([1., 3., 6.8, 12., 18.2, 25.])
562577
result = s.interpolate(method='cubic')
@@ -585,7 +600,6 @@ def test_interp_multiIndex(self):
585600

586601
expected = s.copy()
587602
expected.loc[2] = 2
588-
expected = expected.astype(np.int64)
589603
result = s.interpolate()
590604
assert_series_equal(result, expected)
591605

@@ -595,15 +609,15 @@ def test_interp_multiIndex(self):
595609

596610
def test_interp_nonmono_raise(self):
597611
_skip_if_no_scipy()
598-
s = pd.Series([1, 2, 3], index=[0, 2, 1])
612+
s = Series([1, np.nan, 3], index=[0, 2, 1])
599613
with tm.assertRaises(ValueError):
600614
s.interpolate(method='krogh')
601615

602616
def test_interp_datetime64(self):
603617
_skip_if_no_scipy()
604618
df = Series([1, np.nan, 3], index=date_range('1/1/2000', periods=3))
605619
result = df.interpolate(method='nearest')
606-
expected = Series([1, 1, 3], index=date_range('1/1/2000', periods=3))
620+
expected = Series([1., 1., 3.], index=date_range('1/1/2000', periods=3))
607621
assert_series_equal(result, expected)
608622

609623
class TestDataFrame(tm.TestCase, Generic):
@@ -639,7 +653,7 @@ def test_get_numeric_data_preserve_dtype(self):
639653
def test_interp_basic(self):
640654
df = DataFrame({'A': [1, 2, np.nan, 4], 'B': [1, 4, 9, np.nan],
641655
'C': [1, 2, 3, 5], 'D': list('abcd')})
642-
expected = DataFrame({'A': [1, 2, 3, 4], 'B': [1, 4, 9, 9],
656+
expected = DataFrame({'A': [1., 2., 3., 4.], 'B': [1., 4., 9., 9.],
643657
'C': [1, 2, 3, 5], 'D': list('abcd')})
644658
result = df.interpolate()
645659
assert_frame_equal(result, expected)
@@ -648,8 +662,6 @@ def test_interp_basic(self):
648662
expected = df.set_index('C')
649663
expected.A.loc[3] = 3
650664
expected.B.loc[5] = 9
651-
expected[['A', 'B']] = expected[['A', 'B']].astype(np.int64)
652-
653665
assert_frame_equal(result, expected)
654666

655667
def test_interp_bad_method(self):
@@ -663,9 +675,14 @@ def test_interp_combo(self):
663675
'C': [1, 2, 3, 5], 'D': list('abcd')})
664676

665677
result = df['A'].interpolate()
678+
expected = Series([1., 2., 3., 4.])
679+
assert_series_equal(result, expected)
680+
681+
result = df['A'].interpolate(downcast='infer')
666682
expected = Series([1, 2, 3, 4])
667683
assert_series_equal(result, expected)
668684

685+
669686
def test_interp_nan_idx(self):
670687
df = DataFrame({'A': [1, 2, np.nan, 4], 'B': [np.nan, 2, 3, 4]})
671688
df = df.set_index('A')
@@ -722,13 +739,16 @@ def test_interp_alt_scipy(self):
722739
expected = df.copy()
723740
expected['A'].iloc[2] = 3
724741
expected['A'].iloc[5] = 6
742+
assert_frame_equal(result, expected)
743+
744+
result = df.interpolate(method='barycentric', downcast='infer')
725745
assert_frame_equal(result, expected.astype(np.int64))
726746

727747
result = df.interpolate(method='krogh')
728748
expectedk = df.copy()
729-
expectedk['A'].iloc[2] = 3
730-
expectedk['A'].iloc[5] = 6
731-
expectedk['A'] = expected['A'].astype(np.int64)
749+
# expectedk['A'].iloc[2] = 3
750+
# expectedk['A'].iloc[5] = 6
751+
expectedk['A'] = expected['A']
732752
assert_frame_equal(result, expectedk)
733753

734754
_skip_if_no_pchip()
@@ -786,9 +806,32 @@ def test_interp_raise_on_only_mixed(self):
786806

787807
def test_interp_inplace(self):
788808
df = DataFrame({'a': [1., 2., np.nan, 4.]})
789-
expected = DataFrame({'a': [1, 2, 3, 4]})
790-
df['a'].interpolate(inplace=True)
791-
assert_frame_equal(df, expected)
809+
expected = DataFrame({'a': [1., 2., 3., 4.]})
810+
result = df.copy()
811+
result['a'].interpolate(inplace=True)
812+
assert_frame_equal(result, expected)
813+
814+
result = df.copy()
815+
result['a'].interpolate(inplace=True, downcast='infer')
816+
assert_frame_equal(result, expected.astype('int'))
817+
818+
def test_interp_ignore_all_good(self):
819+
# GH
820+
df = DataFrame({'A': [1, 2, np.nan, 4],
821+
'B': [1, 2, 3, 4],
822+
'C': [1., 2., np.nan, 4.],
823+
'D': [1., 2., 3., 4.]})
824+
expected = DataFrame({'A': np.array([1, 2, 3, 4], dtype='float'),
825+
'B': np.array([1, 2, 3, 4], dtype='int'),
826+
'C': np.array([1., 2., 3, 4.], dtype='float'),
827+
'D': np.array([1., 2., 3., 4.], dtype='float')})
828+
829+
result = df.interpolate(downcast=None)
830+
assert_frame_equal(result, expected)
831+
832+
# all good
833+
result = df[['B', 'D']].interpolate(downcast=None)
834+
assert_frame_equal(result, df[['B', 'D']])
792835

793836
def test_no_order(self):
794837
_skip_if_no_scipy()
@@ -802,7 +845,7 @@ def test_spline(self):
802845
_skip_if_no_scipy()
803846
s = Series([1, 2, np.nan, 4, 5, np.nan, 7])
804847
result = s.interpolate(method='spline', order=1)
805-
expected = Series([1, 2, 3, 4, 5, 6, 7])
848+
expected = Series([1., 2., 3., 4., 5., 6., 7.])
806849
assert_series_equal(result, expected)
807850

808851
def test_metadata_propagation_indiv(self):

vb_suite/frame_methods.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -403,3 +403,29 @@ def test_unequal(name):
403403
frame_object_unequal = Benchmark('test_unequal("object_df")', setup)
404404
frame_nonunique_unequal = Benchmark('test_unequal("nonunique_cols")', setup)
405405

406+
#-----------------------------------------------------------------------------
407+
# interpolate
408+
# this is the worst case, where every column has NaNs.
409+
setup = common_setup + """
410+
df = DataFrame(randn(10000, 100))
411+
df.values[::2] = np.nan
412+
"""
413+
414+
frame_interpolate = Benchmark('df.interpolate()', setup,
415+
start_date=datetime(2014, 2, 7))
416+
417+
setup = common_setup + """
418+
df = DataFrame({'A': np.arange(0, 10000),
419+
'B': np.random.randint(0, 100, 10000),
420+
'C': randn(10000),
421+
'D': randn(10000)})
422+
df.loc[1::5, 'A'] = np.nan
423+
df.loc[1::5, 'C'] = np.nan
424+
"""
425+
426+
frame_interpolate_some_good = Benchmark('df.interpolate()', setup,
427+
start_date=datetime(2014, 2, 7))
428+
frame_interpolate_some_good_infer = Benchmark('df.interpolate(downcast="infer")',
429+
setup,
430+
start_date=datetime(2014, 2, 7))
431+

0 commit comments

Comments
 (0)