diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 32b548e5f32f1..0baa0a307c988 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -189,6 +189,7 @@ Other API Changes - The default NA value for :class:`UInt64Index` has changed from 0 to ``NaN``, which impacts methods that mask with NA, such as ``UInt64Index.where()`` (:issue:`18398`) - Refactored ``setup.py`` to use ``find_packages`` instead of explicitly listing out all subpackages (:issue:`18535`) - Rearranged the order of keyword arguments in :func:`read_excel()` to align with :func:`read_csv()` (:pr:`16672`) +- :func:`pandas.merge` now raises a ``ValueError`` when trying to merge on incompatible data types (:issue:`9780`) .. _whatsnew_0220.deprecations: diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index bad7088a126cf..455c6f42ac74a 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -27,6 +27,7 @@ is_dtype_equal, is_bool, is_list_like, + is_datetimelike, _ensure_int64, _ensure_float64, _ensure_object, @@ -962,6 +963,33 @@ def _maybe_coerce_merge_keys(self): elif lib.infer_dtype(lk) == lib.infer_dtype(rk): pass + # Check if we are trying to merge on obviously + # incompatible dtypes GH 9780 + elif is_numeric_dtype(lk) and not is_numeric_dtype(rk): + msg = ("You are trying to merge on {lk_dtype} and " + "{rk_dtype} columns. If you wish to proceed " + "you should use pd.concat".format(lk_dtype=lk.dtype, + rk_dtype=rk.dtype)) + raise ValueError(msg) + elif not is_numeric_dtype(lk) and is_numeric_dtype(rk): + msg = ("You are trying to merge on {lk_dtype} and " + "{rk_dtype} columns. If you wish to proceed " + "you should use pd.concat".format(lk_dtype=lk.dtype, + rk_dtype=rk.dtype)) + raise ValueError(msg) + elif is_datetimelike(lk) and not is_datetimelike(rk): + msg = ("You are trying to merge on {lk_dtype} and " + "{rk_dtype} columns. If you wish to proceed " + "you should use pd.concat".format(lk_dtype=lk.dtype, + rk_dtype=rk.dtype)) + raise ValueError(msg) + elif not is_datetimelike(lk) and is_datetimelike(rk): + msg = ("You are trying to merge on {lk_dtype} and " + "{rk_dtype} columns. If you wish to proceed " + "you should use pd.concat".format(lk_dtype=lk.dtype, + rk_dtype=rk.dtype)) + raise ValueError(msg) + # Houston, we have a problem! # let's coerce to object if the dtypes aren't # categorical, otherwise coerce to the category diff --git a/pandas/tests/reshape/merge/test_merge.py b/pandas/tests/reshape/merge/test_merge.py index 6f2d2ce2a8583..70b84f7a6225b 100644 --- a/pandas/tests/reshape/merge/test_merge.py +++ b/pandas/tests/reshape/merge/test_merge.py @@ -6,6 +6,7 @@ from numpy import nan import numpy as np import random +import re import pandas as pd from pandas.compat import lrange, lzip @@ -1370,30 +1371,47 @@ def f(): pytest.raises(NotImplementedError, f) -@pytest.fixture -def df(): - return DataFrame( - {'A': ['foo', 'bar'], - 'B': Series(['foo', 'bar']).astype('category'), - 'C': [1, 2], - 'D': [1.0, 2.0], - 'E': Series([1, 2], dtype='uint64'), - 'F': Series([1, 2], dtype='int32')}) - - class TestMergeDtypes(object): - def test_different(self, df): - - # we expect differences by kind - # to be ok, while other differences should return object - - left = df - for col in df.columns: - right = DataFrame({'A': df[col]}) + @pytest.mark.parametrize('right_vals', [ + ['foo', 'bar'], + Series(['foo', 'bar']).astype('category'), + [1, 2], + [1.0, 2.0], + Series([1, 2], dtype='uint64'), + Series([1, 2], dtype='int32') + ] + ) + def test_different(self, right_vals): + + left = DataFrame({'A': ['foo', 'bar'], + 'B': Series(['foo', 'bar']).astype('category'), + 'C': [1, 2], + 'D': [1.0, 2.0], + 'E': Series([1, 2], dtype='uint64'), + 'F': Series([1, 2], dtype='int32')}) + right = DataFrame({'A': right_vals}) + + # GH 9780 + # We allow merging on object and categorical cols and cast + # categorical cols to object + if (is_categorical_dtype(right['A'].dtype) or + is_object_dtype(right['A'].dtype)): result = pd.merge(left, right, on='A') assert is_object_dtype(result.A.dtype) + # GH 9780 + # We raise for merging on object col and int/float col and + # merging on categorical col and int/float col + else: + msg = ("You are trying to merge on " + "{lk_dtype} and {rk_dtype} columns. " + "If you wish to proceed you should use " + "pd.concat".format(lk_dtype=left['A'].dtype, + rk_dtype=right['A'].dtype)) + with tm.assert_raises_regex(ValueError, msg): + pd.merge(left, right, on='A') + @pytest.mark.parametrize('d1', [np.int64, np.int32, np.int16, np.int8, np.uint8]) @pytest.mark.parametrize('d2', [np.int64, np.float64, @@ -1462,6 +1480,42 @@ def test_merge_on_ints_floats_warning(self): result = B.merge(A, left_on='Y', right_on='X') assert_frame_equal(result, expected[['Y', 'X']]) + @pytest.mark.parametrize('df1_vals, df2_vals', [ + ([0, 1, 2], ["0", "1", "2"]), + ([0.0, 1.0, 2.0], ["0", "1", "2"]), + ([0, 1, 2], [u"0", u"1", u"2"]), + (pd.date_range('1/1/2011', periods=2, freq='D'), ['2011-01-01', + '2011-01-02']), + (pd.date_range('1/1/2011', periods=2, freq='D'), [0, 1]), + (pd.date_range('1/1/2011', periods=2, freq='D'), [0.0, 1.0]), + ([0, 1, 2], Series(['a', 'b', 'a']).astype('category')), + ([0.0, 1.0, 2.0], Series(['a', 'b', 'a']).astype('category')), + ]) + def test_merge_incompat_dtypes(self, df1_vals, df2_vals): + # GH 9780 + # Raise a ValueError when a user tries to merge on + # dtypes that are incompatible (e.g., obj and int/float) + + df1 = DataFrame({'A': df1_vals}) + df2 = DataFrame({'A': df2_vals}) + + msg = ("You are trying to merge on {lk_dtype} and " + "{rk_dtype} columns. If you wish to proceed " + "you should use pd.concat".format(lk_dtype=df1['A'].dtype, + rk_dtype=df2['A'].dtype)) + msg = re.escape(msg) + with tm.assert_raises_regex(ValueError, msg): + pd.merge(df1, df2, on=['A']) + + # Check that error still raised when swapping order of dataframes + msg = ("You are trying to merge on {lk_dtype} and " + "{rk_dtype} columns. If you wish to proceed " + "you should use pd.concat".format(lk_dtype=df2['A'].dtype, + rk_dtype=df1['A'].dtype)) + msg = re.escape(msg) + with tm.assert_raises_regex(ValueError, msg): + pd.merge(df2, df1, on=['A']) + @pytest.fixture def left():