From 3b50b52f657a1b406ef335ced9e86adc9d145c46 Mon Sep 17 00:00:00 2001 From: Dan Birken Date: Mon, 21 Oct 2013 12:02:35 -0700 Subject: [PATCH 1/3] TST/PERF: Re-write assert_almost_equal() in cython #4398 Add a testing.pyx cython file, and port assert_almost_equal() from python to cython. On my machine this brings a modest gain to the suite of "not slow" tests (160s -> 140s), but on assert_almost_equal() heavy tests, like test_expressions.py, it shows a large improvement (14s -> 4s). --- doc/source/release.rst | 1 + pandas/src/testing.pyx | 86 ++++++++++++++++++++++++++++++++++++++++++ pandas/util/testing.py | 73 ++++------------------------------- setup.py | 10 ++++- 4 files changed, 103 insertions(+), 67 deletions(-) create mode 100644 pandas/src/testing.pyx diff --git a/doc/source/release.rst b/doc/source/release.rst index 6eeaa55280e43..7171b48f4097a 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -414,6 +414,7 @@ See :ref:`Internal Refactoring` compatible. (:issue:`5213`, :issue:`5214`) - Unity ``dropna`` for Series/DataFrame signature (:issue:`5250`), tests from :issue:`5234`, courtesy of @rockg + - Rewrite assert_almost_equal() in cython for performance (:issue:`4398`) .. _release.bug_fixes-0.13.0: diff --git a/pandas/src/testing.pyx b/pandas/src/testing.pyx new file mode 100644 index 0000000000000..324728b84b1e0 --- /dev/null +++ b/pandas/src/testing.pyx @@ -0,0 +1,86 @@ +import numpy as np + +from pandas import compat +from pandas.core.common import isnull + +cdef bint isiterable(obj): + return hasattr(obj, '__iter__') + +cdef bint decimal_almost_equal(double desired, double actual, int decimal): + # Code from + # http://docs.scipy.org/doc/numpy/reference/generated + # /numpy.testing.assert_almost_equal.html + return abs(desired - actual) < (0.5 * 10.0 ** -decimal) + +cpdef assert_dict_equal(a, b, bint compare_keys=True): + a_keys = frozenset(a.keys()) + b_keys = frozenset(b.keys()) + + if compare_keys: + assert a_keys == b_keys + + for k in a_keys: + assert_almost_equal(a[k], b[k]) + + return True + +cpdef assert_almost_equal(a, b, bint check_less_precise=False): + cdef: + int decimal + Py_ssize_t i, na, nb + double fa, fb + + if isinstance(a, dict) or isinstance(b, dict): + return assert_dict_equal(a, b) + + if isinstance(a, compat.string_types): + assert a == b, "%r != %r" % (a, b) + return True + + if isiterable(a): + assert isiterable(b), "First object is iterable, second isn't" + na, nb = len(a), len(b) + assert na == nb, "%s != %s" % (na, nb) + if (isinstance(a, np.ndarray) and + isinstance(b, np.ndarray) and + np.array_equal(a, b)): + return True + else: + for i in xrange(na): + assert_almost_equal(a[i], b[i], check_less_precise) + return True + + if isnull(a): + assert isnull(b), "First object is null, second isn't" + return True + + if isinstance(a, (bool, float, int, np.float32)): + decimal = 5 + + # deal with differing dtypes + if check_less_precise: + dtype_a = np.dtype(type(a)) + dtype_b = np.dtype(type(b)) + if dtype_a.kind == 'f' and dtype_b == 'f': + if dtype_a.itemsize <= 4 and dtype_b.itemsize <= 4: + decimal = 3 + + if np.isinf(a): + assert np.isinf(b), "First object is inf, second isn't" + else: + fa, fb = a, b + + # case for zero + if abs(fa) < 1e-5: + if not decimal_almost_equal(fa, fb, decimal): + assert False, ( + '(very low values) expected %.5f but got %.5f' % (b, a) + ) + else: + if not decimal_almost_equal(1, fb / fa, decimal): + assert False, 'expected %.5f but got %.5f' % (b, a) + + else: + assert a == b, "%s != %s" % (a, b) + + return True diff --git a/pandas/util/testing.py b/pandas/util/testing.py index 7a37be30f7bf6..be6f593da2043 100644 --- a/pandas/util/testing.py +++ b/pandas/util/testing.py @@ -37,6 +37,8 @@ from pandas.tseries.index import DatetimeIndex from pandas.tseries.period import PeriodIndex +from pandas import _testing + from pandas.io.common import urlopen Index = index.Index @@ -50,6 +52,11 @@ K = 4 _RAISE_NETWORK_ERROR_DEFAULT = False +# NOTE: don't pass an NDFrame or index to this function - may not handle it +# well. +assert_almost_equal = _testing.assert_almost_equal + +assert_dict_equal = _testing.assert_dict_equal def randbool(size=(), p=0.5): return rand(*size) <= p @@ -374,75 +381,9 @@ def assert_attr_equal(attr, left, right): def isiterable(obj): return hasattr(obj, '__iter__') - -# NOTE: don't pass an NDFrame or index to this function - may not handle it -# well. -def assert_almost_equal(a, b, check_less_precise=False): - if isinstance(a, dict) or isinstance(b, dict): - return assert_dict_equal(a, b) - - if isinstance(a, compat.string_types): - assert a == b, "%r != %r" % (a, b) - return True - - if isiterable(a): - np.testing.assert_(isiterable(b)) - na, nb = len(a), len(b) - assert na == nb, "%s != %s" % (na, nb) - if isinstance(a, np.ndarray) and isinstance(b, np.ndarray) and\ - np.array_equal(a, b): - return True - else: - for i in range(na): - assert_almost_equal(a[i], b[i], check_less_precise) - return True - - err_msg = lambda a, b: 'expected %.5f but got %.5f' % (b, a) - - if isnull(a): - np.testing.assert_(isnull(b)) - return - - if isinstance(a, (bool, float, int, np.float32)): - decimal = 5 - - # deal with differing dtypes - if check_less_precise: - dtype_a = np.dtype(type(a)) - dtype_b = np.dtype(type(b)) - if dtype_a.kind == 'f' and dtype_b == 'f': - if dtype_a.itemsize <= 4 and dtype_b.itemsize <= 4: - decimal = 3 - - if np.isinf(a): - assert np.isinf(b), err_msg(a, b) - - # case for zero - elif abs(a) < 1e-5: - np.testing.assert_almost_equal( - a, b, decimal=decimal, err_msg=err_msg(a, b), verbose=False) - else: - np.testing.assert_almost_equal( - 1, a / b, decimal=decimal, err_msg=err_msg(a, b), verbose=False) - else: - assert a == b, "%s != %s" % (a, b) - - def is_sorted(seq): return assert_almost_equal(seq, np.sort(np.array(seq))) - -def assert_dict_equal(a, b, compare_keys=True): - a_keys = frozenset(a.keys()) - b_keys = frozenset(b.keys()) - - if compare_keys: - assert(a_keys == b_keys) - - for k in a_keys: - assert_almost_equal(a[k], b[k]) - - def assert_series_equal(left, right, check_dtype=True, check_index_type=False, check_series_type=False, diff --git a/setup.py b/setup.py index c326d14f552e0..635da56d7339f 100755 --- a/setup.py +++ b/setup.py @@ -304,7 +304,8 @@ class CheckSDist(sdist): 'pandas/index.pyx', 'pandas/algos.pyx', 'pandas/parser.pyx', - 'pandas/src/sparse.pyx'] + 'pandas/src/sparse.pyx', + 'pandas/src/testing.pyx'] def initialize_options(self): sdist.initialize_options(self) @@ -464,6 +465,13 @@ def pxd(name): extensions.extend([sparse_ext]) +testing_ext = Extension('pandas._testing', + sources=[srcpath('testing', suffix=suffix)], + include_dirs=[], + libraries=libraries) + +extensions.extend([testing_ext]) + #---------------------------------------------------------------------- # msgpack stuff here From 81ef690e4d8bfd68b2404f699a6f684a1be8a415 Mon Sep 17 00:00:00 2001 From: Dan Birken Date: Mon, 21 Oct 2013 12:02:44 -0700 Subject: [PATCH 2/3] TST: Fix edge cases in assert_almost_equal() + tests #4398 Many of the edge cases were related to ordering of the items, but in some cases there were also issues with type checking. This fixes both of those issues and massively expands the testing for this function. --- pandas/src/testing.pyx | 68 ++++++++++++++++++++++++--- pandas/tests/test_tests.py | 96 ++++++++++++++++++++++++++++++++++++-- 2 files changed, 154 insertions(+), 10 deletions(-) diff --git a/pandas/src/testing.pyx b/pandas/src/testing.pyx index 324728b84b1e0..b324c6652d58f 100644 --- a/pandas/src/testing.pyx +++ b/pandas/src/testing.pyx @@ -3,9 +3,36 @@ import numpy as np from pandas import compat from pandas.core.common import isnull +cdef NUMERIC_TYPES = ( + bool, + int, + float, + np.bool, + np.int8, + np.int16, + np.int32, + np.int64, + np.uint8, + np.uint16, + np.uint32, + np.uint64, + np.float16, + np.float32, + np.float64, +) + +cdef bint is_comparable_as_number(obj): + return isinstance(obj, NUMERIC_TYPES) + cdef bint isiterable(obj): return hasattr(obj, '__iter__') +cdef bint has_length(obj): + return hasattr(obj, '__len__') + +cdef bint is_dictlike(obj): + return hasattr(obj, 'keys') and hasattr(obj, '__getitem__') + cdef bint decimal_almost_equal(double desired, double actual, int decimal): # Code from # http://docs.scipy.org/doc/numpy/reference/generated @@ -13,6 +40,10 @@ cdef bint decimal_almost_equal(double desired, double actual, int decimal): return abs(desired - actual) < (0.5 * 10.0 ** -decimal) cpdef assert_dict_equal(a, b, bint compare_keys=True): + assert is_dictlike(a) and is_dictlike(b), ( + "Cannot compare dict objects, one or both is not dict-like" + ) + a_keys = frozenset(a.keys()) b_keys = frozenset(b.keys()) @@ -33,14 +64,24 @@ cpdef assert_almost_equal(a, b, bint check_less_precise=False): if isinstance(a, dict) or isinstance(b, dict): return assert_dict_equal(a, b) - if isinstance(a, compat.string_types): + if (isinstance(a, compat.string_types) or + isinstance(b, compat.string_types)): assert a == b, "%r != %r" % (a, b) return True if isiterable(a): - assert isiterable(b), "First object is iterable, second isn't" + assert isiterable(b), ( + "First object is iterable, second isn't: %r != %r" % (a, b) + ) + assert has_length(a) and has_length(b), ( + "Can't compare objects without length, one or both is invalid: " + "(%r, %r)" % (a, b) + ) + na, nb = len(a), len(b) - assert na == nb, "%s != %s" % (na, nb) + assert na == nb, ( + "Length of two iterators not the same: %r != %r" % (na, nb) + ) if (isinstance(a, np.ndarray) and isinstance(b, np.ndarray) and np.array_equal(a, b)): @@ -49,12 +90,27 @@ cpdef assert_almost_equal(a, b, bint check_less_precise=False): for i in xrange(na): assert_almost_equal(a[i], b[i], check_less_precise) return True + elif isiterable(b): + assert False, ( + "Second object is iterable, first isn't: %r != %r" % (a, b) + ) if isnull(a): - assert isnull(b), "First object is null, second isn't" + assert isnull(b), ( + "First object is null, second isn't: %r != %r" % (a, b) + ) + return True + elif isnull(b): + assert isnull(a), ( + "First object is not null, second is null: %r != %r" % (a, b) + ) return True - if isinstance(a, (bool, float, int, np.float32)): + if is_comparable_as_number(a): + assert is_comparable_as_number(b), ( + "First object is numeric, second is not: %r != %r" % (a, b) + ) + decimal = 5 # deal with differing dtypes @@ -81,6 +137,6 @@ cpdef assert_almost_equal(a, b, bint check_less_precise=False): assert False, 'expected %.5f but got %.5f' % (b, a) else: - assert a == b, "%s != %s" % (a, b) + assert a == b, "%r != %r" % (a, b) return True diff --git a/pandas/tests/test_tests.py b/pandas/tests/test_tests.py index 1890c2607fc89..fa295838d47e9 100644 --- a/pandas/tests/test_tests.py +++ b/pandas/tests/test_tests.py @@ -4,6 +4,7 @@ import unittest import warnings import nose +import numpy as np import sys from pandas.util.testing import ( @@ -12,12 +13,99 @@ # let's get meta. -class TestUtilTesting(unittest.TestCase): +class TestAssertAlmostEqual(unittest.TestCase): _multiprocess_can_split_ = True - def test_assert_almost_equal(self): - # don't die because values are not ndarrays - assert_almost_equal(1.1,1.1,check_less_precise=True) + def _assert_almost_equal_both(self, a, b, **kwargs): + assert_almost_equal(a, b, **kwargs) + assert_almost_equal(b, a, **kwargs) + + def _assert_not_almost_equal_both(self, a, b, **kwargs): + self.assertRaises(AssertionError, assert_almost_equal, a, b, **kwargs) + self.assertRaises(AssertionError, assert_almost_equal, b, a, **kwargs) + + def test_assert_almost_equal_numbers(self): + self._assert_almost_equal_both(1.1, 1.1) + self._assert_almost_equal_both(1.1, 1.100001) + self._assert_almost_equal_both(np.int16(1), 1.000001) + self._assert_almost_equal_both(np.float64(1.1), 1.1) + self._assert_almost_equal_both(np.uint32(5), 5) + + self._assert_not_almost_equal_both(1.1, 1) + self._assert_not_almost_equal_both(1.1, True) + self._assert_not_almost_equal_both(1, 2) + self._assert_not_almost_equal_both(1.0001, np.int16(1)) + + def test_assert_almost_equal_numbers_with_zeros(self): + self._assert_almost_equal_both(0, 0) + self._assert_almost_equal_both(0.000001, 0) + + self._assert_not_almost_equal_both(0.001, 0) + self._assert_not_almost_equal_both(1, 0) + + def test_assert_almost_equal_numbers_with_mixed(self): + self._assert_not_almost_equal_both(1, 'abc') + self._assert_not_almost_equal_both(1, [1,]) + self._assert_not_almost_equal_both(1, object()) + + def test_assert_almost_equal_dicts(self): + self._assert_almost_equal_both({'a': 1, 'b': 2}, {'a': 1, 'b': 2}) + + self._assert_not_almost_equal_both({'a': 1, 'b': 2}, {'a': 1, 'b': 3}) + self._assert_not_almost_equal_both( + {'a': 1, 'b': 2}, {'a': 1, 'b': 2, 'c': 3} + ) + self._assert_not_almost_equal_both({'a': 1}, 1) + self._assert_not_almost_equal_both({'a': 1}, 'abc') + self._assert_not_almost_equal_both({'a': 1}, [1,]) + + def test_assert_almost_equal_dict_like_object(self): + class DictLikeObj(object): + def keys(self): + return ('a',) + + def __getitem__(self, item): + if item == 'a': + return 1 + + self._assert_almost_equal_both({'a': 1}, DictLikeObj()) + + self._assert_not_almost_equal_both({'a': 2}, DictLikeObj()) + + def test_assert_almost_equal_strings(self): + self._assert_almost_equal_both('abc', 'abc') + + self._assert_not_almost_equal_both('abc', 'abcd') + self._assert_not_almost_equal_both('abc', 'abd') + self._assert_not_almost_equal_both('abc', 1) + self._assert_not_almost_equal_both('abc', [1,]) + + def test_assert_almost_equal_iterables(self): + self._assert_almost_equal_both([1, 2, 3], [1, 2, 3]) + self._assert_almost_equal_both(np.array([1, 2, 3]), [1, 2, 3]) + + # Can't compare generators + self._assert_not_almost_equal_both(iter([1, 2, 3]), [1, 2, 3]) + + self._assert_not_almost_equal_both([1, 2, 3], [1, 2, 4]) + self._assert_not_almost_equal_both([1, 2, 3], [1, 2, 3, 4]) + self._assert_not_almost_equal_both([1, 2, 3], 1) + + def test_assert_almost_equal_null(self): + self._assert_almost_equal_both(None, None) + self._assert_almost_equal_both(None, np.NaN) + + self._assert_not_almost_equal_both(None, 0) + self._assert_not_almost_equal_both(np.NaN, 0) + + def test_assert_almost_equal_inf(self): + self._assert_almost_equal_both(np.inf, np.inf) + self._assert_almost_equal_both(np.inf, float("inf")) + + self._assert_not_almost_equal_both(np.inf, 0) + +class TestUtilTesting(unittest.TestCase): + _multiprocess_can_split_ = True def test_raise_with_traceback(self): with assertRaisesRegexp(LookupError, "error_text"): From 850220f75910e6bc98d7254323a7ef71962424ea Mon Sep 17 00:00:00 2001 From: Dan Birken Date: Mon, 21 Oct 2013 12:02:48 -0700 Subject: [PATCH 3/3] CLN: Make the name of the test file more accurate --- pandas/tests/{test_tests.py => test_testing.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename pandas/tests/{test_tests.py => test_testing.py} (100%) diff --git a/pandas/tests/test_tests.py b/pandas/tests/test_testing.py similarity index 100% rename from pandas/tests/test_tests.py rename to pandas/tests/test_testing.py