diff --git a/bench/bench_existence.py b/bench/bench_existence.py new file mode 100644 index 0000000000000..a8487cdcd76ec --- /dev/null +++ b/bench/bench_existence.py @@ -0,0 +1,285 @@ +from __future__ import division + +import os +import sys +from itertools import cycle + +from timeit import Timer +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +from bokeh.mpl import to_bokeh +from numpy.random import randint + +from mpltools import style +style.use('ggplot') + +class ExistenceBenchmarks(object): + + + def time_py_dict(look_for, look_in): + df_look_for = pd.DataFrame(look_for, columns=['data']) + dict_look_in = dict(zip(look_in, look_in)) + + def time_this(): + result = df_look_for[[x in dict_look_in for x in df_look_for.data]] + return result.drop_duplicates().sort('data') + + return time_this + + + def time_isin_list(look_for, look_in): + df_look_for = pd.DataFrame(look_for, columns=['data']) + list_look_in = list(look_in) + + def time_this(): + result = df_look_for[df_look_for.data.isin(list_look_in)] + return result.drop_duplicates().sort('data') + + return time_this + + + def time_isin_dict(look_for, look_in): + df_look_for = pd.DataFrame(look_for, columns=['data']) + dict_look_in = dict(zip(look_in, look_in)) + + def time_this(): + result = df_look_for[df_look_for.data.isin(dict_look_in)] + return result.drop_duplicates().sort('data') + + return time_this + + + def time_isin_series(look_for, look_in): + series_look_in = pd.Series(look_in) + df_look_for = pd.DataFrame(look_for, columns=['data']) + + def time_this(): + result = df_look_for[df_look_for.data.isin(series_look_in)] + return result.drop_duplicates().sort('data') + + return time_this + + + def time_join(look_for, look_in): + series_look_in = pd.Series(look_in, index=look_in) + series_look_in.name = 'series_data' + df_look_for = pd.DataFrame(look_for, columns=['data'], index=look_for) + + def time_this(): + result = df_look_for.join(series_look_in, how='inner') + return result.drop_duplicates() + + return time_this + + + def time_join_no_dups(look_for, look_in): + series_look_in = pd.Series(look_in, index=look_in) + series_look_in.name = 'series_data' + df_look_for = pd.DataFrame(look_for, columns=['data'], index=look_for) + + def time_this(): + df_look_for.drop_duplicates(inplace=True) + series_look_in.drop_duplicates(inplace=True) + result = df_look_for.join(series_look_in, how='inner') + return result.sort('data') + + return time_this + + + def time_query_in(look_for, look_in): + series_look_in = pd.Series(look_in) + series_look_in.name = 'data' + df_look_for = pd.DataFrame(look_for, columns=['data']) + + def time_this(): + # series_look_in is not visible to .query unless defined in local function scope. + s_look_in = series_look_in + result = df_look_for.query('data in @s_look_in') + return result.drop_duplicates().sort('data') + + return time_this + + +def run_bench(to_time, repeat, look_sets, x_axis, linestyle='-'): + func_results = [] + markers = cycle(['o', 's', '+', '^', 'v', 'x', 'D', '*']) + + for time_func_name in to_time: + marker=markers.next() + colors = cycle(['b', 'g', 'r', 'c', 'm', 'y', 'k']) + for set_name, look_set in look_sets: + color=colors.next() + plot_results = [] + for look_for, look_in in look_set: + func = ExistenceBenchmarks.__dict__[time_func_name](look_for, look_in) + result = func() + t = Timer(func) + elapsed = t.timeit(number=repeat) / repeat + name = time_func_name.replace('time_', '') + ' ' + set_name + ' (%.1f%%)' % ((len(result) / len(look_for)) * 100) + func_results.append((name, look_for, look_in, elapsed)) + plot_results.append(elapsed) + plt.plot(x_axis, plot_results, marker=marker, color=color, label=name, linestyle=linestyle) + + +def test_timed(to_time): + look_for = randint(0, 10000, 5000) + look_in = randint(5000, 15000, 5000) + + first_result = ExistenceBenchmarks.__dict__[to_time[0]](look_for, look_in)() + + for time_func_name in to_time[1:]: + func = ExistenceBenchmarks.__dict__[time_func_name](look_for, look_in) + result = func() + if np.array_equal(first_result['data'].values, result['data'].values): + pass + else: + raise AssertionError("%s and %s have unmatched output." % (to_time[0], time_func_name)) + + +if __name__ == '__main__': + + pandas_dir = os.path.dirname(os.path.abspath(os.path.dirname(__file__))) + static_path = os.path.join(pandas_dir, 'doc', 'source', '_static') + join_path = lambda p: os.path.join(static_path, p) + + to_time = [key for key in ExistenceBenchmarks.__dict__ if key.startswith('time_')] + + + if len(sys.argv) != 2: + print 'usage: <--test, --run>' + print '\t--test : Ensure that all timed functions are returning identical output.' + print '\t--run : Generate plots for all timed functions.' + sys.exit() + + if sys.argv[1] == '--test': + test_timed(to_time) + + elif sys.argv[1] == '--run': + test_timed(to_time) + + def save_plot(filename, subtitle): + fname = join_path(filename) + plt.axes().set_xscale('log') + x1,x2,y1,y2 = plt.axis() + # plt.axis((x1, x2, 0, y_limit)) + plt.legend(loc=2, prop={'size':8}) + plt.title('Existence Comparisons%s' % subtitle) + plt.xlabel('% Overlap of X Elements') + plt.ylabel('Time(s)') + plt.savefig(fname) + plt.clf() + + def unordered(exp_range, repeat): + rng = [2**x for x in exp_range] + + # 25% overlap + look_set_25 = \ + [(randint(0, 100*i, 50*i), randint(75*i, 175*i, 50*i)) for i in rng] + + look_set_50 = \ + [(randint(0, 100*i, 50*i), randint(50*i, 150*i, 50*i)) for i in rng] + + look_set_75 = \ + [(randint(0, 100*i, 50*i), randint(25*i, 125*i, 50*i)) for i in rng] + + look_set_100 = \ + [(randint(0, 100*i, 50*i), randint(0*i, 100*i, 50*i)) for i in rng] + + look_sets = [] + look_sets.append(('25% overlap', look_set_25)) + look_sets.append(('50% overlap', look_set_50)) + look_sets.append(('75% overlap', look_set_75)) + look_sets.append(('100% overlap', look_set_100)) + + x_axis = [100*i for i in rng] + run_bench(to_time, 10, look_sets, x_axis, linestyle='-') + + + def from_ordered(exp_range, repeat): + rng = [2**x for x in exp_range] + + # 25% overlap + look_set_25 = \ + [(sorted(randint(0, 100*i, 50*i)), randint(75*i, 175*i, 50*i)) for i in rng] + + look_set_50 = \ + [(sorted(randint(0, 100*i, 50*i)), randint(50*i, 150*i, 50*i)) for i in rng] + + look_set_75 = \ + [(sorted(randint(0, 100*i, 50*i)), randint(25*i, 125*i, 50*i)) for i in rng] + + look_set_100 = \ + [(sorted(randint(0, 100*i, 50*i)), randint(0*i, 100*i, 50*i)) for i in rng] + + look_sets = [] + look_sets.append(('25% overlap, for-ordered', look_set_25)) + look_sets.append(('50% overlap, for-ordered', look_set_50)) + look_sets.append(('75% overlap, for-ordered', look_set_75)) + look_sets.append(('100% overlap, for-ordered', look_set_100)) + + x_axis = [100*i for i in rng] + run_bench(to_time, 10, look_sets, x_axis, linestyle='-.') + + + def both_ordered(exp_range, repeat): + rng = [2**x for x in exp_range] + + # 25% overlap + look_set_25 = \ + [(sorted(randint(0, 100*i, 50*i)), sorted(randint(75*i, 175*i, 50*i))) for i in rng] + + look_set_50 = \ + [(sorted(randint(0, 100*i, 50*i)), sorted(randint(50*i, 150*i, 50*i))) for i in rng] + + look_set_75 = \ + [(sorted(randint(0, 100*i, 50*i)), sorted(randint(25*i, 125*i, 50*i))) for i in rng] + + look_set_100 = \ + [(sorted(randint(0, 100*i, 50*i)), sorted(randint(0*i, 100*i, 50*i))) for i in rng] + + look_sets = [] + look_sets.append(('25% overlap, both-ordered', look_set_25)) + look_sets.append(('50% overlap, both-ordered', look_set_50)) + look_sets.append(('75% overlap, both-ordered', look_set_75)) + look_sets.append(('100% overlap, both-ordered', look_set_100)) + + x_axis = [100*i for i in rng] + run_bench(to_time, repeat, look_sets, x_axis, linestyle=':') + + + plt.figure(figsize=(32, 24)) + unordered(range(1, 10), 10) + from_ordered(range(1, 10), 10) + both_ordered(range(1, 10), 10) + save_plot('existence-perf-small.png', ': Small') + + plt.figure(figsize=(32, 24)) + unordered(range(10, 15), 3) + from_ordered(range(10, 15), 3) + both_ordered(range(10, 15), 3) + save_plot('existence-perf-large.png', ': Large') + + plt.figure(figsize=(16, 12)) + unordered(range(1, 10), 10) + save_plot('existence-perf-unordered-small.png', ': Unordered Small') + + plt.figure(figsize=(16, 12)) + from_ordered(range(1, 10), 10) + save_plot('existence-perf-from-ordered-small.png', ': From-Ordered Small') + + plt.figure(figsize=(16, 12)) + both_ordered(range(1, 10), 10) + save_plot('existence-perf-both-ordered-small.png', ': Both-Ordered Small') + + plt.figure(figsize=(16, 12)) + unordered(range(10, 15), 3) + save_plot('existence-perf-unordered-large.png', ': Unordered Large') + + plt.figure(figsize=(16, 12)) + from_ordered(range(10, 15), 3) + save_plot('existence-perf-from-ordered-large.png', ': From-Ordered Large') + + plt.figure(figsize=(16, 12)) + both_ordered(range(10, 15), 3) + save_plot('existence-perf-both-ordered-large.png', ': Both-Ordered Large') \ No newline at end of file diff --git a/doc/source/_static/existence-perf-large.png b/doc/source/_static/existence-perf-large.png new file mode 100644 index 0000000000000..5c0766a2afb3c Binary files /dev/null and b/doc/source/_static/existence-perf-large.png differ diff --git a/doc/source/_static/existence-perf-small.png b/doc/source/_static/existence-perf-small.png new file mode 100644 index 0000000000000..6150cc47037a5 Binary files /dev/null and b/doc/source/_static/existence-perf-small.png differ diff --git a/doc/source/enhancingperf.rst b/doc/source/enhancingperf.rst index 00c76632ce17b..554543c0f51df 100644 --- a/doc/source/enhancingperf.rst +++ b/doc/source/enhancingperf.rst @@ -668,3 +668,171 @@ In general, :meth:`DataFrame.query`/:func:`pandas.eval` will evaluate the subexpressions that *can* be evaluated by ``numexpr`` and those that must be evaluated in Python space transparently to the user. This is done by inferring the result type of an expression from its arguments and operators. + +Existence (IsIn, Inner Join, Dict/Hash, Query) +---------------------------------------------------- + +Existence is the process of testing if an item exists in another list of items, and +in the case of a DataFrame, we're testing each value of a column for existence in +another collection of items. + +There are a number of different ways to test for existence using pandas and the +following methods are a few of those. The comments correspond to the legend +in the plots further down. + + +:meth:`DataFrame.isin` + +.. code-block:: python + + # isin_list + df[df.index.isin(lst)] + # isin_dict + df[df.index.isin(dct)] + # isin_series + df[df.index.isin(series)] + + + +:meth:`DataFrame.query` + +.. code-block:: python + + # The '@' symbol is used with `query` to reference local variables. Names + # without '@' will reference the DataFrame's columns or index. + + # query_in list + df.query('index in @lst') + # query_in Series + df.query('index in @series') + + # A list can be used with `query('.. == ..')` to test for existence + # but other data structures such as the `pandas.Series` have + # a different behaviour. + + df.query('index == @lst') + + +:meth:`DataFrame.apply` + +.. code-block:: python + + df[df.index.apply(lambda x: x in lst)] + + +:meth:`DataFrame.join` + +.. code-block:: python + + # join + df.join(lst, how='inner') + + # this can actually be fast for small DataFrames + df[[x in dct for x in df.index]] + + # isin_series, query_in Series, pydict, + # join and isin_list are included in the plots below. + + +As seen below, generally using a ``Series`` is better than using pure python data +structures for anything larger than very small datasets of around 1000 records. +The fastest two being ``join(series)``: + +.. code-block:: python + + lst = range(1000000) + series = Series(lst, name='data') + + df = DataFrame(lst, columns=['ID']) + + df.join(series, how='inner') + # 100 loops, best of 3: 19.2 ms per loop + +list vs Series: + +.. code-block:: python + + df[df.index.isin(lst)] + # 1 loops, best of 3: 1.06 s per loop + + df[df.index.isin(series)] + # 1 loops, best of 3: 477 ms per loop + +df.index vs df.column doesn't make a difference here: + +.. code-block:: python + + df[df.ID.isin(series)] + # 1 loops, best of 3: 474 ms per loop + + df[df.index.isin(series)] + # 1 loops, best of 3: 475 ms per loop + +The ``query`` 'in' syntax has the same performance as ``isin``. + +.. code-block:: python + + df.query('index in @lst') + # 1 loops, best of 3: 1.04 s per loop + + df.query('index in @series') + # 1 loops, best of 3: 451 ms per loop + + df.query('index == @lst') + # 1 loops, best of 3: 1.03 s per loop + + +For ``join``, the data must be the index in the ``DataFrame`` and the index in the ``Series`` +for the best performance. The ``Series`` must also have a ``name``. ``join`` defaults to a +left join so we need to specify 'inner' for existence. + +.. code-block:: python + + df.join(series, how='inner') + # 100 loops, best of 3: 19.7 ms per loop + +Smaller datasets: + +.. code-block:: python + + df = DataFrame([1,2,3,4], columns=['ID']) + lst = range(10000) + dct = dict(zip(lst, lst)) + series = Series(lst, name='data') + + df.join(series, how='inner') + # 1000 loops, best of 3: 866 us per loop + + df[df.ID.isin(dct)] + # 1000 loops, best of 3: 809 us per loop + + df[df.ID.isin(lst)] + # 1000 loops, best of 3: 853 us per loop + + df[df.ID.isin(series)] + # 100 loops, best of 3: 2.22 ms per loop + +It's actually faster to use ``apply`` or a list comprehension for these small cases. + +.. code-block:: python + + df[[x in dct for x in df.ID]] + # 1000 loops, best of 3: 266 us per loop + + df[df.ID.apply(lambda x: x in dct)] + # 1000 loops, best of 3: 364 us per loop + + +Here is a visualization of some of the benchmarks above. You can see that except for with +very small datasets, ``isin(Series)`` and ``join(Series)`` quickly become faster than the +pure python data structures. + +.. image:: _static/existence-perf-small.png + +However, ``isin(Series)`` still presents fairly poor exponential performance where ``join`` is quite +fast for large datasets. There is some overhead involved in ensuring your data is the index +in both your left and right datasets but that time should be clearly outweighed by the gains of +the join itself. For extremely large datasets, you may start bumping into memory limits since ``join`` +does not perform any disk chunking, etc. + +.. image:: _static/existence-perf-large.png \ No newline at end of file