Merge pull request #350 from Yelp/norases_335_create_epsilon_first_ba…

…ndit_endpoint Implement Epsilon-First bandit endpoint
Yelp · Aug 1, 2014 · 902b21e · 902b21e
2 parents 1672b4c + ec4c694
commit 902b21e
Show file tree

Hide file tree

Showing 18 changed files with 454 additions and 64 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,7 +1,8 @@
 * Features
 
   * Added multi-armed bandit endpoint. (#255)
-    * Implemented epsilon-greedy.
+    * Implemented epsilon-greedy. (#255)
+    * Implemented epsilon-first. (#335) 
   * Added support for the L-BFGS-B optimizer. (#296)
 
 * Changes

diff --git a/docs/bandit.rst b/docs/bandit.rst
@@ -59,12 +59,12 @@ There are many different policies for this problem:
 
 We have implemented the following policies in our package:
 
+* :mod:`~moe.bandit.epsilon_first.EpsilonFirst`
 * :mod:`~moe.bandit.epsilon_greedy.EpsilonGreedy`
 
 Other policies include:
 
 * Weighted random choice
-* `Epsilon-first`_
 * `Epsilon-decreasing`_ \*
 * `UCB-exp (Upper Confidence Bound)`_ \*
 * `UCB-tuned`_ \*
@@ -73,7 +73,6 @@ Other policies include:
 
 \* Regret bounded as :math:`t \rightarrow \infty`
 
-.. _Epsilon-first: http://en.wikipedia.org/wiki/Multi-armed_bandit#Semi-uniform_strategies
 .. _Epsilon-decreasing: http://en.wikipedia.org/wiki/Multi-armed_bandit#Semi-uniform_strategies
 .. _UCB-exp (Upper Confidence Bound): http://moodle.technion.ac.il/pluginfile.php/192340/mod_resource/content/0/UCB.pdf
 .. _UCB-tuned: http://moodle.technion.ac.il/pluginfile.php/192340/mod_resource/content/0/UCB.pdf

diff --git a/moe/bandit/__init__.py b/moe/bandit/__init__.py
@@ -6,6 +6,8 @@
 * :mod:`moe.bandit.constant`: some default configuration values for ``optimal_learning`` components
 * :mod:`moe.bandit.data_containers`: :class:`~moe.bandit.data_containers.SampleArm`
   and :class:`~moe.bandit.data_containers.HistoricalData` containers for passing data to the ``bandit`` library
+* :mod:`moe.bandit.epsilon_first`: :class:`~moe.bandit.epsilon_first.EpsilonFirst`
+  object for allocating bandit arms and choosing the winning arm based on epsilon-first policy.
 * :mod:`moe.bandit.epsilon_greedy`: :class:`~moe.bandit.epsilon_greedy.EpsilonGreedy`
   object for allocating bandit arms and choosing the winning arm based on epsilon-greedy policy.
 * :mod:`moe.bandit.epsilon`: a base :class:`~moe.bandit.epsilon.Epsilon`

diff --git a/moe/bandit/constant.py b/moe/bandit/constant.py
@@ -8,10 +8,19 @@
                                             }
                            }
 
-DEFAULT_EPSILON = 0.05
-
 # Epsilon subtypes
+EPSILON_SUBTYPE_FIRST = 'first'
 EPSILON_SUBTYPE_GREEDY = 'greedy'
 EPSILON_SUBTYPES = [
+                EPSILON_SUBTYPE_FIRST,
                 EPSILON_SUBTYPE_GREEDY,
                 ]
+
+# Default Hyperparameters
+DEFAULT_EPSILON = 0.05
+DEFAULT_TOTAL_SAMPLES = 100
+EPSILON_SUBTYPES_TO_DEFAULT_HYPERPARAMETER_INFOS = {
+        EPSILON_SUBTYPE_FIRST: {'epsilon': DEFAULT_EPSILON,
+                                'total_samples': DEFAULT_TOTAL_SAMPLES},
+        EPSILON_SUBTYPE_GREEDY: {'epsilon': DEFAULT_EPSILON},
+        }
diff --git a/moe/bandit/epsilon.py b/moe/bandit/epsilon.py
@@ -4,16 +4,17 @@
 See :class:`moe.bandit.interfaces.bandit_interface` for further details on bandit.
 
 """
-
 import copy
 
+import numpy
+
 from moe.bandit.constant import DEFAULT_EPSILON
 from moe.bandit.interfaces.bandit_interface import BanditInterface
 
 
 class Epsilon(BanditInterface):
 
-    r"""Implementation of the constructor of Epsilon. Abstract method allocate_arms implemented in subclass.
+    r"""Implementation of the constructor and common methods of Epsilon. Abstract method allocate_arms implemented in subclass.
 
     A class to encapsulate the computation of bandit epsilon.
     Epsilon is the sole hyperparameter in this class. Subclasses may contain other hyperparameters.
@@ -31,7 +32,7 @@ def __init__(
         """Construct an Epsilon object.
 
         :param historical_info: a dictionary of arms sampled
-        :type historical_info: dictionary of (String(), SingleArm()) pairs
+        :type historical_info: dictionary of (String(), SampleArm()) pairs (see :class:`moe.bandit.data_containers.SampleArm` for more details)
         :param subtype: subtype of the epsilon bandit algorithm (default: None)
         :type subtype: String()
         :param epsilon: epsilon hyperparameter for the epsilon bandit algorithm (default: :const:`~moe.bandit.constant.DEFAULT_EPSILON`)
@@ -41,3 +42,34 @@ def __init__(
         self._historical_info = copy.deepcopy(historical_info)
         self._subtype = subtype
         self._epsilon = epsilon
+
+    @staticmethod
+    def get_winning_arm_names(arms_sampled):
+        r"""Compute the set of winning arm names based on the given ``arms_sampled``..
+
+        Throws an exception when arms_sampled is empty.
+        Implementers of this interface will never override this method.
+
+        :param arms_sampled: a dictionary of arm name to :class:`moe.bandit.data_containers.SampleArm`
+        :type arms_sampled: dictionary of (String(), SampleArm()) pairs
+        :return: of set of names of the winning arms
+        :rtype: frozenset(String())
+        :raise: ValueError when ``arms_sampled`` are empty.
+
+        """
+        if not arms_sampled:
+            raise ValueError('arms_sampled is empty!')
+
+        avg_payoff_arm_name_list = []
+        for arm_name, sampled_arm in arms_sampled.iteritems():
+            avg_payoff = numpy.float64(sampled_arm.win - sampled_arm.loss) / sampled_arm.total if sampled_arm.total > 0 else 0
+            avg_payoff_arm_name_list.append((avg_payoff, arm_name))
+        avg_payoff_arm_name_list.sort(reverse=True)
+
+        best_payoff, _ = max(avg_payoff_arm_name_list)
+        # Filter out arms that have average payoff less than the best payoff
+        winning_arm_payoff_name_list = filter(lambda avg_payoff_arm_name: avg_payoff_arm_name[0] == best_payoff, avg_payoff_arm_name_list)
+        # Extract a list of winning arm names from a list of (average payoff, arm name) tuples.
+        _, winning_arm_name_list = map(list, zip(*winning_arm_payoff_name_list))
+        winning_arm_names = frozenset(winning_arm_name_list)
+        return winning_arm_names
diff --git a/moe/bandit/epsilon_first.py b/moe/bandit/epsilon_first.py
@@ -0,0 +1,118 @@
+# -*- coding: utf-8 -*-
+"""Classes (Python) to compute the Bandit Epsilon-First arm allocation and choosing the arm to pull next.
+
+See :class:`moe.bandit.epsilon.Epsilon` for further details on bandit.
+
+"""
+from moe.bandit.constant import DEFAULT_EPSILON, DEFAULT_TOTAL_SAMPLES, EPSILON_SUBTYPE_FIRST
+from moe.bandit.epsilon import Epsilon
+
+
+class EpsilonFirst(Epsilon):
+
+    r"""Implementation of EpsilonFirst.
+
+    A class to encapsulate the computation of bandit epsilon first.
+
+    total_samples is the total number of samples (number to sample + number sampled)
+    number sampled is calculated by summing up total from each arm sampled.
+    total_samples is T from :doc:`bandit`.
+
+    See superclass :class:`moe.bandit.epsilon.Epsilon` for further details.
+
+    """
+
+    def __init__(
+            self,
+            historical_info,
+            epsilon=DEFAULT_EPSILON,
+            total_samples=DEFAULT_TOTAL_SAMPLES,
+    ):
+        """Construct an EpsilonFirst object. See superclass :class:`moe.bandit.epsilon.Epsilon` for details.
+
+        total_samples is the total number of samples (number to sample + number sampled)
+        number sampled is calculated by summing up total from each arm sampled.
+        total_samples is T from :doc:`bandit`.
+
+        """
+        super(EpsilonFirst, self).__init__(
+            historical_info=historical_info,
+            subtype=EPSILON_SUBTYPE_FIRST,
+            epsilon=epsilon,
+            )
+        self._total_samples = total_samples
+
+    def allocate_arms(self):
+        r"""Compute the allocation to each arm given ``historical_info``, running bandit ``subtype`` endpoint with hyperparameters in ``hyperparameter_info``.
+
+        Computes the allocation to each arm based on the given subtype, historical info, and hyperparameter info.
+
+        Works with k-armed bandits (k >= 1).
+
+        The Algorithm: http://en.wikipedia.org/wiki/Multi-armed_bandit#Approximate_solutions
+
+        This method starts with a pure exploration phase, followed by a pure exploitation phase.
+        If we have a total of T trials, the first :math:`\epsilon` T trials, we only explore.
+        After that, we only exploit (t = :math:`\epsilon` T, :math:`\epsilon` T + 1, ..., T).
+
+        This method will pull a random arm in the exploration phase.
+        Then this method will pull the optimal arm (best expected return) in the exploitation phase.
+
+        In case of a tie in the exploitation phase, the method will split the allocation among the optimal arms.
+
+        For example, if we have three arms, two arms (arm1 and arm2) with an average payoff of 0.5
+        (``{win:10, lose:10, total:20}``)
+        and a new arm (arm3, average payoff is 0 and total is 0).
+
+        Let the epsilon :math:`\epsilon` be 0.1.
+
+        The allocation depends on which phase we are in:
+
+        *Case 1: T = 50*
+
+        Recall that T = number to sample + number sampled. number sampled :math:`= 20 + 20 + 0 = 40`.
+        So we are on trial #41. We explore the first :math:`\epsilon T = 0.1 * 50 = 5` trials
+        and thus we are in the exploitation phase. We split the allocation between the optimal arms arm1 and arm2.
+
+        ``{arm1: 0.5, arm2: 0.5, arm3: 0.0}``
+
+        *Case 2: T = 500*
+
+        We explore the first :math:`\epsilon T = 0.1 * 500 = 50` trials.
+        Since we are on trail #41, we are in the exploration phase. We choose arms randomly:
+
+        ``{arm1: 0.33, arm2: 0.33, arm3: 0.33}``
+
+        :return: the dictionary of (arm, allocation) key-value pairs
+        :rtype: a dictionary of (String(), float64) pairs
+        :raise: ValueError when ``sample_arms`` are empty.
+
+        """
+        arms_sampled = self._historical_info.arms_sampled
+        num_arms = self._historical_info.num_arms
+
+        if not arms_sampled:
+            raise ValueError('sample_arms is empty!')
+
+        num_sampled = sum([sampled_arm.total for sampled_arm in arms_sampled.itervalues()])
+        # Exploration phase, trials 1,2,..., epsilon * T
+        # Allocate equal probability to all arms
+        if num_sampled < self._total_samples * self._epsilon:
+            equal_allocation = 1.0 / num_arms
+            arms_to_allocations = {}
+            for arm_name in arms_sampled.iterkeys():
+                arms_to_allocations[arm_name] = equal_allocation
+            return arms_to_allocations
+
+        # Exploitation phase, trials epsilon * T+1, ..., T
+        winning_arm_names = self.get_winning_arm_names(arms_sampled)
+
+        num_winning_arms = len(winning_arm_names)
+        arms_to_allocations = {}
+
+        winning_arm_allocation = 1.0 / num_winning_arms
+        # Split allocation among winning arms, all other arms get allocation of 0.
+        for arm_name in arms_sampled.iterkeys():
+            arms_to_allocations[arm_name] = winning_arm_allocation if arm_name in winning_arm_names else 0.0
+
+        return arms_to_allocations
diff --git a/moe/bandit/epsilon_greedy.py b/moe/bandit/epsilon_greedy.py
@@ -4,8 +4,6 @@
 See :class:`moe.bandit.epsilon.Epsilon` for further details on this bandit.
 
 """
-import numpy
-
 from moe.bandit.constant import DEFAULT_EPSILON, EPSILON_SUBTYPE_GREEDY
 from moe.bandit.epsilon import Epsilon
 
@@ -62,24 +60,17 @@ def allocate_arms(self):
 
         :return: the dictionary of (arm, allocation) key-value pairs
         :rtype: a dictionary of (String(), float64) pairs
+        :raise: ValueError when ``sample_arms`` are empty.
 
         """
         arms_sampled = self._historical_info.arms_sampled
         num_arms = self._historical_info.num_arms
         if not arms_sampled:
             raise ValueError('sample_arms are empty!')
-        avg_payoff_arm_name_list = []
-        for arm_name, sampled_arm in arms_sampled.iteritems():
-            avg_payoff = numpy.float64(sampled_arm.win - sampled_arm.loss) / sampled_arm.total if sampled_arm.total > 0 else 0
-            avg_payoff_arm_name_list.append((avg_payoff, arm_name))
-
-        best_payoff, _ = max(avg_payoff_arm_name_list)
-        # Filter out arms that have average payoff less than the best payoff
-        winning_arm_payoff_name_list = filter(lambda avg_payoff_arm_name: avg_payoff_arm_name[0] == best_payoff, avg_payoff_arm_name_list)
-        # Extract a list of winning arm names from a list of (average payoff, arm name) tuples.
-        _, winning_arm_name_list = map(list, zip(*winning_arm_payoff_name_list))
-
-        num_winning_arms = len(winning_arm_name_list)
+
+        winning_arm_names = self.get_winning_arm_names(arms_sampled)
+
+        num_winning_arms = len(winning_arm_names)
         epsilon_allocation = self._epsilon / num_arms
         arms_to_allocations = {}
 
@@ -89,7 +80,7 @@ def allocate_arms(self):
 
         # With probability 1-epsilon, split allocation among winning arms.
         winning_arm_allocation = (1.0 - self._epsilon) / num_winning_arms
-        for winning_arm_name in winning_arm_name_list:
+        for winning_arm_name in winning_arm_names:
             arms_to_allocations[winning_arm_name] += winning_arm_allocation
 
         return arms_to_allocations
diff --git a/moe/bandit/linkers.py b/moe/bandit/linkers.py
@@ -2,7 +2,8 @@
 """Links between the implementations of bandit algorithms."""
 from collections import namedtuple
 
-from moe.bandit.constant import EPSILON_SUBTYPE_GREEDY
+from moe.bandit.constant import EPSILON_SUBTYPE_FIRST, EPSILON_SUBTYPE_GREEDY
+from moe.bandit.epsilon_first import EpsilonFirst
 from moe.bandit.epsilon_greedy import EpsilonGreedy
 
 # Epsilon
@@ -16,6 +17,10 @@
 
 
 EPSILON_SUBTYPES_TO_EPSILON_METHODS = {
+        EPSILON_SUBTYPE_FIRST: EpsilonMethod(
+            subtype=EPSILON_SUBTYPE_FIRST,
+            bandit_class=EpsilonFirst,
+            ),
         EPSILON_SUBTYPE_GREEDY: EpsilonMethod(
             subtype=EPSILON_SUBTYPE_GREEDY,
             bandit_class=EpsilonGreedy,

diff --git a/moe/tests/bandit/__init__.py b/moe/tests/bandit/__init__.py
@@ -21,7 +21,9 @@
 **Files in this package**
 
 * :mod:`moe.tests.bandit.bandit_test_case`: base test case for bandit tests with a simple integration test case
+* :mod:`moe.tests.bandit.epsilon_first_test`: tests for :mod:`moe.bandit.epsilon_greedy.EpsilonFirst`
 * :mod:`moe.tests.bandit.epsilon_greedy_test`: tests for :mod:`moe.bandit.epsilon_greedy.EpsilonGreedy`
+* :mod:`moe.tests.bandit.epsilon_test`: tests for :mod:`moe.bandit.epsilon_greedy.Epsilon`
 * :mod:`moe.tests.bandit.epsilon_test_case`: test cases for classes under :mod:`moe.bandit.epsilon.Epsilon`
 * :mod:`moe.tests.bandit.linkers_test`: tests for :mod:`moe.bandit.linkers`