HypothesisWorks · DRMacIver · Nov 28, 2019 · Nov 27, 2019 · Nov 27, 2019 · Nov 27, 2019
diff --git a/hypothesis-python/RELEASE.rst b/hypothesis-python/RELEASE.rst
@@ -0,0 +1,8 @@
+RELEASE_TYPE: minor
+
+This release significantly improves the data distribution in rule based stateful testing <stateful_testing>,
+by using a technique called `Swarm Testing (Groce, Alex, et al. "Swarm testing."
+Proceedings of the 2012 International Symposium on Software Testing and Analysis. ACM, 2012.) <https://agroce.github.io/issta12.pdf>`_
+to select which rules are run in any given test case. This should allow it to find many issues that it would previously have missed.
+
+This change is likely to be especially beneficial for stateful tests with large numbers of rules.
diff --git a/hypothesis-python/src/hypothesis/core.py b/hypothesis-python/src/hypothesis/core.py
@@ -758,6 +758,12 @@ def run_engine(self):
  report("".join(traceback.format_exception(type(e), e, tb)))
 
  finally: # pragma: no cover
+ # Mostly useful for ``find`` and ensuring that objects that
+ # hold on to a reference to ``data`` know that it's now been
+ # finished and they shouldn't attempt to draw more data from
+ # it.
+ ran_example.freeze()
+
  # This section is in fact entirely covered by the tests in
  # test_reproduce_failure, but it seems to trigger a lovely set
  # of coverage bugs: The branches show up as uncovered (despite

diff --git a/hypothesis-python/src/hypothesis/searchstrategy/featureflags.py b/hypothesis-python/src/hypothesis/searchstrategy/featureflags.py
@@ -0,0 +1,129 @@
+# coding=utf-8
+#
+# This file is part of Hypothesis, which may be found at
+# https://github.com/HypothesisWorks/hypothesis/
+#
+# Most of this work is copyright (C) 2013-2019 David R. MacIver
+# (david@drmaciver.com), but it contains contributions by others. See
+# CONTRIBUTING.rst for a full list of people who may hold copyright, and
+# consult the git log if you need to determine who owns an individual
+# contribution.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public License,
+# v. 2.0. If a copy of the MPL was not distributed with this file, You can
+# obtain one at https://mozilla.org/MPL/2.0/.
+#
+# END HEADER
+
+from __future__ import absolute_import, division, print_function
+
+import hypothesis.internal.conjecture.utils as cu
+from hypothesis.searchstrategy.strategies import SearchStrategy
+
+FEATURE_LABEL = cu.calc_label_from_name("feature flag")
+
+
+class FeatureFlags(object):
+ """Object that can be used to control a number of feature flags for a
+ given test run.
+
+ This enables an approach to data generation called swarm testing (
+ see Groce, Alex, et al. "Swarm testing." Proceedings of the 2012
+ International Symposium on Software Testing and Analysis. ACM, 2012), in
+ which generation is biased by selectively turning some features off for
+ each test case generated. When there are many interacting features this can
+ find bugs that a pure generation strategy would otherwise have missed.
+
+ FeatureFlags are designed to "shrink open", so that during shrinking they
+ become less restrictive. This allows us to potentially shrink to smaller
+ test cases that were forbidden during the generation phase because they
+ required disabled features.
+ """
+
+ def __init__(self, data=None, enabled=(), disabled=()):
+ self.__data = data
+ self.__decisions = {}
+
+ for f in enabled:
+ self.__decisions[f] = 0
+
+ for f in disabled:
+ self.__decisions[f] = 255
+
+ # In the original swarm testing paper they turn features on or off
+ # uniformly at random. Instead we decide the probability with which to
+ # enable features up front. This can allow for scenarios where all or
+ # no features are enabled, which are vanishingly unlikely in the
+ # original model.
+ #
+ # We implement this as a single 8-bit integer and enable features which
+ # score >= that value. In particular when self.__baseline is 0, all
+ # features will be enabled. This is so that we shrink in the direction
+ # of more features being enabled.
+ if self.__data is not None:
+ self.__baseline = data.draw_bits(8)
+ else:
+ # If data is None we're in example mode so all that matters is the
+ # enabled/disabled lists above. We set this up so that
+ self.__baseline = 1
+
+ def is_enabled(self, name):
+ """Tests whether the feature named ``name`` should be enabled on this
+ test run."""
+ if self.__data is None or self.__data.frozen:
+ # Feature set objects might hang around after data generation has
+ # finished. If this happens then we just report all new features as
+ # enabled, because that's our shrinking direction and they have no
+ # impact on data generation if they weren't used while it was
+ # running.
+ try:
+ return self.__is_value_enabled(self.__decisions[name])
+ except KeyError:
+ return True
+
+ data = self.__data
+
+ data.start_example(label=FEATURE_LABEL)
+ if name in self.__decisions:
+ # If we've already decided on this feature then we don't actually
+ # need to draw anything, but we do write the same decision to the
+ # input stream. This allows us to lazily decide whether a feature
+ # is enabled, because it means that if we happen to delete the part
+ # of the test case where we originally decided, the next point at
+ # which we make this decision just makes the decision it previously
+ # made.
+ value = self.__decisions[name]
+ data.draw_bits(8, forced=value)
+ else:
+ # If the baseline is 0 then everything is enabled so it doesn't
+ # matter what we have here and we might as well make the shrinker's
+ # life easier by forcing it to zero.
+ if self.__baseline == 0:
+ value = 0
+ data.draw_bits(8, forced=0)
+ else:
+ value = data.draw_bits(8)
+ self.__decisions[name] = value
+ data.stop_example()
+ return self.__is_value_enabled(value)
+
+ def __is_value_enabled(self, value):
+ """Check if a given value drawn for a feature counts as enabled. Note
+ that low values are more likely to be enabled. This is again in aid of
+ shrinking open. In particular a value of 255 is always enabled."""
+ return (255 - value) >= self.__baseline
+
+ def __repr__(self):
+ enabled = []
+ disabled = []
+ for k, v in self.__decisions.items():
+ if self.__is_value_enabled(v):
+ enabled.append(k)
+ else:
+ disabled.append(k)
+ return "FeatureFlags(enabled=%r, disabled=%r)" % (enabled, disabled)
+
+
+class FeatureStrategy(SearchStrategy):
+ def do_draw(self, data):
+ return FeatureFlags(data)
diff --git a/hypothesis-python/src/hypothesis/stateful.py b/hypothesis-python/src/hypothesis/stateful.py
@@ -47,6 +47,7 @@
 from hypothesis.internal.reflection import function_digest, nicerepr, proxies, qualname
 from hypothesis.internal.validation import check_type
 from hypothesis.reporting import current_verbosity, report
+from hypothesis.searchstrategy.featureflags import FeatureStrategy
 from hypothesis.searchstrategy.strategies import OneOfStrategy, SearchStrategy
 from hypothesis.vendor.pretty import CUnicodeIO, RepresentationPrinter
 
@@ -610,6 +611,10 @@ def __init__(self, machine):
  self.machine = machine
  self.rules = list(machine.rules())
 
+ self.enabled_rules_strategy = st.shared(
+ FeatureStrategy(), key=("enabled rules", machine),
+ )
+
  # The order is a bit arbitrary. Primarily we're trying to group rules
  # that write to the same location together, and to put rules with no
  # target first as they have less effect on the structure. We order from
@@ -635,12 +640,27 @@ def do_draw(self, data):
  if not any(self.is_valid(rule) for rule in self.rules):
  msg = u"No progress can be made from state %r" % (self.machine,)
  quiet_raise(InvalidDefinition(msg))
- rule = data.draw(st.sampled_from(self.rules).filter(self.is_valid))
+
+ feature_flags = data.draw(self.enabled_rules_strategy)
+
+ # Note: The order of the filters here is actually quite important,
+ # because checking is_enabled makes choices, so increases the size of
+ # the choice sequence. This means that if we are in a case where many
+ # rules are invalid we will make a lot more choices if we ask if they
+ # are enabled before we ask if they are valid, so our test cases will
+ # be artificially large.
+ rule = data.draw(
+ st.sampled_from(self.rules)
+ .filter(self.is_valid)
+ .filter(lambda r: feature_flags.is_enabled(r.function.__name__))
+ )
+
  return (rule, data.draw(rule.arguments_strategy))
 
  def is_valid(self, rule):
  if rule.precondition and not rule.precondition(self.machine):
  return False
+
  for b in rule.bundles:
  bundle = self.machine.bundle(b.name)
  if not bundle:

diff --git a/hypothesis-python/tests/cover/test_feature_flags.py b/hypothesis-python/tests/cover/test_feature_flags.py
@@ -0,0 +1,84 @@
+# coding=utf-8
+#
+# This file is part of Hypothesis, which may be found at
+# https://github.com/HypothesisWorks/hypothesis/
+#
+# Most of this work is copyright (C) 2013-2019 David R. MacIver
+# (david@drmaciver.com), but it contains contributions by others. See
+# CONTRIBUTING.rst for a full list of people who may hold copyright, and
+# consult the git log if you need to determine who owns an individual
+# contribution.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public License,
+# v. 2.0. If a copy of the MPL was not distributed with this file, You can
+# obtain one at https://mozilla.org/MPL/2.0/.
+#
+# END HEADER
+
+from __future__ import absolute_import, division, print_function
+
+from hypothesis import given, strategies as st
+from hypothesis.internal.compat import hrange
+from hypothesis.searchstrategy.featureflags import FeatureFlags, FeatureStrategy
+from tests.common.debug import find_any, minimal
+
+STRAT = FeatureStrategy()
+
+
+def test_can_all_be_enabled():
+ find_any(STRAT, lambda x: all(x.is_enabled(i) for i in hrange(100)))
+
+
+def test_can_all_be_disabled():
+ find_any(STRAT, lambda x: all(not x.is_enabled(i) for i in hrange(100)))
+
+
+def test_minimizes_open():
+ features = hrange(10)
+
+ flags = minimal(STRAT, lambda x: [x.is_enabled(i) for i in features])
+
+ assert all(flags.is_enabled(i) for i in features)
+
+
+def test_minimizes_individual_features_to_open():
+ features = list(hrange(10))
+
+ flags = minimal(
+ STRAT, lambda x: sum([x.is_enabled(i) for i in features]) < len(features)
+ )
+
+ assert all(flags.is_enabled(i) for i in features[:-1])
+ assert not flags.is_enabled(features[-1])
+
+
+def test_marks_unknown_features_as_enabled():
+ x = find_any(STRAT, lambda v: True)
+
+ assert x.is_enabled("fish")
+
+
+def test_by_default_all_enabled():
+ f = FeatureFlags()
+
+ assert f.is_enabled("foo")
+
+
+@given(st.data())
+def test_repr_can_be_evalled(data):
+ flags = data.draw(STRAT)
+
+ features = data.draw(st.lists(st.text(), unique=True))
+
+ for f in features:
+ flags.is_enabled(f)
+
+ flags2 = eval(repr(flags))
+
+ for f in features:
+ assert flags2.is_enabled(f) == flags.is_enabled(f)
+
+ more_features = data.draw(st.lists(st.text().filter(lambda s: s not in features)))
+
+ for f in more_features:
+ assert flags2.is_enabled(f)
diff --git a/hypothesis-python/tests/cover/test_stateful.py b/hypothesis-python/tests/cover/test_stateful.py
@@ -223,12 +223,42 @@ def fail(self, x, y):
  assert False
 
 
+class CanSwarm(RuleBasedStateMachine):
+ """This test will essentially never pass if you choose rules uniformly at
+ random, because every time the snake rule fires we return to the beginning,
+ so we will tend to undo progress well before we make enough progress for
+ the test to fail.
+
+ This tests our swarm testing functionality in stateful testing by ensuring
+ that we can sometimes generate long runs of steps which exclude a
+ particular rule.
+ """
+
+ def __init__(self):
+ super(CanSwarm, self).__init__()
+ self.seen = set()
+
+ # The reason this rule takes a parameter is that it ensures that we do not
+ # achieve "swarming" by by just restricting the alphabet for single byte
+ # decisions, which is a thing the underlying conjecture engine will
+ # happily do on its own without knowledge of the rule structure.
+ @rule(move=integers(0, 255))
+ def ladder(self, move):
+ self.seen.add(move)
+ assert len(self.seen) <= 15
+
+ @rule()
+ def snake(self):
+ self.seen.clear()
+
+
 bad_machines = (
  BalancedTrees,
  DepthMachine,
  RoseTreeStateMachine,
  NotTheLastMachine,
  PopulateMultipleTargets,
+ CanSwarm,
 )
 
 for m in bad_machines:
@@ -1164,7 +1194,7 @@ def oops(self):
 
 
 def test_reproduce_failure_works():
- @reproduce_failure(__version__, base64.b64encode(b"\0\0\0"))
+ @reproduce_failure(__version__, base64.b64encode(b"\0\0\0\0\0"))
  class TrivialMachine(RuleBasedStateMachine):
  @rule()
  def oops(self):
@@ -1175,7 +1205,7 @@ def oops(self):
 
 
 def test_reproduce_failure_fails_if_no_error():
- @reproduce_failure(__version__, base64.b64encode(b"\0\0\0"))
+ @reproduce_failure(__version__, base64.b64encode(b"\0\0\0\0\0"))
  class TrivialMachine(RuleBasedStateMachine):
  @rule()
  def ok(self):

diff --git a/hypothesis-python/tests/cover/test_statistical_events.py b/hypothesis-python/tests/cover/test_statistical_events.py
@@ -234,4 +234,4 @@ def do(self, item):
 
 def test_stateful_states_are_deduped():
  stats = call_for_statistics(DemoStateMachine.TestCase().runTest)
- assert len(stats.events) == 1
+ assert len(stats.events) <= 2