diff --git a/hypothesis-python/RELEASE.rst b/hypothesis-python/RELEASE.rst new file mode 100644 index 0000000000..3ef07510e5 --- /dev/null +++ b/hypothesis-python/RELEASE.rst @@ -0,0 +1,8 @@ +RELEASE_TYPE: minor + +This release significantly improves the data distribution in rule based stateful testing , +by using a technique called `Swarm Testing (Groce, Alex, et al. "Swarm testing." +Proceedings of the 2012 International Symposium on Software Testing and Analysis. ACM, 2012.) `_ +to select which rules are run in any given test case. This should allow it to find many issues that it would previously have missed. + +This change is likely to be especially beneficial for stateful tests with large numbers of rules. diff --git a/hypothesis-python/src/hypothesis/core.py b/hypothesis-python/src/hypothesis/core.py index 489db7a835..79fea63abf 100644 --- a/hypothesis-python/src/hypothesis/core.py +++ b/hypothesis-python/src/hypothesis/core.py @@ -758,6 +758,12 @@ def run_engine(self): report("".join(traceback.format_exception(type(e), e, tb))) finally: # pragma: no cover + # Mostly useful for ``find`` and ensuring that objects that + # hold on to a reference to ``data`` know that it's now been + # finished and they shouldn't attempt to draw more data from + # it. + ran_example.freeze() + # This section is in fact entirely covered by the tests in # test_reproduce_failure, but it seems to trigger a lovely set # of coverage bugs: The branches show up as uncovered (despite diff --git a/hypothesis-python/src/hypothesis/searchstrategy/featureflags.py b/hypothesis-python/src/hypothesis/searchstrategy/featureflags.py new file mode 100644 index 0000000000..f8ae129a1e --- /dev/null +++ b/hypothesis-python/src/hypothesis/searchstrategy/featureflags.py @@ -0,0 +1,129 @@ +# coding=utf-8 +# +# This file is part of Hypothesis, which may be found at +# https://github.com/HypothesisWorks/hypothesis/ +# +# Most of this work is copyright (C) 2013-2019 David R. MacIver +# (david@drmaciver.com), but it contains contributions by others. See +# CONTRIBUTING.rst for a full list of people who may hold copyright, and +# consult the git log if you need to determine who owns an individual +# contribution. +# +# This Source Code Form is subject to the terms of the Mozilla Public License, +# v. 2.0. If a copy of the MPL was not distributed with this file, You can +# obtain one at https://mozilla.org/MPL/2.0/. +# +# END HEADER + +from __future__ import absolute_import, division, print_function + +import hypothesis.internal.conjecture.utils as cu +from hypothesis.searchstrategy.strategies import SearchStrategy + +FEATURE_LABEL = cu.calc_label_from_name("feature flag") + + +class FeatureFlags(object): + """Object that can be used to control a number of feature flags for a + given test run. + + This enables an approach to data generation called swarm testing ( + see Groce, Alex, et al. "Swarm testing." Proceedings of the 2012 + International Symposium on Software Testing and Analysis. ACM, 2012), in + which generation is biased by selectively turning some features off for + each test case generated. When there are many interacting features this can + find bugs that a pure generation strategy would otherwise have missed. + + FeatureFlags are designed to "shrink open", so that during shrinking they + become less restrictive. This allows us to potentially shrink to smaller + test cases that were forbidden during the generation phase because they + required disabled features. + """ + + def __init__(self, data=None, enabled=(), disabled=()): + self.__data = data + self.__decisions = {} + + for f in enabled: + self.__decisions[f] = 0 + + for f in disabled: + self.__decisions[f] = 255 + + # In the original swarm testing paper they turn features on or off + # uniformly at random. Instead we decide the probability with which to + # enable features up front. This can allow for scenarios where all or + # no features are enabled, which are vanishingly unlikely in the + # original model. + # + # We implement this as a single 8-bit integer and enable features which + # score >= that value. In particular when self.__baseline is 0, all + # features will be enabled. This is so that we shrink in the direction + # of more features being enabled. + if self.__data is not None: + self.__baseline = data.draw_bits(8) + else: + # If data is None we're in example mode so all that matters is the + # enabled/disabled lists above. We set this up so that + self.__baseline = 1 + + def is_enabled(self, name): + """Tests whether the feature named ``name`` should be enabled on this + test run.""" + if self.__data is None or self.__data.frozen: + # Feature set objects might hang around after data generation has + # finished. If this happens then we just report all new features as + # enabled, because that's our shrinking direction and they have no + # impact on data generation if they weren't used while it was + # running. + try: + return self.__is_value_enabled(self.__decisions[name]) + except KeyError: + return True + + data = self.__data + + data.start_example(label=FEATURE_LABEL) + if name in self.__decisions: + # If we've already decided on this feature then we don't actually + # need to draw anything, but we do write the same decision to the + # input stream. This allows us to lazily decide whether a feature + # is enabled, because it means that if we happen to delete the part + # of the test case where we originally decided, the next point at + # which we make this decision just makes the decision it previously + # made. + value = self.__decisions[name] + data.draw_bits(8, forced=value) + else: + # If the baseline is 0 then everything is enabled so it doesn't + # matter what we have here and we might as well make the shrinker's + # life easier by forcing it to zero. + if self.__baseline == 0: + value = 0 + data.draw_bits(8, forced=0) + else: + value = data.draw_bits(8) + self.__decisions[name] = value + data.stop_example() + return self.__is_value_enabled(value) + + def __is_value_enabled(self, value): + """Check if a given value drawn for a feature counts as enabled. Note + that low values are more likely to be enabled. This is again in aid of + shrinking open. In particular a value of 255 is always enabled.""" + return (255 - value) >= self.__baseline + + def __repr__(self): + enabled = [] + disabled = [] + for k, v in self.__decisions.items(): + if self.__is_value_enabled(v): + enabled.append(k) + else: + disabled.append(k) + return "FeatureFlags(enabled=%r, disabled=%r)" % (enabled, disabled) + + +class FeatureStrategy(SearchStrategy): + def do_draw(self, data): + return FeatureFlags(data) diff --git a/hypothesis-python/src/hypothesis/stateful.py b/hypothesis-python/src/hypothesis/stateful.py index 6122179660..c9dd1c1b61 100644 --- a/hypothesis-python/src/hypothesis/stateful.py +++ b/hypothesis-python/src/hypothesis/stateful.py @@ -47,6 +47,7 @@ from hypothesis.internal.reflection import function_digest, nicerepr, proxies, qualname from hypothesis.internal.validation import check_type from hypothesis.reporting import current_verbosity, report +from hypothesis.searchstrategy.featureflags import FeatureStrategy from hypothesis.searchstrategy.strategies import OneOfStrategy, SearchStrategy from hypothesis.vendor.pretty import CUnicodeIO, RepresentationPrinter @@ -610,6 +611,10 @@ def __init__(self, machine): self.machine = machine self.rules = list(machine.rules()) + self.enabled_rules_strategy = st.shared( + FeatureStrategy(), key=("enabled rules", machine), + ) + # The order is a bit arbitrary. Primarily we're trying to group rules # that write to the same location together, and to put rules with no # target first as they have less effect on the structure. We order from @@ -635,12 +640,27 @@ def do_draw(self, data): if not any(self.is_valid(rule) for rule in self.rules): msg = u"No progress can be made from state %r" % (self.machine,) quiet_raise(InvalidDefinition(msg)) - rule = data.draw(st.sampled_from(self.rules).filter(self.is_valid)) + + feature_flags = data.draw(self.enabled_rules_strategy) + + # Note: The order of the filters here is actually quite important, + # because checking is_enabled makes choices, so increases the size of + # the choice sequence. This means that if we are in a case where many + # rules are invalid we will make a lot more choices if we ask if they + # are enabled before we ask if they are valid, so our test cases will + # be artificially large. + rule = data.draw( + st.sampled_from(self.rules) + .filter(self.is_valid) + .filter(lambda r: feature_flags.is_enabled(r.function.__name__)) + ) + return (rule, data.draw(rule.arguments_strategy)) def is_valid(self, rule): if rule.precondition and not rule.precondition(self.machine): return False + for b in rule.bundles: bundle = self.machine.bundle(b.name) if not bundle: diff --git a/hypothesis-python/tests/cover/test_feature_flags.py b/hypothesis-python/tests/cover/test_feature_flags.py new file mode 100644 index 0000000000..f0efd4a40f --- /dev/null +++ b/hypothesis-python/tests/cover/test_feature_flags.py @@ -0,0 +1,84 @@ +# coding=utf-8 +# +# This file is part of Hypothesis, which may be found at +# https://github.com/HypothesisWorks/hypothesis/ +# +# Most of this work is copyright (C) 2013-2019 David R. MacIver +# (david@drmaciver.com), but it contains contributions by others. See +# CONTRIBUTING.rst for a full list of people who may hold copyright, and +# consult the git log if you need to determine who owns an individual +# contribution. +# +# This Source Code Form is subject to the terms of the Mozilla Public License, +# v. 2.0. If a copy of the MPL was not distributed with this file, You can +# obtain one at https://mozilla.org/MPL/2.0/. +# +# END HEADER + +from __future__ import absolute_import, division, print_function + +from hypothesis import given, strategies as st +from hypothesis.internal.compat import hrange +from hypothesis.searchstrategy.featureflags import FeatureFlags, FeatureStrategy +from tests.common.debug import find_any, minimal + +STRAT = FeatureStrategy() + + +def test_can_all_be_enabled(): + find_any(STRAT, lambda x: all(x.is_enabled(i) for i in hrange(100))) + + +def test_can_all_be_disabled(): + find_any(STRAT, lambda x: all(not x.is_enabled(i) for i in hrange(100))) + + +def test_minimizes_open(): + features = hrange(10) + + flags = minimal(STRAT, lambda x: [x.is_enabled(i) for i in features]) + + assert all(flags.is_enabled(i) for i in features) + + +def test_minimizes_individual_features_to_open(): + features = list(hrange(10)) + + flags = minimal( + STRAT, lambda x: sum([x.is_enabled(i) for i in features]) < len(features) + ) + + assert all(flags.is_enabled(i) for i in features[:-1]) + assert not flags.is_enabled(features[-1]) + + +def test_marks_unknown_features_as_enabled(): + x = find_any(STRAT, lambda v: True) + + assert x.is_enabled("fish") + + +def test_by_default_all_enabled(): + f = FeatureFlags() + + assert f.is_enabled("foo") + + +@given(st.data()) +def test_repr_can_be_evalled(data): + flags = data.draw(STRAT) + + features = data.draw(st.lists(st.text(), unique=True)) + + for f in features: + flags.is_enabled(f) + + flags2 = eval(repr(flags)) + + for f in features: + assert flags2.is_enabled(f) == flags.is_enabled(f) + + more_features = data.draw(st.lists(st.text().filter(lambda s: s not in features))) + + for f in more_features: + assert flags2.is_enabled(f) diff --git a/hypothesis-python/tests/cover/test_stateful.py b/hypothesis-python/tests/cover/test_stateful.py index 9f0a8a4845..7a59af6ef5 100644 --- a/hypothesis-python/tests/cover/test_stateful.py +++ b/hypothesis-python/tests/cover/test_stateful.py @@ -223,12 +223,42 @@ def fail(self, x, y): assert False +class CanSwarm(RuleBasedStateMachine): + """This test will essentially never pass if you choose rules uniformly at + random, because every time the snake rule fires we return to the beginning, + so we will tend to undo progress well before we make enough progress for + the test to fail. + + This tests our swarm testing functionality in stateful testing by ensuring + that we can sometimes generate long runs of steps which exclude a + particular rule. + """ + + def __init__(self): + super(CanSwarm, self).__init__() + self.seen = set() + + # The reason this rule takes a parameter is that it ensures that we do not + # achieve "swarming" by by just restricting the alphabet for single byte + # decisions, which is a thing the underlying conjecture engine will + # happily do on its own without knowledge of the rule structure. + @rule(move=integers(0, 255)) + def ladder(self, move): + self.seen.add(move) + assert len(self.seen) <= 15 + + @rule() + def snake(self): + self.seen.clear() + + bad_machines = ( BalancedTrees, DepthMachine, RoseTreeStateMachine, NotTheLastMachine, PopulateMultipleTargets, + CanSwarm, ) for m in bad_machines: @@ -1164,7 +1194,7 @@ def oops(self): def test_reproduce_failure_works(): - @reproduce_failure(__version__, base64.b64encode(b"\0\0\0")) + @reproduce_failure(__version__, base64.b64encode(b"\0\0\0\0\0")) class TrivialMachine(RuleBasedStateMachine): @rule() def oops(self): @@ -1175,7 +1205,7 @@ def oops(self): def test_reproduce_failure_fails_if_no_error(): - @reproduce_failure(__version__, base64.b64encode(b"\0\0\0")) + @reproduce_failure(__version__, base64.b64encode(b"\0\0\0\0\0")) class TrivialMachine(RuleBasedStateMachine): @rule() def ok(self): diff --git a/hypothesis-python/tests/cover/test_statistical_events.py b/hypothesis-python/tests/cover/test_statistical_events.py index fcce5d3f2f..7d1edd6ecd 100644 --- a/hypothesis-python/tests/cover/test_statistical_events.py +++ b/hypothesis-python/tests/cover/test_statistical_events.py @@ -234,4 +234,4 @@ def do(self, item): def test_stateful_states_are_deduped(): stats = call_for_statistics(DemoStateMachine.TestCase().runTest) - assert len(stats.events) == 1 + assert len(stats.events) <= 2