From a6e5944b0fa2d919ff5bf77ff289a6f31b7cb7e2 Mon Sep 17 00:00:00 2001 From: "David R. MacIver" Date: Wed, 27 Nov 2019 13:07:30 +0000 Subject: [PATCH 01/12] Make sure to freeze data after use. --- hypothesis-python/src/hypothesis/core.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/hypothesis-python/src/hypothesis/core.py b/hypothesis-python/src/hypothesis/core.py index 489db7a835..79fea63abf 100644 --- a/hypothesis-python/src/hypothesis/core.py +++ b/hypothesis-python/src/hypothesis/core.py @@ -758,6 +758,12 @@ def run_engine(self): report("".join(traceback.format_exception(type(e), e, tb))) finally: # pragma: no cover + # Mostly useful for ``find`` and ensuring that objects that + # hold on to a reference to ``data`` know that it's now been + # finished and they shouldn't attempt to draw more data from + # it. + ran_example.freeze() + # This section is in fact entirely covered by the tests in # test_reproduce_failure, but it seems to trigger a lovely set # of coverage bugs: The branches show up as uncovered (despite From d8634e4a132fb81f87269504cde51e490a6a0e9a Mon Sep 17 00:00:00 2001 From: "David R. MacIver" Date: Wed, 27 Nov 2019 14:13:44 +0000 Subject: [PATCH 02/12] We can have two statistical events due to aborted tests --- hypothesis-python/tests/cover/test_statistical_events.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hypothesis-python/tests/cover/test_statistical_events.py b/hypothesis-python/tests/cover/test_statistical_events.py index fcce5d3f2f..7d1edd6ecd 100644 --- a/hypothesis-python/tests/cover/test_statistical_events.py +++ b/hypothesis-python/tests/cover/test_statistical_events.py @@ -234,4 +234,4 @@ def do(self, item): def test_stateful_states_are_deduped(): stats = call_for_statistics(DemoStateMachine.TestCase().runTest) - assert len(stats.events) == 1 + assert len(stats.events) <= 2 From c645ccd085b2a156a61085cfe70a327f4318d540 Mon Sep 17 00:00:00 2001 From: "David R. MacIver" Date: Wed, 27 Nov 2019 13:08:23 +0000 Subject: [PATCH 03/12] Add a strategy for swarm testing --- .../hypothesis/searchstrategy/featureflags.py | 118 ++++++++++++++++++ .../tests/cover/test_feature_flags.py | 57 +++++++++ 2 files changed, 175 insertions(+) create mode 100644 hypothesis-python/src/hypothesis/searchstrategy/featureflags.py create mode 100644 hypothesis-python/tests/cover/test_feature_flags.py diff --git a/hypothesis-python/src/hypothesis/searchstrategy/featureflags.py b/hypothesis-python/src/hypothesis/searchstrategy/featureflags.py new file mode 100644 index 0000000000..f7867c9bc5 --- /dev/null +++ b/hypothesis-python/src/hypothesis/searchstrategy/featureflags.py @@ -0,0 +1,118 @@ +# coding=utf-8 +# +# This file is part of Hypothesis, which may be found at +# https://github.com/HypothesisWorks/hypothesis/ +# +# Most of this work is copyright (C) 2013-2019 David R. MacIver +# (david@drmaciver.com), but it contains contributions by others. See +# CONTRIBUTING.rst for a full list of people who may hold copyright, and +# consult the git log if you need to determine who owns an individual +# contribution. +# +# This Source Code Form is subject to the terms of the Mozilla Public License, +# v. 2.0. If a copy of the MPL was not distributed with this file, You can +# obtain one at https://mozilla.org/MPL/2.0/. +# +# END HEADER + +from __future__ import absolute_import, division, print_function + +import hypothesis.internal.conjecture.utils as cu +from hypothesis.searchstrategy.strategies import SearchStrategy + +FEATURE_LABEL = cu.calc_label_from_name("feature flag") + + +class FeatureFlags(object): + """Object that can be used to control a number of feature flags for a + given test run. + + This enables an approach to data generation called swarm testing ( + see Groce, Alex, et al. "Swarm testing." Proceedings of the 2012 + International Symposium on Software Testing and Analysis. ACM, 2012), in + which generation is biased by selectively turning some features off for + each test case generated. When there are many interacting features this can + find bugs that a pure generation strategy would otherwise have missed. + + FeatureFlags are designed to "shrink open", so that during shrinking they + become less restrictive. This allows us to potentially shrink to smaller + test cases that were forbidden during the generation phase because they + required disabled features. + """ + + def __init__(self, data): + self.__data = data + self.__decisions = {} + + # In the original swarm testing paper they turn features on or off + # uniformly at random. Instead we decide the probability with which to + # enable features up front. This can allow for scenarios where all or + # no features are enabled, which are vanishingly unlikely in the + # original model. + # + # We implement this as a single 8-bit integer and enable features which + # score >= that value. In particular when self.__baseline is 0, all + # features will be enabled. This is so that we shrink in the direction + # of more features being enabled. + self.__baseline = data.draw_bits(8) + + def is_enabled(self, name): + """Tests whether the feature named ``name`` should be enabled on this + test run.""" + if self.__data.frozen: + # Feature set objects might hang around after data generation has + # finished. If this happens then we just report all new features as + # enabled, because that's our shrinking direction and they have no + # impact on data generation if they weren't used while it was + # running. + try: + return self.__is_value_enabled(self.__decisions[name]) + except KeyError: + return True + + data = self.__data + + data.start_example(label=FEATURE_LABEL) + if name in self.__decisions: + # If we've already decided on this feature then we don't actually + # need to draw anything, but we do write the same decision to the + # input stream. This allows us to lazily decide whether a feature + # is enabled, because it means that if we happen to delete the part + # of the test case where we originally decided, the next point at + # which we make this decision just makes the decision it previously + # made. + value = self.__decisions[name] + data.draw_bits(8, forced=value) + else: + # If the baseline is 0 then everything is enabled so it doesn't + # matter what we have here and we might as well make the shrinker's + # life easier by forcing it to zero. + if self.__baseline == 0: + value = 0 + data.draw_bits(8, forced=0) + else: + value = data.draw_bits(8) + self.__decisions[name] = value + data.stop_example() + return self.__is_value_enabled(value) + + def __is_value_enabled(self, value): + """Check if a given value drawn for a feature counts as enabled. Note + that low values are more likely to be enabled. This is again in aid of + shrinking open. In particular a value of 255 is always enabled.""" + return (255 - value) >= self.__baseline + + def __repr__(self): + enabled = [] + disabled = [] + for k, v in self.__decisions.items(): + if self.__is_value_enabled(v): + enabled.append(k) + else: + disabled.append(k) + return "FeatureFlags(enabled=%r, disabled=%r)" % (enabled, disabled) + + +class FeatureStrategy(SearchStrategy): + def do_draw(self, data): + return FeatureFlags(data) diff --git a/hypothesis-python/tests/cover/test_feature_flags.py b/hypothesis-python/tests/cover/test_feature_flags.py new file mode 100644 index 0000000000..09d38c3fd3 --- /dev/null +++ b/hypothesis-python/tests/cover/test_feature_flags.py @@ -0,0 +1,57 @@ +# coding=utf-8 +# +# This file is part of Hypothesis, which may be found at +# https://github.com/HypothesisWorks/hypothesis/ +# +# Most of this work is copyright (C) 2013-2019 David R. MacIver +# (david@drmaciver.com), but it contains contributions by others. See +# CONTRIBUTING.rst for a full list of people who may hold copyright, and +# consult the git log if you need to determine who owns an individual +# contribution. +# +# This Source Code Form is subject to the terms of the Mozilla Public License, +# v. 2.0. If a copy of the MPL was not distributed with this file, You can +# obtain one at https://mozilla.org/MPL/2.0/. +# +# END HEADER + +from __future__ import absolute_import, division, print_function + +from hypothesis.internal.compat import hrange +from hypothesis.searchstrategy.featureflags import FeatureStrategy +from tests.common.debug import find_any, minimal + +STRAT = FeatureStrategy() + + +def test_can_all_be_enabled(): + find_any(STRAT, lambda x: all(x.is_enabled(i) for i in hrange(100))) + + +def test_can_all_be_disabled(): + find_any(STRAT, lambda x: all(not x.is_enabled(i) for i in hrange(100))) + + +def test_minimizes_open(): + features = hrange(10) + + flags = minimal(STRAT, lambda x: [x.is_enabled(i) for i in features]) + + assert all(flags.is_enabled(i) for i in features) + + +def test_minimizes_individual_features_to_open(): + features = hrange(10) + + flags = minimal( + STRAT, lambda x: sum([x.is_enabled(i) for i in features]) < len(features) + ) + + assert all(flags.is_enabled(i) for i in features[:-1]) + assert not flags.is_enabled(features[-1]) + + +def test_marks_unknown_features_as_enabled(): + x = find_any(STRAT, lambda v: True) + + assert x.is_enabled("fish") From 5b0090844ec5b3fe554a11085611e51cb9a09e2a Mon Sep 17 00:00:00 2001 From: "David R. MacIver" Date: Wed, 27 Nov 2019 13:13:10 +0000 Subject: [PATCH 04/12] Add test to demonstrate the problem --- .../tests/cover/test_stateful.py | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/hypothesis-python/tests/cover/test_stateful.py b/hypothesis-python/tests/cover/test_stateful.py index 9f0a8a4845..19b5bd196c 100644 --- a/hypothesis-python/tests/cover/test_stateful.py +++ b/hypothesis-python/tests/cover/test_stateful.py @@ -223,12 +223,42 @@ def fail(self, x, y): assert False +class CanSwarm(RuleBasedStateMachine): + """This test will essentially never pass if you choose rules uniformly at + random, because every time the snake rule fires we return to the beginning, + so we will tend to undo progress well before we make neough progress for + the test to fail. + + This tests our swarm testing functionality in stateful testing by ensuring + that we can sometimes generate long runs of steps which exclude a + particular rule. + """ + + def __init__(self): + super(CanSwarm, self).__init__() + self.seen = set() + + # The reason this rule takes a parameter is that it ensures that we do not + # achieve "swarming" by by just restricting the alphabet for single byte + # decisions, which is a thing the underlying conjecture engine will + # happily do on its own without knowledge of the rule structure. + @rule(move=integers(0, 255)) + def ladder(self, move): + self.seen.add(move) + assert len(self.seen) <= 15 + + @rule() + def snake(self): + self.seen.clear() + + bad_machines = ( BalancedTrees, DepthMachine, RoseTreeStateMachine, NotTheLastMachine, PopulateMultipleTargets, + CanSwarm, ) for m in bad_machines: From 80321207efd4d6d7a50048169d99dfb24bc6d260 Mon Sep 17 00:00:00 2001 From: "David R. MacIver" Date: Wed, 27 Nov 2019 13:13:47 +0000 Subject: [PATCH 05/12] Use swarm testing to enable/disable rules in stateful testing --- hypothesis-python/src/hypothesis/stateful.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/hypothesis-python/src/hypothesis/stateful.py b/hypothesis-python/src/hypothesis/stateful.py index 6122179660..fb3e5d09bd 100644 --- a/hypothesis-python/src/hypothesis/stateful.py +++ b/hypothesis-python/src/hypothesis/stateful.py @@ -47,6 +47,7 @@ from hypothesis.internal.reflection import function_digest, nicerepr, proxies, qualname from hypothesis.internal.validation import check_type from hypothesis.reporting import current_verbosity, report +from hypothesis.searchstrategy.featureflags import FeatureStrategy from hypothesis.searchstrategy.strategies import OneOfStrategy, SearchStrategy from hypothesis.vendor.pretty import CUnicodeIO, RepresentationPrinter @@ -610,6 +611,10 @@ def __init__(self, machine): self.machine = machine self.rules = list(machine.rules()) + self.enabled_rules_strategy = st.shared( + FeatureStrategy(), key=("enabled rules", machine), + ) + # The order is a bit arbitrary. Primarily we're trying to group rules # that write to the same location together, and to put rules with no # target first as they have less effect on the structure. We order from @@ -635,12 +640,21 @@ def do_draw(self, data): if not any(self.is_valid(rule) for rule in self.rules): msg = u"No progress can be made from state %r" % (self.machine,) quiet_raise(InvalidDefinition(msg)) - rule = data.draw(st.sampled_from(self.rules).filter(self.is_valid)) + + feature_flags = data.draw(self.enabled_rules_strategy) + + rule = data.draw( + st.sampled_from(self.rules) + .filter(self.is_valid) + .filter(lambda r: feature_flags.is_enabled(r.function.__name__)) + ) + return (rule, data.draw(rule.arguments_strategy)) def is_valid(self, rule): if rule.precondition and not rule.precondition(self.machine): return False + for b in rule.bundles: bundle = self.machine.bundle(b.name) if not bundle: From 493bb257b4dea7c83d244294d163425f6cb4c064 Mon Sep 17 00:00:00 2001 From: "David R. MacIver" Date: Wed, 27 Nov 2019 13:17:18 +0000 Subject: [PATCH 06/12] Add release file --- hypothesis-python/RELEASE.rst | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 hypothesis-python/RELEASE.rst diff --git a/hypothesis-python/RELEASE.rst b/hypothesis-python/RELEASE.rst new file mode 100644 index 0000000000..f67b13f3b8 --- /dev/null +++ b/hypothesis-python/RELEASE.rst @@ -0,0 +1,8 @@ +RELEASE_TYPE: patch + +This release significantly improves the data distribution in rule based stateful testing , +by using a technique called Swarm Testing (Groce, Alex, et al. "Swarm testing." +Proceedings of the 2012 International Symposium on Software Testing and Analysis. ACM, 2012.) +to select which rules are run in any given test case. This should allow it to find many issues that it would previously have missed. + +This change is likely to be especially beneficial for stateful tests with large numbers of rules. From 7750be447a9138ccf8ea90bb0016a20e08facae0 Mon Sep 17 00:00:00 2001 From: "David R. MacIver" Date: Wed, 27 Nov 2019 13:43:13 +0000 Subject: [PATCH 07/12] Fix format for reproduce failure tests --- hypothesis-python/tests/cover/test_stateful.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hypothesis-python/tests/cover/test_stateful.py b/hypothesis-python/tests/cover/test_stateful.py index 19b5bd196c..484cd427ab 100644 --- a/hypothesis-python/tests/cover/test_stateful.py +++ b/hypothesis-python/tests/cover/test_stateful.py @@ -1194,7 +1194,7 @@ def oops(self): def test_reproduce_failure_works(): - @reproduce_failure(__version__, base64.b64encode(b"\0\0\0")) + @reproduce_failure(__version__, base64.b64encode(b"\0\0\0\0\0")) class TrivialMachine(RuleBasedStateMachine): @rule() def oops(self): @@ -1205,7 +1205,7 @@ def oops(self): def test_reproduce_failure_fails_if_no_error(): - @reproduce_failure(__version__, base64.b64encode(b"\0\0\0")) + @reproduce_failure(__version__, base64.b64encode(b"\0\0\0\0\0")) class TrivialMachine(RuleBasedStateMachine): @rule() def ok(self): From bc9cd73f6881135568f17bed9ac850b66e21f07a Mon Sep 17 00:00:00 2001 From: "David R. MacIver" Date: Wed, 27 Nov 2019 15:32:50 +0000 Subject: [PATCH 08/12] That needs to be a list on Python 2 --- hypothesis-python/tests/cover/test_feature_flags.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hypothesis-python/tests/cover/test_feature_flags.py b/hypothesis-python/tests/cover/test_feature_flags.py index 09d38c3fd3..13e7cce1e4 100644 --- a/hypothesis-python/tests/cover/test_feature_flags.py +++ b/hypothesis-python/tests/cover/test_feature_flags.py @@ -41,7 +41,7 @@ def test_minimizes_open(): def test_minimizes_individual_features_to_open(): - features = hrange(10) + features = list(hrange(10)) flags = minimal( STRAT, lambda x: sum([x.is_enabled(i) for i in features]) < len(features) From 5d69404df7dbcd3aeea08a739e8b64d8de07694e Mon Sep 17 00:00:00 2001 From: "David R. MacIver" Date: Wed, 27 Nov 2019 17:20:22 +0000 Subject: [PATCH 09/12] Fix typo --- hypothesis-python/tests/cover/test_stateful.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hypothesis-python/tests/cover/test_stateful.py b/hypothesis-python/tests/cover/test_stateful.py index 484cd427ab..7a59af6ef5 100644 --- a/hypothesis-python/tests/cover/test_stateful.py +++ b/hypothesis-python/tests/cover/test_stateful.py @@ -226,7 +226,7 @@ def fail(self, x, y): class CanSwarm(RuleBasedStateMachine): """This test will essentially never pass if you choose rules uniformly at random, because every time the snake rule fires we return to the beginning, - so we will tend to undo progress well before we make neough progress for + so we will tend to undo progress well before we make enough progress for the test to fail. This tests our swarm testing functionality in stateful testing by ensuring From bce9d2a3f24a14e00d9e63e264337e28ac20e46b Mon Sep 17 00:00:00 2001 From: "David R. MacIver" Date: Wed, 27 Nov 2019 17:59:31 +0000 Subject: [PATCH 10/12] Add warning comment about order of checks --- hypothesis-python/src/hypothesis/stateful.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/hypothesis-python/src/hypothesis/stateful.py b/hypothesis-python/src/hypothesis/stateful.py index fb3e5d09bd..c9dd1c1b61 100644 --- a/hypothesis-python/src/hypothesis/stateful.py +++ b/hypothesis-python/src/hypothesis/stateful.py @@ -643,6 +643,12 @@ def do_draw(self, data): feature_flags = data.draw(self.enabled_rules_strategy) + # Note: The order of the filters here is actually quite important, + # because checking is_enabled makes choices, so increases the size of + # the choice sequence. This means that if we are in a case where many + # rules are invalid we will make a lot more choices if we ask if they + # are enabled before we ask if they are valid, so our test cases will + # be artificially large. rule = data.draw( st.sampled_from(self.rules) .filter(self.is_valid) From 02e4a30133563747b2509fe2859da025eb0fce08 Mon Sep 17 00:00:00 2001 From: "David R. MacIver" Date: Thu, 28 Nov 2019 10:01:20 +0000 Subject: [PATCH 11/12] Improve RELEASE.rst --- hypothesis-python/RELEASE.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hypothesis-python/RELEASE.rst b/hypothesis-python/RELEASE.rst index f67b13f3b8..3ef07510e5 100644 --- a/hypothesis-python/RELEASE.rst +++ b/hypothesis-python/RELEASE.rst @@ -1,8 +1,8 @@ -RELEASE_TYPE: patch +RELEASE_TYPE: minor This release significantly improves the data distribution in rule based stateful testing , -by using a technique called Swarm Testing (Groce, Alex, et al. "Swarm testing." -Proceedings of the 2012 International Symposium on Software Testing and Analysis. ACM, 2012.) +by using a technique called `Swarm Testing (Groce, Alex, et al. "Swarm testing." +Proceedings of the 2012 International Symposium on Software Testing and Analysis. ACM, 2012.) `_ to select which rules are run in any given test case. This should allow it to find many issues that it would previously have missed. This change is likely to be especially beneficial for stateful tests with large numbers of rules. From 42ee3bebe9d40c97460ce4a4592f6e0e0eb15c49 Mon Sep 17 00:00:00 2001 From: "David R. MacIver" Date: Thu, 28 Nov 2019 10:01:32 +0000 Subject: [PATCH 12/12] Allow explicit construction of FeatureFlags --- .../hypothesis/searchstrategy/featureflags.py | 17 +++++++++-- .../tests/cover/test_feature_flags.py | 29 ++++++++++++++++++- 2 files changed, 42 insertions(+), 4 deletions(-) diff --git a/hypothesis-python/src/hypothesis/searchstrategy/featureflags.py b/hypothesis-python/src/hypothesis/searchstrategy/featureflags.py index f7867c9bc5..f8ae129a1e 100644 --- a/hypothesis-python/src/hypothesis/searchstrategy/featureflags.py +++ b/hypothesis-python/src/hypothesis/searchstrategy/featureflags.py @@ -40,10 +40,16 @@ class FeatureFlags(object): required disabled features. """ - def __init__(self, data): + def __init__(self, data=None, enabled=(), disabled=()): self.__data = data self.__decisions = {} + for f in enabled: + self.__decisions[f] = 0 + + for f in disabled: + self.__decisions[f] = 255 + # In the original swarm testing paper they turn features on or off # uniformly at random. Instead we decide the probability with which to # enable features up front. This can allow for scenarios where all or @@ -54,12 +60,17 @@ def __init__(self, data): # score >= that value. In particular when self.__baseline is 0, all # features will be enabled. This is so that we shrink in the direction # of more features being enabled. - self.__baseline = data.draw_bits(8) + if self.__data is not None: + self.__baseline = data.draw_bits(8) + else: + # If data is None we're in example mode so all that matters is the + # enabled/disabled lists above. We set this up so that + self.__baseline = 1 def is_enabled(self, name): """Tests whether the feature named ``name`` should be enabled on this test run.""" - if self.__data.frozen: + if self.__data is None or self.__data.frozen: # Feature set objects might hang around after data generation has # finished. If this happens then we just report all new features as # enabled, because that's our shrinking direction and they have no diff --git a/hypothesis-python/tests/cover/test_feature_flags.py b/hypothesis-python/tests/cover/test_feature_flags.py index 13e7cce1e4..f0efd4a40f 100644 --- a/hypothesis-python/tests/cover/test_feature_flags.py +++ b/hypothesis-python/tests/cover/test_feature_flags.py @@ -17,8 +17,9 @@ from __future__ import absolute_import, division, print_function +from hypothesis import given, strategies as st from hypothesis.internal.compat import hrange -from hypothesis.searchstrategy.featureflags import FeatureStrategy +from hypothesis.searchstrategy.featureflags import FeatureFlags, FeatureStrategy from tests.common.debug import find_any, minimal STRAT = FeatureStrategy() @@ -55,3 +56,29 @@ def test_marks_unknown_features_as_enabled(): x = find_any(STRAT, lambda v: True) assert x.is_enabled("fish") + + +def test_by_default_all_enabled(): + f = FeatureFlags() + + assert f.is_enabled("foo") + + +@given(st.data()) +def test_repr_can_be_evalled(data): + flags = data.draw(STRAT) + + features = data.draw(st.lists(st.text(), unique=True)) + + for f in features: + flags.is_enabled(f) + + flags2 = eval(repr(flags)) + + for f in features: + assert flags2.is_enabled(f) == flags.is_enabled(f) + + more_features = data.draw(st.lists(st.text().filter(lambda s: s not in features))) + + for f in more_features: + assert flags2.is_enabled(f)