Increase variety in characters() generation

HypothesisWorks · Oct 25, 2018 · 38284e6 · 38284e6
1 parent cde4f35
commit 38284e6
Show file tree

Hide file tree

Showing 7 changed files with 127 additions and 54 deletions.
diff --git a/hypothesis-python/RELEASE.rst b/hypothesis-python/RELEASE.rst
@@ -0,0 +1,14 @@
+RELEASE_TYPE: minor
+
+This release substantially increases the variety of examples from the
+:func:`~hypothesis.strategies.characters` strategy.
+
+Unicode characters used to be selected by codepoint alone, which made
+generation of some rare character types highly unlikely (:issue:`1401`).
+Character generation now selects a Unicode category - preferring letters
+to numbers to whitespace, and so on - then a code point within that category.
+
+If :func:`~hypothesis.strategies.text` is passed an alphabet which is not
+already a strategy, it will follow the same generation and shrinking order
+if possible. ``alphabet=None`` is now deprecated, and the default strategy
+is also the default value for the argument.
diff --git a/hypothesis-python/docs/data.rst b/hypothesis-python/docs/data.rst
@@ -177,16 +177,28 @@ returns a new strategy for it. So for example:
 .. doctest::
 
  >>> from string import printable; from pprint import pprint
- >>> json = recursive(none() | booleans() | floats() | text(printable),
- ... lambda children: lists(children, 1) | dictionaries(text(printable), children, min_size=1))
+ >>> nice_text = text(printable, max_size=3)
+ >>> json = recursive(none() | booleans() | floats() | nice_text,
+ ... lambda children: lists(children, 1) | dictionaries(nice_text, children, min_size=1))
  >>> pprint(json.example())
- [[1.175494351e-38, ']', 1.9, True, False, '.M}Xl', ''], True]
+ {'^-$': {'': -4.4078854352852856e+16,
+ '3': None,
+ '_': False,
+ '`': '$',
+ '{': None}}
  >>> pprint(json.example())
- {'de(l': None,
- 'nK': {'(Rt)': None,
- '+hoZh1YU]gy8': True,
- '8z]EIFA06^l`i^': 'LFE{Q',
- '9,': 'l{cA=/'}}
+ {'': {'': -2.00001},
+ ' )\n': '',
+ '-': ['', True, None, True, nan, ''],
+ '-\x0c': {'': None, '`': None, 'h': True, '}O]': '^_['},
+ '9': {'': '',
+ ' \\': -1.192092896e-07,
+ '+$!': True,
+ '-': False,
+ '<}': -1.1,
+ '_': None}}
+
+
 
 That is, we start with our leaf data and then we augment it by allowing lists and dictionaries of anything we can generate as JSON data.
 

diff --git a/hypothesis-python/src/hypothesis/internal/charmap.py b/hypothesis-python/src/hypothesis/internal/charmap.py
@@ -31,7 +31,8 @@
 if False:
  from typing import Dict, Tuple
  intervals = Tuple[Tuple[int, int], ...]
- cache_type = Dict[Tuple[Tuple[str, ...], int, int, intervals], intervals]
+ cache_type = Dict[Tuple[Tuple[str, ...], int, int, intervals],
+ Tuple[intervals]]
 
 
 def charmap_file():
@@ -314,12 +315,9 @@ def _query_for_key(key):
  except KeyError:
  pass
  assert key
- if set(key) == set(categories()):
- result = ((0, sys.maxunicode),)
- else:
- result = _union_intervals(
- _query_for_key(key[:-1]), charmap()[key[-1]]
- )
+ result = _union_intervals(
+ _query_for_key(key[:-1]), charmap()[key[-1]]
+ )
  category_index_cache[key] = result
  return result
 
@@ -354,25 +352,30 @@ def query(
  if max_codepoint is None:
  max_codepoint = sys.maxunicode
  catkey = _category_key(exclude_categories, include_categories)
- character_intervals = _intervals(include_characters or '')
  exclude_intervals = _intervals(exclude_characters or '')
  qkey = (
  catkey, min_codepoint, max_codepoint,
- character_intervals, exclude_intervals
+ _intervals(include_characters or ''), exclude_intervals
  )
  try:
  return limited_category_index_cache[qkey]
  except KeyError:
  pass
- base = _query_for_key(catkey)
- result = []
- for u, v in base:
- if v >= min_codepoint and u <= max_codepoint:
- result.append((
- max(u, min_codepoint), min(v, max_codepoint)
- ))
- result = tuple(result)
- result = _union_intervals(result, character_intervals)
- result = _subtract_intervals(result, exclude_intervals)
- limited_category_index_cache[qkey] = result
- return result
+ out = []
+ for cat in categories():
+ if cat in catkey:
+ base = _query_for_key((cat,))
+ else:
+ base = ()
+ result = []
+ for u, v in base:
+ if v >= min_codepoint and u <= max_codepoint:
+ result.append((max(u, min_codepoint), min(v, max_codepoint)))
+ include_chars = _intervals([c for c in (include_characters or '')
+ if unicodedata.category(c) == cat])
+ result = _union_intervals(tuple(result), include_chars)
+ result = _subtract_intervals(result, exclude_intervals)
+ if result:
+ out.append(result)
+ limited_category_index_cache[qkey] = tuple(out)
+ return tuple(out)
diff --git a/hypothesis-python/src/hypothesis/searchstrategy/strings.py b/hypothesis-python/src/hypothesis/searchstrategy/strings.py
@@ -19,12 +19,15 @@
 
 from hypothesis.errors import InvalidArgument
 from hypothesis.internal import charmap
-from hypothesis.internal.compat import hunichr, binary_type
+from hypothesis.internal.compat import hunichr, binary_type, int_to_bytes
 from hypothesis.internal.intervalsets import IntervalSet
-from hypothesis.internal.conjecture.utils import integer_range
+from hypothesis.internal.conjecture.utils import choice, integer_range, \
+ calc_label_from_name
 from hypothesis.searchstrategy.strategies import SearchStrategy, \
  MappedSearchStrategy
 
+ONE_UNICODE_CHAR_LABEL = calc_label_from_name('one unicode character')
+
 
 class OneCharStringStrategy(SearchStrategy):
  """A strategy which generates single character strings of text type."""
@@ -60,15 +63,39 @@ def __init__(self,
  'combination of arguments: ' + ', '.join(
  '%s=%r' % arg for arg in arguments if arg[1] is not None)
  )
- self.intervals = IntervalSet(intervals)
- self.zero_point = self.intervals.index_above(ord('0'))
+ self.category_intervals = [IntervalSet(i) for i in intervals]
 
- def do_draw(self, data):
- i = integer_range(
- data, 0, len(self.intervals) - 1,
- center=self.zero_point,
+ all_intervals = sorted(sum(intervals, ()), reverse=True)
+ self.all_intervals = IntervalSet(
+ charmap._union_intervals(all_intervals[:1], all_intervals[1:])
  )
- return hunichr(self.intervals[i])
+
+ def get_char_idx(self, data, category):
+ return category[integer_range(
+ data, 0, len(category) - 1,
+ center=category.index_above(ord('0'))
+ )]
+
+ def do_draw(self, data):
+ # Drawing a unicode character uses the "shrink open" trick - we start
+ # by drawing a boolean (from eight bits), a category, and a character
+ # within that category. Note that this is grouped in an example, so
+ # that it can be easily zeroed by the shrinker later.
+ data.start_example(ONE_UNICODE_CHAR_LABEL)
+ use_category = data.draw_bits(8) != 0
+ category = choice(data, self.category_intervals)
+ char_idx = self.get_char_idx(data, category)
+ data.stop_example()
+ # After generating by index-within-category, we want to shrink by
+ # codepoint instead. We therefore write the bytes that would generate
+ # this char_idx by codepoint. When the shrinker zeros out the first
+ # example the value of char_idx will be determined by the written
+ # bytes, and after that shrinking depends only on the codepoint.
+ if use_category:
+ data.write(int_to_bytes(self.all_intervals.index(char_idx), 4))
+ else:
+ char_idx = self.get_char_idx(data, self.all_intervals)
+ return hunichr(char_idx)
 
 
 class StringStrategy(MappedSearchStrategy):

diff --git a/hypothesis-python/tests/cover/test_charmap.py b/hypothesis-python/tests/cover/test_charmap.py
@@ -21,6 +21,7 @@
 import sys
 import tempfile
 import unicodedata
+from itertools import chain
 
 import hypothesis.strategies as st
 import hypothesis.internal.charmap as cm
@@ -58,14 +59,15 @@ def assert_valid_range_list(ls):
  st.sets(st.sampled_from(cm.categories())) | st.none(),
 )
 def test_query_matches_categories(exclude, include):
- values = cm.query(exclude, include)
- assert_valid_range_list(values)
- for u, v in values:
- for i in (u, v, (u + v) // 2):
- cat = unicodedata.category(hunichr(i))
- if include is not None:
- assert cat in include
- assert cat not in exclude
+ cats = cm.query(exclude, include)
+ for values in cats:
+ assert_valid_range_list(values)
+ for u, v in values:
+ for i in (u, v, (u + v) // 2):
+ cat = unicodedata.category(hunichr(i))
+ if include is not None:
+ assert cat in include
+ assert cat not in exclude
 
 
 @given(
@@ -75,19 +77,20 @@ def test_query_matches_categories(exclude, include):
 )
 def test_query_matches_categories_codepoints(exclude, include, m1, m2):
  m1, m2 = sorted((m1, m2))
- values = cm.query(exclude, include, min_codepoint=m1, max_codepoint=m2)
- assert_valid_range_list(values)
- for u, v in values:
- assert m1 <= u
- assert v <= m2
+ cats = cm.query(exclude, include, min_codepoint=m1, max_codepoint=m2)
+ for values in cats:
+ assert_valid_range_list(values)
+ for u, v in values:
+ assert m1 <= u
+ assert v <= m2
 
 
 @given(st.sampled_from(cm.categories()), st.integers(0, sys.maxunicode))
 def test_exclude_only_excludes_from_that_category(cat, i):
  c = hunichr(i)
  assume(unicodedata.category(c) != cat)
  intervals = cm.query(exclude_categories=(cat,))
- assert any(a <= i <= b for a, b in intervals)
+ assert any(a <= i <= b for a, b in chain.from_iterable(intervals))
 
 
 def test_reload_charmap():

diff --git a/hypothesis-python/tests/cover/test_simple_strings.py b/hypothesis-python/tests/cover/test_simple_strings.py
@@ -17,13 +17,14 @@
 
 from __future__ import division, print_function, absolute_import
 
+import unicodedata
 from random import Random
 
 import pytest
 
 from hypothesis import given
 from tests.common.debug import minimal
-from tests.common.utils import checks_deprecated_behaviour
+from tests.common.utils import fails, checks_deprecated_behaviour
 from hypothesis.strategies import text, binary, tuples, characters
 
 
@@ -141,3 +142,15 @@ def test_explicit_alphabet_None_is_deprecated():
 @checks_deprecated_behaviour
 def test_alphabet_non_string():
  text([1, 2, 3]).example()
+
+
+@fails
+@given(text(min_size=2))
+def test_can_find_non_NFC_normalised_strings_issue_341(s):
+ assert s == unicodedata.normalize('NFC', s)
+
+
+@fails
+@given(text(min_size=1))
+def test_can_find_non_NFD_normalised_strings_issue_341(s):
+ assert s == unicodedata.normalize('NFD', s)
diff --git a/whole-repo-tests/test_doctests.py b/whole-repo-tests/test_doctests.py
@@ -28,6 +28,7 @@ def test_doctests():
  env['PYTHONPATH'] = 'src'
 
  pip_tool(
- 'sphinx-build', '-W', '-b', 'doctest', '-d', 'docs/_build/doctrees',
+ 'sphinx-build', '-n', '-W', '--keep-going', '-T',
+ '-b', 'doctest', '-d', 'docs/_build/doctrees',
  'docs', 'docs/_build/html', env=env, cwd=BASE_DIR,
  )