Increase variety in characters() generation

HypothesisWorks · Oct 23, 2018 · 0ed071f · 0ed071f
1 parent 74839e6
commit 0ed071f
Show file tree

Hide file tree

Showing 6 changed files with 124 additions and 59 deletions.
diff --git a/hypothesis-python/RELEASE.rst b/hypothesis-python/RELEASE.rst
@@ -0,0 +1,14 @@
+RELEASE_TYPE: minor
+
+This release substantially increases the variety of examples from the
+:func:`~hypothesis.strategies.characters` strategy.
+
+Unicode characters used to be selected by codepoint alone, which made
+generation of some rare character types highly unlikely (:issue:`1401`).
+Character generation now selects a Unicode category - preferring letters
+to numbers to whitespace, and so on - then a code point within that category.
+
+If :func:`~hypothesis.strategies.text` is passed an alphabet which is not
+already a strategy, it will follow the same generation and shrinking order
+if possible. ``alphabet=None`` is now deprecated, and the default strategy
+is also the default value for the argument.
diff --git a/hypothesis-python/docs/data.rst b/hypothesis-python/docs/data.rst
@@ -177,16 +177,24 @@ returns a new strategy for it. So for example:
 .. doctest::
 
  >>> from string import printable; from pprint import pprint
- >>> json = recursive(none() | booleans() | floats() | text(printable),
- ... lambda children: lists(children, 1) | dictionaries(text(printable), children, min_size=1))
+ >>> nice_text = text(printable, max_size=3)
+ >>> json = recursive(none() | booleans() | floats() | nice_text,
+ ... lambda children: lists(children, 1) | dictionaries(nice_text, children, min_size=1))
  >>> pprint(json.example())
- [[1.175494351e-38, ']', 1.9, True, False, '.M}Xl', ''], True]
+ {'': None,
+ '##': None,
+ 'V': [1.175494351e-38, 'R', '|jk', True, ''],
+ 'l': None,
+ 'p#': '5\x0b'}
  >>> pprint(json.example())
- {'de(l': None,
- 'nK': {'(Rt)': None,
- '+hoZh1YU]gy8': True,
- '8z]EIFA06^l`i^': 'LFE{Q',
- '9,': 'l{cA=/'}}
+ {'+': [None,
+ -1.6075146256587385e+223,
+ False,
+ ...,
+ 1.872980906096507e+16,
+ '#vX',
+ 0.5]}
+
 
 That is, we start with our leaf data and then we augment it by allowing lists and dictionaries of anything we can generate as JSON data.
 

diff --git a/hypothesis-python/src/hypothesis/internal/charmap.py b/hypothesis-python/src/hypothesis/internal/charmap.py
@@ -31,7 +31,8 @@
 if False:
  from typing import Dict, Tuple
  intervals = Tuple[Tuple[int, int], ...]
- cache_type = Dict[Tuple[Tuple[str, ...], int, int, intervals], intervals]
+ cache_type = Dict[Tuple[Tuple[str, ...], int, int, intervals],
+ Tuple[intervals]]
 
 
 def charmap_file():
@@ -110,14 +111,17 @@ def categories():
  """
  global _categories
  if _categories is None:
- cm = charmap()
- _categories = sorted(
- cm.keys(), key=lambda c: len(cm[c])
- )
- _categories.remove('Cc') # Other, Control
- _categories.remove('Cs') # Other, Surrogate
- _categories.append('Cc')
- _categories.append('Cs')
+ # Written out manually to define our shrinking order, smallest first.
+ _categories = (
+ 'Ll Lu Lt Lm Lo ' # Letter: lower/upper/titlecase, modifier, other
+ 'Nd Nl No ' # Number: decimal digit, letter, other
+ 'Zs Zl Zp ' # Separator: space, line, paragraph
+ 'Pc Pd Ps Pe Pi Pf Po ' # Various punctuation
+ 'Sc Sm Sk So ' # Symbols: currency, math, modifier, other
+ 'Mn Mc Me ' # Marks: nonspacing, spacing combining, enclosing
+ 'Cf Co Cn Cc Cs' # Other: format, private, --, control, surrogate
+ ).split()
+ assert set(charmap().keys()) == set(_categories)
  return tuple(_categories)
 
 
@@ -314,12 +318,9 @@ def _query_for_key(key):
  except KeyError:
  pass
  assert key
- if set(key) == set(categories()):
- result = ((0, sys.maxunicode),)
- else:
- result = _union_intervals(
- _query_for_key(key[:-1]), charmap()[key[-1]]
- )
+ result = _union_intervals(
+ _query_for_key(key[:-1]), charmap()[key[-1]]
+ )
  category_index_cache[key] = result
  return result
 
@@ -354,25 +355,30 @@ def query(
  if max_codepoint is None:
  max_codepoint = sys.maxunicode
  catkey = _category_key(exclude_categories, include_categories)
- character_intervals = _intervals(include_characters or '')
  exclude_intervals = _intervals(exclude_characters or '')
  qkey = (
  catkey, min_codepoint, max_codepoint,
- character_intervals, exclude_intervals
+ _intervals(include_characters or ''), exclude_intervals
  )
  try:
  return limited_category_index_cache[qkey]
  except KeyError:
  pass
- base = _query_for_key(catkey)
- result = []
- for u, v in base:
- if v >= min_codepoint and u <= max_codepoint:
- result.append((
- max(u, min_codepoint), min(v, max_codepoint)
- ))
- result = tuple(result)
- result = _union_intervals(result, character_intervals)
- result = _subtract_intervals(result, exclude_intervals)
- limited_category_index_cache[qkey] = result
- return result
+ out = []
+ for cat in categories():
+ if cat in catkey:
+ base = _query_for_key((cat,))
+ else:
+ base = ()
+ result = []
+ for u, v in base:
+ if v >= min_codepoint and u <= max_codepoint:
+ result.append((max(u, min_codepoint), min(v, max_codepoint)))
+ include_chars = _intervals([c for c in (include_characters or '')
+ if unicodedata.category(c) == cat])
+ result = _union_intervals(tuple(result), include_chars)
+ result = _subtract_intervals(result, exclude_intervals)
+ if result:
+ out.append(result)
+ limited_category_index_cache[qkey] = tuple(out)
+ return tuple(out)
diff --git a/hypothesis-python/src/hypothesis/searchstrategy/strings.py b/hypothesis-python/src/hypothesis/searchstrategy/strings.py
@@ -19,9 +19,10 @@
 
 from hypothesis.errors import InvalidArgument
 from hypothesis.internal import charmap
-from hypothesis.internal.compat import hunichr, binary_type
+from hypothesis.internal.compat import hunichr, binary_type, int_to_bytes
 from hypothesis.internal.intervalsets import IntervalSet
-from hypothesis.internal.conjecture.utils import integer_range
+from hypothesis.internal.conjecture.utils import choice, biased_coin, \
+ integer_range
 from hypothesis.searchstrategy.strategies import SearchStrategy, \
  MappedSearchStrategy
 
@@ -60,15 +61,35 @@ def __init__(self,
  'combination of arguments: ' + ', '.join(
  '%s=%r' % arg for arg in arguments if arg[1] is not None)
  )
- self.intervals = IntervalSet(intervals)
- self.zero_point = self.intervals.index_above(ord('0'))
+ self.category_intervals = [IntervalSet(i) for i in intervals]
+
+ all_intervals = sorted(sum(intervals, ()), reverse=True)
+ self.all_intervals = IntervalSet(
+ charmap._union_intervals(all_intervals[:1], all_intervals[1:])
+ )
+ self.zero_points = {iset: iset.index_above(ord('0'))
+ for iset in self.category_intervals}
+ self.zero_points[None] = self.all_intervals.index_above(ord('0'))
 
  def do_draw(self, data):
+ category = choice(data, self.category_intervals)
+ # Almost always - when generating characters - we use the category
+ # chosen above and draw a codepoint from that category. Then, we
+ # write the integer that would draw this from *all* categories.
+ if biased_coin(data, p=0.001):
+ char_idx = category[integer_range(
+ data, 0, len(category) - 1, center=self.zero_points[category]
+ )]
+ data.write(int_to_bytes(self.all_intervals.index(char_idx), 4))
+ return hunichr(char_idx)
+ # We 'shrink open', allowing any category to be generated when we are
+ # shrinking characters. This distinction means we can generate
+ # by categories, but shrink by codepoints.
  i = integer_range(
- data, 0, len(self.intervals) - 1,
- center=self.zero_point,
+ data, 0, len(self.all_intervals) - 1,
+ center=self.zero_points[None],
  )
- return hunichr(self.intervals[i])
+ return hunichr(self.all_intervals[i])
 
 
 class StringStrategy(MappedSearchStrategy):

diff --git a/hypothesis-python/tests/cover/test_charmap.py b/hypothesis-python/tests/cover/test_charmap.py
@@ -21,6 +21,7 @@
 import sys
 import tempfile
 import unicodedata
+from itertools import chain
 
 import hypothesis.strategies as st
 import hypothesis.internal.charmap as cm
@@ -58,14 +59,15 @@ def assert_valid_range_list(ls):
  st.sets(st.sampled_from(cm.categories())) | st.none(),
 )
 def test_query_matches_categories(exclude, include):
- values = cm.query(exclude, include)
- assert_valid_range_list(values)
- for u, v in values:
- for i in (u, v, (u + v) // 2):
- cat = unicodedata.category(hunichr(i))
- if include is not None:
- assert cat in include
- assert cat not in exclude
+ cats = cm.query(exclude, include)
+ for values in cats:
+ assert_valid_range_list(values)
+ for u, v in values:
+ for i in (u, v, (u + v) // 2):
+ cat = unicodedata.category(hunichr(i))
+ if include is not None:
+ assert cat in include
+ assert cat not in exclude
 
 
 @given(
@@ -75,19 +77,20 @@ def test_query_matches_categories(exclude, include):
 )
 def test_query_matches_categories_codepoints(exclude, include, m1, m2):
  m1, m2 = sorted((m1, m2))
- values = cm.query(exclude, include, min_codepoint=m1, max_codepoint=m2)
- assert_valid_range_list(values)
- for u, v in values:
- assert m1 <= u
- assert v <= m2
+ cats = cm.query(exclude, include, min_codepoint=m1, max_codepoint=m2)
+ for values in cats:
+ assert_valid_range_list(values)
+ for u, v in values:
+ assert m1 <= u
+ assert v <= m2
 
 
 @given(st.sampled_from(cm.categories()), st.integers(0, sys.maxunicode))
 def test_exclude_only_excludes_from_that_category(cat, i):
  c = hunichr(i)
  assume(unicodedata.category(c) != cat)
  intervals = cm.query(exclude_categories=(cat,))
- assert any(a <= i <= b for a, b in intervals)
+ assert any(a <= i <= b for a, b in chain.from_iterable(intervals))
 
 
 def test_reload_charmap():

diff --git a/hypothesis-python/tests/cover/test_simple_strings.py b/hypothesis-python/tests/cover/test_simple_strings.py
@@ -17,13 +17,14 @@
 
 from __future__ import division, print_function, absolute_import
 
+import unicodedata
 from random import Random
 
 import pytest
 
 from hypothesis import given
 from tests.common.debug import minimal
-from tests.common.utils import checks_deprecated_behaviour
+from tests.common.utils import fails, checks_deprecated_behaviour
 from hypothesis.strategies import text, binary, tuples, characters
 
 
@@ -141,3 +142,15 @@ def test_explicit_alphabet_None_is_deprecated():
 @checks_deprecated_behaviour
 def test_alphabet_non_string():
  text([1, 2, 3]).example()
+
+
+@fails
+@given(text(min_size=2))
+def test_can_find_non_NFC_normalised_strings_issue_341(s):
+ assert s == unicodedata.normalize('NFC', s)
+
+
+@fails
+@given(text(min_size=1))
+def test_can_find_non_NFD_normalised_strings_issue_341(s):
+ assert s == unicodedata.normalize('NFD', s)