Skip to content

Commit

Permalink
Increase variety in characters() generation
Browse files Browse the repository at this point in the history
  • Loading branch information
Zac-HD committed Oct 23, 2018
1 parent 74839e6 commit 0ed071f
Show file tree
Hide file tree
Showing 6 changed files with 124 additions and 59 deletions.
14 changes: 14 additions & 0 deletions hypothesis-python/RELEASE.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
RELEASE_TYPE: minor

This release substantially increases the variety of examples from the
:func:`~hypothesis.strategies.characters` strategy.

Unicode characters used to be selected by codepoint alone, which made
generation of some rare character types highly unlikely (:issue:`1401`).
Character generation now selects a Unicode category - preferring letters
to numbers to whitespace, and so on - then a code point within that category.

If :func:`~hypothesis.strategies.text` is passed an alphabet which is not
already a strategy, it will follow the same generation and shrinking order
if possible. ``alphabet=None`` is now deprecated, and the default strategy
is also the default value for the argument.
24 changes: 16 additions & 8 deletions hypothesis-python/docs/data.rst
Original file line number Diff line number Diff line change
Expand Up @@ -177,16 +177,24 @@ returns a new strategy for it. So for example:
.. doctest::

>>> from string import printable; from pprint import pprint
>>> json = recursive(none() | booleans() | floats() | text(printable),
... lambda children: lists(children, 1) | dictionaries(text(printable), children, min_size=1))
>>> nice_text = text(printable, max_size=3)
>>> json = recursive(none() | booleans() | floats() | nice_text,
... lambda children: lists(children, 1) | dictionaries(nice_text, children, min_size=1))
>>> pprint(json.example())
[[1.175494351e-38, ']', 1.9, True, False, '.M}Xl', ''], True]
{'': None,
'##': None,
'V': [1.175494351e-38, 'R', '|jk', True, ''],
'l': None,
'p#': '5\x0b'}
>>> pprint(json.example())
{'de(l': None,
'nK': {'(Rt)': None,
'+hoZh1YU]gy8': True,
'8z]EIFA06^l`i^': 'LFE{Q',
'9,': 'l{cA=/'}}
{'+': [None,
-1.6075146256587385e+223,
False,
...,
1.872980906096507e+16,
'#vX',
0.5]}


That is, we start with our leaf data and then we augment it by allowing lists and dictionaries of anything we can generate as JSON data.

Expand Down
64 changes: 35 additions & 29 deletions hypothesis-python/src/hypothesis/internal/charmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@
if False:
from typing import Dict, Tuple
intervals = Tuple[Tuple[int, int], ...]
cache_type = Dict[Tuple[Tuple[str, ...], int, int, intervals], intervals]
cache_type = Dict[Tuple[Tuple[str, ...], int, int, intervals],
Tuple[intervals]]


def charmap_file():
Expand Down Expand Up @@ -110,14 +111,17 @@ def categories():
"""
global _categories
if _categories is None:
cm = charmap()
_categories = sorted(
cm.keys(), key=lambda c: len(cm[c])
)
_categories.remove('Cc') # Other, Control
_categories.remove('Cs') # Other, Surrogate
_categories.append('Cc')
_categories.append('Cs')
# Written out manually to define our shrinking order, smallest first.
_categories = (
'Ll Lu Lt Lm Lo ' # Letter: lower/upper/titlecase, modifier, other
'Nd Nl No ' # Number: decimal digit, letter, other
'Zs Zl Zp ' # Separator: space, line, paragraph
'Pc Pd Ps Pe Pi Pf Po ' # Various punctuation
'Sc Sm Sk So ' # Symbols: currency, math, modifier, other
'Mn Mc Me ' # Marks: nonspacing, spacing combining, enclosing
'Cf Co Cn Cc Cs' # Other: format, private, --, control, surrogate
).split()
assert set(charmap().keys()) == set(_categories)
return tuple(_categories)


Expand Down Expand Up @@ -314,12 +318,9 @@ def _query_for_key(key):
except KeyError:
pass
assert key
if set(key) == set(categories()):
result = ((0, sys.maxunicode),)
else:
result = _union_intervals(
_query_for_key(key[:-1]), charmap()[key[-1]]
)
result = _union_intervals(
_query_for_key(key[:-1]), charmap()[key[-1]]
)
category_index_cache[key] = result
return result

Expand Down Expand Up @@ -354,25 +355,30 @@ def query(
if max_codepoint is None:
max_codepoint = sys.maxunicode
catkey = _category_key(exclude_categories, include_categories)
character_intervals = _intervals(include_characters or '')
exclude_intervals = _intervals(exclude_characters or '')
qkey = (
catkey, min_codepoint, max_codepoint,
character_intervals, exclude_intervals
_intervals(include_characters or ''), exclude_intervals
)
try:
return limited_category_index_cache[qkey]
except KeyError:
pass
base = _query_for_key(catkey)
result = []
for u, v in base:
if v >= min_codepoint and u <= max_codepoint:
result.append((
max(u, min_codepoint), min(v, max_codepoint)
))
result = tuple(result)
result = _union_intervals(result, character_intervals)
result = _subtract_intervals(result, exclude_intervals)
limited_category_index_cache[qkey] = result
return result
out = []
for cat in categories():
if cat in catkey:
base = _query_for_key((cat,))
else:
base = ()
result = []
for u, v in base:
if v >= min_codepoint and u <= max_codepoint:
result.append((max(u, min_codepoint), min(v, max_codepoint)))
include_chars = _intervals([c for c in (include_characters or '')
if unicodedata.category(c) == cat])
result = _union_intervals(tuple(result), include_chars)
result = _subtract_intervals(result, exclude_intervals)
if result:
out.append(result)
limited_category_index_cache[qkey] = tuple(out)
return tuple(out)
35 changes: 28 additions & 7 deletions hypothesis-python/src/hypothesis/searchstrategy/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,10 @@

from hypothesis.errors import InvalidArgument
from hypothesis.internal import charmap
from hypothesis.internal.compat import hunichr, binary_type
from hypothesis.internal.compat import hunichr, binary_type, int_to_bytes
from hypothesis.internal.intervalsets import IntervalSet
from hypothesis.internal.conjecture.utils import integer_range
from hypothesis.internal.conjecture.utils import choice, biased_coin, \
integer_range
from hypothesis.searchstrategy.strategies import SearchStrategy, \
MappedSearchStrategy

Expand Down Expand Up @@ -60,15 +61,35 @@ def __init__(self,
'combination of arguments: ' + ', '.join(
'%s=%r' % arg for arg in arguments if arg[1] is not None)
)
self.intervals = IntervalSet(intervals)
self.zero_point = self.intervals.index_above(ord('0'))
self.category_intervals = [IntervalSet(i) for i in intervals]

all_intervals = sorted(sum(intervals, ()), reverse=True)
self.all_intervals = IntervalSet(
charmap._union_intervals(all_intervals[:1], all_intervals[1:])
)
self.zero_points = {iset: iset.index_above(ord('0'))
for iset in self.category_intervals}
self.zero_points[None] = self.all_intervals.index_above(ord('0'))

def do_draw(self, data):
category = choice(data, self.category_intervals)
# Almost always - when generating characters - we use the category
# chosen above and draw a codepoint from that category. Then, we
# write the integer that would draw this from *all* categories.
if biased_coin(data, p=0.001):
char_idx = category[integer_range(
data, 0, len(category) - 1, center=self.zero_points[category]
)]
data.write(int_to_bytes(self.all_intervals.index(char_idx), 4))
return hunichr(char_idx)
# We 'shrink open', allowing any category to be generated when we are
# shrinking characters. This distinction means we can generate
# by categories, but shrink by codepoints.
i = integer_range(
data, 0, len(self.intervals) - 1,
center=self.zero_point,
data, 0, len(self.all_intervals) - 1,
center=self.zero_points[None],
)
return hunichr(self.intervals[i])
return hunichr(self.all_intervals[i])


class StringStrategy(MappedSearchStrategy):
Expand Down
31 changes: 17 additions & 14 deletions hypothesis-python/tests/cover/test_charmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import sys
import tempfile
import unicodedata
from itertools import chain

import hypothesis.strategies as st
import hypothesis.internal.charmap as cm
Expand Down Expand Up @@ -58,14 +59,15 @@ def assert_valid_range_list(ls):
st.sets(st.sampled_from(cm.categories())) | st.none(),
)
def test_query_matches_categories(exclude, include):
values = cm.query(exclude, include)
assert_valid_range_list(values)
for u, v in values:
for i in (u, v, (u + v) // 2):
cat = unicodedata.category(hunichr(i))
if include is not None:
assert cat in include
assert cat not in exclude
cats = cm.query(exclude, include)
for values in cats:
assert_valid_range_list(values)
for u, v in values:
for i in (u, v, (u + v) // 2):
cat = unicodedata.category(hunichr(i))
if include is not None:
assert cat in include
assert cat not in exclude


@given(
Expand All @@ -75,19 +77,20 @@ def test_query_matches_categories(exclude, include):
)
def test_query_matches_categories_codepoints(exclude, include, m1, m2):
m1, m2 = sorted((m1, m2))
values = cm.query(exclude, include, min_codepoint=m1, max_codepoint=m2)
assert_valid_range_list(values)
for u, v in values:
assert m1 <= u
assert v <= m2
cats = cm.query(exclude, include, min_codepoint=m1, max_codepoint=m2)
for values in cats:
assert_valid_range_list(values)
for u, v in values:
assert m1 <= u
assert v <= m2


@given(st.sampled_from(cm.categories()), st.integers(0, sys.maxunicode))
def test_exclude_only_excludes_from_that_category(cat, i):
c = hunichr(i)
assume(unicodedata.category(c) != cat)
intervals = cm.query(exclude_categories=(cat,))
assert any(a <= i <= b for a, b in intervals)
assert any(a <= i <= b for a, b in chain.from_iterable(intervals))


def test_reload_charmap():
Expand Down
15 changes: 14 additions & 1 deletion hypothesis-python/tests/cover/test_simple_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,14 @@

from __future__ import division, print_function, absolute_import

import unicodedata
from random import Random

import pytest

from hypothesis import given
from tests.common.debug import minimal
from tests.common.utils import checks_deprecated_behaviour
from tests.common.utils import fails, checks_deprecated_behaviour
from hypothesis.strategies import text, binary, tuples, characters


Expand Down Expand Up @@ -141,3 +142,15 @@ def test_explicit_alphabet_None_is_deprecated():
@checks_deprecated_behaviour
def test_alphabet_non_string():
text([1, 2, 3]).example()


@fails
@given(text(min_size=2))
def test_can_find_non_NFC_normalised_strings_issue_341(s):
assert s == unicodedata.normalize('NFC', s)


@fails
@given(text(min_size=1))
def test_can_find_non_NFD_normalised_strings_issue_341(s):
assert s == unicodedata.normalize('NFD', s)

0 comments on commit 0ed071f

Please sign in to comment.