Skip to content

Commit

Permalink
Increase variety in characters() generation
Browse files Browse the repository at this point in the history
  • Loading branch information
Zac-HD committed Oct 25, 2018
1 parent cde4f35 commit 38284e6
Show file tree
Hide file tree
Showing 7 changed files with 127 additions and 54 deletions.
14 changes: 14 additions & 0 deletions hypothesis-python/RELEASE.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
RELEASE_TYPE: minor

This release substantially increases the variety of examples from the
:func:`~hypothesis.strategies.characters` strategy.

Unicode characters used to be selected by codepoint alone, which made
generation of some rare character types highly unlikely (:issue:`1401`).
Character generation now selects a Unicode category - preferring letters
to numbers to whitespace, and so on - then a code point within that category.

If :func:`~hypothesis.strategies.text` is passed an alphabet which is not
already a strategy, it will follow the same generation and shrinking order
if possible. ``alphabet=None`` is now deprecated, and the default strategy
is also the default value for the argument.
28 changes: 20 additions & 8 deletions hypothesis-python/docs/data.rst
Original file line number Diff line number Diff line change
Expand Up @@ -177,16 +177,28 @@ returns a new strategy for it. So for example:
.. doctest::

>>> from string import printable; from pprint import pprint
>>> json = recursive(none() | booleans() | floats() | text(printable),
... lambda children: lists(children, 1) | dictionaries(text(printable), children, min_size=1))
>>> nice_text = text(printable, max_size=3)
>>> json = recursive(none() | booleans() | floats() | nice_text,
... lambda children: lists(children, 1) | dictionaries(nice_text, children, min_size=1))
>>> pprint(json.example())
[[1.175494351e-38, ']', 1.9, True, False, '.M}Xl', ''], True]
{'^-$': {'': -4.4078854352852856e+16,
'3': None,
'_': False,
'`': '$',
'{': None}}
>>> pprint(json.example())
{'de(l': None,
'nK': {'(Rt)': None,
'+hoZh1YU]gy8': True,
'8z]EIFA06^l`i^': 'LFE{Q',
'9,': 'l{cA=/'}}
{'': {'': -2.00001},
' )\n': '',
'-': ['', True, None, True, nan, ''],
'-\x0c': {'': None, '`': None, 'h': True, '}O]': '^_['},
'9': {'': '',
' \\': -1.192092896e-07,
'+$!': True,
'-': False,
'<}': -1.1,
'_': None}}



That is, we start with our leaf data and then we augment it by allowing lists and dictionaries of anything we can generate as JSON data.

Expand Down
45 changes: 24 additions & 21 deletions hypothesis-python/src/hypothesis/internal/charmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@
if False:
from typing import Dict, Tuple
intervals = Tuple[Tuple[int, int], ...]
cache_type = Dict[Tuple[Tuple[str, ...], int, int, intervals], intervals]
cache_type = Dict[Tuple[Tuple[str, ...], int, int, intervals],
Tuple[intervals]]


def charmap_file():
Expand Down Expand Up @@ -314,12 +315,9 @@ def _query_for_key(key):
except KeyError:
pass
assert key
if set(key) == set(categories()):
result = ((0, sys.maxunicode),)
else:
result = _union_intervals(
_query_for_key(key[:-1]), charmap()[key[-1]]
)
result = _union_intervals(
_query_for_key(key[:-1]), charmap()[key[-1]]
)
category_index_cache[key] = result
return result

Expand Down Expand Up @@ -354,25 +352,30 @@ def query(
if max_codepoint is None:
max_codepoint = sys.maxunicode
catkey = _category_key(exclude_categories, include_categories)
character_intervals = _intervals(include_characters or '')
exclude_intervals = _intervals(exclude_characters or '')
qkey = (
catkey, min_codepoint, max_codepoint,
character_intervals, exclude_intervals
_intervals(include_characters or ''), exclude_intervals
)
try:
return limited_category_index_cache[qkey]
except KeyError:
pass
base = _query_for_key(catkey)
result = []
for u, v in base:
if v >= min_codepoint and u <= max_codepoint:
result.append((
max(u, min_codepoint), min(v, max_codepoint)
))
result = tuple(result)
result = _union_intervals(result, character_intervals)
result = _subtract_intervals(result, exclude_intervals)
limited_category_index_cache[qkey] = result
return result
out = []
for cat in categories():
if cat in catkey:
base = _query_for_key((cat,))
else:
base = ()
result = []
for u, v in base:
if v >= min_codepoint and u <= max_codepoint:
result.append((max(u, min_codepoint), min(v, max_codepoint)))
include_chars = _intervals([c for c in (include_characters or '')
if unicodedata.category(c) == cat])
result = _union_intervals(tuple(result), include_chars)
result = _subtract_intervals(result, exclude_intervals)
if result:
out.append(result)
limited_category_index_cache[qkey] = tuple(out)
return tuple(out)
45 changes: 36 additions & 9 deletions hypothesis-python/src/hypothesis/searchstrategy/strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,15 @@

from hypothesis.errors import InvalidArgument
from hypothesis.internal import charmap
from hypothesis.internal.compat import hunichr, binary_type
from hypothesis.internal.compat import hunichr, binary_type, int_to_bytes
from hypothesis.internal.intervalsets import IntervalSet
from hypothesis.internal.conjecture.utils import integer_range
from hypothesis.internal.conjecture.utils import choice, integer_range, \
calc_label_from_name
from hypothesis.searchstrategy.strategies import SearchStrategy, \
MappedSearchStrategy

ONE_UNICODE_CHAR_LABEL = calc_label_from_name('one unicode character')


class OneCharStringStrategy(SearchStrategy):
"""A strategy which generates single character strings of text type."""
Expand Down Expand Up @@ -60,15 +63,39 @@ def __init__(self,
'combination of arguments: ' + ', '.join(
'%s=%r' % arg for arg in arguments if arg[1] is not None)
)
self.intervals = IntervalSet(intervals)
self.zero_point = self.intervals.index_above(ord('0'))
self.category_intervals = [IntervalSet(i) for i in intervals]

def do_draw(self, data):
i = integer_range(
data, 0, len(self.intervals) - 1,
center=self.zero_point,
all_intervals = sorted(sum(intervals, ()), reverse=True)
self.all_intervals = IntervalSet(
charmap._union_intervals(all_intervals[:1], all_intervals[1:])
)
return hunichr(self.intervals[i])

def get_char_idx(self, data, category):
return category[integer_range(
data, 0, len(category) - 1,
center=category.index_above(ord('0'))
)]

def do_draw(self, data):
# Drawing a unicode character uses the "shrink open" trick - we start
# by drawing a boolean (from eight bits), a category, and a character
# within that category. Note that this is grouped in an example, so
# that it can be easily zeroed by the shrinker later.
data.start_example(ONE_UNICODE_CHAR_LABEL)
use_category = data.draw_bits(8) != 0
category = choice(data, self.category_intervals)
char_idx = self.get_char_idx(data, category)
data.stop_example()
# After generating by index-within-category, we want to shrink by
# codepoint instead. We therefore write the bytes that would generate
# this char_idx by codepoint. When the shrinker zeros out the first
# example the value of char_idx will be determined by the written
# bytes, and after that shrinking depends only on the codepoint.
if use_category:
data.write(int_to_bytes(self.all_intervals.index(char_idx), 4))
else:
char_idx = self.get_char_idx(data, self.all_intervals)
return hunichr(char_idx)


class StringStrategy(MappedSearchStrategy):
Expand Down
31 changes: 17 additions & 14 deletions hypothesis-python/tests/cover/test_charmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import sys
import tempfile
import unicodedata
from itertools import chain

import hypothesis.strategies as st
import hypothesis.internal.charmap as cm
Expand Down Expand Up @@ -58,14 +59,15 @@ def assert_valid_range_list(ls):
st.sets(st.sampled_from(cm.categories())) | st.none(),
)
def test_query_matches_categories(exclude, include):
values = cm.query(exclude, include)
assert_valid_range_list(values)
for u, v in values:
for i in (u, v, (u + v) // 2):
cat = unicodedata.category(hunichr(i))
if include is not None:
assert cat in include
assert cat not in exclude
cats = cm.query(exclude, include)
for values in cats:
assert_valid_range_list(values)
for u, v in values:
for i in (u, v, (u + v) // 2):
cat = unicodedata.category(hunichr(i))
if include is not None:
assert cat in include
assert cat not in exclude


@given(
Expand All @@ -75,19 +77,20 @@ def test_query_matches_categories(exclude, include):
)
def test_query_matches_categories_codepoints(exclude, include, m1, m2):
m1, m2 = sorted((m1, m2))
values = cm.query(exclude, include, min_codepoint=m1, max_codepoint=m2)
assert_valid_range_list(values)
for u, v in values:
assert m1 <= u
assert v <= m2
cats = cm.query(exclude, include, min_codepoint=m1, max_codepoint=m2)
for values in cats:
assert_valid_range_list(values)
for u, v in values:
assert m1 <= u
assert v <= m2


@given(st.sampled_from(cm.categories()), st.integers(0, sys.maxunicode))
def test_exclude_only_excludes_from_that_category(cat, i):
c = hunichr(i)
assume(unicodedata.category(c) != cat)
intervals = cm.query(exclude_categories=(cat,))
assert any(a <= i <= b for a, b in intervals)
assert any(a <= i <= b for a, b in chain.from_iterable(intervals))


def test_reload_charmap():
Expand Down
15 changes: 14 additions & 1 deletion hypothesis-python/tests/cover/test_simple_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,14 @@

from __future__ import division, print_function, absolute_import

import unicodedata
from random import Random

import pytest

from hypothesis import given
from tests.common.debug import minimal
from tests.common.utils import checks_deprecated_behaviour
from tests.common.utils import fails, checks_deprecated_behaviour
from hypothesis.strategies import text, binary, tuples, characters


Expand Down Expand Up @@ -141,3 +142,15 @@ def test_explicit_alphabet_None_is_deprecated():
@checks_deprecated_behaviour
def test_alphabet_non_string():
text([1, 2, 3]).example()


@fails
@given(text(min_size=2))
def test_can_find_non_NFC_normalised_strings_issue_341(s):
assert s == unicodedata.normalize('NFC', s)


@fails
@given(text(min_size=1))
def test_can_find_non_NFD_normalised_strings_issue_341(s):
assert s == unicodedata.normalize('NFD', s)
3 changes: 2 additions & 1 deletion whole-repo-tests/test_doctests.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ def test_doctests():
env['PYTHONPATH'] = 'src'

pip_tool(
'sphinx-build', '-W', '-b', 'doctest', '-d', 'docs/_build/doctrees',
'sphinx-build', '-n', '-W', '--keep-going', '-T',
'-b', 'doctest', '-d', 'docs/_build/doctrees',
'docs', 'docs/_build/html', env=env, cwd=BASE_DIR,
)

0 comments on commit 38284e6

Please sign in to comment.