Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bpo-39689: _struct: Avoid undefined behavior when loading native _Bool #18925

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 49 additions & 47 deletions Lib/test/test_buffer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2754,54 +2754,56 @@ def test_memoryview_cast_1D_ND(self):
# be 1D, at least one format must be 'c', 'b' or 'B'.
for _tshape in gencastshapes():
for char in fmtdict['@']:
tfmt = ('', '@')[randrange(2)] + char
tsize = struct.calcsize(tfmt)
n = prod(_tshape) * tsize
obj = 'memoryview' if is_byte_format(tfmt) else 'bytefmt'
for fmt, items, _ in iter_format(n, obj):
size = struct.calcsize(fmt)
shape = [n] if n > 0 else []
tshape = _tshape + [size]

ex = ndarray(items, shape=shape, format=fmt)
m = memoryview(ex)

titems, tshape = cast_items(ex, tfmt, tsize, shape=tshape)

if titems is None:
self.assertRaises(TypeError, m.cast, tfmt, tshape)
continue
if titems == 'nan':
continue # NaNs in lists are a recipe for trouble.

# 1D -> ND
nd = ndarray(titems, shape=tshape, format=tfmt)

m2 = m.cast(tfmt, shape=tshape)
ndim = len(tshape)
strides = nd.strides
lst = nd.tolist()
self.verify(m2, obj=ex,
itemsize=tsize, fmt=tfmt, readonly=True,
ndim=ndim, shape=tshape, strides=strides,
lst=lst, cast=True)

# ND -> 1D
m3 = m2.cast(fmt)
m4 = m2.cast(fmt, shape=shape)
ndim = len(shape)
strides = ex.strides
lst = ex.tolist()

self.verify(m3, obj=ex,
itemsize=size, fmt=fmt, readonly=True,
ndim=ndim, shape=shape, strides=strides,
lst=lst, cast=True)
with self.subTest(_tshape=_tshape, char=char):
tfmt = ('', '@')[randrange(2)] + char
tsize = struct.calcsize(tfmt)
n = prod(_tshape) * tsize
obj = 'memoryview' if is_byte_format(tfmt) else 'bytefmt'
for fmt, items, _ in iter_format(n, obj):
size = struct.calcsize(fmt)
shape = [n] if n > 0 else []
tshape = _tshape + [size]

self.verify(m4, obj=ex,
itemsize=size, fmt=fmt, readonly=True,
ndim=ndim, shape=shape, strides=strides,
lst=lst, cast=True)
ex = ndarray(items, shape=shape, format=fmt)
m = memoryview(ex)

titems, tshape = cast_items(ex, tfmt, tsize,
shape=tshape)

if titems is None:
self.assertRaises(TypeError, m.cast, tfmt, tshape)
continue
if titems == 'nan':
continue # NaNs in lists are a recipe for trouble.

# 1D -> ND
nd = ndarray(titems, shape=tshape, format=tfmt)

m2 = m.cast(tfmt, shape=tshape)
ndim = len(tshape)
strides = nd.strides
lst = nd.tolist()
self.verify(m2, obj=ex,
itemsize=tsize, fmt=tfmt, readonly=True,
ndim=ndim, shape=tshape, strides=strides,
lst=lst, cast=True)

# ND -> 1D
m3 = m2.cast(fmt)
m4 = m2.cast(fmt, shape=shape)
ndim = len(shape)
strides = ex.strides
lst = ex.tolist()

self.verify(m3, obj=ex,
itemsize=size, fmt=fmt, readonly=True,
ndim=ndim, shape=shape, strides=strides,
lst=lst, cast=True)

self.verify(m4, obj=ex,
itemsize=size, fmt=fmt, readonly=True,
ndim=ndim, shape=shape, strides=strides,
lst=lst, cast=True)

if ctypes:
# format: "T{>l:x:>d:y:}"
Expand Down
14 changes: 12 additions & 2 deletions Lib/test/test_struct.py
Original file line number Diff line number Diff line change
Expand Up @@ -516,8 +516,18 @@ def __bool__(self):
self.fail("Expected OSError: struct.pack(%r, "
"ExplodingBool())" % (prefix + '?'))

for c in [b'\x01', b'\x7f', b'\xff', b'\x0f', b'\xf0']:
self.assertTrue(struct.unpack('>?', c)[0])
# To avoid undefined behavior in the C code, we assume that in
# every mode, the size is 1, '\x00' is false, and `\x01` is true.
# If there is a platform where this is not the case, the unpack
# code and the tests below will need adjusting. See bpo-39689.

self.assertEqual(packedFalse, b'\0' * len(false))
self.assertEqual(packedTrue, b'\x01' * len(true))

self.assertFalse(struct.unpack(prefix + '?', b'\0')[0])

for c in [b'\x01', b'\x7f', b'\xff', b'\x0f', b'\xf0']:
self.assertTrue(struct.unpack(prefix + '?', c)[0])

def test_count_overflow(self):
hugecount = '{}b'.format(sys.maxsize+1)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
The struct module and memoryview now load native booleans as ``char`` rather
than ``_Bool`` to avoid triggering undefined behavior. For valid native _Bool,
the behavior is the same on all supported (tested) platforms.
15 changes: 12 additions & 3 deletions Modules/_struct.c
Original file line number Diff line number Diff line change
Expand Up @@ -482,9 +482,18 @@ nu_ulonglong(const char *p, const formatdef *f)
static PyObject *
nu_bool(const char *p, const formatdef *f)
{
_Bool x;
memcpy((char *)&x, p, sizeof x);
return PyBool_FromLong(x != 0);
/* The usual thing to do here is to memcpy *p to a _Bool variable.
* However, that would expose C undefined behavior to Python code:
* any bit-patterns except 0 and 1 would trigger UB.
* So we instead cast *p from char to _Bool, as in bu_bool.
*
* We assume, and assert, that sizeof(_Bool) is 1.
* We also assume the bit-pattern for (_Bool)0 is the same as for (char)0;
* this is covered by tests.
* See bpo-39689.
*/
assert(sizeof(_Bool) == sizeof(char));
return PyBool_FromLong((_Bool)*p != 0);
}


Expand Down
4 changes: 3 additions & 1 deletion Objects/memoryobject.c
Original file line number Diff line number Diff line change
Expand Up @@ -1699,7 +1699,9 @@ unpack_single(const char *ptr, const char *fmt)
case 'l': UNPACK_SINGLE(ld, ptr, long); goto convert_ld;

/* boolean */
case '?': UNPACK_SINGLE(ld, ptr, _Bool); goto convert_bool;
// memcpy-ing values other than 0 or 1 to a _Bool variable triggers
// undefined behavior, so cast from char instead. See bpo-39689.
case '?': ld = (_Bool)*ptr; goto convert_bool;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In theory this can trigger an unaligned access if sizeof(_Bool) > 1. Unaligned accesses that result in bus errors are the main reason for the memcpy().

Also I wonder if UB checkers can't get smart enough to figure out that e.g. (_Bool)*ptr == 42 and then complain.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In theory this can trigger an unaligned access if sizeof(_Bool) > 1. Unaligned accesses that result in bus errors are the main reason for the memcpy().

Ah! You are right, thank you. This PR won't work as is. I won't have time to work on this soon, so I'll close it in the mean time.

Also I wonder if UB checkers can't get smart enough to figure out that e.g. (_Bool)*ptr == 42 and then complain.

I don't have access to the C standard, but from what I've read, casting to _Bool will convert non-zero values to 1. So this should be fine.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So this should be fine.

Yes, I misread the construct as *(_Bool *)ptr), I now see that you are deliberately dereferencing the char pointer.

Hmm, the rank of _Bool is the smallest, but sizeof(char) can still be larger than sizeof(_Bool). So I wouldn't want this construct either.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sizeof(char) is guaranteed to be 1. It can't be larger than sizeof(_Bool).
Alignment may be a different matter (or may not; I don't know).


/* unsigned integers */
case 'H': UNPACK_SINGLE(lu, ptr, unsigned short); goto convert_lu;
Expand Down