Skip to content

Commit f5f047a

Browse files
authored
gh-96735: Fix undefined behaviour in struct unpacking functions (#96739)
This PR fixes undefined behaviour in the struct module unpacking support functions `bu_longlong`, `lu_longlong`, `bu_int` and `lu_int`; thanks to @kumaraditya303 for finding these. The fix is to accumulate the bytes in an unsigned integer type instead of a signed integer type, then to convert to the appropriate signed type. In cases where the width matches, that conversion will typically be compiled away to a no-op. (Evidence from Godbolt: https://godbolt.org/z/5zvxodj64 .) To make the conversions efficient, I've specialised the relevant functions for their output size: for `bu_longlong` and `lu_longlong`, this only entails checking that the output size is indeed `8`. But `bu_int` and `lu_int` were used for format sizes `2` and `4` - I've split those into two separate functions each. No tests, because all of the affected cases are already exercised by the test suite.
1 parent 817fa28 commit f5f047a

File tree

2 files changed

+71
-26
lines changed

2 files changed

+71
-26
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Fix undefined behaviour in :func:`struct.unpack`.

Modules/_struct.c

+70-26
Original file line numberDiff line numberDiff line change
@@ -805,19 +805,38 @@ static const formatdef native_table[] = {
805805

806806
/* Big-endian routines. *****************************************************/
807807

808+
static PyObject *
809+
bu_short(_structmodulestate *state, const char *p, const formatdef *f)
810+
{
811+
unsigned long x = 0;
812+
813+
/* This function is only ever used in the case f->size == 2. */
814+
assert(f->size == 2);
815+
Py_ssize_t i = 2;
816+
const unsigned char *bytes = (const unsigned char *)p;
817+
do {
818+
x = (x<<8) | *bytes++;
819+
} while (--i > 0);
820+
/* Extend sign, avoiding implementation-defined or undefined behaviour. */
821+
x = (x ^ 0x8000U) - 0x8000U;
822+
return PyLong_FromLong(x & 0x8000U ? -1 - (long)(~x) : (long)x);
823+
}
824+
808825
static PyObject *
809826
bu_int(_structmodulestate *state, const char *p, const formatdef *f)
810827
{
811-
long x = 0;
812-
Py_ssize_t i = f->size;
828+
unsigned long x = 0;
829+
830+
/* This function is only ever used in the case f->size == 4. */
831+
assert(f->size == 4);
832+
Py_ssize_t i = 4;
813833
const unsigned char *bytes = (const unsigned char *)p;
814834
do {
815835
x = (x<<8) | *bytes++;
816836
} while (--i > 0);
817-
/* Extend the sign bit. */
818-
if (SIZEOF_LONG > f->size)
819-
x |= -(x & (1L << ((8 * f->size) - 1)));
820-
return PyLong_FromLong(x);
837+
/* Extend sign, avoiding implementation-defined or undefined behaviour. */
838+
x = (x ^ 0x80000000U) - 0x80000000U;
839+
return PyLong_FromLong(x & 0x80000000U ? -1 - (long)(~x) : (long)x);
821840
}
822841

823842
static PyObject *
@@ -835,16 +854,19 @@ bu_uint(_structmodulestate *state, const char *p, const formatdef *f)
835854
static PyObject *
836855
bu_longlong(_structmodulestate *state, const char *p, const formatdef *f)
837856
{
838-
long long x = 0;
839-
Py_ssize_t i = f->size;
857+
unsigned long long x = 0;
858+
859+
/* This function is only ever used in the case f->size == 8. */
860+
assert(f->size == 8);
861+
Py_ssize_t i = 8;
840862
const unsigned char *bytes = (const unsigned char *)p;
841863
do {
842864
x = (x<<8) | *bytes++;
843865
} while (--i > 0);
844-
/* Extend the sign bit. */
845-
if (SIZEOF_LONG_LONG > f->size)
846-
x |= -(x & ((long long)1 << ((8 * f->size) - 1)));
847-
return PyLong_FromLongLong(x);
866+
/* Extend sign, avoiding implementation-defined or undefined behaviour. */
867+
x = (x ^ 0x8000000000000000U) - 0x8000000000000000U;
868+
return PyLong_FromLongLong(
869+
x & 0x8000000000000000U ? -1 - (long long)(~x) : (long long)x);
848870
}
849871

850872
static PyObject *
@@ -1009,7 +1031,7 @@ static formatdef bigendian_table[] = {
10091031
{'c', 1, 0, nu_char, np_char},
10101032
{'s', 1, 0, NULL},
10111033
{'p', 1, 0, NULL},
1012-
{'h', 2, 0, bu_int, bp_int},
1034+
{'h', 2, 0, bu_short, bp_int},
10131035
{'H', 2, 0, bu_uint, bp_uint},
10141036
{'i', 4, 0, bu_int, bp_int},
10151037
{'I', 4, 0, bu_uint, bp_uint},
@@ -1026,19 +1048,38 @@ static formatdef bigendian_table[] = {
10261048

10271049
/* Little-endian routines. *****************************************************/
10281050

1051+
static PyObject *
1052+
lu_short(_structmodulestate *state, const char *p, const formatdef *f)
1053+
{
1054+
unsigned long x = 0;
1055+
1056+
/* This function is only ever used in the case f->size == 2. */
1057+
assert(f->size == 2);
1058+
Py_ssize_t i = 2;
1059+
const unsigned char *bytes = (const unsigned char *)p;
1060+
do {
1061+
x = (x<<8) | bytes[--i];
1062+
} while (i > 0);
1063+
/* Extend sign, avoiding implementation-defined or undefined behaviour. */
1064+
x = (x ^ 0x8000U) - 0x8000U;
1065+
return PyLong_FromLong(x & 0x8000U ? -1 - (long)(~x) : (long)x);
1066+
}
1067+
10291068
static PyObject *
10301069
lu_int(_structmodulestate *state, const char *p, const formatdef *f)
10311070
{
1032-
long x = 0;
1033-
Py_ssize_t i = f->size;
1071+
unsigned long x = 0;
1072+
1073+
/* This function is only ever used in the case f->size == 4. */
1074+
assert(f->size == 4);
1075+
Py_ssize_t i = 4;
10341076
const unsigned char *bytes = (const unsigned char *)p;
10351077
do {
10361078
x = (x<<8) | bytes[--i];
10371079
} while (i > 0);
1038-
/* Extend the sign bit. */
1039-
if (SIZEOF_LONG > f->size)
1040-
x |= -(x & (1L << ((8 * f->size) - 1)));
1041-
return PyLong_FromLong(x);
1080+
/* Extend sign, avoiding implementation-defined or undefined behaviour. */
1081+
x = (x ^ 0x80000000U) - 0x80000000U;
1082+
return PyLong_FromLong(x & 0x80000000U ? -1 - (long)(~x) : (long)x);
10421083
}
10431084

10441085
static PyObject *
@@ -1056,16 +1097,19 @@ lu_uint(_structmodulestate *state, const char *p, const formatdef *f)
10561097
static PyObject *
10571098
lu_longlong(_structmodulestate *state, const char *p, const formatdef *f)
10581099
{
1059-
long long x = 0;
1060-
Py_ssize_t i = f->size;
1100+
unsigned long long x = 0;
1101+
1102+
/* This function is only ever used in the case f->size == 8. */
1103+
assert(f->size == 8);
1104+
Py_ssize_t i = 8;
10611105
const unsigned char *bytes = (const unsigned char *)p;
10621106
do {
10631107
x = (x<<8) | bytes[--i];
10641108
} while (i > 0);
1065-
/* Extend the sign bit. */
1066-
if (SIZEOF_LONG_LONG > f->size)
1067-
x |= -(x & ((long long)1 << ((8 * f->size) - 1)));
1068-
return PyLong_FromLongLong(x);
1109+
/* Extend sign, avoiding implementation-defined or undefined behaviour. */
1110+
x = (x ^ 0x8000000000000000U) - 0x8000000000000000U;
1111+
return PyLong_FromLongLong(
1112+
x & 0x8000000000000000U ? -1 - (long long)(~x) : (long long)x);
10691113
}
10701114

10711115
static PyObject *
@@ -1213,7 +1257,7 @@ static formatdef lilendian_table[] = {
12131257
{'c', 1, 0, nu_char, np_char},
12141258
{'s', 1, 0, NULL},
12151259
{'p', 1, 0, NULL},
1216-
{'h', 2, 0, lu_int, lp_int},
1260+
{'h', 2, 0, lu_short, lp_int},
12171261
{'H', 2, 0, lu_uint, lp_uint},
12181262
{'i', 4, 0, lu_int, lp_int},
12191263
{'I', 4, 0, lu_uint, lp_uint},

0 commit comments

Comments
 (0)