Skip to content

Commit a98d9ea

Browse files
authored
gh-94155: Reduce hash collisions for code objects (#100183)
* Uses a better hashing algorithm to get better dispersion and remove commutativity. * Incorporates `co_firstlineno`, `Py_SIZE(co)`, and bytecode instructions. * This is now the entire set of criteria used in `code_richcompare`, except for `_PyCode_ConstantKey` (which would incorporate the types of `co_consts` rather than just their values).
1 parent 36d3583 commit a98d9ea

File tree

3 files changed

+60
-20
lines changed

3 files changed

+60
-20
lines changed

Lib/test/test_code.py

+26
Original file line numberDiff line numberDiff line change
@@ -465,6 +465,32 @@ def f():
465465
self.assertNotEqual(code_b, code_d)
466466
self.assertNotEqual(code_c, code_d)
467467

468+
def test_code_hash_uses_firstlineno(self):
469+
c1 = (lambda: 1).__code__
470+
c2 = (lambda: 1).__code__
471+
self.assertNotEqual(c1, c2)
472+
self.assertNotEqual(hash(c1), hash(c2))
473+
c3 = c1.replace(co_firstlineno=17)
474+
self.assertNotEqual(c1, c3)
475+
self.assertNotEqual(hash(c1), hash(c3))
476+
477+
def test_code_hash_uses_order(self):
478+
# Swapping posonlyargcount and kwonlyargcount should change the hash.
479+
c = (lambda x, y, *, z=1, w=1: 1).__code__
480+
self.assertEqual(c.co_argcount, 2)
481+
self.assertEqual(c.co_posonlyargcount, 0)
482+
self.assertEqual(c.co_kwonlyargcount, 2)
483+
swapped = c.replace(co_posonlyargcount=2, co_kwonlyargcount=0)
484+
self.assertNotEqual(c, swapped)
485+
self.assertNotEqual(hash(c), hash(swapped))
486+
487+
def test_code_hash_uses_bytecode(self):
488+
c = (lambda x, y: x + y).__code__
489+
d = (lambda x, y: x * y).__code__
490+
c1 = c.replace(co_code=d.co_code)
491+
self.assertNotEqual(c, c1)
492+
self.assertNotEqual(hash(c), hash(c1))
493+
468494

469495
def isinterned(s):
470496
return s is sys.intern(('_' + s + '_')[1:-1])
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Improved the hashing algorithm for code objects, mitigating some hash collisions.

Objects/codeobject.c

+33-20
Original file line numberDiff line numberDiff line change
@@ -1842,28 +1842,41 @@ code_richcompare(PyObject *self, PyObject *other, int op)
18421842
static Py_hash_t
18431843
code_hash(PyCodeObject *co)
18441844
{
1845-
Py_hash_t h, h0, h1, h2, h3;
1846-
h0 = PyObject_Hash(co->co_name);
1847-
if (h0 == -1) return -1;
1848-
h1 = PyObject_Hash(co->co_consts);
1849-
if (h1 == -1) return -1;
1850-
h2 = PyObject_Hash(co->co_names);
1851-
if (h2 == -1) return -1;
1852-
h3 = PyObject_Hash(co->co_localsplusnames);
1853-
if (h3 == -1) return -1;
1854-
Py_hash_t h4 = PyObject_Hash(co->co_linetable);
1855-
if (h4 == -1) {
1856-
return -1;
1845+
Py_uhash_t uhash = 20221211;
1846+
#define SCRAMBLE_IN(H) do { \
1847+
uhash ^= (Py_uhash_t)(H); \
1848+
uhash *= _PyHASH_MULTIPLIER; \
1849+
} while (0)
1850+
#define SCRAMBLE_IN_HASH(EXPR) do { \
1851+
Py_hash_t h = PyObject_Hash(EXPR); \
1852+
if (h == -1) { \
1853+
return -1; \
1854+
} \
1855+
SCRAMBLE_IN(h); \
1856+
} while (0)
1857+
1858+
SCRAMBLE_IN_HASH(co->co_name);
1859+
SCRAMBLE_IN_HASH(co->co_consts);
1860+
SCRAMBLE_IN_HASH(co->co_names);
1861+
SCRAMBLE_IN_HASH(co->co_localsplusnames);
1862+
SCRAMBLE_IN_HASH(co->co_linetable);
1863+
SCRAMBLE_IN_HASH(co->co_exceptiontable);
1864+
SCRAMBLE_IN(co->co_argcount);
1865+
SCRAMBLE_IN(co->co_posonlyargcount);
1866+
SCRAMBLE_IN(co->co_kwonlyargcount);
1867+
SCRAMBLE_IN(co->co_flags);
1868+
SCRAMBLE_IN(co->co_firstlineno);
1869+
SCRAMBLE_IN(Py_SIZE(co));
1870+
for (int i = 0; i < Py_SIZE(co); i++) {
1871+
int deop = _PyOpcode_Deopt[_Py_OPCODE(_PyCode_CODE(co)[i])];
1872+
SCRAMBLE_IN(deop);
1873+
SCRAMBLE_IN(_Py_OPARG(_PyCode_CODE(co)[i]));
1874+
i += _PyOpcode_Caches[deop];
18571875
}
1858-
Py_hash_t h5 = PyObject_Hash(co->co_exceptiontable);
1859-
if (h5 == -1) {
1860-
return -1;
1876+
if ((Py_hash_t)uhash == -1) {
1877+
return -2;
18611878
}
1862-
h = h0 ^ h1 ^ h2 ^ h3 ^ h4 ^ h5 ^
1863-
co->co_argcount ^ co->co_posonlyargcount ^ co->co_kwonlyargcount ^
1864-
co->co_flags;
1865-
if (h == -1) h = -2;
1866-
return h;
1879+
return (Py_hash_t)uhash;
18671880
}
18681881

18691882

0 commit comments

Comments
 (0)