Convert the CRC32 constant generation code to Python

ebiggers · Nov 28, 2024 · bd57e93 · bd57e93 · cieplutki · Dec 15, 2024
1 parent 1a6f436
commit bd57e93
Show file tree

Hide file tree

Showing 6 changed files with 162 additions and 310 deletions.
diff --git a/lib/crc32_multipliers.h b/lib/crc32_multipliers.h
@@ -1,7 +1,7 @@
 /*
  * crc32_multipliers.h - constants for CRC-32 folding
  *
- * THIS FILE WAS GENERATED BY gen_crc32_multipliers.c.  DO NOT EDIT.
+ * THIS FILE WAS GENERATED BY gen-crc32-consts.py.  DO NOT EDIT.
  */
 
 #define CRC32_X159_MODG 0xae689191 /* x^159 mod G(x) */

diff --git a/lib/crc32_tables.h b/lib/crc32_tables.h
@@ -1,7 +1,7 @@
 /*
  * crc32_tables.h - data tables for CRC-32 computation
  *
- * THIS FILE WAS GENERATED BY gen_crc32_tables.c.  DO NOT EDIT.
+ * THIS FILE WAS GENERATED BY gen-crc32-consts.py.  DO NOT EDIT.
  */
 
 static const u32 crc32_slice1_table[] MAYBE_UNUSED = {

diff --git a/lib/x86/crc32_pclmul_template.h b/lib/x86/crc32_pclmul_template.h
@@ -51,7 +51,7 @@
  * instructions.  Note that the x86 crc32 instruction cannot be used, as it is
  * for a different polynomial, not the gzip one.  For an explanation of CRC
  * folding with carryless multiplication instructions, see
- * scripts/gen_crc32_multipliers.c and the following blog posts and papers:
+ * scripts/gen-crc32-consts.py and the following blog posts and papers:
  *
  *	"An alternative exposition of crc32_4k_pclmulqdq"
  *	https://www.corsix.org/content/alternative-exposition-crc32_4k_pclmulqdq
@@ -189,7 +189,7 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
 	 * folding across 128 bits.  mults_128b differs from mults_1v when
 	 * VL != 16.  All multipliers are 64-bit, to match what pclmulqdq needs,
 	 * but since this is for CRC-32 only their low 32 bits are nonzero.
-	 * For more details, see scripts/gen_crc32_multipliers.c.
+	 * For more details, see scripts/gen-crc32-consts.py.
 	 */
 	const vec_t mults_8v = MULTS_8V;
 	const vec_t mults_4v = MULTS_4V;

diff --git a/scripts/gen-crc32-consts.py b/scripts/gen-crc32-consts.py
@@ -0,0 +1,158 @@
+#!/usr/bin/env python3
+#
+# This script generates constants for efficient computation of the gzip CRC-32.
+
+import sys
+
+# This is the generator polynomial G(x) of the gzip CRC-32, represented as an
+# int using the natural mapping between bits and polynomial coefficients.
+G = 0x104c11db7
+
+# XOR (add) an iterable of polynomials.
+def xor(iterable):
+    res = 0
+    for val in iterable:
+        res ^= val
+    return res
+
+# Multiply two polynomials.
+def clmul(a, b):
+    return xor(a << i for i in range(b.bit_length()) if (b & (1 << i)) != 0)
+
+# Polynomial division floor(a / b).
+def div(a, b):
+    q = 0
+    while a.bit_length() >= b.bit_length():
+        q ^= 1 << (a.bit_length() - b.bit_length())
+        a ^= b << (a.bit_length() - b.bit_length())
+    return q
+
+# Reduce the polynomial 'a' modulo the polynomial 'b'.
+def reduce(a, b):
+    return a ^ clmul(div(a, b), b)
+
+# Reverse the bits of a polynomial.
+def bitreverse(poly, num_bits):
+    return xor(1 << (num_bits - 1 - i) for i in range(num_bits)
+               if (poly & (1 << i)) != 0)
+
+# Compute x^d mod G.
+def x_to_the_d(d):
+    if d < G.bit_length() - 1:
+        return 1 << d
+    t = x_to_the_d(d//2)
+    t = clmul(t, t)
+    if d % 2 != 0:
+        t <<= 1
+    return reduce(t, G)
+
+def gen_tables():
+    print('/*')
+    print(' * crc32_tables.h - data tables for CRC-32 computation')
+    print(' *')
+    print(' * THIS FILE WAS GENERATED BY gen-crc32-consts.py.  DO NOT EDIT.')
+    print(' */')
+    for n in [1, 8]:
+        print('')
+        print(f'static const u32 crc32_slice{n}_table[] MAYBE_UNUSED = {{')
+        # The i'th table entry is the CRC-32 of the message consisting of byte
+        # i % 256 followed by i // 256 zero bytes.
+        polys = [bitreverse(i % 256, 8) << (32 + 8*(i//256)) for i in range(256 * n)]
+        polys = [bitreverse(reduce(poly, G), 32) for poly in polys]
+        for i in range(0, len(polys), 4):
+            print(f'\t0x{polys[i+0]:08x}, 0x{polys[i+1]:08x}, 0x{polys[i+2]:08x}, 0x{polys[i+3]:08x},')
+        print('};')
+
+# Compute the constant multipliers needed for "folding" over various distances
+# with the gzip CRC-32.  Each such multiplier is x^d mod G(x) for some distance
+# d, in bits, over which the folding is occurring.
+#
+# Folding works as follows: let A(x) be a polynomial (possibly reduced partially
+# or fully mod G(x)) for part of the message, and let B(x) be a polynomial
+# (possibly reduced partially or fully mod G(x)) for a later part of the
+# message.  The unreduced combined polynomial is A(x)*x^d + B(x), where d is the
+# number of bits separating the two parts of the message plus len(B(x)).  Since
+# mod G(x) can be applied at any point, x^d mod G(x) can be precomputed and used
+# instead of x^d unreduced.  That allows the combined polynomial to be computed
+# relatively easily in a partially-reduced form A(x)*(x^d mod G(x)) + B(x), with
+# length max(len(A(x)) + 31, len(B(x))).  This does require doing a polynomial
+# multiplication (carryless multiplication).
+#
+# "Folding" in this way can be used for the entire CRC computation except the
+# final reduction to 32 bits; this works well when CPU support for carryless
+# multiplication is available.  It can also be used to combine CRCs of different
+# parts of the message that were computed using a different method.
+#
+# Note that the gzip CRC-32 uses bit-reversed polynomials.  I.e., the low order
+# bits are really the high order polynomial coefficients.
+def gen_multipliers():
+    print('/*')
+    print(' * crc32_multipliers.h - constants for CRC-32 folding')
+    print(' *')
+    print(' * THIS FILE WAS GENERATED BY gen-crc32-consts.py.  DO NOT EDIT.')
+    print(' */')
+    print('')
+
+    # Compute the multipliers needed for CRC-32 folding with carryless
+    # multiplication instructions that operate on the 64-bit halves of 128-bit
+    # segments.  Using the terminology from earlier, for each 64-bit fold
+    # len(A(x)) = 64, and len(B(x)) = 95 since a 64-bit polynomial multiplied by
+    # a 32-bit one produces a 95-bit one.  When A(x) is the low order polynomial
+    # half of a 128-bit segments (high order physical half), the separation
+    # between the message parts is the total length of the 128-bit segments
+    # separating the values.  When A(x) is the high order polynomial half, the
+    # separation is 64 bits greater.
+    for i in range(1, 33):
+        sep_lo = 128 * (i - 1)
+        sep_hi = sep_lo + 64
+        len_B = 95
+        for d in [sep_hi + len_B, # A(x) = high 64 polynomial bits (low 64 physical bits)
+                  sep_lo + len_B # A(x) = low 64 polynomial bits (high 64 physical bits)
+                  ]:
+            poly = bitreverse(x_to_the_d(d), 32)
+            print(f'#define CRC32_X{d}_MODG 0x{poly:08x} /* x^{d} mod G(x) */')
+        print('')
+
+    # Compute constants for the final 128 => 32 bit reduction.
+    poly = bitreverse(div(1 << 95, G), 64)
+    print(f'#define CRC32_BARRETT_CONSTANT_1 0x{poly:016x}ULL /* floor(x^95 / G(x)) */')
+    poly = bitreverse(G, 33)
+    print(f'#define CRC32_BARRETT_CONSTANT_2 0x{poly:016x}ULL /* G(x) */')
+
+    # Compute multipliers for combining the CRCs of separate chunks.
+    print('')
+    num_chunks = 4
+    table_len = 129
+    min_chunk_len = 128
+    print(f'#define CRC32_NUM_CHUNKS {num_chunks}')
+    print(f'#define CRC32_MIN_VARIABLE_CHUNK_LEN {min_chunk_len}UL')
+    print(f'#define CRC32_MAX_VARIABLE_CHUNK_LEN {(table_len-1) * min_chunk_len}UL')
+    print('')
+    print('/* Multipliers for implementations that use a variable chunk length */')
+    print('static const u32 crc32_mults_for_chunklen[][CRC32_NUM_CHUNKS - 1] MAYBE_UNUSED = {')
+    print('\t{ 0 /* unused row */ },')
+    for i in range(1, table_len):
+        chunk_len = i * min_chunk_len
+        print(f'\t/* chunk_len={chunk_len} */')
+        print('\t{ ', end='')
+        for j in range(num_chunks - 1, 0, -1):
+            d = (j * 8 * chunk_len) - 33
+            poly = bitreverse(x_to_the_d(d), 32)
+            print(f'0x{poly:08x} /* x^{d} mod G(x) */, ', end='')
+        print('},')
+    print('};')
+    fixed_chunk_len = 32768
+    print('')
+    print('/* Multipliers for implementations that use a large fixed chunk length */')
+    print(f'#define CRC32_FIXED_CHUNK_LEN {fixed_chunk_len}UL')
+    for j in range(1, num_chunks):
+        d = (j * 8 * fixed_chunk_len) - 33
+        poly = bitreverse(x_to_the_d(d), 32)
+        print(f'#define CRC32_FIXED_CHUNK_MULT_{j} 0x{poly:08x} /* x^{d} mod G(x) */')
+
+with open('lib/crc32_tables.h', 'w') as f:
+    sys.stdout = f
+    gen_tables()
+with open('lib/crc32_multipliers.h', 'w') as f:
+    sys.stdout = f
+    gen_multipliers()
diff --git a/scripts/gen_crc32_multipliers.c b/scripts/gen_crc32_multipliers.c