From bd57e9368283d42b65b7bef4296199128a75a690 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Thu, 28 Nov 2024 14:13:57 -0800 Subject: [PATCH] Convert the CRC32 constant generation code to Python --- lib/crc32_multipliers.h | 2 +- lib/crc32_tables.h | 2 +- lib/x86/crc32_pclmul_template.h | 4 +- scripts/gen-crc32-consts.py | 158 +++++++++++++++++++++++++ scripts/gen_crc32_multipliers.c | 201 -------------------------------- scripts/gen_crc32_tables.c | 105 ----------------- 6 files changed, 162 insertions(+), 310 deletions(-) create mode 100755 scripts/gen-crc32-consts.py delete mode 100644 scripts/gen_crc32_multipliers.c delete mode 100644 scripts/gen_crc32_tables.c diff --git a/lib/crc32_multipliers.h b/lib/crc32_multipliers.h index 4fdb7bf3..9a08745e 100644 --- a/lib/crc32_multipliers.h +++ b/lib/crc32_multipliers.h @@ -1,7 +1,7 @@ /* * crc32_multipliers.h - constants for CRC-32 folding * - * THIS FILE WAS GENERATED BY gen_crc32_multipliers.c. DO NOT EDIT. + * THIS FILE WAS GENERATED BY gen-crc32-consts.py. DO NOT EDIT. */ #define CRC32_X159_MODG 0xae689191 /* x^159 mod G(x) */ diff --git a/lib/crc32_tables.h b/lib/crc32_tables.h index 86228c72..d6aff733 100644 --- a/lib/crc32_tables.h +++ b/lib/crc32_tables.h @@ -1,7 +1,7 @@ /* * crc32_tables.h - data tables for CRC-32 computation * - * THIS FILE WAS GENERATED BY gen_crc32_tables.c. DO NOT EDIT. + * THIS FILE WAS GENERATED BY gen-crc32-consts.py. DO NOT EDIT. */ static const u32 crc32_slice1_table[] MAYBE_UNUSED = { diff --git a/lib/x86/crc32_pclmul_template.h b/lib/x86/crc32_pclmul_template.h index df804a29..33d7ef33 100644 --- a/lib/x86/crc32_pclmul_template.h +++ b/lib/x86/crc32_pclmul_template.h @@ -51,7 +51,7 @@ * instructions. Note that the x86 crc32 instruction cannot be used, as it is * for a different polynomial, not the gzip one. For an explanation of CRC * folding with carryless multiplication instructions, see - * scripts/gen_crc32_multipliers.c and the following blog posts and papers: + * scripts/gen-crc32-consts.py and the following blog posts and papers: * * "An alternative exposition of crc32_4k_pclmulqdq" * https://www.corsix.org/content/alternative-exposition-crc32_4k_pclmulqdq @@ -189,7 +189,7 @@ ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len) * folding across 128 bits. mults_128b differs from mults_1v when * VL != 16. All multipliers are 64-bit, to match what pclmulqdq needs, * but since this is for CRC-32 only their low 32 bits are nonzero. - * For more details, see scripts/gen_crc32_multipliers.c. + * For more details, see scripts/gen-crc32-consts.py. */ const vec_t mults_8v = MULTS_8V; const vec_t mults_4v = MULTS_4V; diff --git a/scripts/gen-crc32-consts.py b/scripts/gen-crc32-consts.py new file mode 100755 index 00000000..bd984cb0 --- /dev/null +++ b/scripts/gen-crc32-consts.py @@ -0,0 +1,158 @@ +#!/usr/bin/env python3 +# +# This script generates constants for efficient computation of the gzip CRC-32. + +import sys + +# This is the generator polynomial G(x) of the gzip CRC-32, represented as an +# int using the natural mapping between bits and polynomial coefficients. +G = 0x104c11db7 + +# XOR (add) an iterable of polynomials. +def xor(iterable): + res = 0 + for val in iterable: + res ^= val + return res + +# Multiply two polynomials. +def clmul(a, b): + return xor(a << i for i in range(b.bit_length()) if (b & (1 << i)) != 0) + +# Polynomial division floor(a / b). +def div(a, b): + q = 0 + while a.bit_length() >= b.bit_length(): + q ^= 1 << (a.bit_length() - b.bit_length()) + a ^= b << (a.bit_length() - b.bit_length()) + return q + +# Reduce the polynomial 'a' modulo the polynomial 'b'. +def reduce(a, b): + return a ^ clmul(div(a, b), b) + +# Reverse the bits of a polynomial. +def bitreverse(poly, num_bits): + return xor(1 << (num_bits - 1 - i) for i in range(num_bits) + if (poly & (1 << i)) != 0) + +# Compute x^d mod G. +def x_to_the_d(d): + if d < G.bit_length() - 1: + return 1 << d + t = x_to_the_d(d//2) + t = clmul(t, t) + if d % 2 != 0: + t <<= 1 + return reduce(t, G) + +def gen_tables(): + print('/*') + print(' * crc32_tables.h - data tables for CRC-32 computation') + print(' *') + print(' * THIS FILE WAS GENERATED BY gen-crc32-consts.py. DO NOT EDIT.') + print(' */') + for n in [1, 8]: + print('') + print(f'static const u32 crc32_slice{n}_table[] MAYBE_UNUSED = {{') + # The i'th table entry is the CRC-32 of the message consisting of byte + # i % 256 followed by i // 256 zero bytes. + polys = [bitreverse(i % 256, 8) << (32 + 8*(i//256)) for i in range(256 * n)] + polys = [bitreverse(reduce(poly, G), 32) for poly in polys] + for i in range(0, len(polys), 4): + print(f'\t0x{polys[i+0]:08x}, 0x{polys[i+1]:08x}, 0x{polys[i+2]:08x}, 0x{polys[i+3]:08x},') + print('};') + +# Compute the constant multipliers needed for "folding" over various distances +# with the gzip CRC-32. Each such multiplier is x^d mod G(x) for some distance +# d, in bits, over which the folding is occurring. +# +# Folding works as follows: let A(x) be a polynomial (possibly reduced partially +# or fully mod G(x)) for part of the message, and let B(x) be a polynomial +# (possibly reduced partially or fully mod G(x)) for a later part of the +# message. The unreduced combined polynomial is A(x)*x^d + B(x), where d is the +# number of bits separating the two parts of the message plus len(B(x)). Since +# mod G(x) can be applied at any point, x^d mod G(x) can be precomputed and used +# instead of x^d unreduced. That allows the combined polynomial to be computed +# relatively easily in a partially-reduced form A(x)*(x^d mod G(x)) + B(x), with +# length max(len(A(x)) + 31, len(B(x))). This does require doing a polynomial +# multiplication (carryless multiplication). +# +# "Folding" in this way can be used for the entire CRC computation except the +# final reduction to 32 bits; this works well when CPU support for carryless +# multiplication is available. It can also be used to combine CRCs of different +# parts of the message that were computed using a different method. +# +# Note that the gzip CRC-32 uses bit-reversed polynomials. I.e., the low order +# bits are really the high order polynomial coefficients. +def gen_multipliers(): + print('/*') + print(' * crc32_multipliers.h - constants for CRC-32 folding') + print(' *') + print(' * THIS FILE WAS GENERATED BY gen-crc32-consts.py. DO NOT EDIT.') + print(' */') + print('') + + # Compute the multipliers needed for CRC-32 folding with carryless + # multiplication instructions that operate on the 64-bit halves of 128-bit + # segments. Using the terminology from earlier, for each 64-bit fold + # len(A(x)) = 64, and len(B(x)) = 95 since a 64-bit polynomial multiplied by + # a 32-bit one produces a 95-bit one. When A(x) is the low order polynomial + # half of a 128-bit segments (high order physical half), the separation + # between the message parts is the total length of the 128-bit segments + # separating the values. When A(x) is the high order polynomial half, the + # separation is 64 bits greater. + for i in range(1, 33): + sep_lo = 128 * (i - 1) + sep_hi = sep_lo + 64 + len_B = 95 + for d in [sep_hi + len_B, # A(x) = high 64 polynomial bits (low 64 physical bits) + sep_lo + len_B # A(x) = low 64 polynomial bits (high 64 physical bits) + ]: + poly = bitreverse(x_to_the_d(d), 32) + print(f'#define CRC32_X{d}_MODG 0x{poly:08x} /* x^{d} mod G(x) */') + print('') + + # Compute constants for the final 128 => 32 bit reduction. + poly = bitreverse(div(1 << 95, G), 64) + print(f'#define CRC32_BARRETT_CONSTANT_1 0x{poly:016x}ULL /* floor(x^95 / G(x)) */') + poly = bitreverse(G, 33) + print(f'#define CRC32_BARRETT_CONSTANT_2 0x{poly:016x}ULL /* G(x) */') + + # Compute multipliers for combining the CRCs of separate chunks. + print('') + num_chunks = 4 + table_len = 129 + min_chunk_len = 128 + print(f'#define CRC32_NUM_CHUNKS {num_chunks}') + print(f'#define CRC32_MIN_VARIABLE_CHUNK_LEN {min_chunk_len}UL') + print(f'#define CRC32_MAX_VARIABLE_CHUNK_LEN {(table_len-1) * min_chunk_len}UL') + print('') + print('/* Multipliers for implementations that use a variable chunk length */') + print('static const u32 crc32_mults_for_chunklen[][CRC32_NUM_CHUNKS - 1] MAYBE_UNUSED = {') + print('\t{ 0 /* unused row */ },') + for i in range(1, table_len): + chunk_len = i * min_chunk_len + print(f'\t/* chunk_len={chunk_len} */') + print('\t{ ', end='') + for j in range(num_chunks - 1, 0, -1): + d = (j * 8 * chunk_len) - 33 + poly = bitreverse(x_to_the_d(d), 32) + print(f'0x{poly:08x} /* x^{d} mod G(x) */, ', end='') + print('},') + print('};') + fixed_chunk_len = 32768 + print('') + print('/* Multipliers for implementations that use a large fixed chunk length */') + print(f'#define CRC32_FIXED_CHUNK_LEN {fixed_chunk_len}UL') + for j in range(1, num_chunks): + d = (j * 8 * fixed_chunk_len) - 33 + poly = bitreverse(x_to_the_d(d), 32) + print(f'#define CRC32_FIXED_CHUNK_MULT_{j} 0x{poly:08x} /* x^{d} mod G(x) */') + +with open('lib/crc32_tables.h', 'w') as f: + sys.stdout = f + gen_tables() +with open('lib/crc32_multipliers.h', 'w') as f: + sys.stdout = f + gen_multipliers() diff --git a/scripts/gen_crc32_multipliers.c b/scripts/gen_crc32_multipliers.c deleted file mode 100644 index 4ec7bcfb..00000000 --- a/scripts/gen_crc32_multipliers.c +++ /dev/null @@ -1,201 +0,0 @@ -/* - * gen_crc32_multipliers.c - * - * Copyright 2016 Eric Biggers - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/* - * This program computes the constant multipliers needed for "folding" over - * various distances with the gzip CRC-32. Each such multiplier is x^D mod G(x) - * for some distance D, in bits, over which the folding is occurring. - * - * Folding works as follows: let A(x) be a polynomial (possibly reduced - * partially or fully mod G(x)) for part of the message, and let B(x) be a - * polynomial (possibly reduced partially or fully mod G(x)) for a later part of - * the message. The unreduced combined polynomial is A(x)*x^D + B(x), where D - * is the number of bits separating the two parts of the message plus len(B(x)). - * Since mod G(x) can be applied at any point, x^D mod G(x) can be precomputed - * and used instead of x^D unreduced. That allows the combined polynomial to be - * computed relatively easily in a partially-reduced form A(x)*(x^D mod G(x)) + - * B(x), with length max(len(A(x)) + 31, len(B(x))). This does require doing a - * polynomial multiplication (carryless multiplication). - * - * "Folding" in this way can be used for the entire CRC computation except the - * final reduction to 32 bits; this works well when CPU support for carryless - * multiplication is available. It can also be used to combine CRCs of - * different parts of the message that were computed using a different method. - * - * Note that the gzip CRC-32 uses bit-reversed polynomials. I.e., the low order - * bits are really the high order polynomial coefficients. - */ - -#include -#include - -#include "../common_defs.h" - -/* The generator polynomial G(x) for the gzip CRC-32 */ -#define CRCPOLY 0xEDB88320 /* G(x) without x^32 term */ -#define CRCPOLY_FULL (((u64)CRCPOLY << 1) | 1) /* G(x) */ - -/* Compute x^D mod G(x) */ -static u32 -compute_xD_modG(size_t D) -{ - /* Start with x^0 mod G(x) */ - u32 remainder = 0x80000000; - - /* Each iteration, 'remainder' becomes x^i mod G(x) */ - for (size_t i = 1; i <= D; i++) - remainder = (remainder >> 1) ^ ((remainder & 1) ? CRCPOLY : 0); - - /* Now 'remainder' is x^D mod G(x) */ - return remainder; -} - -/* Compute floor(x^95 / G(x)) */ -static u64 -compute_x95_div_G(void) -{ - /* The quotient, max order 95 - 32 = 63. */ - u64 quotient = 0; - - /* - * The x^32 through x^95 terms of the remainder. This starts at x^95 - * and is updated through long division. At the end only the - * x^0 through x^31 terms will be nonzero, but those are unneeded. - */ - u64 remainder = 0x1; - - for (int i = 0; i < 64; i++) { - /* - * If the x^(95-i) term of remainder is nonzero, add - * x^(63-i) * G(x) to cancel it out. (G(x) has order 32.) - */ - if (remainder & (1ULL << i)) { - quotient |= 1ULL << i; - remainder ^= (u64)CRCPOLY_FULL << i; - } - } - - return quotient; -} - -static void -gen_vec_folding_constants(void) -{ - /* - * Compute the multipliers needed for CRC-32 folding with carryless - * multiplication instructions that operate on the 64-bit halves of - * 128-bit segments. Using the terminology from earlier, for each 64-bit - * fold len(A(x)) = 64, and len(B(x)) = 95 since a 64-bit polynomial - * multiplied by a 32-bit one produces a 95-bit one. When A(x) is the - * low order polynomial half of a 128-bit segments (high order physical - * half), the separation between the message parts is the total length - * of the 128-bit segments separating the values. When A(x) is the high - * order polynomial half, the separation is 64 bits greater. - */ - for (int i = 1; i <= 32; i++) { - const int sep_lo = 128 * (i - 1); - const int sep_hi = sep_lo + 64; - const int len_B = 95; - int D; - - /* A(x) = high 64 polynomial bits (low 64 physical bits) */ - D = sep_hi + len_B; - printf("#define CRC32_X%d_MODG 0x%08"PRIx32" /* x^%d mod G(x) */\n", - D, compute_xD_modG(D), D); - - /* A(x) = low 64 polynomial bits (high 64 physical bits) */ - D = sep_lo + len_B; - printf("#define CRC32_X%d_MODG 0x%08"PRIx32" /* x^%d mod G(x) */\n", - D, compute_xD_modG(D), D); - printf("\n"); - } - - /* Constants for the final 128 => 32 bit reduction */ - printf("#define CRC32_BARRETT_CONSTANT_1 0x%016"PRIx64"ULL /* floor(x^95 / G(x)) */\n", - compute_x95_div_G()); - printf("#define CRC32_BARRETT_CONSTANT_2 0x%016"PRIx64"ULL /* G(x) */\n", - CRCPOLY_FULL); -} - -/* Multipliers for combining the CRCs of separate chunks */ -static void -gen_chunk_constants(void) -{ - const size_t num_chunks = 4; - const size_t table_len = 129; - const size_t min_chunk_len = 128; - - printf("#define CRC32_NUM_CHUNKS %zu\n", num_chunks); - printf("#define CRC32_MIN_VARIABLE_CHUNK_LEN %zuUL\n", min_chunk_len); - printf("#define CRC32_MAX_VARIABLE_CHUNK_LEN %zuUL\n", - (table_len - 1) * min_chunk_len); - printf("\n"); - printf("/* Multipliers for implementations that use a variable chunk length */\n"); - printf("static const u32 crc32_mults_for_chunklen[][CRC32_NUM_CHUNKS - 1] MAYBE_UNUSED = {\n"); - printf("\t{ 0 /* unused row */ },\n"); - for (size_t i = 1; i < table_len; i++) { - const size_t chunk_len = i*min_chunk_len; - - printf("\t/* chunk_len=%zu */\n", chunk_len); - printf("\t{ "); - for (size_t j = num_chunks - 1; j >= 1; j--) { - const size_t D = (j * 8 * chunk_len) - 33; - - printf("0x%08"PRIx32" /* x^%zu mod G(x) */, ", - compute_xD_modG(D), D); - } - printf("},\n"); - } - printf("};\n"); - printf("\n"); - - printf("/* Multipliers for implementations that use a large fixed chunk length */\n"); - const size_t fixed_chunk_len = 32768; - printf("#define CRC32_FIXED_CHUNK_LEN %zuUL\n", fixed_chunk_len); - for (int j = 1; j < num_chunks; j++) { - const size_t D = (j * 8 * fixed_chunk_len) - 33; - - printf("#define CRC32_FIXED_CHUNK_MULT_%d 0x%08"PRIx32" /* x^%zu mod G(x) */\n", - j, compute_xD_modG(D), D); - } -} - -int -main(void) -{ - printf("/*\n" - " * crc32_multipliers.h - constants for CRC-32 folding\n" - " *\n" - " * THIS FILE WAS GENERATED BY gen_crc32_multipliers.c. DO NOT EDIT.\n" - " */\n" - "\n"); - - gen_vec_folding_constants(); - printf("\n"); - gen_chunk_constants(); - return 0; -} diff --git a/scripts/gen_crc32_tables.c b/scripts/gen_crc32_tables.c deleted file mode 100644 index b13fc5c4..00000000 --- a/scripts/gen_crc32_tables.c +++ /dev/null @@ -1,105 +0,0 @@ -/* - * gen_crc32_tables.c - a program for CRC-32 table generation - * - * Copyright 2016 Eric Biggers - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#include - -#include "../common_defs.h" - -#define CRCPOLY 0xEDB88320 /* G(x) without x^32 term */ - -static u32 -crc32_update_bit(u32 remainder, u8 next_bit) -{ - return (remainder >> 1) ^ (((remainder ^ next_bit) & 1) ? CRCPOLY : 0); -} - -static u32 -crc32_update_byte(u32 remainder, u8 next_byte) -{ - for (int j = 0; j < 8; j++, next_byte >>= 1) - remainder = crc32_update_bit(remainder, next_byte & 1); - return remainder; -} - -static void -print_256_entries(const u32 *entries) -{ - for (size_t i = 0; i < 256 / 4; i++) { - printf("\t"); - for (size_t j = 0; j < 4; j++) { - printf("0x%08x,", entries[i * 4 + j]); - if (j != 3) - printf(" "); - } - printf("\n"); - } -} - -int -main(void) -{ - u32 crc32_table[0x800]; - - /* crc32_table[i] for 0 <= i < 0x100 is the CRC-32 of byte i. */ - for (int i = 0; i < 0x100; i++) - crc32_table[i] = crc32_update_byte(0, i); - - /* - * crc32_table[i] for 0x100 <= i < 0x800 is the CRC-32 of byte i % 0x100 - * followed by i / 0x100 zero bytes. - */ - for (int i = 0x100; i < 0x800; i++) - crc32_table[i] = crc32_update_byte(crc32_table[i - 0x100], 0); - - printf("/*\n"); - printf(" * crc32_tables.h - data tables for CRC-32 computation\n"); - printf(" *\n"); - printf(" * THIS FILE WAS GENERATED BY gen_crc32_tables.c. DO NOT EDIT.\n"); - printf(" */\n"); - printf("\n"); - /* - * Although crc32_slice1_table is the same as the first 256 entries of - * crc32_slice8_table, we output these tables separately because any - * combo of (slice1, slice8, slice1 && slice8, nothing) might be needed, - * and it's simplest to let the compiler optimize out any unused tables. - */ - printf("static const u32 crc32_slice1_table[] MAYBE_UNUSED = {\n"); - print_256_entries(&crc32_table[0x000]); - printf("};\n"); - printf("\n"); - printf("static const u32 crc32_slice8_table[] MAYBE_UNUSED = {\n"); - print_256_entries(&crc32_table[0x000]); - print_256_entries(&crc32_table[0x100]); - print_256_entries(&crc32_table[0x200]); - print_256_entries(&crc32_table[0x300]); - print_256_entries(&crc32_table[0x400]); - print_256_entries(&crc32_table[0x500]); - print_256_entries(&crc32_table[0x600]); - print_256_entries(&crc32_table[0x700]); - printf("};\n"); - return 0; -}