diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt index 27b20a4003c..c1365be6a0a 100644 --- a/contrib/CMakeLists.txt +++ b/contrib/CMakeLists.txt @@ -194,3 +194,5 @@ add_subdirectory(magic_enum) add_subdirectory(aws-cmake) add_subdirectory(simdjson) + +add_subdirectory(fastpforlib) diff --git a/contrib/fastpforlib/CMakeLists.txt b/contrib/fastpforlib/CMakeLists.txt new file mode 100644 index 00000000000..b3b9dba64cc --- /dev/null +++ b/contrib/fastpforlib/CMakeLists.txt @@ -0,0 +1,13 @@ +if(POLICY CMP0063) + cmake_policy(SET CMP0063 NEW) +endif() + +add_library(fastpforlib STATIC bitpacking.cpp) + +target_include_directories(fastpforlib PUBLIC $) +set_target_properties(fastpforlib PROPERTIES EXPORT_NAME fastpforlib) + +if (TIFLASH_ENABLE_ARCH_HASWELL_SUPPORT) + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TIFLASH_COMPILER_ARCH_HASWELL_FLAG}") + set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${TIFLASH_COMPILER_ARCH_HASWELL_FLAG}") +endif () diff --git a/contrib/fastpforlib/LICENSE b/contrib/fastpforlib/LICENSE new file mode 100644 index 00000000000..8405e89a0b1 --- /dev/null +++ b/contrib/fastpforlib/LICENSE @@ -0,0 +1,191 @@ +Apache License +Version 2.0, January 2004 +http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + +"License" shall mean the terms and conditions for use, reproduction, and +distribution as defined by Sections 1 through 9 of this document. + +"Licensor" shall mean the copyright owner or entity authorized by the copyright +owner that is granting the License. + +"Legal Entity" shall mean the union of the acting entity and all other entities +that control, are controlled by, or are under common control with that entity. +For the purposes of this definition, "control" means (i) the power, direct or +indirect, to cause the direction or management of such entity, whether by +contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the +outstanding shares, or (iii) beneficial ownership of such entity. + +"You" (or "Your") shall mean an individual or Legal Entity exercising +permissions granted by this License. + +"Source" form shall mean the preferred form for making modifications, including +but not limited to software source code, documentation source, and configuration +files. + +"Object" form shall mean any form resulting from mechanical transformation or +translation of a Source form, including but not limited to compiled object code, +generated documentation, and conversions to other media types. + +"Work" shall mean the work of authorship, whether in Source or Object form, made +available under the License, as indicated by a copyright notice that is included +in or attached to the work (an example is provided in the Appendix below). + +"Derivative Works" shall mean any work, whether in Source or Object form, that +is based on (or derived from) the Work and for which the editorial revisions, +annotations, elaborations, or other modifications represent, as a whole, an +original work of authorship. For the purposes of this License, Derivative Works +shall not include works that remain separable from, or merely link (or bind by +name) to the interfaces of, the Work and Derivative Works thereof. + +"Contribution" shall mean any work of authorship, including the original version +of the Work and any modifications or additions to that Work or Derivative Works +thereof, that is intentionally submitted to Licensor for inclusion in the Work +by the copyright owner or by an individual or Legal Entity authorized to submit +on behalf of the copyright owner. For the purposes of this definition, +"submitted" means any form of electronic, verbal, or written communication sent +to the Licensor or its representatives, including but not limited to +communication on electronic mailing lists, source code control systems, and +issue tracking systems that are managed by, or on behalf of, the Licensor for +the purpose of discussing and improving the Work, but excluding communication +that is conspicuously marked or otherwise designated in writing by the copyright +owner as "Not a Contribution." + +"Contributor" shall mean Licensor and any individual or Legal Entity on behalf +of whom a Contribution has been received by Licensor and subsequently +incorporated within the Work. + +2. Grant of Copyright License. + +Subject to the terms and conditions of this License, each Contributor hereby +grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, +irrevocable copyright license to reproduce, prepare Derivative Works of, +publicly display, publicly perform, sublicense, and distribute the Work and such +Derivative Works in Source or Object form. + +3. Grant of Patent License. + +Subject to the terms and conditions of this License, each Contributor hereby +grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, +irrevocable (except as stated in this section) patent license to make, have +made, use, offer to sell, sell, import, and otherwise transfer the Work, where +such license applies only to those patent claims licensable by such Contributor +that are necessarily infringed by their Contribution(s) alone or by combination +of their Contribution(s) with the Work to which such Contribution(s) was +submitted. If You institute patent litigation against any entity (including a +cross-claim or counterclaim in a lawsuit) alleging that the Work or a +Contribution incorporated within the Work constitutes direct or contributory +patent infringement, then any patent licenses granted to You under this License +for that Work shall terminate as of the date such litigation is filed. + +4. Redistribution. + +You may reproduce and distribute copies of the Work or Derivative Works thereof +in any medium, with or without modifications, and in Source or Object form, +provided that You meet the following conditions: + +You must give any other recipients of the Work or Derivative Works a copy of +this License; and +You must cause any modified files to carry prominent notices stating that You +changed the files; and +You must retain, in the Source form of any Derivative Works that You distribute, +all copyright, patent, trademark, and attribution notices from the Source form +of the Work, excluding those notices that do not pertain to any part of the +Derivative Works; and +If the Work includes a "NOTICE" text file as part of its distribution, then any +Derivative Works that You distribute must include a readable copy of the +attribution notices contained within such NOTICE file, excluding those notices +that do not pertain to any part of the Derivative Works, in at least one of the +following places: within a NOTICE text file distributed as part of the +Derivative Works; within the Source form or documentation, if provided along +with the Derivative Works; or, within a display generated by the Derivative +Works, if and wherever such third-party notices normally appear. The contents of +the NOTICE file are for informational purposes only and do not modify the +License. You may add Your own attribution notices within Derivative Works that +You distribute, alongside or as an addendum to the NOTICE text from the Work, +provided that such additional attribution notices cannot be construed as +modifying the License. +You may add Your own copyright statement to Your modifications and may provide +additional or different license terms and conditions for use, reproduction, or +distribution of Your modifications, or for any such Derivative Works as a whole, +provided Your use, reproduction, and distribution of the Work otherwise complies +with the conditions stated in this License. + +5. Submission of Contributions. + +Unless You explicitly state otherwise, any Contribution intentionally submitted +for inclusion in the Work by You to the Licensor shall be under the terms and +conditions of this License, without any additional terms or conditions. +Notwithstanding the above, nothing herein shall supersede or modify the terms of +any separate license agreement you may have executed with Licensor regarding +such Contributions. + +6. Trademarks. + +This License does not grant permission to use the trade names, trademarks, +service marks, or product names of the Licensor, except as required for +reasonable and customary use in describing the origin of the Work and +reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. + +Unless required by applicable law or agreed to in writing, Licensor provides the +Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, +including, without limitation, any warranties or conditions of TITLE, +NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are +solely responsible for determining the appropriateness of using or +redistributing the Work and assume any risks associated with Your exercise of +permissions under this License. + +8. Limitation of Liability. + +In no event and under no legal theory, whether in tort (including negligence), +contract, or otherwise, unless required by applicable law (such as deliberate +and grossly negligent acts) or agreed to in writing, shall any Contributor be +liable to You for damages, including any direct, indirect, special, incidental, +or consequential damages of any character arising as a result of this License or +out of the use or inability to use the Work (including but not limited to +damages for loss of goodwill, work stoppage, computer failure or malfunction, or +any and all other commercial damages or losses), even if such Contributor has +been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. + +While redistributing the Work or Derivative Works thereof, You may choose to +offer, and charge a fee for, acceptance of support, warranty, indemnity, or +other liability obligations and/or rights consistent with this License. However, +in accepting such obligations, You may act only on Your own behalf and on Your +sole responsibility, not on behalf of any other Contributor, and only if You +agree to indemnify, defend, and hold each Contributor harmless for any liability +incurred by, or claims asserted against, such Contributor by reason of your +accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work + +To apply the Apache License to your work, attach the following boilerplate +notice, with the fields enclosed by brackets "[]" replaced with your own +identifying information. (Don't include the brackets!) The text should be +enclosed in the appropriate comment syntax for the file format. We also +recommend that a file or class name and description of purpose be included on +the same "printed page" as the copyright notice for easier identification within +third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/contrib/fastpforlib/bitpacking.cpp b/contrib/fastpforlib/bitpacking.cpp new file mode 100644 index 00000000000..f741954dc0f --- /dev/null +++ b/contrib/fastpforlib/bitpacking.cpp @@ -0,0 +1,1284 @@ +#include "bitpacking.h" + +#include +#include + +namespace fastpforlib { +namespace internal { + +// Used for uint8_t, uint16_t and uint32_t +template +typename std::enable_if<(DELTA + SHR) < TYPE_SIZE>::type unpack_single_out(const TYPE *__restrict in, + TYPE *__restrict out) { + *out = ((*in) >> SHR) % (1 << DELTA); +} + +// Used for uint8_t, uint16_t and uint32_t +template +typename std::enable_if<(DELTA + SHR) >= TYPE_SIZE>::type unpack_single_out(const TYPE *__restrict &in, + TYPE *__restrict out) { + *out = (*in) >> SHR; + ++in; + + static const TYPE NEXT_SHR = SHR + DELTA - TYPE_SIZE; + *out |= ((*in) % (1U << NEXT_SHR)) << (TYPE_SIZE - SHR); +} + +template +typename std::enable_if<(DELTA + SHR) < 32>::type unpack_single_out(const uint32_t *__restrict in, + uint64_t *__restrict out) { + *out = ((static_cast(*in)) >> SHR) % (1ULL << DELTA); +} + +template +typename std::enable_if<(DELTA + SHR) >= 32 && (DELTA + SHR) < 64>::type +unpack_single_out(const uint32_t *__restrict &in, uint64_t *__restrict out) { + *out = static_cast(*in) >> SHR; + ++in; + if (DELTA + SHR > 32) { + static const uint8_t NEXT_SHR = SHR + DELTA - 32; + *out |= static_cast((*in) % (1U << NEXT_SHR)) << (32 - SHR); + } +} + +template +typename std::enable_if<(DELTA + SHR) >= 64>::type unpack_single_out(const uint32_t *__restrict &in, + uint64_t *__restrict out) { + *out = static_cast(*in) >> SHR; + ++in; + + *out |= static_cast(*in) << (32 - SHR); + ++in; + + if (DELTA + SHR > 64) { + static const uint8_t NEXT_SHR = DELTA + SHR - 64; + *out |= static_cast((*in) % (1U << NEXT_SHR)) << (64 - SHR); + } +} + +// Used for uint8_t, uint16_t and uint32_t +template + typename std::enable_if < DELTA + SHL::type pack_single_in(const TYPE in, TYPE *__restrict out) { + if (SHL == 0) { + *out = in & MASK; + } else { + *out |= (in & MASK) << SHL; + } +} + +// Used for uint8_t, uint16_t and uint32_t +template +typename std::enable_if= TYPE_SIZE>::type pack_single_in(const TYPE in, TYPE *__restrict &out) { + *out |= in << SHL; + ++out; + + if (DELTA + SHL > TYPE_SIZE) { + *out = (in & MASK) >> (TYPE_SIZE - SHL); + } +} + +template + typename std::enable_if < DELTA + SHL<32>::type pack_single_in64(const uint64_t in, uint32_t *__restrict out) { + if (SHL == 0) { + *out = static_cast(in & MASK); + } else { + *out |= (in & MASK) << SHL; + } +} +template + typename std::enable_if < DELTA + SHL >= 32 && + DELTA + SHL<64>::type pack_single_in64(const uint64_t in, uint32_t *__restrict &out) { + if (SHL == 0) { + *out = static_cast(in & MASK); + } else { + *out |= (in & MASK) << SHL; + } + + ++out; + + if (DELTA + SHL > 32) { + *out = static_cast((in & MASK) >> (32 - SHL)); + } +} +template +typename std::enable_if= 64>::type pack_single_in64(const uint64_t in, uint32_t *__restrict &out) { + *out |= in << SHL; + ++out; + + *out = static_cast((in & MASK) >> (32 - SHL)); + ++out; + + if (DELTA + SHL > 64) { + *out = (in & MASK) >> (64 - SHL); + } +} +template +struct Unroller8 { + static void Unpack(const uint8_t *__restrict &in, uint8_t *__restrict out) { + unpack_single_out(in, out + OINDEX); + + Unroller8::Unpack(in, out); + } + + static void Pack(const uint8_t *__restrict in, uint8_t *__restrict out) { + pack_single_in(in[OINDEX], out); + + Unroller8::Pack(in, out); + } + +};\ +template +struct Unroller8 { + enum { SHIFT = (DELTA * 7) % 8 }; + + static void Unpack(const uint8_t *__restrict in, uint8_t *__restrict out) { + out[7] = (*in) >> SHIFT; + } + + static void Pack(const uint8_t *__restrict in, uint8_t *__restrict out) { + *out |= (in[7] << SHIFT); + } +}; + +template +struct Unroller16 { + static void Unpack(const uint16_t *__restrict &in, uint16_t *__restrict out) { + unpack_single_out(in, out + OINDEX); + + Unroller16::Unpack(in, out); + } + + static void Pack(const uint16_t *__restrict in, uint16_t *__restrict out) { + pack_single_in(in[OINDEX], out); + + Unroller16::Pack(in, out); + } + +}; + +template +struct Unroller16 { + enum { SHIFT = (DELTA * 15) % 16 }; + + static void Unpack(const uint16_t *__restrict in, uint16_t *__restrict out) { + out[15] = (*in) >> SHIFT; + } + + static void Pack(const uint16_t *__restrict in, uint16_t *__restrict out) { + *out |= (in[15] << SHIFT); + } +}; + +template +struct Unroller { + static void Unpack(const uint32_t *__restrict &in, uint32_t *__restrict out) { + unpack_single_out(in, out + OINDEX); + + Unroller::Unpack(in, out); + } + + static void Unpack(const uint32_t *__restrict &in, uint64_t *__restrict out) { + unpack_single_out(in, out + OINDEX); + + Unroller::Unpack(in, out); + } + + static void Pack(const uint32_t *__restrict in, uint32_t *__restrict out) { + pack_single_in(in[OINDEX], out); + + Unroller::Pack(in, out); + } + + static void Pack(const uint64_t *__restrict in, uint32_t *__restrict out) { + pack_single_in64(in[OINDEX], out); + + Unroller::Pack(in, out); + } +}; + +template +struct Unroller { + enum { SHIFT = (DELTA * 31) % 32 }; + + static void Unpack(const uint32_t *__restrict in, uint32_t *__restrict out) { + out[31] = (*in) >> SHIFT; + } + + static void Unpack(const uint32_t *__restrict in, uint64_t *__restrict out) { + out[31] = (*in) >> SHIFT; + if (DELTA > 32) { + ++in; + out[31] |= static_cast(*in) << (32 - SHIFT); + } + } + + static void Pack(const uint32_t *__restrict in, uint32_t *__restrict out) { + *out |= (in[31] << SHIFT); + } + + static void Pack(const uint64_t *__restrict in, uint32_t *__restrict out) { + *out |= (in[31] << SHIFT); + if (DELTA > 32) { + ++out; + *out = static_cast(in[31] >> (32 - SHIFT)); + } + } +}; + +// Special cases +void __fastunpack0(const uint8_t *__restrict, uint8_t *__restrict out) { + for (uint8_t i = 0; i < 8; ++i) + *(out++) = 0; +} + +void __fastunpack0(const uint16_t *__restrict, uint16_t *__restrict out) { + for (uint16_t i = 0; i < 16; ++i) + *(out++) = 0; +} + +void __fastunpack0(const uint32_t *__restrict, uint32_t *__restrict out) { + for (uint32_t i = 0; i < 32; ++i) + *(out++) = 0; +} + +void __fastunpack0(const uint32_t *__restrict, uint64_t *__restrict out) { + for (uint32_t i = 0; i < 32; ++i) + *(out++) = 0; +} + +void __fastpack0(const uint8_t *__restrict, uint8_t *__restrict) { +} +void __fastpack0(const uint16_t *__restrict, uint16_t *__restrict) { +} +void __fastpack0(const uint32_t *__restrict, uint32_t *__restrict) { +} +void __fastpack0(const uint64_t *__restrict, uint32_t *__restrict) { +} + +// fastunpack for 8 bits +void __fastunpack1(const uint8_t *__restrict in, uint8_t *__restrict out) { + Unroller8<1>::Unpack(in, out); +} + +void __fastunpack2(const uint8_t *__restrict in, uint8_t *__restrict out) { + Unroller8<2>::Unpack(in, out); +} + +void __fastunpack3(const uint8_t *__restrict in, uint8_t *__restrict out) { + Unroller8<3>::Unpack(in, out); +} + +void __fastunpack4(const uint8_t *__restrict in, uint8_t *__restrict out) { + for (uint8_t outer = 0; outer < 4; ++outer) { + for (uint8_t inwordpointer = 0; inwordpointer < 8; inwordpointer += 4) + *(out++) = ((*in) >> inwordpointer) % (1U << 4); + ++in; + } +} + +void __fastunpack5(const uint8_t *__restrict in, uint8_t *__restrict out) { + Unroller8<5>::Unpack(in, out); +} + +void __fastunpack6(const uint8_t *__restrict in, uint8_t *__restrict out) { + Unroller8<6>::Unpack(in, out); +} + +void __fastunpack7(const uint8_t *__restrict in, uint8_t *__restrict out) { + Unroller8<7>::Unpack(in, out); +} + +void __fastunpack8(const uint8_t *__restrict in, uint8_t *__restrict out) { + for (int k = 0; k < 8; ++k) + out[k] = in[k]; +} + + +// fastunpack for 16 bits +void __fastunpack1(const uint16_t *__restrict in, uint16_t *__restrict out) { + Unroller16<1>::Unpack(in, out); +} + +void __fastunpack2(const uint16_t *__restrict in, uint16_t *__restrict out) { + Unroller16<2>::Unpack(in, out); +} + +void __fastunpack3(const uint16_t *__restrict in, uint16_t *__restrict out) { + Unroller16<3>::Unpack(in, out); +} + +void __fastunpack4(const uint16_t *__restrict in, uint16_t *__restrict out) { + for (uint16_t outer = 0; outer < 4; ++outer) { + for (uint16_t inwordpointer = 0; inwordpointer < 16; inwordpointer += 4) + *(out++) = ((*in) >> inwordpointer) % (1U << 4); + ++in; + } +} + +void __fastunpack5(const uint16_t *__restrict in, uint16_t *__restrict out) { + Unroller16<5>::Unpack(in, out); +} + +void __fastunpack6(const uint16_t *__restrict in, uint16_t *__restrict out) { + Unroller16<6>::Unpack(in, out); +} + +void __fastunpack7(const uint16_t *__restrict in, uint16_t *__restrict out) { + Unroller16<7>::Unpack(in, out); +} + +void __fastunpack8(const uint16_t *__restrict in, uint16_t *__restrict out) { + for (uint16_t outer = 0; outer < 8; ++outer) { + for (uint16_t inwordpointer = 0; inwordpointer < 16; inwordpointer += 8) + *(out++) = ((*in) >> inwordpointer) % (1U << 8); + ++in; + } +} + +void __fastunpack9(const uint16_t *__restrict in, uint16_t *__restrict out) { + Unroller16<9>::Unpack(in, out); +} + +void __fastunpack10(const uint16_t *__restrict in, uint16_t *__restrict out) { + Unroller16<10>::Unpack(in, out); +} + +void __fastunpack11(const uint16_t *__restrict in, uint16_t *__restrict out) { + Unroller16<11>::Unpack(in, out); +} + +void __fastunpack12(const uint16_t *__restrict in, uint16_t *__restrict out) { + Unroller16<12>::Unpack(in, out); +} + +void __fastunpack13(const uint16_t *__restrict in, uint16_t *__restrict out) { + Unroller16<13>::Unpack(in, out); +} + +void __fastunpack14(const uint16_t *__restrict in, uint16_t *__restrict out) { + Unroller16<14>::Unpack(in, out); +} + +void __fastunpack15(const uint16_t *__restrict in, uint16_t *__restrict out) { + Unroller16<15>::Unpack(in, out); +} + +void __fastunpack16(const uint16_t *__restrict in, uint16_t *__restrict out) { + for (int k = 0; k < 16; ++k) + out[k] = in[k]; +} + +// fastunpack for 32 bits +void __fastunpack1(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<1>::Unpack(in, out); +} + +void __fastunpack2(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<2>::Unpack(in, out); +} + +void __fastunpack3(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<3>::Unpack(in, out); +} + +void __fastunpack4(const uint32_t *__restrict in, uint32_t *__restrict out) { + for (uint32_t outer = 0; outer < 4; ++outer) { + for (uint32_t inwordpointer = 0; inwordpointer < 32; inwordpointer += 4) + *(out++) = ((*in) >> inwordpointer) % (1U << 4); + ++in; + } +} + +void __fastunpack5(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<5>::Unpack(in, out); +} + +void __fastunpack6(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<6>::Unpack(in, out); +} + +void __fastunpack7(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<7>::Unpack(in, out); +} + +void __fastunpack8(const uint32_t *__restrict in, uint32_t *__restrict out) { + for (uint32_t outer = 0; outer < 8; ++outer) { + for (uint32_t inwordpointer = 0; inwordpointer < 32; inwordpointer += 8) + *(out++) = ((*in) >> inwordpointer) % (1U << 8); + ++in; + } +} + +void __fastunpack9(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<9>::Unpack(in, out); +} + +void __fastunpack10(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<10>::Unpack(in, out); +} + +void __fastunpack11(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<11>::Unpack(in, out); +} + +void __fastunpack12(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<12>::Unpack(in, out); +} + +void __fastunpack13(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<13>::Unpack(in, out); +} + +void __fastunpack14(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<14>::Unpack(in, out); +} + +void __fastunpack15(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<15>::Unpack(in, out); +} + +void __fastunpack16(const uint32_t *__restrict in, uint32_t *__restrict out) { + for (uint32_t outer = 0; outer < 16; ++outer) { + for (uint32_t inwordpointer = 0; inwordpointer < 32; inwordpointer += 16) + *(out++) = ((*in) >> inwordpointer) % (1U << 16); + ++in; + } +} + +void __fastunpack17(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<17>::Unpack(in, out); +} + +void __fastunpack18(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<18>::Unpack(in, out); +} + +void __fastunpack19(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<19>::Unpack(in, out); +} + +void __fastunpack20(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<20>::Unpack(in, out); +} + +void __fastunpack21(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<21>::Unpack(in, out); +} + +void __fastunpack22(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<22>::Unpack(in, out); +} + +void __fastunpack23(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<23>::Unpack(in, out); +} + +void __fastunpack24(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<24>::Unpack(in, out); +} + +void __fastunpack25(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<25>::Unpack(in, out); +} + +void __fastunpack26(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<26>::Unpack(in, out); +} + +void __fastunpack27(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<27>::Unpack(in, out); +} + +void __fastunpack28(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<28>::Unpack(in, out); +} + +void __fastunpack29(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<29>::Unpack(in, out); +} + +void __fastunpack30(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<30>::Unpack(in, out); +} + +void __fastunpack31(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<31>::Unpack(in, out); +} + +void __fastunpack32(const uint32_t *__restrict in, uint32_t *__restrict out) { + for (int k = 0; k < 32; ++k) + out[k] = in[k]; +} + +// fastupack for 64 bits +void __fastunpack1(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<1>::Unpack(in, out); +} + +void __fastunpack2(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<2>::Unpack(in, out); +} + +void __fastunpack3(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<3>::Unpack(in, out); +} + +void __fastunpack4(const uint32_t *__restrict in, uint64_t *__restrict out) { + for (uint32_t outer = 0; outer < 4; ++outer) { + for (uint32_t inwordpointer = 0; inwordpointer < 32; inwordpointer += 4) + *(out++) = ((*in) >> inwordpointer) % (1U << 4); + ++in; + } +} + +void __fastunpack5(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<5>::Unpack(in, out); +} + +void __fastunpack6(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<6>::Unpack(in, out); +} + +void __fastunpack7(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<7>::Unpack(in, out); +} + +void __fastunpack8(const uint32_t *__restrict in, uint64_t *__restrict out) { + for (uint32_t outer = 0; outer < 8; ++outer) { + for (uint32_t inwordpointer = 0; inwordpointer < 32; inwordpointer += 8) { + *(out++) = ((*in) >> inwordpointer) % (1U << 8); + } + ++in; + } +} + +void __fastunpack9(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<9>::Unpack(in, out); +} + +void __fastunpack10(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<10>::Unpack(in, out); +} + +void __fastunpack11(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<11>::Unpack(in, out); +} + +void __fastunpack12(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<12>::Unpack(in, out); +} + +void __fastunpack13(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<13>::Unpack(in, out); +} + +void __fastunpack14(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<14>::Unpack(in, out); +} + +void __fastunpack15(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<15>::Unpack(in, out); +} + +void __fastunpack16(const uint32_t *__restrict in, uint64_t *__restrict out) { + for (uint32_t outer = 0; outer < 16; ++outer) { + for (uint32_t inwordpointer = 0; inwordpointer < 32; inwordpointer += 16) + *(out++) = ((*in) >> inwordpointer) % (1U << 16); + ++in; + } +} + +void __fastunpack17(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<17>::Unpack(in, out); +} + +void __fastunpack18(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<18>::Unpack(in, out); +} + +void __fastunpack19(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<19>::Unpack(in, out); +} + +void __fastunpack20(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<20>::Unpack(in, out); +} + +void __fastunpack21(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<21>::Unpack(in, out); +} + +void __fastunpack22(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<22>::Unpack(in, out); +} + +void __fastunpack23(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<23>::Unpack(in, out); +} + +void __fastunpack24(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<24>::Unpack(in, out); +} + +void __fastunpack25(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<25>::Unpack(in, out); +} + +void __fastunpack26(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<26>::Unpack(in, out); +} + +void __fastunpack27(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<27>::Unpack(in, out); +} + +void __fastunpack28(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<28>::Unpack(in, out); +} + +void __fastunpack29(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<29>::Unpack(in, out); +} + +void __fastunpack30(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<30>::Unpack(in, out); +} + +void __fastunpack31(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<31>::Unpack(in, out); +} + +void __fastunpack32(const uint32_t *__restrict in, uint64_t *__restrict out) { + for (int k = 0; k < 32; ++k) + out[k] = in[k]; +} + +void __fastunpack33(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<33>::Unpack(in, out); +} + +void __fastunpack34(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<34>::Unpack(in, out); +} + +void __fastunpack35(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<35>::Unpack(in, out); +} + +void __fastunpack36(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<36>::Unpack(in, out); +} + +void __fastunpack37(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<37>::Unpack(in, out); +} + +void __fastunpack38(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<38>::Unpack(in, out); +} + +void __fastunpack39(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<39>::Unpack(in, out); +} + +void __fastunpack40(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<40>::Unpack(in, out); +} + +void __fastunpack41(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<41>::Unpack(in, out); +} + +void __fastunpack42(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<42>::Unpack(in, out); +} + +void __fastunpack43(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<43>::Unpack(in, out); +} + +void __fastunpack44(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<44>::Unpack(in, out); +} + +void __fastunpack45(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<45>::Unpack(in, out); +} + +void __fastunpack46(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<46>::Unpack(in, out); +} + +void __fastunpack47(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<47>::Unpack(in, out); +} + +void __fastunpack48(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<48>::Unpack(in, out); +} + +void __fastunpack49(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<49>::Unpack(in, out); +} + +void __fastunpack50(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<50>::Unpack(in, out); +} + +void __fastunpack51(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<51>::Unpack(in, out); +} + +void __fastunpack52(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<52>::Unpack(in, out); +} + +void __fastunpack53(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<53>::Unpack(in, out); +} + +void __fastunpack54(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<54>::Unpack(in, out); +} + +void __fastunpack55(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<55>::Unpack(in, out); +} + +void __fastunpack56(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<56>::Unpack(in, out); +} + +void __fastunpack57(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<57>::Unpack(in, out); +} + +void __fastunpack58(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<58>::Unpack(in, out); +} + +void __fastunpack59(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<59>::Unpack(in, out); +} + +void __fastunpack60(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<60>::Unpack(in, out); +} + +void __fastunpack61(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<61>::Unpack(in, out); +} + +void __fastunpack62(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<62>::Unpack(in, out); +} + +void __fastunpack63(const uint32_t *__restrict in, uint64_t *__restrict out) { + Unroller<63>::Unpack(in, out); +} + +void __fastunpack64(const uint32_t *__restrict in, uint64_t *__restrict out) { + for (int k = 0; k < 32; ++k) { + out[k] = in[k * 2]; + out[k] |= static_cast(in[k * 2 + 1]) << 32; + } +} + +// fastpack for 8 bits + +void __fastpack1(const uint8_t *__restrict in, uint8_t *__restrict out) { + Unroller8<1>::Pack(in, out); +} + +void __fastpack2(const uint8_t *__restrict in, uint8_t *__restrict out) { + Unroller8<2>::Pack(in, out); +} + +void __fastpack3(const uint8_t *__restrict in, uint8_t *__restrict out) { + Unroller8<3>::Pack(in, out); +} + +void __fastpack4(const uint8_t *__restrict in, uint8_t *__restrict out) { + Unroller8<4>::Pack(in, out); +} + +void __fastpack5(const uint8_t *__restrict in, uint8_t *__restrict out) { + Unroller8<5>::Pack(in, out); +} + +void __fastpack6(const uint8_t *__restrict in, uint8_t *__restrict out) { + Unroller8<6>::Pack(in, out); +} + +void __fastpack7(const uint8_t *__restrict in, uint8_t *__restrict out) { + Unroller8<7>::Pack(in, out); +} + +void __fastpack8(const uint8_t *__restrict in, uint8_t *__restrict out) { + for (int k = 0; k < 8; ++k) + out[k] = in[k]; +} + +// fastpack for 16 bits + +void __fastpack1(const uint16_t *__restrict in, uint16_t *__restrict out) { + Unroller16<1>::Pack(in, out); +} + +void __fastpack2(const uint16_t *__restrict in, uint16_t *__restrict out) { + Unroller16<2>::Pack(in, out); +} + +void __fastpack3(const uint16_t *__restrict in, uint16_t *__restrict out) { + Unroller16<3>::Pack(in, out); +} + +void __fastpack4(const uint16_t *__restrict in, uint16_t *__restrict out) { + Unroller16<4>::Pack(in, out); +} + +void __fastpack5(const uint16_t *__restrict in, uint16_t *__restrict out) { + Unroller16<5>::Pack(in, out); +} + +void __fastpack6(const uint16_t *__restrict in, uint16_t *__restrict out) { + Unroller16<6>::Pack(in, out); +} + +void __fastpack7(const uint16_t *__restrict in, uint16_t *__restrict out) { + Unroller16<7>::Pack(in, out); +} + +void __fastpack8(const uint16_t *__restrict in, uint16_t *__restrict out) { + Unroller16<8>::Pack(in, out); +} + +void __fastpack9(const uint16_t *__restrict in, uint16_t *__restrict out) { + Unroller16<9>::Pack(in, out); +} + +void __fastpack10(const uint16_t *__restrict in, uint16_t *__restrict out) { + Unroller16<10>::Pack(in, out); +} + +void __fastpack11(const uint16_t *__restrict in, uint16_t *__restrict out) { + Unroller16<11>::Pack(in, out); +} + +void __fastpack12(const uint16_t *__restrict in, uint16_t *__restrict out) { + Unroller16<12>::Pack(in, out); +} + +void __fastpack13(const uint16_t *__restrict in, uint16_t *__restrict out) { + Unroller16<13>::Pack(in, out); +} + +void __fastpack14(const uint16_t *__restrict in, uint16_t *__restrict out) { + Unroller16<14>::Pack(in, out); +} + +void __fastpack15(const uint16_t *__restrict in, uint16_t *__restrict out) { + Unroller16<15>::Pack(in, out); +} + +void __fastpack16(const uint16_t *__restrict in, uint16_t *__restrict out) { + for (int k = 0; k < 16; ++k) + out[k] = in[k]; +} + + +// fastpack for 32 bits + +void __fastpack1(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<1>::Pack(in, out); +} + +void __fastpack2(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<2>::Pack(in, out); +} + +void __fastpack3(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<3>::Pack(in, out); +} + +void __fastpack4(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<4>::Pack(in, out); +} + +void __fastpack5(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<5>::Pack(in, out); +} + +void __fastpack6(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<6>::Pack(in, out); +} + +void __fastpack7(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<7>::Pack(in, out); +} + +void __fastpack8(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<8>::Pack(in, out); +} + +void __fastpack9(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<9>::Pack(in, out); +} + +void __fastpack10(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<10>::Pack(in, out); +} + +void __fastpack11(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<11>::Pack(in, out); +} + +void __fastpack12(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<12>::Pack(in, out); +} + +void __fastpack13(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<13>::Pack(in, out); +} + +void __fastpack14(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<14>::Pack(in, out); +} + +void __fastpack15(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<15>::Pack(in, out); +} + +void __fastpack16(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<16>::Pack(in, out); +} + +void __fastpack17(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<17>::Pack(in, out); +} + +void __fastpack18(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<18>::Pack(in, out); +} + +void __fastpack19(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<19>::Pack(in, out); +} + +void __fastpack20(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<20>::Pack(in, out); +} + +void __fastpack21(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<21>::Pack(in, out); +} + +void __fastpack22(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<22>::Pack(in, out); +} + +void __fastpack23(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<23>::Pack(in, out); +} + +void __fastpack24(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<24>::Pack(in, out); +} + +void __fastpack25(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<25>::Pack(in, out); +} + +void __fastpack26(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<26>::Pack(in, out); +} + +void __fastpack27(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<27>::Pack(in, out); +} + +void __fastpack28(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<28>::Pack(in, out); +} + +void __fastpack29(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<29>::Pack(in, out); +} + +void __fastpack30(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<30>::Pack(in, out); +} + +void __fastpack31(const uint32_t *__restrict in, uint32_t *__restrict out) { + Unroller<31>::Pack(in, out); +} + +void __fastpack32(const uint32_t *__restrict in, uint32_t *__restrict out) { + for (int k = 0; k < 32; ++k) + out[k] = in[k]; +} + +// fastpack for 64 bits + +void __fastpack1(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<1>::Pack(in, out); +} + +void __fastpack2(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<2>::Pack(in, out); +} + +void __fastpack3(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<3>::Pack(in, out); +} + +void __fastpack4(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<4>::Pack(in, out); +} + +void __fastpack5(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<5>::Pack(in, out); +} + +void __fastpack6(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<6>::Pack(in, out); +} + +void __fastpack7(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<7>::Pack(in, out); +} + +void __fastpack8(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<8>::Pack(in, out); +} + +void __fastpack9(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<9>::Pack(in, out); +} + +void __fastpack10(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<10>::Pack(in, out); +} + +void __fastpack11(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<11>::Pack(in, out); +} + +void __fastpack12(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<12>::Pack(in, out); +} + +void __fastpack13(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<13>::Pack(in, out); +} + +void __fastpack14(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<14>::Pack(in, out); +} + +void __fastpack15(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<15>::Pack(in, out); +} + +void __fastpack16(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<16>::Pack(in, out); +} + +void __fastpack17(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<17>::Pack(in, out); +} + +void __fastpack18(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<18>::Pack(in, out); +} + +void __fastpack19(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<19>::Pack(in, out); +} + +void __fastpack20(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<20>::Pack(in, out); +} + +void __fastpack21(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<21>::Pack(in, out); +} + +void __fastpack22(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<22>::Pack(in, out); +} + +void __fastpack23(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<23>::Pack(in, out); +} + +void __fastpack24(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<24>::Pack(in, out); +} + +void __fastpack25(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<25>::Pack(in, out); +} + +void __fastpack26(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<26>::Pack(in, out); +} + +void __fastpack27(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<27>::Pack(in, out); +} + +void __fastpack28(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<28>::Pack(in, out); +} + +void __fastpack29(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<29>::Pack(in, out); +} + +void __fastpack30(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<30>::Pack(in, out); +} + +void __fastpack31(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<31>::Pack(in, out); +} + +void __fastpack32(const uint64_t *__restrict in, uint32_t *__restrict out) { + for (int k = 0; k < 32; ++k) { + out[k] = static_cast(in[k]); + } +} + +void __fastpack33(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<33>::Pack(in, out); +} + +void __fastpack34(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<34>::Pack(in, out); +} + +void __fastpack35(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<35>::Pack(in, out); +} + +void __fastpack36(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<36>::Pack(in, out); +} + +void __fastpack37(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<37>::Pack(in, out); +} + +void __fastpack38(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<38>::Pack(in, out); +} + +void __fastpack39(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<39>::Pack(in, out); +} + +void __fastpack40(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<40>::Pack(in, out); +} + +void __fastpack41(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<41>::Pack(in, out); +} + +void __fastpack42(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<42>::Pack(in, out); +} + +void __fastpack43(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<43>::Pack(in, out); +} + +void __fastpack44(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<44>::Pack(in, out); +} + +void __fastpack45(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<45>::Pack(in, out); +} + +void __fastpack46(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<46>::Pack(in, out); +} + +void __fastpack47(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<47>::Pack(in, out); +} + +void __fastpack48(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<48>::Pack(in, out); +} + +void __fastpack49(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<49>::Pack(in, out); +} + +void __fastpack50(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<50>::Pack(in, out); +} + +void __fastpack51(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<51>::Pack(in, out); +} + +void __fastpack52(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<52>::Pack(in, out); +} + +void __fastpack53(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<53>::Pack(in, out); +} + +void __fastpack54(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<54>::Pack(in, out); +} + +void __fastpack55(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<55>::Pack(in, out); +} + +void __fastpack56(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<56>::Pack(in, out); +} + +void __fastpack57(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<57>::Pack(in, out); +} + +void __fastpack58(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<58>::Pack(in, out); +} + +void __fastpack59(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<59>::Pack(in, out); +} + +void __fastpack60(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<60>::Pack(in, out); +} + +void __fastpack61(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<61>::Pack(in, out); +} + +void __fastpack62(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<62>::Pack(in, out); +} + +void __fastpack63(const uint64_t *__restrict in, uint32_t *__restrict out) { + Unroller<63>::Pack(in, out); +} + +void __fastpack64(const uint64_t *__restrict in, uint32_t *__restrict out) { + for (int i = 0; i < 32; ++i) { + out[2 * i] = static_cast(in[i]); + out[2 * i + 1] = in[i] >> 32; + } +} +} // namespace internal +} // namespace fastpforlib diff --git a/contrib/fastpforlib/bitpacking.h b/contrib/fastpforlib/bitpacking.h new file mode 100644 index 00000000000..c994120d8e0 --- /dev/null +++ b/contrib/fastpforlib/bitpacking.h @@ -0,0 +1,278 @@ +/** + * This code is released under the + * Apache License Version 2.0 http://www.apache.org/licenses/. + * + * (c) Daniel Lemire, http://fastpforlib.me/en/ + */ +#pragma once +#include +#include + +namespace fastpforlib { +namespace internal { + +// Unpacks 8 uint8_t values +void __fastunpack0(const uint8_t *__restrict in, uint8_t *__restrict out); +void __fastunpack1(const uint8_t *__restrict in, uint8_t *__restrict out); +void __fastunpack2(const uint8_t *__restrict in, uint8_t *__restrict out); +void __fastunpack3(const uint8_t *__restrict in, uint8_t *__restrict out); +void __fastunpack4(const uint8_t *__restrict in, uint8_t *__restrict out); +void __fastunpack5(const uint8_t *__restrict in, uint8_t *__restrict out); +void __fastunpack6(const uint8_t *__restrict in, uint8_t *__restrict out); +void __fastunpack7(const uint8_t *__restrict in, uint8_t *__restrict out); +void __fastunpack8(const uint8_t *__restrict in, uint8_t *__restrict out); + +// Unpacks 16 uint16_t values +void __fastunpack0(const uint16_t *__restrict in, uint16_t *__restrict out); +void __fastunpack1(const uint16_t *__restrict in, uint16_t *__restrict out); +void __fastunpack2(const uint16_t *__restrict in, uint16_t *__restrict out); +void __fastunpack3(const uint16_t *__restrict in, uint16_t *__restrict out); +void __fastunpack4(const uint16_t *__restrict in, uint16_t *__restrict out); +void __fastunpack5(const uint16_t *__restrict in, uint16_t *__restrict out); +void __fastunpack6(const uint16_t *__restrict in, uint16_t *__restrict out); +void __fastunpack7(const uint16_t *__restrict in, uint16_t *__restrict out); +void __fastunpack8(const uint16_t *__restrict in, uint16_t *__restrict out); +void __fastunpack9(const uint16_t *__restrict in, uint16_t *__restrict out); +void __fastunpack10(const uint16_t *__restrict in, uint16_t *__restrict out); +void __fastunpack11(const uint16_t *__restrict in, uint16_t *__restrict out); +void __fastunpack12(const uint16_t *__restrict in, uint16_t *__restrict out); +void __fastunpack13(const uint16_t *__restrict in, uint16_t *__restrict out); +void __fastunpack14(const uint16_t *__restrict in, uint16_t *__restrict out); +void __fastunpack15(const uint16_t *__restrict in, uint16_t *__restrict out); +void __fastunpack16(const uint16_t *__restrict in, uint16_t *__restrict out); + +// Unpacks 32 uint32_t values +void __fastunpack0(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastunpack1(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastunpack2(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastunpack3(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastunpack4(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastunpack5(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastunpack6(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastunpack7(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastunpack8(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastunpack9(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastunpack10(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastunpack11(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastunpack12(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastunpack13(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastunpack14(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastunpack15(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastunpack16(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastunpack17(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastunpack18(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastunpack19(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastunpack20(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastunpack21(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastunpack22(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastunpack23(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastunpack24(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastunpack25(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastunpack26(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastunpack27(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastunpack28(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastunpack29(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastunpack30(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastunpack31(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastunpack32(const uint32_t *__restrict in, uint32_t *__restrict out); + +// Unpacks 32 uint64_t values +void __fastunpack0(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack1(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack2(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack3(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack4(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack5(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack6(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack7(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack8(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack9(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack10(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack11(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack12(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack13(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack14(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack15(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack16(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack17(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack18(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack19(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack20(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack21(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack22(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack23(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack24(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack25(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack26(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack27(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack28(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack29(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack30(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack31(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack32(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack33(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack34(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack35(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack36(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack37(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack38(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack39(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack40(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack41(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack42(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack43(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack44(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack45(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack46(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack47(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack48(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack49(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack50(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack51(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack52(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack53(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack54(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack55(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack56(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack57(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack58(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack59(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack60(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack61(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack62(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack63(const uint32_t *__restrict in, uint64_t *__restrict out); +void __fastunpack64(const uint32_t *__restrict in, uint64_t *__restrict out); + +// Packs 8 int8_t values +void __fastpack0(const uint8_t *__restrict in, uint8_t *__restrict out); +void __fastpack1(const uint8_t *__restrict in, uint8_t *__restrict out); +void __fastpack2(const uint8_t *__restrict in, uint8_t *__restrict out); +void __fastpack3(const uint8_t *__restrict in, uint8_t *__restrict out); +void __fastpack4(const uint8_t *__restrict in, uint8_t *__restrict out); +void __fastpack5(const uint8_t *__restrict in, uint8_t *__restrict out); +void __fastpack6(const uint8_t *__restrict in, uint8_t *__restrict out); +void __fastpack7(const uint8_t *__restrict in, uint8_t *__restrict out); +void __fastpack8(const uint8_t *__restrict in, uint8_t *__restrict out); + +// Packs 16 int16_t values +void __fastpack0(const uint16_t *__restrict in, uint16_t *__restrict out); +void __fastpack1(const uint16_t *__restrict in, uint16_t *__restrict out); +void __fastpack2(const uint16_t *__restrict in, uint16_t *__restrict out); +void __fastpack3(const uint16_t *__restrict in, uint16_t *__restrict out); +void __fastpack4(const uint16_t *__restrict in, uint16_t *__restrict out); +void __fastpack5(const uint16_t *__restrict in, uint16_t *__restrict out); +void __fastpack6(const uint16_t *__restrict in, uint16_t *__restrict out); +void __fastpack7(const uint16_t *__restrict in, uint16_t *__restrict out); +void __fastpack8(const uint16_t *__restrict in, uint16_t *__restrict out); +void __fastpack9(const uint16_t *__restrict in, uint16_t *__restrict out); +void __fastpack10(const uint16_t *__restrict in, uint16_t *__restrict out); +void __fastpack11(const uint16_t *__restrict in, uint16_t *__restrict out); +void __fastpack12(const uint16_t *__restrict in, uint16_t *__restrict out); +void __fastpack13(const uint16_t *__restrict in, uint16_t *__restrict out); +void __fastpack14(const uint16_t *__restrict in, uint16_t *__restrict out); +void __fastpack15(const uint16_t *__restrict in, uint16_t *__restrict out); +void __fastpack16(const uint16_t *__restrict in, uint16_t *__restrict out); + +// Packs 32 int32_t values +void __fastpack0(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastpack1(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastpack2(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastpack3(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastpack4(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastpack5(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastpack6(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastpack7(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastpack8(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastpack9(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastpack10(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastpack11(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastpack12(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastpack13(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastpack14(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastpack15(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastpack16(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastpack17(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastpack18(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastpack19(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastpack20(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastpack21(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastpack22(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastpack23(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastpack24(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastpack25(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastpack26(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastpack27(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastpack28(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastpack29(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastpack30(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastpack31(const uint32_t *__restrict in, uint32_t *__restrict out); +void __fastpack32(const uint32_t *__restrict in, uint32_t *__restrict out); + +// Packs 32 int64_t values +void __fastpack0(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack1(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack2(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack3(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack4(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack5(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack6(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack7(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack8(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack9(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack10(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack11(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack12(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack13(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack14(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack15(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack16(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack17(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack18(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack19(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack20(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack21(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack22(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack23(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack24(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack25(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack26(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack27(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack28(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack29(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack30(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack31(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack32(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack33(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack34(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack35(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack36(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack37(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack38(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack39(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack40(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack41(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack42(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack43(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack44(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack45(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack46(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack47(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack48(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack49(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack50(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack51(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack52(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack53(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack54(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack55(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack56(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack57(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack58(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack59(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack60(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack61(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack62(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack63(const uint64_t *__restrict in, uint32_t *__restrict out); +void __fastpack64(const uint64_t *__restrict in, uint32_t *__restrict out); +} // namespace internal +} // namespace fastpforlib diff --git a/contrib/fastpforlib/bitpackinghelpers.h b/contrib/fastpforlib/bitpackinghelpers.h new file mode 100644 index 00000000000..913ffa9250e --- /dev/null +++ b/contrib/fastpforlib/bitpackinghelpers.h @@ -0,0 +1,887 @@ +/** +* This code is released under the +* Apache License Version 2.0 http://www.apache.org/licenses/. +* +* (c) Daniel Lemire, http://lemire.me/en/ +*/ +#pragma once +#include + +#include "bitpacking.h" + +namespace fastpforlib +{ + +namespace internal +{ + +// Note that this only packs 8 values +inline void fastunpack_quarter(const uint8_t * __restrict in, uint8_t * __restrict out, const uint32_t bit) +{ + // Could have used function pointers instead of switch. + // Switch calls do offer the compiler more opportunities for optimization in + // theory. In this case, it makes no difference with a good compiler. + switch (bit) + { + case 0: + internal::__fastunpack0(in, out); + break; + case 1: + internal::__fastunpack1(in, out); + break; + case 2: + internal::__fastunpack2(in, out); + break; + case 3: + internal::__fastunpack3(in, out); + break; + case 4: + internal::__fastunpack4(in, out); + break; + case 5: + internal::__fastunpack5(in, out); + break; + case 6: + internal::__fastunpack6(in, out); + break; + case 7: + internal::__fastunpack7(in, out); + break; + case 8: + internal::__fastunpack8(in, out); + break; + default: + throw std::logic_error("Invalid bit width for bitpacking"); + } +} + +// Note that this only packs 8 values +inline void fastpack_quarter(const uint8_t * __restrict in, uint8_t * __restrict out, const uint32_t bit) +{ + // Could have used function pointers instead of switch. + // Switch calls do offer the compiler more opportunities for optimization in + // theory. In this case, it makes no difference with a good compiler. + switch (bit) + { + case 0: + internal::__fastpack0(in, out); + break; + case 1: + internal::__fastpack1(in, out); + break; + case 2: + internal::__fastpack2(in, out); + break; + case 3: + internal::__fastpack3(in, out); + break; + case 4: + internal::__fastpack4(in, out); + break; + case 5: + internal::__fastpack5(in, out); + break; + case 6: + internal::__fastpack6(in, out); + break; + case 7: + internal::__fastpack7(in, out); + break; + case 8: + internal::__fastpack8(in, out); + break; + default: + throw std::logic_error("Invalid bit width for bitpacking"); + } +} + +// Note that this only packs 16 values +inline void fastunpack_half(const uint16_t * __restrict in, uint16_t * __restrict out, const uint32_t bit) +{ + // Could have used function pointers instead of switch. + // Switch calls do offer the compiler more opportunities for optimization in + // theory. In this case, it makes no difference with a good compiler. + switch (bit) + { + case 0: + internal::__fastunpack0(in, out); + break; + case 1: + internal::__fastunpack1(in, out); + break; + case 2: + internal::__fastunpack2(in, out); + break; + case 3: + internal::__fastunpack3(in, out); + break; + case 4: + internal::__fastunpack4(in, out); + break; + case 5: + internal::__fastunpack5(in, out); + break; + case 6: + internal::__fastunpack6(in, out); + break; + case 7: + internal::__fastunpack7(in, out); + break; + case 8: + internal::__fastunpack8(in, out); + break; + case 9: + internal::__fastunpack9(in, out); + break; + case 10: + internal::__fastunpack10(in, out); + break; + case 11: + internal::__fastunpack11(in, out); + break; + case 12: + internal::__fastunpack12(in, out); + break; + case 13: + internal::__fastunpack13(in, out); + break; + case 14: + internal::__fastunpack14(in, out); + break; + case 15: + internal::__fastunpack15(in, out); + break; + case 16: + internal::__fastunpack16(in, out); + break; + default: + throw std::logic_error("Invalid bit width for bitpacking"); + } +} + +// Note that this only packs 16 values +inline void fastpack_half(const uint16_t * __restrict in, uint16_t * __restrict out, const uint32_t bit) +{ + // Could have used function pointers instead of switch. + // Switch calls do offer the compiler more opportunities for optimization in + // theory. In this case, it makes no difference with a good compiler. + switch (bit) + { + case 0: + internal::__fastpack0(in, out); + break; + case 1: + internal::__fastpack1(in, out); + break; + case 2: + internal::__fastpack2(in, out); + break; + case 3: + internal::__fastpack3(in, out); + break; + case 4: + internal::__fastpack4(in, out); + break; + case 5: + internal::__fastpack5(in, out); + break; + case 6: + internal::__fastpack6(in, out); + break; + case 7: + internal::__fastpack7(in, out); + break; + case 8: + internal::__fastpack8(in, out); + break; + case 9: + internal::__fastpack9(in, out); + break; + case 10: + internal::__fastpack10(in, out); + break; + case 11: + internal::__fastpack11(in, out); + break; + case 12: + internal::__fastpack12(in, out); + break; + case 13: + internal::__fastpack13(in, out); + break; + case 14: + internal::__fastpack14(in, out); + break; + case 15: + internal::__fastpack15(in, out); + break; + case 16: + internal::__fastpack16(in, out); + break; + default: + throw std::logic_error("Invalid bit width for bitpacking"); + } +} +} // namespace internal + +inline void fastunpack(const uint8_t * __restrict in, uint8_t * __restrict out, const uint32_t bit) +{ + for (uint8_t i = 0; i < 4; i++) + { + internal::fastunpack_quarter(in + (i * bit), out + (i * 8), bit); + } +} + +inline void fastunpack(const uint16_t * __restrict in, uint16_t * __restrict out, const uint32_t bit) +{ + internal::fastunpack_half(in, out, bit); + internal::fastunpack_half(in + bit, out + 16, bit); +} + +inline void fastunpack(const uint32_t * __restrict in, uint32_t * __restrict out, const uint32_t bit) +{ + // Could have used function pointers instead of switch. + // Switch calls do offer the compiler more opportunities for optimization in + // theory. In this case, it makes no difference with a good compiler. + switch (bit) + { + case 0: + internal::__fastunpack0(in, out); + break; + case 1: + internal::__fastunpack1(in, out); + break; + case 2: + internal::__fastunpack2(in, out); + break; + case 3: + internal::__fastunpack3(in, out); + break; + case 4: + internal::__fastunpack4(in, out); + break; + case 5: + internal::__fastunpack5(in, out); + break; + case 6: + internal::__fastunpack6(in, out); + break; + case 7: + internal::__fastunpack7(in, out); + break; + case 8: + internal::__fastunpack8(in, out); + break; + case 9: + internal::__fastunpack9(in, out); + break; + case 10: + internal::__fastunpack10(in, out); + break; + case 11: + internal::__fastunpack11(in, out); + break; + case 12: + internal::__fastunpack12(in, out); + break; + case 13: + internal::__fastunpack13(in, out); + break; + case 14: + internal::__fastunpack14(in, out); + break; + case 15: + internal::__fastunpack15(in, out); + break; + case 16: + internal::__fastunpack16(in, out); + break; + case 17: + internal::__fastunpack17(in, out); + break; + case 18: + internal::__fastunpack18(in, out); + break; + case 19: + internal::__fastunpack19(in, out); + break; + case 20: + internal::__fastunpack20(in, out); + break; + case 21: + internal::__fastunpack21(in, out); + break; + case 22: + internal::__fastunpack22(in, out); + break; + case 23: + internal::__fastunpack23(in, out); + break; + case 24: + internal::__fastunpack24(in, out); + break; + case 25: + internal::__fastunpack25(in, out); + break; + case 26: + internal::__fastunpack26(in, out); + break; + case 27: + internal::__fastunpack27(in, out); + break; + case 28: + internal::__fastunpack28(in, out); + break; + case 29: + internal::__fastunpack29(in, out); + break; + case 30: + internal::__fastunpack30(in, out); + break; + case 31: + internal::__fastunpack31(in, out); + break; + case 32: + internal::__fastunpack32(in, out); + break; + default: + throw std::logic_error("Invalid bit width for bitpacking"); + } +} + +inline void fastunpack(const uint32_t * __restrict in, uint64_t * __restrict out, const uint32_t bit) +{ + // Could have used function pointers instead of switch. + // Switch calls do offer the compiler more opportunities for optimization in + // theory. In this case, it makes no difference with a good compiler. + switch (bit) + { + case 0: + internal::__fastunpack0(in, out); + break; + case 1: + internal::__fastunpack1(in, out); + break; + case 2: + internal::__fastunpack2(in, out); + break; + case 3: + internal::__fastunpack3(in, out); + break; + case 4: + internal::__fastunpack4(in, out); + break; + case 5: + internal::__fastunpack5(in, out); + break; + case 6: + internal::__fastunpack6(in, out); + break; + case 7: + internal::__fastunpack7(in, out); + break; + case 8: + internal::__fastunpack8(in, out); + break; + case 9: + internal::__fastunpack9(in, out); + break; + case 10: + internal::__fastunpack10(in, out); + break; + case 11: + internal::__fastunpack11(in, out); + break; + case 12: + internal::__fastunpack12(in, out); + break; + case 13: + internal::__fastunpack13(in, out); + break; + case 14: + internal::__fastunpack14(in, out); + break; + case 15: + internal::__fastunpack15(in, out); + break; + case 16: + internal::__fastunpack16(in, out); + break; + case 17: + internal::__fastunpack17(in, out); + break; + case 18: + internal::__fastunpack18(in, out); + break; + case 19: + internal::__fastunpack19(in, out); + break; + case 20: + internal::__fastunpack20(in, out); + break; + case 21: + internal::__fastunpack21(in, out); + break; + case 22: + internal::__fastunpack22(in, out); + break; + case 23: + internal::__fastunpack23(in, out); + break; + case 24: + internal::__fastunpack24(in, out); + break; + case 25: + internal::__fastunpack25(in, out); + break; + case 26: + internal::__fastunpack26(in, out); + break; + case 27: + internal::__fastunpack27(in, out); + break; + case 28: + internal::__fastunpack28(in, out); + break; + case 29: + internal::__fastunpack29(in, out); + break; + case 30: + internal::__fastunpack30(in, out); + break; + case 31: + internal::__fastunpack31(in, out); + break; + case 32: + internal::__fastunpack32(in, out); + break; + case 33: + internal::__fastunpack33(in, out); + break; + case 34: + internal::__fastunpack34(in, out); + break; + case 35: + internal::__fastunpack35(in, out); + break; + case 36: + internal::__fastunpack36(in, out); + break; + case 37: + internal::__fastunpack37(in, out); + break; + case 38: + internal::__fastunpack38(in, out); + break; + case 39: + internal::__fastunpack39(in, out); + break; + case 40: + internal::__fastunpack40(in, out); + break; + case 41: + internal::__fastunpack41(in, out); + break; + case 42: + internal::__fastunpack42(in, out); + break; + case 43: + internal::__fastunpack43(in, out); + break; + case 44: + internal::__fastunpack44(in, out); + break; + case 45: + internal::__fastunpack45(in, out); + break; + case 46: + internal::__fastunpack46(in, out); + break; + case 47: + internal::__fastunpack47(in, out); + break; + case 48: + internal::__fastunpack48(in, out); + break; + case 49: + internal::__fastunpack49(in, out); + break; + case 50: + internal::__fastunpack50(in, out); + break; + case 51: + internal::__fastunpack51(in, out); + break; + case 52: + internal::__fastunpack52(in, out); + break; + case 53: + internal::__fastunpack53(in, out); + break; + case 54: + internal::__fastunpack54(in, out); + break; + case 55: + internal::__fastunpack55(in, out); + break; + case 56: + internal::__fastunpack56(in, out); + break; + case 57: + internal::__fastunpack57(in, out); + break; + case 58: + internal::__fastunpack58(in, out); + break; + case 59: + internal::__fastunpack59(in, out); + break; + case 60: + internal::__fastunpack60(in, out); + break; + case 61: + internal::__fastunpack61(in, out); + break; + case 62: + internal::__fastunpack62(in, out); + break; + case 63: + internal::__fastunpack63(in, out); + break; + case 64: + internal::__fastunpack64(in, out); + break; + default: + throw std::logic_error("Invalid bit width for bitpacking"); + } +} + +inline void fastpack(const uint8_t * __restrict in, uint8_t * __restrict out, const uint32_t bit) +{ + for (uint8_t i = 0; i < 4; i++) + { + internal::fastpack_quarter(in + (i * 8), out + (i * bit), bit); + } +} + +inline void fastpack(const uint16_t * __restrict in, uint16_t * __restrict out, const uint32_t bit) +{ + internal::fastpack_half(in, out, bit); + internal::fastpack_half(in + 16, out + bit, bit); +} + +inline void fastpack(const uint32_t * __restrict in, uint32_t * __restrict out, const uint32_t bit) +{ + // Could have used function pointers instead of switch. + // Switch calls do offer the compiler more opportunities for optimization in + // theory. In this case, it makes no difference with a good compiler. + switch (bit) + { + case 0: + internal::__fastpack0(in, out); + break; + case 1: + internal::__fastpack1(in, out); + break; + case 2: + internal::__fastpack2(in, out); + break; + case 3: + internal::__fastpack3(in, out); + break; + case 4: + internal::__fastpack4(in, out); + break; + case 5: + internal::__fastpack5(in, out); + break; + case 6: + internal::__fastpack6(in, out); + break; + case 7: + internal::__fastpack7(in, out); + break; + case 8: + internal::__fastpack8(in, out); + break; + case 9: + internal::__fastpack9(in, out); + break; + case 10: + internal::__fastpack10(in, out); + break; + case 11: + internal::__fastpack11(in, out); + break; + case 12: + internal::__fastpack12(in, out); + break; + case 13: + internal::__fastpack13(in, out); + break; + case 14: + internal::__fastpack14(in, out); + break; + case 15: + internal::__fastpack15(in, out); + break; + case 16: + internal::__fastpack16(in, out); + break; + case 17: + internal::__fastpack17(in, out); + break; + case 18: + internal::__fastpack18(in, out); + break; + case 19: + internal::__fastpack19(in, out); + break; + case 20: + internal::__fastpack20(in, out); + break; + case 21: + internal::__fastpack21(in, out); + break; + case 22: + internal::__fastpack22(in, out); + break; + case 23: + internal::__fastpack23(in, out); + break; + case 24: + internal::__fastpack24(in, out); + break; + case 25: + internal::__fastpack25(in, out); + break; + case 26: + internal::__fastpack26(in, out); + break; + case 27: + internal::__fastpack27(in, out); + break; + case 28: + internal::__fastpack28(in, out); + break; + case 29: + internal::__fastpack29(in, out); + break; + case 30: + internal::__fastpack30(in, out); + break; + case 31: + internal::__fastpack31(in, out); + break; + case 32: + internal::__fastpack32(in, out); + break; + default: + throw std::logic_error("Invalid bit width for bitpacking"); + } +} + +inline void fastpack(const uint64_t * __restrict in, uint32_t * __restrict out, const uint32_t bit) +{ + switch (bit) + { + case 0: + internal::__fastpack0(in, out); + break; + case 1: + internal::__fastpack1(in, out); + break; + case 2: + internal::__fastpack2(in, out); + break; + case 3: + internal::__fastpack3(in, out); + break; + case 4: + internal::__fastpack4(in, out); + break; + case 5: + internal::__fastpack5(in, out); + break; + case 6: + internal::__fastpack6(in, out); + break; + case 7: + internal::__fastpack7(in, out); + break; + case 8: + internal::__fastpack8(in, out); + break; + case 9: + internal::__fastpack9(in, out); + break; + case 10: + internal::__fastpack10(in, out); + break; + case 11: + internal::__fastpack11(in, out); + break; + case 12: + internal::__fastpack12(in, out); + break; + case 13: + internal::__fastpack13(in, out); + break; + case 14: + internal::__fastpack14(in, out); + break; + case 15: + internal::__fastpack15(in, out); + break; + case 16: + internal::__fastpack16(in, out); + break; + case 17: + internal::__fastpack17(in, out); + break; + case 18: + internal::__fastpack18(in, out); + break; + case 19: + internal::__fastpack19(in, out); + break; + case 20: + internal::__fastpack20(in, out); + break; + case 21: + internal::__fastpack21(in, out); + break; + case 22: + internal::__fastpack22(in, out); + break; + case 23: + internal::__fastpack23(in, out); + break; + case 24: + internal::__fastpack24(in, out); + break; + case 25: + internal::__fastpack25(in, out); + break; + case 26: + internal::__fastpack26(in, out); + break; + case 27: + internal::__fastpack27(in, out); + break; + case 28: + internal::__fastpack28(in, out); + break; + case 29: + internal::__fastpack29(in, out); + break; + case 30: + internal::__fastpack30(in, out); + break; + case 31: + internal::__fastpack31(in, out); + break; + case 32: + internal::__fastpack32(in, out); + break; + case 33: + internal::__fastpack33(in, out); + break; + case 34: + internal::__fastpack34(in, out); + break; + case 35: + internal::__fastpack35(in, out); + break; + case 36: + internal::__fastpack36(in, out); + break; + case 37: + internal::__fastpack37(in, out); + break; + case 38: + internal::__fastpack38(in, out); + break; + case 39: + internal::__fastpack39(in, out); + break; + case 40: + internal::__fastpack40(in, out); + break; + case 41: + internal::__fastpack41(in, out); + break; + case 42: + internal::__fastpack42(in, out); + break; + case 43: + internal::__fastpack43(in, out); + break; + case 44: + internal::__fastpack44(in, out); + break; + case 45: + internal::__fastpack45(in, out); + break; + case 46: + internal::__fastpack46(in, out); + break; + case 47: + internal::__fastpack47(in, out); + break; + case 48: + internal::__fastpack48(in, out); + break; + case 49: + internal::__fastpack49(in, out); + break; + case 50: + internal::__fastpack50(in, out); + break; + case 51: + internal::__fastpack51(in, out); + break; + case 52: + internal::__fastpack52(in, out); + break; + case 53: + internal::__fastpack53(in, out); + break; + case 54: + internal::__fastpack54(in, out); + break; + case 55: + internal::__fastpack55(in, out); + break; + case 56: + internal::__fastpack56(in, out); + break; + case 57: + internal::__fastpack57(in, out); + break; + case 58: + internal::__fastpack58(in, out); + break; + case 59: + internal::__fastpack59(in, out); + break; + case 60: + internal::__fastpack60(in, out); + break; + case 61: + internal::__fastpack61(in, out); + break; + case 62: + internal::__fastpack62(in, out); + break; + case 63: + internal::__fastpack63(in, out); + break; + case 64: + internal::__fastpack64(in, out); + break; + default: + throw std::logic_error("Invalid bit width for bitpacking"); + } +} +} // namespace fastpforlib diff --git a/dbms/CMakeLists.txt b/dbms/CMakeLists.txt index fe28afba8be..baf625965d8 100644 --- a/dbms/CMakeLists.txt +++ b/dbms/CMakeLists.txt @@ -199,6 +199,7 @@ target_link_libraries (tiflash_common_io libsymbolization ${RE2_LIBRARY} ${RE2_ST_LIBRARY} + fastpforlib ) target_include_directories (tiflash_common_io BEFORE PRIVATE ${kvClient_SOURCE_DIR}/include) diff --git a/dbms/src/Common/BitpackingPrimitives.h b/dbms/src/Common/BitpackingPrimitives.h new file mode 100644 index 00000000000..f12dc6d017a --- /dev/null +++ b/dbms/src/Common/BitpackingPrimitives.h @@ -0,0 +1,330 @@ +// Copyright 2024 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include + +namespace DB +{ + +/** + * class BitpackingPrimitives provides basic functions to do bitpacking. + * For signed integers, it will apply the zigzag encoding first. + */ +class BitpackingPrimitives +{ +public: + static constexpr const size_t BITPACKING_ALGORITHM_GROUP_SIZE = 32; + static constexpr const bool BYTE_ALIGNED = false; + + // To ensure enough data is available, use GetRequiredSize() to determine the correct size for dst buffer + // Note: input should be aligned to BITPACKING_ALGORITHM_GROUP_SIZE for good performance. + template + static void packBuffer(unsigned char * dst, const T * src, size_t count, UInt8 width) + { + if constexpr (ASSUME_INPUT_ALIGNED) + { + for (size_t i = 0; i < count; i += BITPACKING_ALGORITHM_GROUP_SIZE) + { + packGroup(dst + (i * width) / 8, src + i, width); + } + } + else + { + size_t misaligned_count = count % BITPACKING_ALGORITHM_GROUP_SIZE; + count -= misaligned_count; + for (size_t i = 0; i < count; i += BITPACKING_ALGORITHM_GROUP_SIZE) + { + packGroup(dst + (i * width) / 8, src + i, width); + } + // Input was not aligned to BITPACKING_ALGORITHM_GROUP_SIZE, we need a copy + if (misaligned_count) + { + T tmp_buffer[BITPACKING_ALGORITHM_GROUP_SIZE]; // TODO: maybe faster on the heap? + memcpy(tmp_buffer, src + count, misaligned_count * sizeof(T)); + packGroup(dst + (count * width) / 8, tmp_buffer, width); + } + } + } + + // Unpacks a block of BITPACKING_ALGORITHM_GROUP_SIZE values + // Assumes both src and dst to be of the correct size + template + static void unPackBuffer( + unsigned char * dst, + const unsigned char * src, + size_t count, + UInt8 width, + bool skip_sign_extension = false) + { + for (size_t i = 0; i < count; i += BITPACKING_ALGORITHM_GROUP_SIZE) + { + unPackGroup(dst + i * sizeof(T), src + (i * width) / 8, width, skip_sign_extension); + } + } + + // Packs a block of BITPACKING_ALGORITHM_GROUP_SIZE values + template + static void packBlock(unsigned char * dst, const T * src, UInt8 width) + { + return packGroup(dst, src, width); + } + + // Unpacks a block of BITPACKING_ALGORITHM_GROUP_SIZE values + template + static void unPackBlock( + unsigned char * dst, + const unsigned char * src, + UInt8 width, + bool skip_sign_extension = false) + { + return unPackGroup(dst, src, width, skip_sign_extension); + } + + // Calculates the minimum required number of bits per value that can store all values + template ::is_signed> + constexpr static UInt8 minimumBitWidth(T value) + { + return findMinimumBitWidth(value, value); + } + + // Calculates the minimum required number of bits per value that can store all values + template ::is_signed> + constexpr static UInt8 minimumBitWidth(const T * values, size_t count) + { + return findMinimumBitWidth(values, count); + } + + // Calculates the minimum required number of bits per value that can store all values, + // given a predetermined minimum and maximum value of the buffer + template ::is_signed> + constexpr static UInt8 minimumBitWidth(T minimum, T maximum) + { + return findMinimumBitWidth(minimum, maximum); + } + + constexpr static size_t getRequiredSize(size_t count, UInt8 width) + { + count = roundUpToAlgorithmGroupSize(count); + return ((count * width) / 8); + } + + // round up to nearest multiple of BITPACKING_ALGORITHM_GROUP_SIZE + template + constexpr static T roundUpToAlgorithmGroupSize(T num_to_round) + { + static_assert( + (BITPACKING_ALGORITHM_GROUP_SIZE & (BITPACKING_ALGORITHM_GROUP_SIZE - 1)) == 0, + "BITPACKING_ALGORITHM_GROUP_SIZE must be a power of 2"); + constexpr T mask = BITPACKING_ALGORITHM_GROUP_SIZE - 1; + return (num_to_round + mask) & ~mask; + } + +private: + template + constexpr static UInt8 findMinimumBitWidth(const T * values, size_t count) + { + T min_value = values[0]; + T max_value = *std::max_element(values, values + count); + if constexpr (is_signed) + { + min_value = *std::min_element(values, values + count); + } + return findMinimumBitWidth(min_value, max_value); + } + + template + constexpr static UInt8 findMinimumBitWidth(T min_value, T max_value) + { + UInt8 bitwidth; + T value; + + if constexpr (is_signed) + { + if (min_value == std::numeric_limits::min()) + { + // handle special case of the minimal value, as it cannot be negated like all other values. + return sizeof(T) * 8; + } + else + { + value = std::max(static_cast(-min_value), max_value); + } + } + else + { + value = max_value; + } + + if (value == 0) + { + return 0; + } + + if constexpr (is_signed) + { + bitwidth = 1; + } + else + { + bitwidth = 0; + } + + while (value) + { + bitwidth++; + value >>= 1; + } + + bitwidth = getEffectiveWidth(bitwidth); + + // Assert results are correct + if (bitwidth < sizeof(T) * 8 && bitwidth != 0) + { + if constexpr (is_signed) + { + RUNTIME_ASSERT(max_value <= (T(1) << (bitwidth - 1)) - 1); + RUNTIME_ASSERT(min_value >= (T(-1) * ((T(1) << (bitwidth - 1)) - 1) - 1)); + } + else + { + RUNTIME_ASSERT(max_value <= (T(1) << (bitwidth)) - 1); + } + } + if constexpr (round_to_next_byte) + { + return (bitwidth / 8 + (bitwidth % 8 != 0)) * 8; + } + return bitwidth; + } + + // Sign bit extension + template ::type> + static void signExtend(unsigned char * dst, UInt8 width) + { + T const mask = static_cast(TU(1) << (width - 1)); + for (size_t i = 0; i < BitpackingPrimitives::BITPACKING_ALGORITHM_GROUP_SIZE; ++i) + { + T value = unalignedLoad(dst + i * sizeof(T)); + value = static_cast(value & ((TU(1) << width) - TU(1))); + T result = (value ^ mask) - mask; + unalignedStore(dst + i * sizeof(T), result); + } + } + + // Prevent compression at widths that are ineffective + // Ineffective means that the width is greater than 7/8 of the original width. + template + constexpr static UInt8 getEffectiveWidth(UInt8 width) + { + UInt8 bits_of_type = sizeof(T) * 8; + UInt8 type_size = sizeof(T); + if (width + type_size > bits_of_type) + { + return bits_of_type; + } + return width; + } + + template + static void packGroup(unsigned char * dst, const T * values, UInt8 width) + { + if constexpr (std::is_same::value || std::is_same::value) + { + fastpforlib::fastpack( + reinterpret_cast(values), + reinterpret_cast(dst), + static_cast(width)); + } + else if constexpr (std::is_same::value || std::is_same::value) + { + fastpforlib::fastpack( + reinterpret_cast(values), + reinterpret_cast(dst), + static_cast(width)); + } + else if constexpr (std::is_same::value || std::is_same::value) + { + fastpforlib::fastpack( + reinterpret_cast(values), + reinterpret_cast(dst), + static_cast(width)); + } + else if constexpr (std::is_same::value || std::is_same::value) + { + fastpforlib::fastpack( + reinterpret_cast(values), + reinterpret_cast(dst), + static_cast(width)); + } + else + { + // TODO: use static_assert(false, xxx) instead until the toolchain upgrade to clang 17.0 + static_assert(sizeof(T *) == 0, "Unsupported type for bitpacking"); + } + } + + template + static void unPackGroup( + unsigned char * dst, + const unsigned char * src, + UInt8 width, + bool skip_sign_extension = false) + { + if constexpr (std::is_same::value || std::is_same::value) + { + fastpforlib::fastunpack( + reinterpret_cast(src), + reinterpret_cast(dst), + static_cast(width)); + } + else if constexpr (std::is_same::value || std::is_same::value) + { + fastpforlib::fastunpack( + reinterpret_cast(src), + reinterpret_cast(dst), + static_cast(width)); + } + else if constexpr (std::is_same::value || std::is_same::value) + { + fastpforlib::fastunpack( + reinterpret_cast(src), + reinterpret_cast(dst), + static_cast(width)); + } + else if constexpr (std::is_same::value || std::is_same::value) + { + fastpforlib::fastunpack( + reinterpret_cast(src), + reinterpret_cast(dst), + static_cast(width)); + } + else + { + // TODO: use static_assert(false, xxx) instead until the toolchain upgrade to clang 17.0 + static_assert(sizeof(T *) == 0, "Unsupported type for bitpacking"); + } + + if (std::numeric_limits::is_signed && !skip_sign_extension && width > 0 && width < sizeof(T) * 8) + { + signExtend(dst, width); + } + } +}; + +} // namespace DB diff --git a/dbms/src/Common/tests/gtest_bitpacking_primitives.cpp b/dbms/src/Common/tests/gtest_bitpacking_primitives.cpp new file mode 100644 index 00000000000..03a7a0c2a36 --- /dev/null +++ b/dbms/src/Common/tests/gtest_bitpacking_primitives.cpp @@ -0,0 +1,240 @@ +// Copyright 2024 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + + +namespace DB::tests +{ + +class TestBitpackingPrimitives : public testing::Test +{ +}; + +TEST_F(TestBitpackingPrimitives, TestMinimumBitWidthSingle) +try +{ + { + const UInt8 value = 0b0000'0000; + ASSERT_EQ(BitpackingPrimitives::minimumBitWidth(value), 0); + } + { + const UInt8 value = 0b0000'0001; + ASSERT_EQ(BitpackingPrimitives::minimumBitWidth(value), 1); + } + { + const UInt8 value = 0b0000'0010; + ASSERT_EQ(BitpackingPrimitives::minimumBitWidth(value), 2); + } + { + const UInt16 value = 0b0000'0010'0000'0100; + ASSERT_EQ(BitpackingPrimitives::minimumBitWidth(value), 10); + } + { + const UInt32 value = 0b0000'0010'0000'0100'0000'0100'0000'0100; + ASSERT_EQ(BitpackingPrimitives::minimumBitWidth(value), 26); + } + { + const UInt64 value = 0b1000'0010'0000'0100'0000'0100'0000'0100'0000'0100'0000'0100'0000'0100'0000'0100; + ASSERT_EQ(BitpackingPrimitives::minimumBitWidth(value), 64); + } + { + const Int8 value = -1; // (0b1000'0001 >> 7) ^ (0b1000'0001 << 1) = 0b0000'0010 + ASSERT_EQ(BitpackingPrimitives::minimumBitWidth(value), 2); + } + { + const Int16 value = -3; // (0b1000'0000'0000'0011 >> 15) ^ (0b1000'0000'0000'0011 << 1) = 0b1000'0000'0000'0111 + ASSERT_EQ(BitpackingPrimitives::minimumBitWidth(value), 3); + } + { + const Int32 value = -7; // just like above + ASSERT_EQ(BitpackingPrimitives::minimumBitWidth(value), 4); + } + { + const Int64 value = 10; // just like above + ASSERT_EQ(BitpackingPrimitives::minimumBitWidth(value), 5); + } +} +CATCH + +TEST_F(TestBitpackingPrimitives, TestMinimumBitWidthMinMax) +try +{ + { + const UInt8 min = 0b0000'0000; + const UInt8 max = 0b0000'0001; + ASSERT_EQ(BitpackingPrimitives::minimumBitWidth(min, max), 1); + } + { + const UInt16 min = 0b0000'0000'0000'0100; + const UInt16 max = 0b1000'0000'0000'0100; + ASSERT_EQ(BitpackingPrimitives::minimumBitWidth(min, max), 16); + } + { + const UInt32 min = 0b0000'0000'0000'0101'0000'0100'0000'0100; + const UInt32 max = 0b1000'0101'0000'0100'0000'0100'0000'0100; + ASSERT_EQ(BitpackingPrimitives::minimumBitWidth(min, max), 32); + } + { + const UInt64 min = 0b0000'0010'0000'0100'0000'0100'0000'0100'0000'0100'0000'0100'0000'0100'0000'0100; + const UInt64 max = 0b1000'0010'0000'0100'0000'0100'0000'0100'0000'0100'0000'0100'0000'0100'0000'0100; + ASSERT_EQ(BitpackingPrimitives::minimumBitWidth(min, max), 64); + } + { + const Int8 min = -1; + const Int8 max = 10; + ASSERT_EQ(BitpackingPrimitives::minimumBitWidth(min, max), 5); + } + { + const Int16 min = -7; + const Int16 max = 10; + ASSERT_EQ(BitpackingPrimitives::minimumBitWidth(min, max), 5); + } + { + const Int32 min = 8; + const Int32 max = 10; + ASSERT_EQ(BitpackingPrimitives::minimumBitWidth(min, max), 5); + } + { + const Int64 min = std::numeric_limits::min(); + const Int64 max = std::numeric_limits::max(); + ASSERT_EQ(BitpackingPrimitives::minimumBitWidth(min, max), 64); + } +} +CATCH + +TEST_F(TestBitpackingPrimitives, TestMinimumBitWidthMultiple) +try +{ + { + const UInt8 values[3] = {0b0000'0000, 0b0000'0001, 0b0000'0010}; + ASSERT_EQ(BitpackingPrimitives::minimumBitWidth(values, 3), 2); + } + { + const UInt16 values[3] = {0b0000'0010'0000'0100, 0b0000'0010'1000'0100, 0b0000'0000'0000'0100}; + ASSERT_EQ(BitpackingPrimitives::minimumBitWidth(values, 3), 10); + } + { + const UInt32 values[3] = { + 0b0000'0010'0000'0100'0000'0100'0000'0100, + 0b0000'0011'0000'0100'0000'0100'0000'0100, + 0b0000'0000'0000'0100'0000'0100'0000'0100, + }; + ASSERT_EQ(BitpackingPrimitives::minimumBitWidth(values, 3), 26); + } + { + const UInt64 values[3] = { + 0b1000'0010'0000'0100'0000'0100'0000'0100'0000'0100'0000'0100'0000'0100'0000'0100, + 0b0000'0011'0000'0100'0000'0100'0000'0100'0000'0100'0000'0100'0000'0100'0000'0100, + 0b0000'0000'0000'0100'0000'0000'0100'0000'0100'0000'0100'0000'0100'0000'0100'0000, + }; + ASSERT_EQ(BitpackingPrimitives::minimumBitWidth(values, 3), 64); + } + { + const Int8 values[3] = {-1, -7, 10}; + ASSERT_EQ(BitpackingPrimitives::minimumBitWidth(values, 3), 5); + } + { + const Int16 values[3] = {-1, -7, 10}; + ASSERT_EQ(BitpackingPrimitives::minimumBitWidth(values, 3), 5); + } + { + const Int32 values[3] = {-1, -7, 10}; + ASSERT_EQ(BitpackingPrimitives::minimumBitWidth(values, 3), 5); + } + { + const Int64 values[3] = {-1, -7, std::numeric_limits::max()}; + ASSERT_EQ(BitpackingPrimitives::minimumBitWidth(values, 3), 64); + } +} +CATCH + + +TEST_F(TestBitpackingPrimitives, TestGetRequiredSize) +try +{ + // 32 as a group + ASSERT_EQ(BitpackingPrimitives::getRequiredSize(1, 8), 32); + ASSERT_EQ(BitpackingPrimitives::getRequiredSize(32, 8), 32); + ASSERT_EQ(BitpackingPrimitives::getRequiredSize(2, 7), 28); + ASSERT_EQ(BitpackingPrimitives::getRequiredSize(32, 7), 28); + ASSERT_EQ(BitpackingPrimitives::getRequiredSize(2, 10), 40); + ASSERT_EQ(BitpackingPrimitives::getRequiredSize(32, 10), 40); +} +CATCH + +TEST_F(TestBitpackingPrimitives, TestpackBuffer) +try +{ + auto test_func = [](const T * values, size_t size) { + const auto width = BitpackingPrimitives::minimumBitWidth(values, size); + const auto length = BitpackingPrimitives::getRequiredSize(size, width); + unsigned char buffer[length]; + BitpackingPrimitives::packBuffer(buffer, values, size, width); + // dest buffer should be rounded up to group size + const auto round_count = BitpackingPrimitives::roundUpToAlgorithmGroupSize(size); + unsigned char decoded[sizeof(T) * round_count]; + BitpackingPrimitives::unPackBuffer(decoded, buffer, size, width); + for (size_t i = 0; i < size; ++i) + { + auto decode_value = unalignedLoad(decoded + i * sizeof(T)); + ASSERT_EQ(decode_value, values[i]); + } + }; + + { + const UInt8 values[3] = {0b0000'0000, 0b0000'0001, 0b0000'0010}; + test_func(values, 3); + } + { + const UInt16 values[3] = {0b0000'0010'0000'0100, 0b0000'0010'1000'0100, 0b0000'0000'0000'0100}; + test_func(values, 3); + } + { + const UInt32 values[3] = { + 0b0000'0010'0000'0100'0000'0100'0000'0100, + 0b0000'0011'0000'0100'0000'0100'0000'0100, + 0b0000'0000'0000'0100'0000'0100'0000'0100, + }; + test_func(values, 3); + } + { + const UInt64 values[3] = { + 0b1000'0010'0000'0100'0000'0100'0000'0100'0000'0100'0000'0100'0000'0100'0000'0100, + 0b0000'0011'0000'0100'0000'0100'0000'0100'0000'0100'0000'0100'0000'0100'0000'0100, + 0b0000'0000'0000'0100'0000'0000'0100'0000'0100'0000'0100'0000'0100'0000'0100'0000, + }; + test_func(values, 3); + } + { + const Int8 values[3] = {-1, -7, 10}; + test_func(values, 3); + } + { + const Int16 values[3] = {-1, -7, 10}; + test_func(values, 3); + } + { + const Int32 values[3] = {-1, -7, 10}; + test_func(values, 3); + } + { + const Int64 values[3] = {-1, -7, std::numeric_limits::max()}; + test_func(values, 3); + } +} +CATCH + +} // namespace DB::tests