diff --git a/Crc32.cpp b/Crc32.cpp index 69e501a..e57dc77 100644 --- a/Crc32.cpp +++ b/Crc32.cpp @@ -587,12 +587,12 @@ uint32_t crc32_combine(uint32_t crcA, uint32_t crcB, size_t lengthB) // put operator for one zero bit in odd odd[0] = Polynomial; // CRC-32 polynomial - for (int i = 1; i < CrcBits; i++) + for (uint32_t i = 1; i < CrcBits; i++) odd[i] = 1 << (i - 1); // put operator for two zero bits in even // same as gf2_matrix_square(even, odd); - for (int i = 0; i < CrcBits; i++) + for (uint32_t i = 0; i < CrcBits; i++) { uint32_t vec = odd[i]; even[i] = 0; @@ -602,11 +602,11 @@ uint32_t crc32_combine(uint32_t crcA, uint32_t crcB, size_t lengthB) } // put operator for four zero bits in odd // same as gf2_matrix_square(odd, even); - for (int i = 0; i < CrcBits; i++) + for (uint32_t i = 0; i < CrcBits; i++) { uint32_t vec = even[i]; odd[i] = 0; - for (int j = 0; vec != 0; j++, vec >>= 1) + for (uint32_t j = 0; vec != 0; j++, vec >>= 1) if (vec & 1) odd[i] ^= even[j]; } @@ -618,7 +618,7 @@ uint32_t crc32_combine(uint32_t crcA, uint32_t crcB, size_t lengthB) for (; lengthB > 0; lengthB >>= 1) { // same as gf2_matrix_square(a, b); - for (int i = 0; i < CrcBits; i++) + for (uint32_t i = 0; i < CrcBits; i++) { uint32_t vec = b[i]; a[i] = 0; diff --git a/Crc32TestMultithreaded.cpp b/Crc32TestMultithreaded.cpp index 0878801..559b0eb 100644 --- a/Crc32TestMultithreaded.cpp +++ b/Crc32TestMultithreaded.cpp @@ -103,7 +103,7 @@ bool testCombine(const char* data, size_t maxBytes = 1024) // check results if (crcAtOnce != crcSequential || crcAtOnce != crcCombined) { - printf("FAILED @ %d: %08X %08X %08X %08X %08X\n", lengthA, crcA, crcB, crcAtOnce, crcSequential, crcCombined); + printf("FAILED @ %zu: %08X %08X %08X %08X %08X\n", lengthA, crcA, crcB, crcAtOnce, crcSequential, crcCombined); ok = false; } } diff --git a/Crc32cl.cpp b/Crc32cl.cpp new file mode 100644 index 0000000..e551097 --- /dev/null +++ b/Crc32cl.cpp @@ -0,0 +1,177 @@ +// ////////////////////////////////////////////////////////// +// Crc32cl.cpp +// Author: Sven Gothel +// Copyright (c) 2021 Gothel Software e.K. (same zlib license from Stephan Brumme) +// Copyright (c) 2016-2019 Stephan Brumme. +// see http://create.stephan-brumme.com/disclaimer.html +// + +#include "Crc32.h" +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#ifdef ENABLE_MT + // Multithreading is incomplete + // - CRC differs, combine works but depending on thread count differs :) + // - Should use fixed thread pool and async ringbuffer for data + // so threads can wait until being signaled EOD + // + // C++11 multithreading + #include + #include + + // ////////////////////////////////////////////////////////// + // run a CRC32 algorithm on multiple threads + typedef uint32_t (*Crc32Algorithm)(const void* data, size_t length, uint32_t previousCrc32); + // compute CRC32 of up to maxBlockSize bytes and start recursively a new thread for excess data + static uint32_t asyncCrc32(Crc32Algorithm myCrc32, const void* data, size_t numBytes, size_t maxBlockSize) + { + // last block ? + if (numBytes <= maxBlockSize) { + return myCrc32(data, numBytes, 0); // we're done + } + + // compute CRC of the remaining bytes in a separate thread + auto dataLeft = (const char*)data + maxBlockSize; + auto bytesLeft = numBytes - maxBlockSize; + auto remainder = std::async(std::launch::async, asyncCrc32, myCrc32, dataLeft, bytesLeft, maxBlockSize); + + // compute CRC of the current block + auto currentCrc = myCrc32(data, maxBlockSize, 0); + // get CRC of the remainder + auto remainderCrc = remainder.get(); + // and merge both + return crc32_combine(currentCrc, remainderCrc, bytesLeft); + } + // call: run(crc32_8bytes, data, NumBytes, 8) if you have an octocore CPU + static uint32_t run(Crc32Algorithm myCrc32, const void* data, size_t numBytes, size_t numThreads = 0) + { + // run on all cores + if (numThreads == 0) { + numThreads = std::thread::hardware_concurrency(); + } + + // split data evenly, rounding up + auto defaultBlocksize = (numBytes + numThreads - 1) / numThreads; + + return asyncCrc32(myCrc32, data, numBytes, defaultBlocksize); + } +#endif // ENABLE_MT + + +// timing +static double seconds() +{ + timespec now; + clock_gettime(CLOCK_REALTIME, &now); + return now.tv_sec + now.tv_nsec / 1000000000.0; +} + +void print_usage(const char*progname) { +#ifdef ENABLE_MT + fprintf(stderr, "Usage %s [-v] [-bsz ] [-threads ] \n", progname); +#else // ENABLE_MT + fprintf(stderr, "Usage %s [-v] [-bsz ] \n", progname); +#endif // ENABLE_MT +} + +int main(int argc, char**argv) +{ + int argi=0; + + bool verbose = false; + int thread_count = 1; // minimum + size_t buffer_size = 20*4096; // minimum 81920 (empiric most efficient on arm64, raspi4 single thread) + + for(int i=1; i= argc ) { + print_usage(argv[0]); + return -1; + } + std::string input_file(argv[++argi]); + std::ifstream in(input_file, std::ios::binary); + if( !in ) { + fprintf(stderr, "Error reading file %s\n", input_file.c_str()); + return -1; + } + std::vector buffer; + + buffer.reserve(buffer_size); + + double startTime; + if( verbose ) { + startTime = seconds(); + fprintf(stderr, " Using: buffer_size %zu, threads %d, ", buffer_size, thread_count); + #ifdef CRC32_USE_LOOKUP_TABLE_SLICING_BY_16 + fprintf(stderr, "LOOKUP_TABLE_SLICING_BY_16\n"); + #elif defined(CRC32_USE_LOOKUP_TABLE_SLICING_BY_8) + fprintf(stderr, "LOOKUP_TABLE_SLICING_BY_8\n"); + #elif defined(CRC32_USE_LOOKUP_TABLE_SLICING_BY_4) + fprintf(stderr, "LOOKUP_TABLE_SLICING_BY_4\n"); + #elif defined(CRC32_USE_LOOKUP_TABLE_BYTE) + fprintf(stderr, "LOOKUP_TABLE_BYTE\n"); + #else + fprintf(stderr, "HALFBYTE\n"); + #endif + } else { + startTime = 0; + } + + uint32_t res = 0; + uint64_t total = 0; + bool has_more = in.good(); + uint64_t loops = 0; + while( has_more ) { + buffer.resize(buffer.capacity()); + + in.read(reinterpret_cast(buffer.data()), buffer.capacity()); + const size_t got = static_cast(in.gcount()); + buffer.resize(got); + total += got; + has_more = in.good(); +#ifdef ENABLE_MT + if( 1 >= thread_count ) { + res = crc32_fast(reinterpret_cast(buffer.data()), got, res); + } else { + res = run(crc32_fast, reinterpret_cast(buffer.data()), got, thread_count); + } +#else // ENABLE_MT + res = crc32_fast(reinterpret_cast(buffer.data()), got, res); +#endif // ENABLE_MT + loops++; + } + if ( verbose ) { + double duration = seconds() - startTime; + fprintf(stderr, " Duration %.3fs, %.3f MiB/s, %" PRIu64 " KiB/loop, %" PRIu64 " loops\n", duration, (total / (1024*1024)) / duration, (total / 1024) / loops, loops); + } + printf("%08x\n", res); + return 0; +} diff --git a/Makefile b/Makefile index e72d1ea..94b4285 100644 --- a/Makefile +++ b/Makefile @@ -3,24 +3,35 @@ CPP = g++ # files PROGRAM = Crc32Test -LIBS = -lrt +LIBS = +LIBS_mt = -lpthread HEADERS = Crc32.h OBJECTS = Crc32.o Crc32Test.o +OBJECTS2 = Crc32.o Crc32TestMultithreaded.o + +OBJECTS3 = Crc32.o Crc32cl.o + # flags FLAGS = -O3 -Wall -pedantic -s -default: $(PROGRAM) +default: $(PROGRAM) Crc32TestMultithreaded crc32 all: default $(PROGRAM): $(OBJECTS) Makefile $(CPP) $(OBJECTS) $(FLAGS) $(LIBS) -o $(PROGRAM) +Crc32TestMultithreaded: $(OBJECTS2) Makefile + $(CPP) $(OBJECTS2) $(FLAGS) $(LIBS_mt) -o Crc32TestMultithreaded + +crc32: $(OBJECTS3) Makefile + $(CPP) $(OBJECTS3) $(FLAGS) $(LIBS) -o crc32 + %.o: %.cpp $(HEADERS) Makefile $(CPP) $(FLAGS) -c $< -o $@ clean: - -rm -f $(OBJECTS) $(PROGRAM) + -rm -f $(OBJECTS) $(OBJECTS2) $(OBJECTS3) $(PROGRAM) Crc32TestMultithreaded crc32 run: $(PROGRAM) ./$(PROGRAM)