Slow comparison to STL (shouldn't be...) #231

jrcavani · 2024-01-13T00:49:55Z

jrcavani
Jan 13, 2024

Hi. I have been a user of this repo since 2022. Today for a different project, I finally sat down and ran some quick benchmarks on 100M insertions. But to my surprise, the STL unordered_map is a lot faster than the flat_hash_map or parallel_flat_hash_map. I must be doing something wrong here.

#include <iostream>
#include <unordered_map>
#include <parallel_hashmap/phmap.h>

using namespace std;

static int N = 100000000;

int main() {
    {
        std::unordered_map<int, int> map;
        map.reserve(N);
        auto t = std::chrono::high_resolution_clock::now();
        for (int i = 0; i < N; i++) {
            map[i] = i;
        }
        auto duration = std::chrono::duration_cast<std::chrono::seconds>(std::chrono::high_resolution_clock::now() - t).count();
        std::cout << "STL: Done in " << duration << "s." << std::endl;
    }

    {
        phmap::flat_hash_map<int, int> map;
        map.reserve(N);
        auto t = std::chrono::high_resolution_clock::now();
        for (int i = 0; i < N; i++) {
            map[i] = i;
        }
        auto duration = std::chrono::duration_cast<std::chrono::seconds>(std::chrono::high_resolution_clock::now() - t).count();
        std::cout << "flat_hash_map: Done in " << duration << "s." << std::endl;
    }

    {
        phmap::parallel_flat_hash_map<
                        int,
                        int,
                        phmap::priv::hash_default_hash<int>,
                        phmap::priv::hash_default_eq<int>,
                        phmap::priv::Allocator<phmap::priv::Pair<const int, int>>,
                        6,
                        std::mutex> map;
        map.reserve(N);
        auto t = std::chrono::high_resolution_clock::now();
        for (int i = 0; i < N; i++) {
            map[i] = i;
        }
        auto duration = std::chrono::duration_cast<std::chrono::seconds>(std::chrono::high_resolution_clock::now() - t).count();
        std::cout << "parallel_flat_hash_map: Done in " << duration << "s." << std::endl;
    }

    return 0;
}

Added to CMakeLists.txt:

if (PHMAP_BUILD_EXAMPLES)
    if(NOT MSVC)
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pedantic -Wall -Wextra -Wcast-align -Wcast-qual -Wdisabled-optimization -Winit-self -Wlogical-op -Wmissing-include-dirs -Woverloaded-virtual -Wredundant-decls -Wshadow -Wstrict-null-sentinel  -Wswitch-default -Wno-unused -Wno-unknown-warning-option -Wno-gnu-zero-variadic-macro-arguments")
    else()
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W4 /Zc:__cplusplus")
    endif()

    set(THREADS_PREFER_PTHREAD_FLAG ON)
    find_package(Threads REQUIRED)

    add_executable(ex_allmaps examples/allmaps.cc phmap.natvis)
    add_executable(ex_basic examples/basic.cc phmap.natvis)
    add_executable(ex_bench examples/bench.cc phmap.natvis)
    add_executable(ex_emplace examples/emplace.cc phmap.natvis)
    if (MSVC)
        add_executable(ex_lazy_emplace_l examples/lazy_emplace_l.cc phmap.natvis)
    endif()
    add_executable(ex_serialize examples/serialize.cc phmap.natvis)
    target_include_directories(ex_serialize PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../cereal/include>)
    add_executable(ex_hash_std examples/hash_std.cc phmap.natvis)
    add_executable(ex_hash_value examples/hash_value.cc phmap.natvis)
    add_executable(ex_hash examples/hash.cc phmap.natvis)
    add_executable(ex_two_files examples/f1.cc examples/f2.cc phmap.natvis)
    add_executable(ex_insert_bench examples/insert_bench.cc phmap.natvis)
    add_executable(ex_knucleotide examples/knucleotide.cc phmap.natvis)
    add_executable(ex_dump_load examples/dump_load.cc phmap.natvis)
    add_executable(ex_btree examples/btree.cc phmap.natvis)
    add_executable(ex_matt examples/matt.cc phmap.natvis)

    target_link_libraries(ex_knucleotide Threads::Threads)
    target_link_libraries(ex_bench Threads::Threads)

    add_executable(test_benchmark test_benchmark.cpp phmap.natvis)
endif()

cmake -DCMAKE_BUILD_TYPE=Release ..

parallel-hashmap-1.3.8/build# ./test_benchmark
STL: Done in 2s.
flat_hash_map: Done in 5s.
parallel_flat_hash_map: Done in 8s.

I tested using the 1.3.8 version, but assuming newer versions show similar behavior.

greg7mdp · 2024-01-22T01:23:03Z

greg7mdp
Jan 22, 2024
Maintainer

Interesting. You happen to use a very special case in your test, where the key space maps exactly to the map (indeed you could use a std::vector instead of the std::unordered_map and it would be even faster.

The regular std::hash hashing function hashes the integer as itself, meaning that in your example there are no collisions in the unordered_map case.

phmap adds extra mixing, which is very helpful if the hash function is not very good, but that causes it to not benefit from this interesting property.

I have slightly modified your example, where instead of inserting keys 0 through N, I insert random integer keys, which is probably a more realistic example.

When doing this, phmap is faster.

STL: Done in 24s.
flat_hash_map: Done in 12s.
parallel_flat_hash_map: Done in 15s.

#include <iostream>
#include <unordered_map>
#include <parallel_hashmap/phmap.h>
#include <random>

using namespace std;

static int N = 100000000;

int main() {


    std::random_device rd;  // a seed source for the random number engine
    std::mt19937 gen(rd()); // mersenne_twister_engine seeded with rd()
    std::uniform_int_distribution<size_t> u (0, N*27);

    {
        std::unordered_map<int, int> map;
        map.reserve(N);
        auto t = std::chrono::high_resolution_clock::now();
        for (int i = 0; i < N; i++) {
            map[u(gen)] = i;
        }
        auto duration = std::chrono::duration_cast<std::chrono::seconds>(std::chrono::high_resolution_clock::now() - t).count();
        std::cout << "STL: Done in " << duration << "s." << std::endl;
    }

    {
        phmap::flat_hash_map<int, int> map;
        map.reserve(N);
        auto t = std::chrono::high_resolution_clock::now();
        for (int i = 0; i < N; i++) {
            map[u(gen)] = i;
        }
        auto duration = std::chrono::duration_cast<std::chrono::seconds>(std::chrono::high_resolution_clock::now() - t).count();
        std::cout << "flat_hash_map: Done in " << duration << "s." << std::endl;
    }

    {
        phmap::parallel_flat_hash_map<
                        int,
                        int,
                        phmap::priv::hash_default_hash<int>,
                        phmap::priv::hash_default_eq<int>,
                        phmap::priv::Allocator<phmap::priv::Pair<const int, int>>,
                        6,
                        std::mutex> map;
        map.reserve(N);
        auto t = std::chrono::high_resolution_clock::now();
        for (int i = 0; i < N; i++) {
            map[u(gen)] = i;
        }
        auto duration = std::chrono::duration_cast<std::chrono::seconds>(std::chrono::high_resolution_clock::now() - t).count();
        std::cout << "parallel_flat_hash_map: Done in " << duration << "s." << std::endl;
    }

    return 0;
}

2 replies

greg7mdp Jan 22, 2024
Maintainer

BTW, thanks for using phmap. It is designed to be a solid hash container that works well in real life use, it is not optimized for benchmarks.

jrcavani Jan 22, 2024
Author

Ah, good to consider the key distribution pattern. Thank you!

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Slow comparison to STL (shouldn't be...) #231

{{title}}

Replies: 1 comment 2 replies

{{title}}

{{editor}}'s edit

{{editor}}'s edit

{{title}}

{{title}}

Select a reply

Slow comparison to STL (shouldn't be...) #231

jrcavani Jan 13, 2024

Replies: 1 comment · 2 replies

greg7mdp Jan 22, 2024 Maintainer

greg7mdp Jan 22, 2024 Maintainer

jrcavani Jan 22, 2024 Author

jrcavani
Jan 13, 2024

Replies: 1 comment 2 replies

greg7mdp
Jan 22, 2024
Maintainer

greg7mdp Jan 22, 2024
Maintainer

jrcavani Jan 22, 2024
Author