From 5c5ba90555424fe972b3c1a36ca97348b900da6f Mon Sep 17 00:00:00 2001 From: Larry Gritz Date: Fri, 19 Jul 2024 11:32:38 -0400 Subject: [PATCH] feat(ustring): ustring hash collision protection The gist is that the ustring::strhash(str) function is modified to strip out the MSB from Strutil::strhash. The rep entry is filed in the ustring table based on this hash. So effectively, the computed hash is 63 bits, not 64. But rep->hashed field consists of the lower 63 bits being the computed hash, and the MSB indicates whether this is the 2nd (or more) entry in the table that had the same 63 bit hash. ustring::hash() then is modified as follows: If the MSB is 0, the computed hash is the hash. If the MSB is 1, though, we DON'T use that hash, and instead we use the pointer to the unique characters, but with the MSB set (that's an invalid address by itself). Note that the computed hashes never have MSB set, and the char*+MSB always have MSB set, so therefore ustring::hash() will never have the same value for two different ustrings. But -- please note! -- that ustring::strhash(str) and ustring(str).hash() will only match (and also be the same value on every execution) if the ustring is the first to receive that hash, which should be approximately always. Probably always, in practice. But in the very improbable case of a hash collision, one of them (the second to be turned into a ustring) will be using the alternate hash based on the character address, which is both not the same as ustring::strhash(chars), nor is it expected to be the same constant on every program execution. Signed-off-by: Larry Gritz --- src/include/OpenImageIO/ustring.h | 68 ++++++-- src/libutil/ustring.cpp | 256 +++++++++++++----------------- src/libutil/ustring_test.cpp | 19 ++- 3 files changed, 174 insertions(+), 169 deletions(-) diff --git a/src/include/OpenImageIO/ustring.h b/src/include/OpenImageIO/ustring.h index fb45c075d5..82785cd999 100644 --- a/src/include/OpenImageIO/ustring.h +++ b/src/include/OpenImageIO/ustring.h @@ -32,6 +32,7 @@ OIIO_NAMESPACE_BEGIN #define OIIO_USTRING_HAS_CTR_FROM_USTRINGHASH 1 #define OIIO_USTRING_HAS_STDHASH 1 #define OIIO_HAS_USTRINGHASH_FORMATTER 1 +#define OIIO_USTRING_SAFE_HASH 1 class ustringhash; // forward declaration @@ -123,6 +124,16 @@ class ustringhash; // forward declaration /// - if you don't need to do a lot of string assignment or equality /// testing, but lots of more complex string manipulation. /// +/// The ustring implementation guarantees that no two ustrings return the same +/// value for hash() (despite the slim probability that two strings could +/// numerically hash to the same value). For the first ustring added with a +/// given hash, u.hash() will be the same value as ustring::strhash(chars), +/// and will deterministically be the same on every execution. In the very +/// improbable case of a hash collision, subsequent ustrings with the same +/// numeric hash will use an alternate hash based on the character address, +/// which is both not the same as ustring::strhash(chars), nor is it expected +/// to be the same constant on every program execution. + class OIIO_UTIL_API ustring { public: using rep_t = const char*; ///< The underlying representation type @@ -288,11 +299,7 @@ class OIIO_UTIL_API ustring { /// Return a C++ std::string representation of a ustring. const std::string& string() const noexcept { - if (m_chars) { - const TableRep* rep = (const TableRep*)m_chars - 1; - return rep->str; - } else - return empty_std_string; + return m_chars ? rep()->str : empty_std_string; } /// Reset to an empty string. @@ -303,17 +310,27 @@ class OIIO_UTIL_API ustring { { if (!m_chars) return 0; - const TableRep* rep = ((const TableRep*)m_chars) - 1; - return rep->length; + return rep()->length; } - /// Return a hashed version of the string + /// ustring::strhash() uses Strutil::strhash but clears the MSB. + static OIIO_HOSTDEVICE constexpr hash_t strhash(string_view str) + { + return Strutil::strhash(str) & hash_mask; + } + + /// Return a hashed version of the string. To guarantee unique hashes, + /// we check if the "duplicate bit" of the hash is set. If not, then + /// we just return the hash which we know is unique. But if that bit + /// is set, we utilize the unique character address. hash_t hash() const noexcept { if (!m_chars) return 0; - const TableRep* rep = ((const TableRep*)m_chars) - 1; - return rep->hashed; + hash_t h = rep()->hashed; + return OIIO_LIKELY((h & duplicate_bit) == 0) + ? h + : hash_t(m_chars) | duplicate_bit; } /// Return a hashed version of the string @@ -749,6 +766,8 @@ class OIIO_UTIL_API ustring { // if you know the rep, the chars are at (char *)(rep+1), and if you // know the chars, the rep is at ((TableRep *)chars - 1). struct TableRep { + // hashed has the MSB set if and only if this is the second or + // greater ustring to have the same hash. hash_t hashed; // precomputed Hash value std::string str; // String representation size_t length; // Length of the string; must be right before cap @@ -757,10 +776,29 @@ class OIIO_UTIL_API ustring { TableRep(string_view strref, hash_t hash); ~TableRep(); const char* c_str() const noexcept { return (const char*)(this + 1); } + constexpr bool unique_hash() const + { + return (hashed & duplicate_bit) == 0; + } }; + // duplicate_bit is a 1 in the MSB, which if set indicates a hash that + // is a duplicate. + static constexpr hash_t duplicate_bit = hash_t(1) << 63; + // hash_mask is what you `&` with hashed to get the real hash (clearing + // the duplicate bit). +#if 1 + static constexpr hash_t hash_mask = ~duplicate_bit; +#else + // Alternate to force lots of hash collisions for testing purposes: + static constexpr hash_t hash_mask = ~duplicate_bit & 0xffff; +#endif + bool has_unique_hash() const { return rep()->unique_hash(); } + private: static std::string empty_std_string; + + const TableRep* rep() const { return ((const TableRep*)m_chars) - 1; } }; @@ -811,7 +849,7 @@ class OIIO_UTIL_API ustringhash { OIIO_DEVICE_CONSTEXPR explicit ustringhash(const char* str) #ifdef __CUDA_ARCH__ // GPU: just compute the hash. This can be constexpr! - : m_hash(Strutil::strhash(str)) + : m_hash(ustring::strhash(str)) #else // CPU: make ustring, get its hash. Note that ustring ctr can't be // constexpr because it has to modify the internal ustring table. @@ -823,7 +861,7 @@ class OIIO_UTIL_API ustringhash { OIIO_DEVICE_CONSTEXPR explicit ustringhash(const char* str, size_t len) #ifdef __CUDA_ARCH__ // GPU: just compute the hash. This can be constexpr! - : m_hash(Strutil::strhash(len, str)) + : m_hash(ustring::strhash(len, str)) #else // CPU: make ustring, get its hash. Note that ustring ctr can't be // constexpr because it has to modify the internal ustring table. @@ -837,7 +875,7 @@ class OIIO_UTIL_API ustringhash { OIIO_DEVICE_CONSTEXPR explicit ustringhash(string_view str) #ifdef __CUDA_ARCH__ // GPU: just compute the hash. This can be constexpr! - : m_hash(Strutil::strhash(str)) + : m_hash(ustring::strhash(str)) #else // CPU: make ustring, get its hash. Note that ustring ctr can't be // constexpr because it has to modify the internal ustring table. @@ -931,13 +969,13 @@ class OIIO_UTIL_API ustringhash { /// Test for equality with a char*. constexpr bool operator==(const char* str) const noexcept { - return m_hash == Strutil::strhash(str); + return m_hash == ustring::strhash(str); } /// Test for inequality with a char*. constexpr bool operator!=(const char* str) const noexcept { - return m_hash != Strutil::strhash(str); + return m_hash != ustring::strhash(str); } #ifndef __CUDA_ARCH__ diff --git a/src/libutil/ustring.cpp b/src/libutil/ustring.cpp index 211c944e67..6a0e1d48c6 100644 --- a/src/libutil/ustring.cpp +++ b/src/libutil/ustring.cpp @@ -22,12 +22,46 @@ typedef spin_rw_write_lock ustring_write_lock_t; #define PREVENT_HASH_COLLISIONS 1 +// Explanation of ustring hash non-collision guarantees: +// +// The gist is that the ustring::strhash(str) function is modified to +// strip out the MSB from Strutil::strhash. The rep entry is filed in +// the ustring table based on this hash. So effectively, the computed +// hash is 63 bits, not 64. +// +// But rep->hashed field consists of the lower 63 bits being the computed +// hash, and the MSB indicates whether this is the 2nd (or more) entry in +// the table that had the same 63 bit hash. +// +// ustring::hash() then is modified as follows: If the MSB is 0, the +// computed hash is the hash. If the MSB is 1, though, we DON'T use that +// hash, and instead we use the pointer to the unique characters, but +// with the MSB set (that's an invalid address by itself). Note that the +// computed hashes never have MSB set, and the char*+MSB always have MSB +// set, so therefore ustring::hash() will never have the same value for +// two different ustrings. +// +// But -- please note! -- that ustring::strhash(str) and +// ustring(str).hash() will only match (and also be the same value on +// every execution) if the ustring is the first to receive that hash, +// which should be approximately always. Probably always, in practice. +// +// But in the very improbable case of a hash collision, one of them (the +// second to be turned into a ustring) will be using the alternate hash +// based on the character address, which is both not the same as +// ustring::strhash(chars), nor is it expected to be the same constant on +// every program execution. + template struct identity { constexpr T operator()(T val) const noexcept { return val; } }; +namespace { +atomic_ll total_ustring_hash_collisions(0); +} + // #define USTRING_TRACK_NUM_LOOKUPS @@ -71,6 +105,10 @@ template struct TableRepMap { const char* lookup(string_view str, uint64_t hash) { + if (OIIO_UNLIKELY(hash & ustring::duplicate_bit)) { + // duplicate bit is set -- the hash is related to the chars! + return OIIO::bitcast(hash & ustring::hash_mask); + } ustring_read_lock_t lock(mutex); #ifdef USTRING_TRACK_NUM_LOOKUPS // NOTE: this simple increment adds a substantial amount of overhead @@ -81,13 +119,12 @@ template struct TableRepMap { #endif size_t pos = hash & mask, dist = 0; for (;;) { - if (entries[pos] == 0) + ustring::TableRep* e = entries[pos]; + if (e == 0) return 0; - if (entries[pos]->hashed == hash - && entries[pos]->length == str.length() - && strncmp(entries[pos]->c_str(), str.data(), str.length()) - == 0) - return entries[pos]->c_str(); + if (e->hashed == hash && e->length == str.length() + && !strncmp(e->c_str(), str.data(), str.length())) + return e->c_str(); ++dist; pos = (pos + dist) & mask; // quadratic probing } @@ -98,6 +135,10 @@ template struct TableRepMap { // the hash. const char* lookup(uint64_t hash) { + if (OIIO_UNLIKELY(hash & ustring::duplicate_bit)) { + // duplicate bit is set -- the hash is related to the chars! + return OIIO::bitcast(hash & ustring::hash_mask); + } ustring_read_lock_t lock(mutex); #ifdef USTRING_TRACK_NUM_LOOKUPS // NOTE: this simple increment adds a substantial amount of overhead @@ -108,10 +149,11 @@ template struct TableRepMap { #endif size_t pos = hash & mask, dist = 0; for (;;) { - if (entries[pos] == 0) + ustring::TableRep* e = entries[pos]; + if (e == 0) return 0; - if (entries[pos]->hashed == hash) - return entries[pos]->c_str(); + if (e->hashed == hash) + return e->c_str(); ++dist; pos = (pos + dist) & mask; // quadratic probing } @@ -119,27 +161,52 @@ template struct TableRepMap { const char* insert(string_view str, uint64_t hash) { + OIIO_ASSERT((hash & ustring::duplicate_bit) == 0); // can't happen? ustring_write_lock_t lock(mutex); size_t pos = hash & mask, dist = 0; + bool duplicate_hash = false; for (;;) { - if (entries[pos] == 0) + ustring::TableRep* e = entries[pos]; + if (e == 0) break; // found insert pos - if (entries[pos]->hashed == hash - && entries[pos]->length == str.length() - && !strncmp(entries[pos]->c_str(), str.data(), str.length())) { - // same string is already inserted, return the one that is - // already in the table - return entries[pos]->c_str(); + if (e->hashed == hash) { + duplicate_hash = true; + if (e->length == str.length() + && !strncmp(e->c_str(), str.data(), str.length())) { + // same string is already inserted, return the one that is + // already in the table + return e->c_str(); + } } ++dist; pos = (pos + dist) & mask; // quadratic probing } ustring::TableRep* rep = make_rep(str, hash); - entries[pos] = rep; + + // If we encountered another ustring with the same hash (if one + // exists, it would have hashed to the same address so we would have + // seen it), set the duplicate bit in the rep's hashed field. + if (duplicate_hash) { + ++total_ustring_hash_collisions; +#if PREVENT_HASH_COLLISIONS + rep->hashed |= ustring::duplicate_bit; +#endif +#if !defined(NDEBUG) + print("DUPLICATE ustring '{}' hash {:x} c_str {:p} strhash {:x}\n", + rep->c_str(), rep->hashed, rep->c_str(), + ustring::strhash(str)); +#endif + } + + entries[pos] = rep; ++num_entries; if (2 * num_entries > mask) - grow(); // maintain 0.5 load factor + grow(); // maintain 0.5 load factor + // ensure low bit clear + OIIO_DASSERT((size_t(rep->c_str()) & 1) == 0); + // ensure low bit clear + OIIO_DASSERT((size_t(rep->c_str()) & ustring::duplicate_bit) == 0); return rep->c_str(); // rep is now in the table } @@ -283,11 +350,6 @@ struct UstringTable { // This string is here so that we can return sensible values of str when the ustring's pointer is NULL std::string ustring::empty_std_string; -// The reverse map that lets you look up a string by its initial hash. -using ReverseMap - = unordered_map_concurrent, - std::equal_to, 256 /*bins*/>; - namespace { // anonymous @@ -298,19 +360,6 @@ ustring_table() return table; } - -static ReverseMap& -reverse_map() -{ - static OIIO_CACHE_ALIGN ReverseMap rm; - return rm; -} - - -// Keep track of any collisions -static std::vector> all_hash_collisions; -OIIO_CACHE_ALIGN static std::mutex collision_mutex; - } // end anonymous namespace @@ -384,6 +433,7 @@ ustring::TableRep::TableRep(string_view strref, ustring::hash_t hash) && defined(_GLIBCXX_USE_CXX11_ABI) && _GLIBCXX_USE_CXX11_ABI // NEW gcc ABI // FIXME -- do something smart with this. + str = strref; #elif defined(__GNUC__) && !defined(_LIBCPP_VERSION) // OLD gcc ABI @@ -402,7 +452,6 @@ ustring::TableRep::TableRep(string_view strref, ustring::hash_t hash) dummy_refcount = 1; // so it never frees *(const char**)&str = c_str(); OIIO_DASSERT(str.c_str() == c_str() && str.size() == length); - return; #elif defined(_LIBCPP_VERSION) && !defined(__aarch64__) // FIXME -- we seem to do the wrong thing with libcpp on Mac M1. Disable @@ -426,15 +475,17 @@ ustring::TableRep::TableRep(string_view strref, ustring::hash_t hash) ((libcpp_string__long*)&str)->__size_ = length; ((libcpp_string__long*)&str)->__data_ = (char*)c_str(); OIIO_DASSERT(str.c_str() == c_str() && str.size() == length); - return; + } else { + // Short string -- just assign it, since there is no extra allocation. + str = strref; } -#endif - +#else // Remaining cases - just assign the internal string. This may result // in double allocation for the chars. If you care about that, do // something special for your platform, much like we did for gcc and // libc++ above. (Windows users, I'm talking to you.) str = strref; +#endif } @@ -459,11 +510,8 @@ ustring::make_unique(string_view strref) if (!strref.data()) strref = string_view("", 0); - hash_t hash = Strutil::strhash64(strref); - // This line, if uncommented, lets you force lots of hash collisions: - // hash &= ~hash_t(0xffffff); + hash_t hash = ustring::strhash(strref); -#if !PREVENT_HASH_COLLISIONS // Check the ustring table to see if this string already exists. If so, // construct from its canonical representation. // NOTE: all locking is performed internally to the table implementation @@ -477,95 +525,13 @@ ustring::make_unique(string_view strref) // OIIO_ASSERT(strref.find('\0') == string_view::npos && // "ustring::make_unique() does not support embedded nulls"); strref = strref.substr(0, nul); - hash = Strutil::strhash64(strref); + hash = ustring::strhash(strref); result = table.lookup(strref, hash); if (result) return result; } // Strutil::print("ADDED ustring \"{}\" {:08x}\n", strref, hash); return table.insert(strref, hash); - -#else - // Check the ustring table to see if this string already exists with the - // default hash. If so, we're done. This is by far the common case -- - // most lookups already exist in the table, and hash collisions are - // extremely rare. - const char* result = table.lookup(strref, hash); - if (result) - return result; - - // ustring doesn't allow strings with embedded nul characters. Before we - // go any further, trim beyond any nul and rehash. - auto nul = strref.find('\0'); - if (nul != string_view::npos) { - // Strutil::print("ustring::make_unique: string contains nulls @{}/{}: \"{}\"\n", - // strref.find('\0'), strref.size(), strref); - // OIIO_ASSERT(strref.find('\0') == string_view::npos && - // "ustring::make_unique() does not support embedded nulls"); - strref = strref.substr(0, nul); - hash = Strutil::strhash64(strref); - result = table.lookup(strref, hash); - if (result) - return result; - } - - // We did not find it. There are two possibilities: (1) the string is in - // the table but has a different hash because it collided; or (2) the - // string is not yet in the table. - - // Thread safety by locking reverse_map's bin corresponding to our - // original hash. This will prevent any potentially colliding ustring - // from being added to either table. But ustrings whose hashes go to - // different bins of the reverse map (which by definition cannot clash) - // are allowed to be added concurrently. - auto& rm(reverse_map()); - size_t bin = rm.lock_bin(hash); - - hash_t orighash = hash; - size_t binmask = orighash & (~rm.nobin_mask()); - size_t num_rehashes = 0; - - while (1) { - auto rev = rm.find(hash, false); - // rev now either holds an iterator into the reverse map for a - // record that has this hash, or else it's end(). - if (rev == rm.end()) { - // That hash is unused, insert the string with that hash into - // the ustring table, and insert the hash with the unique char - // pointer into the reverse_map. - result = table.insert(strref, hash); - bool ok = rm.insert(hash, result, false); - // Strutil::print("ADDED \"{}\" {:08x}\n", strref, hash); - OIIO_ASSERT(ok && "thread safety failure"); - break; - } - // Something uses this hash. Is it our string? - if (!strncmp(rev->second, strref.data(), strref.size())) { - // It is our string, already in this hash slot! - result = rev->second; - break; - } - // Rehash, but keep the bin bits identical so we always rehash into - // the same (locked) bin. - hash = (hash & binmask) - | (farmhash::Fingerprint(hash) & rm.nobin_mask()); - ++num_rehashes; - // Strutil::print("COLLISION \"{}\" {:08x} vs \"{}\"\n", - // strref, orighash, rev->second); - { - std::lock_guard lock(collision_mutex); - all_hash_collisions.emplace_back(rev->second, rev->first); - } - } - rm.unlock_bin(bin); - - if (num_rehashes) { - std::lock_guard lock(collision_mutex); - all_hash_collisions.emplace_back(result, orighash); - } - - return result; -#endif } @@ -607,27 +573,20 @@ ustring::getstats(bool verbose) size_t n_e = total_ustrings(); size_t mem = memory(); if (verbose) { - out << "ustring statistics:\n"; + print(out, "ustring statistics:\n"); #ifdef USTRING_TRACK_NUM_LOOKUPS - out << " ustring requests: " << ustring_table().get_num_lookups() - << "\n"; -#endif - out << " unique strings: " << n_e << "\n"; - out << " ustring memory: " << Strutil::memformat(mem) << "\n"; -#ifndef NDEBUG - std::vector collisions; - hash_collisions(&collisions); - if (collisions.size()) { - out << " Hash collisions: " << collisions.size() << "\n"; - for (auto c : collisions) - out << Strutil::fmt::format(" {} \"{}\"\n", c.hash(), c); - } + print(out, " ustring requests: {}\n", + ustring_table().get_num_lookups()); #endif + print(out, " unique strings: {}\n", n_e); + print(out, " ustring memory: {}\n", Strutil::memformat(mem)); + print(out, " total ustring hash collisions: {}\n", + (int)total_ustring_hash_collisions); } else { #ifdef USTRING_TRACK_NUM_LOOKUPS - out << "requests: " << ustring_table().get_num_lookups() << ", "; + print(out, "requests: {}, ", ustring_table().get_num_lookups()); #endif - out << "unique " << n_e << ", " << Strutil::memformat(mem); + print(out, "unique {}, {}\n", n_e, Strutil::memformat(mem)); } return out.str(); } @@ -637,11 +596,12 @@ ustring::getstats(bool verbose) size_t ustring::hash_collisions(std::vector* collisions) { - std::lock_guard lock(collision_mutex); - if (collisions) - for (const auto& c : all_hash_collisions) - collisions->emplace_back(ustring::from_unique(c.first)); - return all_hash_collisions.size(); + if (collisions) { + // Disabled for now + // for (const auto& c : all_hash_collisions) + // collisions->emplace_back(ustring::from_unique(c.first)); + } + return size_t(total_ustring_hash_collisions); } diff --git a/src/libutil/ustring_test.cpp b/src/libutil/ustring_test.cpp index 6e0a883401..8c54676828 100644 --- a/src/libutil/ustring_test.cpp +++ b/src/libutil/ustring_test.cpp @@ -265,14 +265,14 @@ create_lotso_ustrings(int iterations) { OIIO_DASSERT(size_t(iterations) <= strings.size()); if (verbose) - Strutil::print("thread {}\n", std::this_thread::get_id()); + print("thread {}\n", std::this_thread::get_id()); size_t h = 0; for (int i = 0; i < iterations; ++i) { ustring s(strings[i].data()); h += s.hash(); } if (verbose) - Strutil::printf("checksum %08x\n", unsigned(h)); + print("checksum {:08x}\n", h); } @@ -310,10 +310,17 @@ verify_no_collisions() size_t ncollisions = ustring::hash_collisions(&collisions); OIIO_CHECK_ASSERT(ncollisions == 0); if (ncollisions) { - Strutil::print(" Hash collisions: {}\n", ncollisions); + print(" Hash collisions: {}\n", ncollisions); for (auto c : collisions) - Strutil::print(" \"{}\" (orig {:08x} rehashed {:08x})\n", c, - Strutil::strhash(c), c.hash()); + print(" \"{}\" (orig {:016x} rehashed {:016x})\n", c, + Strutil::strhash(c), c.hash()); + } + + for (int i = 0; i < 200; ++i) { + ustring u = ustring::fmtformat("{}", i); + print("{}: {:016x} uh {:016x} sh {:016x} (addr {:p}){}\n", u, u.hash(), + ustring::strhash(u.c_str()), Strutil::strhash(u.c_str()), + u.c_str(), u.has_unique_hash() ? "" : " DUP"); } } @@ -330,7 +337,7 @@ main(int argc, char* argv[]) benchmark_threaded_ustring_creation(); verify_no_collisions(); - std::cout << "\n" << ustring::getstats(true) << "\n"; + print("\n{}\n", ustring::getstats(true)); return unit_test_failures; }