stotko · stotko · Feb 18, 2020 · Feb 18, 2020 · Feb 18, 2020 · Feb 18, 2020
diff --git a/src/stdgpu/bit.h b/src/stdgpu/bit.h
@@ -38,6 +38,24 @@ template <typename T, typename = typename std::enable_if<std::is_unsigned<T>::va
 STDGPU_HOST_DEVICE bool
 ispow2(const T number);
 
+/**
+ * \brief Computes the smallest power of two which is larger or equal than the given number
+ * \param[in] number A number
+ * \return The smallest power of two which is larger than the given number
+ */
+template <typename T, typename = typename std::enable_if<std::is_unsigned<T>::value>::type>
+STDGPU_HOST_DEVICE T
+ceil2(const T number);
+
+/**
+ * \brief Computes the largest power of two which is smaller or equal than the given number
+ * \param[in] number A number
+ * \return The largest power of two which is smaller than the given number
+ */
+template <typename T, typename = typename std::enable_if<std::is_unsigned<T>::value>::type>
+STDGPU_HOST_DEVICE T
+floor2(const T number);
+
 /**
  * \brief Computes the modulus of the given number and a power of two divider
  * \param[in] number A number

diff --git a/src/stdgpu/impl/bit_detail.h b/src/stdgpu/impl/bit_detail.h
@@ -68,6 +68,50 @@ ispow2(const T number)
 }
 
 
+template <typename T, typename>
+STDGPU_HOST_DEVICE T
+ceil2(const T number)
+{
+    T result = number;
+
+    // Special case zero
+    result += (result == 0);
+
+    result--;
+    for (index_t i = 0; i < stdgpu::numeric_limits<T>::digits; ++i)
+    {
+        result |= result >> i;
+    }
+    result++;
+
+    // If result is not representable in T, we have undefined behavior
+    // --> In this case, we have an overflow to 0
+    STDGPU_ENSURES(result == 0 || ispow2(result));
+
+    return result;
+}
+
+
+template <typename T, typename>
+STDGPU_HOST_DEVICE T
+floor2(const T number)
+{
+    // Special case zero
+    if (number == 0) return 0;
+
+    T result = number;
+    for (index_t i = 0; i < stdgpu::numeric_limits<T>::digits; ++i)
+    {
+        result |= result >> i;
+    }
+    result &= ~(result >> 1);
+
+    STDGPU_ENSURES(ispow2(result));
+
+    return result;
+}
+
+
 template <typename T, typename>
 STDGPU_HOST_DEVICE T
 mod2(const T number,

diff --git a/src/stdgpu/impl/unordered_base_detail.cuh b/src/stdgpu/impl/unordered_base_detail.cuh
@@ -43,20 +43,6 @@ namespace stdgpu
 namespace detail
 {
 
-inline index_t
-next_pow2(const index_t capacity)
-{
-    STDGPU_EXPECTS(capacity > 0);
-
-    index_t result = static_cast<index_t>(1) << static_cast<index_t>(std::ceil(std::log2(capacity)));
-
-    STDGPU_ENSURES(result >= capacity);
-    STDGPU_ENSURES(ispow2<std::size_t>(static_cast<std::size_t>(result)));
-
-    return result;
-}
-
-
 inline index_t
 expected_collisions(const index_t bucket_count,
                     const index_t capacity)
@@ -1038,7 +1024,7 @@ unordered_base<Key, Value, KeyFromValue, Hash, KeyEqual>::createDeviceObject(con
     STDGPU_EXPECTS(capacity > 0);
 
     // bucket count depends on default max load factor
-    index_t bucket_count = next_pow2(static_cast<index_t>(std::ceil(static_cast<float>(capacity) / default_max_load_factor())));
+    index_t bucket_count = static_cast<index_t>(stdgpu::ceil2(static_cast<std::size_t>(std::ceil(static_cast<float>(capacity) / default_max_load_factor()))));
 
     // excess count is estimated by the expected collision count and conservatively lowered since entries falling into regular buckets are already included here
     index_t excess_count = std::max<index_t>(1, expected_collisions(bucket_count, capacity) * 2 / 3);

diff --git a/test/stdgpu/bit.cpp b/test/stdgpu/bit.cpp
@@ -51,6 +51,14 @@ template
 STDGPU_HOST_DEVICE bool
 ispow2<unsigned int>(const unsigned int);
 
+template
+STDGPU_HOST_DEVICE unsigned int
+ceil2<unsigned int>(const unsigned int);
+
+template
+STDGPU_HOST_DEVICE unsigned int
+floor2<unsigned int>(const unsigned int);
+
 template
 STDGPU_HOST_DEVICE unsigned int
 mod2<unsigned int>(const unsigned int,
@@ -80,17 +88,17 @@ popcount<unsigned long long int>(const unsigned long long int);
 
 void
 thread_ispow2_random(const stdgpu::index_t iterations,
-                     const std::unordered_set<size_t>& pow2_list)
+                     const std::unordered_set<std::size_t>& pow2_list)
 {
     // Generate true random numbers
-    size_t seed = test_utils::random_thread_seed();
+    std::size_t seed = test_utils::random_thread_seed();
 
     std::default_random_engine rng(seed);
-    std::uniform_int_distribution<size_t> dist(std::numeric_limits<size_t>::lowest(), std::numeric_limits<size_t>::max());
+    std::uniform_int_distribution<std::size_t> dist(std::numeric_limits<std::size_t>::lowest(), std::numeric_limits<std::size_t>::max());
 
     for (stdgpu::index_t i = 0; i < iterations; ++i)
     {
-        size_t number = dist(rng);
+        std::size_t number = dist(rng);
 
         if (pow2_list.find(number) == pow2_list.end())
         {
@@ -102,10 +110,10 @@ thread_ispow2_random(const stdgpu::index_t iterations,
 
 TEST_F(stdgpu_bit, ispow2)
 {
-    std::unordered_set<size_t> pow2_list;
-    for (size_t i = 0; i < 63; ++i)
+    std::unordered_set<std::size_t> pow2_list;
+    for (std::size_t i = 0; i < std::numeric_limits<std::size_t>::digits; ++i)
     {
-        size_t pow2_i = static_cast<size_t>(1) << i;
+        std::size_t pow2_i = static_cast<std::size_t>(1) << i;
 
         ASSERT_TRUE(stdgpu::ispow2(pow2_i));
 
@@ -121,27 +129,104 @@ TEST_F(stdgpu_bit, ispow2)
 }
 
 
+void
+thread_ceil2_random(const stdgpu::index_t iterations)
+{
+    // Generate true random numbers
+    std::size_t seed = test_utils::random_thread_seed();
+
+    std::default_random_engine rng(seed);
+    std::uniform_int_distribution<std::size_t> dist(std::numeric_limits<std::size_t>::lowest(), std::numeric_limits<std::size_t>::max());
+
+    for (stdgpu::index_t i = 0; i < iterations; ++i)
+    {
+        std::size_t number = dist(rng);
+
+        // result will not be representable, so skip this sample
+        if (number > static_cast<std::size_t>(1) << (std::numeric_limits<std::size_t>::digits - 1)) continue;
+
+        std::size_t result = stdgpu::ceil2(number);
+
+        EXPECT_TRUE(stdgpu::ispow2(result));
+        EXPECT_GE(result, number);
+        EXPECT_LT(result / 2, number);
+    }
+}
+
+
+TEST_F(stdgpu_bit, ceil2_random)
+{
+    stdgpu::index_t iterations_per_thread = static_cast<stdgpu::index_t>(pow(2, 19));
+
+    test_utils::for_each_concurrent_thread(&thread_ceil2_random,
+                                           iterations_per_thread);
+}
+
+
+TEST_F(stdgpu_bit, ceil2_zero)
+{
+    EXPECT_EQ(stdgpu::ceil2(static_cast<std::size_t>(0)), static_cast<std::size_t>(1));
+}
+
+
+void
+thread_floor2_random(const stdgpu::index_t iterations)
+{
+    // Generate true random numbers
+    std::size_t seed = test_utils::random_thread_seed();
+
+    std::default_random_engine rng(seed);
+    std::uniform_int_distribution<std::size_t> dist(std::numeric_limits<std::size_t>::lowest(), std::numeric_limits<std::size_t>::max());
+
+    for (stdgpu::index_t i = 0; i < iterations; ++i)
+    {
+        std::size_t number = dist(rng);
+
+        std::size_t result = stdgpu::floor2(number);
+
+        EXPECT_TRUE(stdgpu::ispow2(result));
+        EXPECT_LE(result, number);
+        EXPECT_GT(result, number / 2);
+    }
+}
+
+
+TEST_F(stdgpu_bit, floor2_random)
+{
+    stdgpu::index_t iterations_per_thread = static_cast<stdgpu::index_t>(pow(2, 19));
+
+    test_utils::for_each_concurrent_thread(&thread_floor2_random,
+                                           iterations_per_thread);
+}
+
+
+TEST_F(stdgpu_bit, floor2_zero)
+{
+    EXPECT_EQ(stdgpu::floor2(static_cast<std::size_t>(0)), static_cast<std::size_t>(0));
+}
+
+
 void
 thread_mod2_random(const stdgpu::index_t iterations,
-                   const size_t divider)
+                   const std::size_t divider)
 {
     // Generate true random numbers
-    size_t seed = test_utils::random_thread_seed();
+    std::size_t seed = test_utils::random_thread_seed();
 
     std::default_random_engine rng(seed);
-    std::uniform_int_distribution<size_t> dist(std::numeric_limits<size_t>::lowest(), std::numeric_limits<size_t>::max());
+    std::uniform_int_distribution<std::size_t> dist(std::numeric_limits<std::size_t>::lowest(), std::numeric_limits<std::size_t>::max());
 
     for (stdgpu::index_t i = 0; i < iterations; ++i)
     {
-        size_t number = dist(rng);
+        std::size_t number = dist(rng);
         EXPECT_EQ(stdgpu::mod2(number, divider), number % divider);
     }
 }
 
 
 TEST_F(stdgpu_bit, mod2_random)
 {
-    const size_t divider = static_cast<size_t>(pow(2, 21));
+    const std::size_t divider = static_cast<std::size_t>(pow(2, 21));
     stdgpu::index_t iterations_per_thread = static_cast<stdgpu::index_t>(pow(2, 19));
 
     test_utils::for_each_concurrent_thread(&thread_mod2_random,
@@ -152,49 +237,49 @@ TEST_F(stdgpu_bit, mod2_random)
 
 TEST_F(stdgpu_bit, mod2_one_positive)
 {
-    size_t number       = 42;
-    size_t divider      = 1;
-    EXPECT_EQ(stdgpu::mod2(number, divider), static_cast<size_t>(0));
+    std::size_t number       = 42;
+    std::size_t divider      = 1;
+    EXPECT_EQ(stdgpu::mod2(number, divider), static_cast<std::size_t>(0));
 }
 
 
 TEST_F(stdgpu_bit, mod2_one_zero)
 {
-    size_t number       = 0;
-    size_t divider      = 1;
-    EXPECT_EQ(stdgpu::mod2(number, divider), static_cast<size_t>(0));
+    std::size_t number       = 0;
+    std::size_t divider      = 1;
+    EXPECT_EQ(stdgpu::mod2(number, divider), static_cast<std::size_t>(0));
 }
 
 
 TEST_F(stdgpu_bit, log2pow2)
 {
-    for (size_t i = 0; i < std::numeric_limits<size_t>::digits; ++i)
+    for (std::size_t i = 0; i < std::numeric_limits<std::size_t>::digits; ++i)
     {
-        EXPECT_EQ(stdgpu::log2pow2(static_cast<size_t>(1) << i), static_cast<size_t>(i));
+        EXPECT_EQ(stdgpu::log2pow2(static_cast<std::size_t>(1) << i), static_cast<std::size_t>(i));
     }
 }
 
 
 TEST_F(stdgpu_bit, popcount_zero)
 {
-    EXPECT_EQ(stdgpu::popcount(static_cast<size_t>(0)), 0);
+    EXPECT_EQ(stdgpu::popcount(static_cast<std::size_t>(0)), 0);
 }
 
 
 TEST_F(stdgpu_bit, popcount_pow2)
 {
-    for (size_t i = 0; i < std::numeric_limits<size_t>::digits; ++i)
+    for (std::size_t i = 0; i < std::numeric_limits<std::size_t>::digits; ++i)
     {
-        EXPECT_EQ(stdgpu::popcount(static_cast<size_t>(1) << i), 1);
+        EXPECT_EQ(stdgpu::popcount(static_cast<std::size_t>(1) << i), 1);
     }
 }
 
 
 TEST_F(stdgpu_bit, popcount_pow2m1)
 {
-    for (size_t i = 0; i < std::numeric_limits<size_t>::digits; ++i)
+    for (std::size_t i = 0; i < std::numeric_limits<std::size_t>::digits; ++i)
     {
-        EXPECT_EQ(stdgpu::popcount((static_cast<size_t>(1) << i) - 1), i);
+        EXPECT_EQ(stdgpu::popcount((static_cast<std::size_t>(1) << i) - 1), i);
     }
 }