diff --git a/LICENSE.txt b/LICENSE.txt
index 2e6ce20ae21a0..5c9aaddc14ff8 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -261,42 +261,6 @@ comments, complaints, performance data, etc to dl@cs.oswego.edu
 
 --------------------------------------------------------------------------------
 
-src/plasma/thirdparty/xxhash: BSD 2-Clause License
-
-xxHash - Fast Hash algorithm
-Copyright (C) 2012-2016, Yann Collet
-
-BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are
-met:
-
-* Redistributions of source code must retain the above copyright
-notice, this list of conditions and the following disclaimer.
-* Redistributions in binary form must reproduce the above
-copyright notice, this list of conditions and the following disclaimer
-in the documentation and/or other materials provided with the
-distribution.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-You can contact the author at :
-- xxHash homepage: http://www.xxhash.com
-- xxHash source repository : https://github.com/Cyan4973/xxHash
-
---------------------------------------------------------------------------------
-
 src/plasma/common.cc (some portions)
 
 Copyright (c) Austin Appleby (aappleby (AT) gmail)
@@ -797,3 +761,37 @@ SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
 FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
 ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 DEALINGS IN THE SOFTWARE.
+
+--------------------------------------------------------------------------------
+
+The files in cpp/src/arrow/util/xxhash/ have the following license
+(BSD 2-Clause License)
+
+xxHash Library
+Copyright (c) 2012-2014, Yann Collet
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice, this
+  list of conditions and the following disclaimer in the documentation and/or
+  other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+You can contact the author at :
+- xxHash homepage: http://www.xxhash.com
+- xxHash source repository : https://github.com/Cyan4973/xxHash
diff --git a/cpp/build-support/lint_cpp_cli.py b/cpp/build-support/lint_cpp_cli.py
index 2333d7f368a17..4c26927740dbb 100644
--- a/cpp/build-support/lint_cpp_cli.py
+++ b/cpp/build-support/lint_cpp_cli.py
@@ -71,9 +71,13 @@ def lint_file(path):
 EXCLUSIONS = [
     'arrow/python/iterators.h',
     'arrow/util/date.h',
+    'arrow/util/hashing.h',
     'arrow/util/macros.h',
     'arrow/util/parallel.h',
     'arrow/util/string_view/string_view.hpp',
+    'arrow/util/xxhash/xxhash.c',
+    'arrow/util/xxhash/xxhash.h',
+    'arrow/visitor_inline.h',
     'gandiva/cache.h',
     'gandiva/jni',
     'test',
diff --git a/cpp/cmake_modules/SetupCxxFlags.cmake b/cpp/cmake_modules/SetupCxxFlags.cmake
index 1727de519550c..6f379f9e01d21 100644
--- a/cpp/cmake_modules/SetupCxxFlags.cmake
+++ b/cpp/cmake_modules/SetupCxxFlags.cmake
@@ -19,7 +19,7 @@
 # instruction sets that would boost performance.
 include(CheckCXXCompilerFlag)
 # x86/amd64 compiler flags
-CHECK_CXX_COMPILER_FLAG("-msse3" CXX_SUPPORTS_SSE3)
+CHECK_CXX_COMPILER_FLAG("-msse4.2" CXX_SUPPORTS_SSE4_2)
 # power compiler flags
 CHECK_CXX_COMPILER_FLAG("-maltivec" CXX_SUPPORTS_ALTIVEC)
 
@@ -212,8 +212,8 @@ if (BUILD_WARNING_FLAGS)
 endif(BUILD_WARNING_FLAGS)
 
 # Only enable additional instruction sets if they are supported
-if (CXX_SUPPORTS_SSE3 AND ARROW_SSE3)
-  set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -msse3")
+if (CXX_SUPPORTS_SSE4_2)
+  set(CXX_COMMON_FLAGS "${CXX_COMMON_FLAGS} -msse4.2")
 endif()
 
 if (CXX_SUPPORTS_ALTIVEC AND ARROW_ALTIVEC)
diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index 8c4138beff6b8..4b5db0a67de6c 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -48,7 +48,6 @@ set(ARROW_SRCS
   util/compression.cc
   util/cpu-info.cc
   util/decimal.cc
-  util/hash.cc
   util/io-util.cc
   util/logging.cc
   util/key_value_metadata.cc
diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h
index ceed7ecba70f8..0274c15f74f61 100644
--- a/cpp/src/arrow/array.h
+++ b/cpp/src/arrow/array.h
@@ -168,6 +168,26 @@ struct ARROW_EXPORT ArrayData {
 
   std::shared_ptr<ArrayData> Copy() const { return std::make_shared<ArrayData>(*this); }
 
+  // Access a buffer's data as a typed C pointer
+  template <typename T>
+  inline const T* GetValues(int i) const {
+    if (buffers[i]) {
+      return reinterpret_cast<const T*>(buffers[i]->data()) + offset;
+    } else {
+      return NULLPTR;
+    }
+  }
+
+  // Access a buffer's data as a typed C pointer
+  template <typename T>
+  inline T* GetMutableValues(int i) {
+    if (buffers[i]) {
+      return reinterpret_cast<T*>(buffers[i]->mutable_data()) + offset;
+    } else {
+      return NULLPTR;
+    }
+  }
+
   std::shared_ptr<DataType> type;
   int64_t length;
   int64_t null_count;
diff --git a/cpp/src/arrow/builder-benchmark.cc b/cpp/src/arrow/builder-benchmark.cc
index 809e6ffab85e4..4acede1ccd51c 100644
--- a/cpp/src/arrow/builder-benchmark.cc
+++ b/cpp/src/arrow/builder-benchmark.cc
@@ -15,11 +15,19 @@
 // specific language governing permissions and limitations
 // under the License.
 
+#include <algorithm>
+#include <cstdint>
+#include <limits>
+#include <random>
+#include <string>
+#include <vector>
+
 #include "benchmark/benchmark.h"
 
 #include "arrow/builder.h"
 #include "arrow/memory_pool.h"
 #include "arrow/test-util.h"
+#include "arrow/util/bit-util.h"
 
 namespace arrow {
 
@@ -148,7 +156,6 @@ static void BM_BuildBinaryArray(benchmark::State& state) {  // NOLINT non-const
     std::shared_ptr<Array> out;
     ABORT_NOT_OK(builder.Finish(&out));
   }
-  // Assuming a string here needs on average 2 bytes
   state.SetBytesProcessed(state.iterations() * iterations * value.size());
 }
 
@@ -171,18 +178,195 @@ static void BM_BuildFixedSizeBinaryArray(
   state.SetBytesProcessed(state.iterations() * iterations * width);
 }
 
-BENCHMARK(BM_BuildPrimitiveArrayNoNulls)->Repetitions(3)->Unit(benchmark::kMicrosecond);
-BENCHMARK(BM_BuildVectorNoNulls)->Repetitions(3)->Unit(benchmark::kMicrosecond);
+// ----------------------------------------------------------------------
+// DictionaryBuilder benchmarks
+
+// Testing with different distributions of integer values helps stress
+// the hash table's robustness.
+
+// Make a vector out of `n_distinct` sequential int values
+template <class Integer>
+static std::vector<Integer> MakeSequentialIntDictFodder(int32_t n_values,
+                                                        int32_t n_distinct) {
+  std::default_random_engine gen(42);
+  std::vector<Integer> values(n_values);
+  {
+    std::uniform_int_distribution<Integer> values_dist(0, n_distinct - 1);
+    std::generate(values.begin(), values.end(), [&]() { return values_dist(gen); });
+  }
+  return values;
+}
+
+// Make a vector out of `n_distinct` int values with potentially colliding hash
+// entries as only their highest bits differ.
+template <class Integer>
+static std::vector<Integer> MakeSimilarIntDictFodder(int32_t n_values,
+                                                     int32_t n_distinct) {
+  std::default_random_engine gen(42);
+  std::vector<Integer> values(n_values);
+  {
+    std::uniform_int_distribution<Integer> values_dist(0, n_distinct - 1);
+    auto max_int = std::numeric_limits<Integer>::max();
+    auto multiplier = static_cast<Integer>(BitUtil::NextPower2(max_int / n_distinct / 2));
+    std::generate(values.begin(), values.end(),
+                  [&]() { return multiplier * values_dist(gen); });
+  }
+  return values;
+}
+
+// Make a vector out of `n_distinct` random int values
+template <class Integer>
+static std::vector<Integer> MakeRandomIntDictFodder(int32_t n_values,
+                                                    int32_t n_distinct) {
+  std::default_random_engine gen(42);
+  std::vector<Integer> values_dict(n_distinct);
+  std::vector<Integer> values(n_values);
+
+  {
+    std::uniform_int_distribution<Integer> values_dist(
+        0, std::numeric_limits<Integer>::max());
+    std::generate(values_dict.begin(), values_dict.end(),
+                  [&]() { return static_cast<Integer>(values_dist(gen)); });
+  }
+  {
+    std::uniform_int_distribution<int32_t> indices_dist(0, n_distinct - 1);
+    std::generate(values.begin(), values.end(),
+                  [&]() { return values_dict[indices_dist(gen)]; });
+  }
+  return values;
+}
+
+// Make a vector out of `n_distinct` string values
+static std::vector<std::string> MakeStringDictFodder(int32_t n_values,
+                                                     int32_t n_distinct) {
+  std::default_random_engine gen(42);
+  std::vector<std::string> values_dict(n_distinct);
+  std::vector<std::string> values(n_values);
 
-BENCHMARK(BM_BuildBooleanArrayNoNulls)->Repetitions(3)->Unit(benchmark::kMicrosecond);
+  {
+    auto it = values_dict.begin();
+    // Add empty string
+    *it++ = "";
+    // Add a few similar strings
+    *it++ = "abc";
+    *it++ = "abcdef";
+    *it++ = "abcfgh";
+    // Add random strings
+    std::uniform_int_distribution<int32_t> length_dist(2, 20);
+    std::independent_bits_engine<std::default_random_engine, 8, uint8_t> bytes_gen(42);
 
-BENCHMARK(BM_BuildAdaptiveIntNoNulls)->Repetitions(3)->Unit(benchmark::kMicrosecond);
+    std::generate(it, values_dict.end(), [&]() {
+      auto length = length_dist(gen);
+      std::string s(length, 'X');
+      for (int32_t i = 0; i < length; ++i) {
+        s[i] = bytes_gen();
+      }
+      return s;
+    });
+  }
+  {
+    std::uniform_int_distribution<int32_t> indices_dist(0, n_distinct - 1);
+    std::generate(values.begin(), values.end(),
+                  [&]() { return values_dict[indices_dist(gen)]; });
+  }
+  return values;
+}
+
+template <class DictionaryBuilderType, class Scalar>
+static void BenchmarkScalarDictionaryArray(
+    benchmark::State& state,  // NOLINT non-const reference
+    const std::vector<Scalar>& fodder) {
+  while (state.KeepRunning()) {
+    DictionaryBuilder<Int64Type> builder(default_memory_pool());
+    for (const auto value : fodder) {
+      ABORT_NOT_OK(builder.Append(value));
+    }
+    std::shared_ptr<Array> out;
+    ABORT_NOT_OK(builder.Finish(&out));
+  }
+  state.SetBytesProcessed(state.iterations() * fodder.size() * sizeof(Scalar));
+}
+
+static void BM_BuildInt64DictionaryArrayRandom(
+    benchmark::State& state) {  // NOLINT non-const reference
+  const auto fodder = MakeRandomIntDictFodder<int64_t>(10000, 100);
+  BenchmarkScalarDictionaryArray<DictionaryBuilder<Int64Type>>(state, fodder);
+}
+
+static void BM_BuildInt64DictionaryArraySequential(
+    benchmark::State& state) {  // NOLINT non-const reference
+  const auto fodder = MakeSequentialIntDictFodder<int64_t>(10000, 100);
+  BenchmarkScalarDictionaryArray<DictionaryBuilder<Int64Type>>(state, fodder);
+}
+
+static void BM_BuildInt64DictionaryArraySimilar(
+    benchmark::State& state) {  // NOLINT non-const reference
+  const auto fodder = MakeSimilarIntDictFodder<int64_t>(10000, 100);
+  BenchmarkScalarDictionaryArray<DictionaryBuilder<Int64Type>>(state, fodder);
+}
+
+static void BM_BuildStringDictionaryArray(
+    benchmark::State& state) {  // NOLINT non-const reference
+  const auto fodder = MakeStringDictFodder(10000, 100);
+  auto type = binary();
+  auto fodder_size =
+      std::accumulate(fodder.begin(), fodder.end(), 0,
+                      [&](size_t acc, const std::string& s) { return acc + s.size(); });
+
+  while (state.KeepRunning()) {
+    BinaryDictionaryBuilder builder(default_memory_pool());
+    for (const auto& value : fodder) {
+      ABORT_NOT_OK(builder.Append(value));
+    }
+    std::shared_ptr<Array> out;
+    ABORT_NOT_OK(builder.Finish(&out));
+  }
+  state.SetBytesProcessed(state.iterations() * fodder_size);
+}
+
+// ----------------------------------------------------------------------
+// Benchmark declarations
+
+static constexpr int32_t kRepetitions = 2;
+
+BENCHMARK(BM_BuildPrimitiveArrayNoNulls)
+    ->Repetitions(kRepetitions)
+    ->Unit(benchmark::kMicrosecond);
+BENCHMARK(BM_BuildVectorNoNulls)
+    ->Repetitions(kRepetitions)
+    ->Unit(benchmark::kMicrosecond);
+
+BENCHMARK(BM_BuildBooleanArrayNoNulls)
+    ->Repetitions(kRepetitions)
+    ->Unit(benchmark::kMicrosecond);
+
+BENCHMARK(BM_BuildAdaptiveIntNoNulls)
+    ->Repetitions(kRepetitions)
+    ->Unit(benchmark::kMicrosecond);
 BENCHMARK(BM_BuildAdaptiveIntNoNullsScalarAppend)
     ->Repetitions(3)
     ->Unit(benchmark::kMicrosecond);
-BENCHMARK(BM_BuildAdaptiveUIntNoNulls)->Repetitions(3)->Unit(benchmark::kMicrosecond);
+BENCHMARK(BM_BuildAdaptiveUIntNoNulls)
+    ->Repetitions(kRepetitions)
+    ->Unit(benchmark::kMicrosecond);
 
-BENCHMARK(BM_BuildBinaryArray)->Repetitions(3)->Unit(benchmark::kMicrosecond);
-BENCHMARK(BM_BuildFixedSizeBinaryArray)->Repetitions(3)->Unit(benchmark::kMicrosecond);
+BENCHMARK(BM_BuildBinaryArray)->Repetitions(kRepetitions)->Unit(benchmark::kMicrosecond);
+BENCHMARK(BM_BuildFixedSizeBinaryArray)
+    ->Repetitions(kRepetitions)
+    ->Unit(benchmark::kMicrosecond);
+
+BENCHMARK(BM_BuildInt64DictionaryArrayRandom)
+    ->Repetitions(kRepetitions)
+    ->Unit(benchmark::kMicrosecond);
+BENCHMARK(BM_BuildInt64DictionaryArraySequential)
+    ->Repetitions(kRepetitions)
+    ->Unit(benchmark::kMicrosecond);
+BENCHMARK(BM_BuildInt64DictionaryArraySimilar)
+    ->Repetitions(kRepetitions)
+    ->Unit(benchmark::kMicrosecond);
+
+BENCHMARK(BM_BuildStringDictionaryArray)
+    ->Repetitions(kRepetitions)
+    ->Unit(benchmark::kMicrosecond);
 
 }  // namespace arrow
diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc
index 3e99308ba4732..6aa415bbed2f3 100644
--- a/cpp/src/arrow/builder.cc
+++ b/cpp/src/arrow/builder.cc
@@ -33,16 +33,9 @@
 #include "arrow/util/bit-util.h"
 #include "arrow/util/checked_cast.h"
 #include "arrow/util/decimal.h"
-#include "arrow/util/hash-util.h"
-#include "arrow/util/hash.h"
+#include "arrow/util/hashing.h"
 #include "arrow/util/logging.h"
 
-#ifdef ARROW_USE_SSE
-#define SSE4_FLAG true
-#else
-#define SSE4_FLAG false
-#endif
-
 namespace arrow {
 
 using internal::AdaptiveIntBuilderBase;
@@ -759,147 +752,42 @@ Status BooleanBuilder::AppendValues(const std::vector<bool>& values) {
 // ----------------------------------------------------------------------
 // DictionaryBuilder
 
-using internal::DictionaryScalar;
-
-namespace {
-
-// A helper class to manage a hash table embedded in a typed Builder.
-template <typename T, typename Enable = void>
-struct DictionaryHashHelper {};
-
-// DictionaryHashHelper implementation for primitive types
 template <typename T>
-struct DictionaryHashHelper<T, enable_if_has_c_type<T>> {
-  using Builder = typename TypeTraits<T>::BuilderType;
-  using Scalar = typename DictionaryScalar<T>::type;
-
-  // Get the dictionary value at the given builder index
-  static Scalar GetDictionaryValue(const Builder& builder, int64_t index) {
-    return builder.GetValue(index);
-  }
-
-  // Compute the hash of a scalar value
-  static int64_t HashValue(const Scalar& value, int byte_width) {
-    return HashUtil::Hash<SSE4_FLAG>(&value, sizeof(Scalar), 0);
-  }
-
-  // Return whether the dictionary value at the given builder index is unequal to value
-  static bool SlotDifferent(const Builder& builder, int64_t index, const Scalar& value) {
-    return GetDictionaryValue(builder, index) != value;
-  }
-
-  // Append a value to the builder
-  static Status AppendValue(Builder& builder, const Scalar& value) {
-    return builder.Append(value);
-  }
-
-  // Append another builder's contents to the builder
-  static Status AppendArray(Builder& builder, const Array& in_array) {
-    const auto& array = checked_cast<const PrimitiveArray&>(in_array);
-    return builder.AppendValues(reinterpret_cast<const Scalar*>(array.values()->data()),
-                                array.length(), nullptr);
-  }
+class DictionaryBuilder<T>::MemoTableImpl
+    : public internal::HashTraits<T>::MemoTableType {
+ public:
+  using MemoTableType = typename internal::HashTraits<T>::MemoTableType;
+  using MemoTableType::MemoTableType;
 };
 
-// DictionaryHashHelper implementation for StringType / BinaryType
 template <typename T>
-struct DictionaryHashHelper<T, enable_if_binary<T>> {
-  using Builder = typename TypeTraits<T>::BuilderType;
-  using Scalar = typename DictionaryScalar<T>::type;
-
-  static Scalar GetDictionaryValue(const Builder& builder, int64_t index) {
-    return builder.GetView(index);
-  }
-
-  static int64_t HashValue(const Scalar& value, int byte_width) {
-    return HashUtil::Hash<SSE4_FLAG>(value.data(), static_cast<int32_t>(value.length()),
-                                     0);
-  }
-
-  static bool SlotDifferent(const Builder& builder, int64_t index, const Scalar& value) {
-    const Scalar other = GetDictionaryValue(builder, index);
-    return value.length() != other.length() ||
-           memcmp(value.data(), other.data(), other.length()) != 0;
-  }
-
-  static Status AppendValue(Builder& builder, const Scalar& value) {
-    return builder.Append(value);
-  }
-
-  static Status AppendArray(Builder& builder, const Array& in_array) {
-    const auto& array = checked_cast<const BinaryArray&>(in_array);
-    for (uint64_t index = 0, limit = array.length(); index < limit; ++index) {
-      RETURN_NOT_OK(builder.Append(array.GetView(index)));
-    }
-    return Status::OK();
-  }
-};
-
-// DictionaryHashHelper implementation for FixedSizeBinaryType
-template <typename T>
-struct DictionaryHashHelper<T, enable_if_fixed_size_binary<T>> {
-  using Builder = typename TypeTraits<FixedSizeBinaryType>::BuilderType;
-  using Scalar = typename DictionaryScalar<FixedSizeBinaryType>::type;
-
-  static Scalar GetDictionaryValue(const Builder& builder, int64_t index) {
-    return builder.GetValue(index);
-  }
-
-  static int64_t HashValue(const Scalar& value, int byte_width) {
-    return HashUtil::Hash<SSE4_FLAG>(value, byte_width, 0);
-  }
-
-  static bool SlotDifferent(const Builder& builder, int64_t index, const uint8_t* value) {
-    const int32_t width = builder.byte_width();
-    const uint8_t* other_value = builder.GetValue(index);
-    return memcmp(value, other_value, width) != 0;
-  }
-
-  static Status AppendValue(Builder& builder, const Scalar& value) {
-    return builder.Append(value);
-  }
-
-  static Status AppendArray(Builder& builder, const Array& in_array) {
-    const auto& array = checked_cast<const FixedSizeBinaryArray&>(in_array);
-    for (uint64_t index = 0, limit = array.length(); index < limit; ++index) {
-      const Scalar value = array.GetValue(index);
-      RETURN_NOT_OK(builder.Append(value));
-    }
-    return Status::OK();
-  }
-};
-
-}  // namespace
+DictionaryBuilder<T>::~DictionaryBuilder() {}
 
 template <typename T>
 DictionaryBuilder<T>::DictionaryBuilder(const std::shared_ptr<DataType>& type,
                                         MemoryPool* pool)
-    : ArrayBuilder(type, pool),
-      hash_slots_(nullptr),
-      dict_builder_(type, pool),
-      overflow_dict_builder_(type, pool),
-      values_builder_(pool),
-      byte_width_(-1) {}
+    : ArrayBuilder(type, pool), byte_width_(-1), values_builder_(pool) {
+  DCHECK_EQ(T::type_id, type->id()) << "inconsistent type passed to DictionaryBuilder";
+}
 
 DictionaryBuilder<NullType>::DictionaryBuilder(const std::shared_ptr<DataType>& type,
                                                MemoryPool* pool)
-    : ArrayBuilder(type, pool), values_builder_(pool) {}
+    : ArrayBuilder(type, pool), values_builder_(pool) {
+  DCHECK_EQ(Type::NA, type->id()) << "inconsistent type passed to DictionaryBuilder";
+}
 
 template <>
 DictionaryBuilder<FixedSizeBinaryType>::DictionaryBuilder(
     const std::shared_ptr<DataType>& type, MemoryPool* pool)
     : ArrayBuilder(type, pool),
-      hash_slots_(nullptr),
-      dict_builder_(type, pool),
-      overflow_dict_builder_(type, pool),
-      values_builder_(pool),
       byte_width_(checked_cast<const FixedSizeBinaryType&>(*type).byte_width()) {}
 
 template <typename T>
 void DictionaryBuilder<T>::Reset() {
-  dict_builder_.Reset();
-  overflow_dict_builder_.Reset();
+  ArrayBuilder::Reset();
   values_builder_.Reset();
+  memo_table_.reset();
+  delta_offset_ = 0;
 }
 
 template <typename T>
@@ -909,14 +797,10 @@ Status DictionaryBuilder<T>::Resize(int64_t capacity) {
   }
 
   if (capacity_ == 0) {
-    // Fill the initial hash table
-    RETURN_NOT_OK(internal::NewHashTable(kInitialHashTableSize, pool_, &hash_table_));
-    hash_slots_ = reinterpret_cast<int32_t*>(hash_table_->mutable_data());
-    hash_table_size_ = kInitialHashTableSize;
-    entry_id_offset_ = 0;
-    mod_bitmask_ = kInitialHashTableSize - 1;
-    hash_table_load_threshold_ =
-        static_cast<int64_t>(static_cast<double>(capacity) * kMaxHashTableLoad);
+    // Initialize hash table
+    // XXX should we let the user pass additional size heuristics?
+    memo_table_.reset(new MemoTableImpl(0));
+    delta_offset_ = 0;
   }
   RETURN_NOT_OK(values_builder_.Resize(capacity));
   return ArrayBuilder::Resize(capacity);
@@ -930,67 +814,12 @@ Status DictionaryBuilder<NullType>::Resize(int64_t capacity) {
   return ArrayBuilder::Resize(capacity);
 }
 
-template <typename T>
-int64_t DictionaryBuilder<T>::HashValue(const Scalar& value) {
-  return DictionaryHashHelper<T>::HashValue(value, byte_width_);
-}
-
-template <typename T>
-typename DictionaryBuilder<T>::Scalar DictionaryBuilder<T>::GetDictionaryValue(
-    typename TypeTraits<T>::BuilderType& dictionary_builder, int64_t index) {
-  return DictionaryHashHelper<T>::GetDictionaryValue(dictionary_builder, index);
-}
-
-template <typename T>
-bool DictionaryBuilder<T>::SlotDifferent(hash_slot_t index, const Scalar& value) {
-  DCHECK_GE(index, 0);
-  if (index >= entry_id_offset_) {
-    // Lookup delta dictionary
-    DCHECK_LT(index - entry_id_offset_, dict_builder_.length());
-    return DictionaryHashHelper<T>::SlotDifferent(
-        dict_builder_, static_cast<int64_t>(index - entry_id_offset_), value);
-  } else {
-    DCHECK_LT(index, overflow_dict_builder_.length());
-    return DictionaryHashHelper<T>::SlotDifferent(overflow_dict_builder_,
-                                                  static_cast<int64_t>(index), value);
-  }
-}
-
-template <typename T>
-Status DictionaryBuilder<T>::AppendDictionary(const Scalar& value) {
-  return DictionaryHashHelper<T>::AppendValue(dict_builder_, value);
-}
-
 template <typename T>
 Status DictionaryBuilder<T>::Append(const Scalar& value) {
   RETURN_NOT_OK(Reserve(1));
-  // Based on DictEncoder<DType>::Put
-  int64_t j = HashValue(value) & mod_bitmask_;
-  hash_slot_t index = hash_slots_[j];
-
-  // Find an empty slot
-  while (kHashSlotEmpty != index && SlotDifferent(index, value)) {
-    // Linear probing
-    ++j;
-    if (j == hash_table_size_) {
-      j = 0;
-    }
-    index = hash_slots_[j];
-  }
 
-  if (index == kHashSlotEmpty) {
-    // Not in the hash table, so we insert it now
-    index = static_cast<hash_slot_t>(dict_builder_.length() + entry_id_offset_);
-    hash_slots_[j] = index;
-    RETURN_NOT_OK(AppendDictionary(value));
-
-    if (ARROW_PREDICT_FALSE(static_cast<int32_t>(dict_builder_.length()) >
-                            hash_table_load_threshold_)) {
-      RETURN_NOT_OK(DoubleTableSize());
-    }
-  }
-
-  RETURN_NOT_OK(values_builder_.Append(index));
+  auto memo_index = memo_table_->GetOrInsert(value);
+  RETURN_NOT_OK(values_builder_.Append(memo_index));
 
   return Status::OK();
 }
@@ -1022,48 +851,24 @@ Status DictionaryBuilder<NullType>::AppendArray(const Array& array) {
   return Status::OK();
 }
 
-template <>
-Status DictionaryBuilder<FixedSizeBinaryType>::AppendArray(const Array& array) {
-  if (!type_->Equals(*array.type())) {
-    return Status::Invalid("Cannot append FixedSizeBinary array with non-matching type");
-  }
-
-  const auto& typed_array = checked_cast<const FixedSizeBinaryArray&>(array);
-  for (int64_t i = 0; i < array.length(); i++) {
-    if (array.IsNull(i)) {
-      RETURN_NOT_OK(AppendNull());
-    } else {
-      RETURN_NOT_OK(Append(typed_array.GetValue(i)));
-    }
-  }
-  return Status::OK();
-}
-
-template <typename T>
-Status DictionaryBuilder<T>::DoubleTableSize() {
-#define INNER_LOOP \
-  int64_t j = HashValue(GetDictionaryValue(dict_builder_, index)) & new_mod_bitmask
-
-  DOUBLE_TABLE_SIZE(, INNER_LOOP);
-
-  return Status::OK();
-}
-
 template <typename T>
 Status DictionaryBuilder<T>::FinishInternal(std::shared_ptr<ArrayData>* out) {
+  // Finalize indices array
+  RETURN_NOT_OK(values_builder_.FinishInternal(out));
+
+  // Generate dictionary array from hash table contents
   std::shared_ptr<Array> dictionary;
-  entry_id_offset_ += dict_builder_.length();
-  RETURN_NOT_OK(dict_builder_.Finish(&dictionary));
+  std::shared_ptr<ArrayData> dictionary_data;
 
-  // Store current dict entries for further uses of this DictionaryBuilder
-  RETURN_NOT_OK(
-      DictionaryHashHelper<T>::AppendArray(overflow_dict_builder_, *dictionary));
-  DCHECK_EQ(entry_id_offset_, overflow_dict_builder_.length());
+  RETURN_NOT_OK(internal::DictionaryTraits<T>::GetDictionaryArrayData(
+      pool_, type_, *memo_table_, delta_offset_, &dictionary_data));
+  dictionary = MakeArray(dictionary_data);
 
-  RETURN_NOT_OK(values_builder_.FinishInternal(out));
+  // Set type of array data to the right dictionary type
   (*out)->type = std::make_shared<DictionaryType>((*out)->type, dictionary);
 
-  dict_builder_.Reset();
+  // Update internals for further uses of this DictionaryBuilder
+  delta_offset_ = memo_table_->size();
   values_builder_.Reset();
 
   return Status::OK();
@@ -1101,6 +906,23 @@ Status DictionaryBuilder<NullType>::FinishInternal(std::shared_ptr<ArrayData>* o
 BINARY_DICTIONARY_SPECIALIZATIONS(StringType);
 BINARY_DICTIONARY_SPECIALIZATIONS(BinaryType);
 
+template <>
+Status DictionaryBuilder<FixedSizeBinaryType>::AppendArray(const Array& array) {
+  if (!type_->Equals(*array.type())) {
+    return Status::Invalid("Cannot append FixedSizeBinary array with non-matching type");
+  }
+
+  const auto& typed_array = checked_cast<const FixedSizeBinaryArray&>(array);
+  for (int64_t i = 0; i < array.length(); i++) {
+    if (array.IsNull(i)) {
+      RETURN_NOT_OK(AppendNull());
+    } else {
+      RETURN_NOT_OK(Append(typed_array.GetValue(i)));
+    }
+  }
+  return Status::OK();
+}
+
 template class DictionaryBuilder<UInt8Type>;
 template class DictionaryBuilder<UInt16Type>;
 template class DictionaryBuilder<UInt32Type>;
@@ -1419,6 +1241,12 @@ FixedSizeBinaryBuilder::FixedSizeBinaryBuilder(const std::shared_ptr<DataType>&
       byte_width_(checked_cast<const FixedSizeBinaryType&>(*type).byte_width()),
       byte_builder_(pool) {}
 
+#ifndef NDEBUG
+void FixedSizeBinaryBuilder::CheckValueSize(int64_t size) {
+  DCHECK_EQ(size, byte_width_) << "Appending wrong size to FixedSizeBinaryBuilder";
+}
+#endif
+
 Status FixedSizeBinaryBuilder::AppendValues(const uint8_t* data, int64_t length,
                                             const uint8_t* valid_bytes) {
   RETURN_NOT_OK(Reserve(length));
@@ -1426,10 +1254,6 @@ Status FixedSizeBinaryBuilder::AppendValues(const uint8_t* data, int64_t length,
   return byte_builder_.Append(data, length * byte_width_);
 }
 
-Status FixedSizeBinaryBuilder::Append(const std::string& value) {
-  return Append(reinterpret_cast<const uint8_t*>(value.c_str()));
-}
-
 Status FixedSizeBinaryBuilder::AppendNull() {
   RETURN_NOT_OK(Reserve(1));
   UnsafeAppendToBitmap(false);
diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h
index 6ddc0e9e01710..9977833b2be47 100644
--- a/cpp/src/arrow/builder.h
+++ b/cpp/src/arrow/builder.h
@@ -35,7 +35,6 @@
 #include "arrow/type.h"
 #include "arrow/type_traits.h"
 #include "arrow/util/bit-util.h"
-#include "arrow/util/hash.h"
 #include "arrow/util/macros.h"
 #include "arrow/util/string_view.h"
 #include "arrow/util/type_traits.h"
@@ -454,6 +453,8 @@ class ARROW_EXPORT AdaptiveIntBuilderBase : public ArrayBuilder {
   uint8_t int_size_;
 };
 
+// TODO investigate AdaptiveIntBuilder / AdaptiveUIntBuilder performance
+
 // Check if we would need to expand the underlying storage type
 inline uint8_t ExpandedIntSize(int64_t val, uint8_t current_int_size) {
   if (current_int_size == 8 ||
@@ -971,10 +972,25 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder {
     UnsafeAppendToBitmap(true);
     return byte_builder_.Append(value, byte_width_);
   }
+
   Status Append(const char* value) {
     return Append(reinterpret_cast<const uint8_t*>(value));
   }
 
+  Status Append(const util::string_view& view) {
+#ifndef NDEBUG
+    CheckValueSize(static_cast<int64_t>(view.size()));
+#endif
+    return Append(reinterpret_cast<const uint8_t*>(view.data()));
+  }
+
+  Status Append(const std::string& s) {
+#ifndef NDEBUG
+    CheckValueSize(static_cast<int64_t>(s.size()));
+#endif
+    return Append(reinterpret_cast<const uint8_t*>(s.data()));
+  }
+
   template <size_t NBYTES>
   Status Append(const std::array<uint8_t, NBYTES>& value) {
     ARROW_RETURN_NOT_OK(Reserve(1));
@@ -984,7 +1000,6 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder {
 
   Status AppendValues(const uint8_t* data, int64_t length,
                       const uint8_t* valid_bytes = NULLPTR);
-  Status Append(const std::string& value);
   Status AppendNull();
 
   void Reset() override;
@@ -1009,6 +1024,10 @@ class ARROW_EXPORT FixedSizeBinaryBuilder : public ArrayBuilder {
  protected:
   int32_t byte_width_;
   BufferBuilder byte_builder_;
+
+#ifndef NDEBUG
+  void CheckValueSize(int64_t size);
+#endif
 };
 
 class ARROW_EXPORT Decimal128Builder : public FixedSizeBinaryBuilder {
@@ -1094,7 +1113,7 @@ struct DictionaryScalar<StringType> {
 
 template <>
 struct DictionaryScalar<FixedSizeBinaryType> {
-  using type = const uint8_t*;
+  using type = util::string_view;
 };
 
 }  // namespace internal
@@ -1112,6 +1131,8 @@ class ARROW_EXPORT DictionaryBuilder : public ArrayBuilder {
  public:
   using Scalar = typename internal::DictionaryScalar<T>::type;
 
+  // WARNING: the type given below is the value type, not the DictionaryType.
+  // The DictionaryType is instantiated on the Finish() call.
   DictionaryBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool);
 
   template <typename T1 = T>
@@ -1119,9 +1140,25 @@ class ARROW_EXPORT DictionaryBuilder : public ArrayBuilder {
       typename std::enable_if<TypeTraits<T1>::is_parameter_free, MemoryPool*>::type pool)
       : DictionaryBuilder<T1>(TypeTraits<T1>::type_singleton(), pool) {}
 
+  ~DictionaryBuilder();
+
   /// \brief Append a scalar value
   Status Append(const Scalar& value);
 
+  /// \brief Append a fixed-width string (only for FixedSizeBinaryType)
+  template <typename T1 = T>
+  Status Append(typename std::enable_if<std::is_base_of<FixedSizeBinaryType, T1>::value,
+                                        const uint8_t*>::type value) {
+    return Append(util::string_view(reinterpret_cast<const char*>(value), byte_width_));
+  }
+
+  /// \brief Append a fixed-width string (only for FixedSizeBinaryType)
+  template <typename T1 = T>
+  Status Append(typename std::enable_if<std::is_base_of<FixedSizeBinaryType, T1>::value,
+                                        const char*>::type value) {
+    return Append(util::string_view(value, byte_width_));
+  }
+
   /// \brief Append a scalar null value
   Status AppendNull();
 
@@ -1133,45 +1170,17 @@ class ARROW_EXPORT DictionaryBuilder : public ArrayBuilder {
   Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
 
   /// is the dictionary builder in the delta building mode
-  bool is_building_delta() { return entry_id_offset_ > 0; }
+  bool is_building_delta() { return delta_offset_ > 0; }
 
  protected:
-  // Hash table implementation helpers
-  Status DoubleTableSize();
-  Scalar GetDictionaryValue(typename TypeTraits<T>::BuilderType& dictionary_builder,
-                            int64_t index);
-  int64_t HashValue(const Scalar& value);
-  // Check whether the dictionary entry in *slot* is equal to the given *value*
-  bool SlotDifferent(hash_slot_t slot, const Scalar& value);
-  Status AppendDictionary(const Scalar& value);
-
-  std::shared_ptr<Buffer> hash_table_;
-  int32_t* hash_slots_;
-
-  /// Size of the table. Must be a power of 2.
-  int64_t hash_table_size_;
-
-  // Offset for the dictionary entries in dict_builder_.
-  // Increased on every Finish call by the number of current entries
-  // in the dictionary.
-  int64_t entry_id_offset_;
-
-  // Store hash_table_size_ - 1, so that j & mod_bitmask_ is equivalent to j %
-  // hash_table_size_, but uses far fewer CPU cycles
-  int64_t mod_bitmask_;
-
-  // This builder accumulates new dictionary entries since the last Finish call
-  // (or since the beginning if Finish hasn't been called).
-  // In other words, it contains the current delta dictionary.
-  typename TypeTraits<T>::BuilderType dict_builder_;
-  // This builder stores dictionary entries encountered before the last Finish call.
-  typename TypeTraits<T>::BuilderType overflow_dict_builder_;
+  class MemoTableImpl;
+  std::unique_ptr<MemoTableImpl> memo_table_;
 
-  AdaptiveIntBuilder values_builder_;
+  int32_t delta_offset_;
+  // Only used for FixedSizeBinaryType
   int32_t byte_width_;
 
-  /// Size at which we decide to resize
-  int64_t hash_table_load_threshold_;
+  AdaptiveIntBuilder values_builder_;
 };
 
 template <>
diff --git a/cpp/src/arrow/compute/compute-test.cc b/cpp/src/arrow/compute/compute-test.cc
index 505dfed14be7f..ea91021e5a84b 100644
--- a/cpp/src/arrow/compute/compute-test.cc
+++ b/cpp/src/arrow/compute/compute-test.cc
@@ -1330,7 +1330,7 @@ TEST_F(TestHashKernel, ChunkedArrayInvoke) {
   ASSERT_OK(DictionaryEncode(&this->ctx_, Datum(carr), &encoded_out));
   ASSERT_EQ(Datum::CHUNKED_ARRAY, encoded_out.kind());
 
-  ASSERT_TRUE(encoded_out.chunked_array()->Equals(*dict_carr));
+  AssertChunkedEqual(*dict_carr, *encoded_out.chunked_array());
 }
 
 using BinaryKernelFunc =
diff --git a/cpp/src/arrow/compute/kernels/cast.cc b/cpp/src/arrow/compute/kernels/cast.cc
index 766740b33aa7d..97bc1414dda9b 100644
--- a/cpp/src/arrow/compute/kernels/cast.cc
+++ b/cpp/src/arrow/compute/kernels/cast.cc
@@ -153,7 +153,7 @@ struct CastFunctor<T, BooleanType, enable_if_number<T>> {
 
     internal::BitmapReader bit_reader(input.buffers[1]->data(), input.offset,
                                       input.length);
-    auto out = GetMutableValues<c_type>(output, 1);
+    auto out = output->GetMutableValues<c_type>(1);
     for (int64_t i = 0; i < input.length; ++i) {
       *out++ = bit_reader.IsSet() ? kOne : kZero;
       bit_reader.Next();
@@ -215,7 +215,7 @@ struct CastFunctor<O, I,
                                            !std::is_same<O, I>::value>::type> {
   void operator()(FunctionContext* ctx, const CastOptions& options,
                   const ArrayData& input, ArrayData* output) {
-    auto in_data = GetValues<typename I::c_type>(input, 1);
+    auto in_data = input.GetValues<typename I::c_type>(1);
     const auto generate = [&in_data]() -> bool { return *in_data++ != 0; };
     internal::GenerateBitsUnrolled(output->buffers[1]->mutable_data(), output->offset,
                                    input.length, generate);
@@ -232,8 +232,8 @@ struct CastFunctor<O, I,
 
     auto in_offset = input.offset;
 
-    const in_type* in_data = GetValues<in_type>(input, 1);
-    auto out_data = GetMutableValues<out_type>(output, 1);
+    const in_type* in_data = input.GetValues<in_type>(1);
+    auto out_data = output->GetMutableValues<out_type>(1);
 
     if (!options.allow_int_overflow) {
       constexpr in_type kMax = static_cast<in_type>(std::numeric_limits<out_type>::max());
@@ -275,8 +275,8 @@ struct CastFunctor<O, I, typename std::enable_if<is_float_truncate<O, I>::value>
     using out_type = typename O::c_type;
 
     auto in_offset = input.offset;
-    const in_type* in_data = GetValues<in_type>(input, 1);
-    auto out_data = GetMutableValues<out_type>(output, 1);
+    const in_type* in_data = input.GetValues<in_type>(1);
+    auto out_data = output->GetMutableValues<out_type>(1);
 
     if (options.allow_float_truncate) {
       // unsafe cast
@@ -321,8 +321,8 @@ struct CastFunctor<O, I,
     using in_type = typename I::c_type;
     using out_type = typename O::c_type;
 
-    const in_type* in_data = GetValues<in_type>(input, 1);
-    auto out_data = GetMutableValues<out_type>(output, 1);
+    const in_type* in_data = input.GetValues<in_type>(1);
+    auto out_data = output->GetMutableValues<out_type>(1);
     for (int64_t i = 0; i < input.length; ++i) {
       *out_data++ = static_cast<out_type>(*in_data++);
     }
@@ -335,8 +335,8 @@ struct CastFunctor<O, I,
 template <typename in_type, typename out_type>
 void ShiftTime(FunctionContext* ctx, const CastOptions& options, const bool is_multiply,
                const int64_t factor, const ArrayData& input, ArrayData* output) {
-  const in_type* in_data = GetValues<in_type>(input, 1);
-  auto out_data = GetMutableValues<out_type>(output, 1);
+  const in_type* in_data = input.GetValues<in_type>(1);
+  auto out_data = output->GetMutableValues<out_type>(1);
 
   if (factor == 1) {
     for (int64_t i = 0; i < input.length; i++) {
@@ -450,7 +450,7 @@ struct CastFunctor<Date64Type, TimestampType> {
                                 output);
 
     // Ensure that intraday milliseconds have been zeroed out
-    auto out_data = GetMutableValues<int64_t>(output, 1);
+    auto out_data = output->GetMutableValues<int64_t>(1);
 
     if (input.null_count != 0) {
       internal::BitmapReader bit_reader(input.buffers[0]->data(), input.offset,
@@ -582,7 +582,7 @@ void UnpackFixedSizeBinaryDictionary(FunctionContext* ctx, const Array& indices,
                                      ArrayData* output) {
   using index_c_type = typename IndexType::c_type;
 
-  const index_c_type* in = GetValues<index_c_type>(*indices.data(), 1);
+  const index_c_type* in = indices.data()->GetValues<index_c_type>(1);
   int32_t byte_width =
       checked_cast<const FixedSizeBinaryType&>(*output->type).byte_width();
 
@@ -655,7 +655,7 @@ Status UnpackBinaryDictionary(FunctionContext* ctx, const Array& indices,
   RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), output->type, &builder));
   BinaryBuilder* binary_builder = checked_cast<BinaryBuilder*>(builder.get());
 
-  const index_c_type* in = GetValues<index_c_type>(*indices.data(), 1);
+  const index_c_type* in = indices.data()->GetValues<index_c_type>(1);
   if (indices.null_count() != 0) {
     internal::BitmapReader valid_bits_reader(indices.null_bitmap_data(), indices.offset(),
                                              indices.length());
@@ -732,7 +732,7 @@ void UnpackPrimitiveDictionary(const Array& indices, const c_type* dictionary,
   internal::BitmapReader valid_bits_reader(indices.null_bitmap_data(), indices.offset(),
                                            indices.length());
 
-  auto in = GetValues<typename IndexType::c_type>(*indices.data(), 1);
+  auto in = indices.data()->GetValues<typename IndexType::c_type>(1);
   for (int64_t i = 0; i < indices.length(); ++i) {
     if (valid_bits_reader.IsSet()) {
       out[i] = dictionary[in[i]];
@@ -758,9 +758,9 @@ struct CastFunctor<T, DictionaryType,
     DCHECK(values_type.Equals(*output->type))
         << "Dictionary type: " << values_type << " target type: " << (*output->type);
 
-    const c_type* dictionary = GetValues<c_type>(*type.dictionary()->data(), 1);
+    const c_type* dictionary = type.dictionary()->data()->GetValues<c_type>(1);
 
-    auto out = GetMutableValues<c_type>(output, 1);
+    auto out = output->GetMutableValues<c_type>(1);
     const Array& indices = *dict_array.indices();
     switch (indices.type()->id()) {
       case Type::INT8:
@@ -794,7 +794,7 @@ struct CastFunctor<O, StringType, enable_if_number<O>> {
     using out_type = typename O::c_type;
 
     StringArray input_array(input.Copy());
-    auto out_data = GetMutableValues<out_type>(output, 1);
+    auto out_data = output->GetMutableValues<out_type>(1);
     internal::StringConverter<O> converter;
 
     for (int64_t i = 0; i < input.length; ++i, ++out_data) {
diff --git a/cpp/src/arrow/compute/kernels/hash.cc b/cpp/src/arrow/compute/kernels/hash.cc
index c004429f866bd..c057ea5736139 100644
--- a/cpp/src/arrow/compute/kernels/hash.cc
+++ b/cpp/src/arrow/compute/kernels/hash.cc
@@ -20,7 +20,6 @@
 #include <algorithm>
 #include <cstdint>
 #include <cstring>
-#include <exception>
 #include <memory>
 #include <mutex>
 #include <sstream>
@@ -39,26 +38,24 @@
 #include "arrow/type_traits.h"
 #include "arrow/util/bit-util.h"
 #include "arrow/util/checked_cast.h"
-#include "arrow/util/hash-util.h"
-#include "arrow/util/hash.h"
+#include "arrow/util/hashing.h"
 #include "arrow/util/logging.h"
 #include "arrow/util/macros.h"
+#include "arrow/util/string_view.h"
+#include "arrow/visitor_inline.h"
 
 namespace arrow {
 
 class MemoryPool;
 
 using internal::checked_cast;
+using internal::DictionaryTraits;
+using internal::HashTraits;
 
 namespace compute {
 
-// TODO(wesm): Enable top-level dispatch to SSE4 hashing if it is enabled
-#define HASH_USE_SSE false
-
 namespace {
 
-enum class SIMDMode : char { NOSIMD, SSE4, AVX2 };
-
 #define CHECK_IMPLEMENTED(KERNEL, FUNCNAME, TYPE)                  \
   if (!KERNEL) {                                                   \
     std::stringstream ss;                                          \
@@ -66,754 +63,213 @@ enum class SIMDMode : char { NOSIMD, SSE4, AVX2 };
     return Status::NotImplemented(ss.str());                       \
   }
 
-// This is a slight design concession -- some hash actions have the possibility
-// of failure. Rather than introduce extra error checking into all actions, we
-// will raise an internal exception so that only the actions where errors can
-// occur will experience the extra overhead
-class HashException : public std::exception {
- public:
-  explicit HashException(const std::string& msg, StatusCode code = StatusCode::Invalid)
-      : msg_(msg), code_(code) {}
-
-  ~HashException() throw() override {}
-
-  const char* what() const throw() override;
-
-  StatusCode code() const { return code_; }
-
- private:
-  std::string msg_;
-  StatusCode code_;
-};
-
-const char* HashException::what() const throw() { return msg_.c_str(); }
+// ----------------------------------------------------------------------
+// Unique implementation
 
-class HashTable {
+class UniqueAction {
  public:
-  HashTable(const std::shared_ptr<DataType>& type, MemoryPool* pool)
-      : type_(type),
-        pool_(pool),
-        initialized_(false),
-        hash_table_(nullptr),
-        hash_slots_(nullptr),
-        hash_table_size_(0),
-        mod_bitmask_(0) {}
+  UniqueAction(const std::shared_ptr<DataType>& type, MemoryPool* pool) {}
 
-  virtual ~HashTable() {}
-
-  virtual Status Append(const ArrayData& input) = 0;
-  virtual Status Flush(Datum* out) = 0;
-  virtual Status GetDictionary(std::shared_ptr<ArrayData>* out) = 0;
-
- protected:
-  Status Init(int64_t elements);
+  Status Reset() { return Status::OK(); }
 
-  std::shared_ptr<DataType> type_;
-  MemoryPool* pool_;
-  bool initialized_;
+  Status Reserve(const int64_t length) { return Status::OK(); }
 
-  // The hash table contains integer indices that reference the set of observed
-  // distinct values
-  std::shared_ptr<Buffer> hash_table_;
-  hash_slot_t* hash_slots_;
+  void ObserveNull() {}
 
-  /// Size of the table. Must be a power of 2.
-  int64_t hash_table_size_;
+  template <class Index>
+  void ObserveFound(Index index) {}
 
-  /// Size at which we decide to resize
-  int64_t hash_table_load_threshold_;
+  template <class Index>
+  void ObserveNotFound(Index index) {}
 
-  // Store hash_table_size_ - 1, so that j & mod_bitmask_ is equivalent to j %
-  // hash_table_size_, but uses far fewer CPU cycles
-  int64_t mod_bitmask_;
+  Status Flush(Datum* out) { return Status::OK(); }
 };
 
-Status HashTable::Init(int64_t elements) {
-  DCHECK_EQ(elements, BitUtil::NextPower2(elements));
-  RETURN_NOT_OK(internal::NewHashTable(elements, pool_, &hash_table_));
-  hash_slots_ = reinterpret_cast<hash_slot_t*>(hash_table_->mutable_data());
-  hash_table_size_ = elements;
-  hash_table_load_threshold_ =
-      static_cast<int64_t>(static_cast<double>(elements) * kMaxHashTableLoad);
-  mod_bitmask_ = elements - 1;
-  initialized_ = true;
-  return Status::OK();
-}
-
-template <typename Type, typename Action, typename Enable = void>
-class HashTableKernel : public HashTable {};
-
-// Types of hash actions
-//
-// unique: append to dictionary when not found, no-op with slot
-// dictionary-encode: append to dictionary when not found, append slot #
-// match: raise or set null when not found, otherwise append slot #
-// isin: set false when not found, otherwise true
-// value counts: append to dictionary when not found, increment count for slot
-
-template <typename Type, typename Enable = void>
-class HashDictionary {};
-
 // ----------------------------------------------------------------------
-// Hash table pass for nulls
+// Dictionary encode implementation
 
-template <typename Type, typename Action>
-class HashTableKernel<Type, Action, enable_if_null<Type>> : public HashTable {
+class DictEncodeAction {
  public:
-  using HashTable::HashTable;
-
-  Status Init() {
-    // No-op, do not even need to initialize hash table
-    return Status::OK();
-  }
+  DictEncodeAction(const std::shared_ptr<DataType>& type, MemoryPool* pool)
+      : indices_builder_(pool) {}
 
-  Status Append(const ArrayData& arr) override {
-    if (!initialized_) {
-      RETURN_NOT_OK(Init());
-    }
-    auto action = checked_cast<Action*>(this);
-    RETURN_NOT_OK(action->Reserve(arr.length));
-    for (int64_t i = 0; i < arr.length; ++i) {
-      action->ObserveNull();
-    }
+  Status Reset() {
+    indices_builder_.Reset();
     return Status::OK();
   }
 
-  Status GetDictionary(std::shared_ptr<ArrayData>* out) override {
-    // TODO(wesm): handle null being a valid dictionary value
-    auto null_array = std::make_shared<NullArray>(0);
-    *out = null_array->data();
-    return Status::OK();
-  }
-};
-
-// ----------------------------------------------------------------------
-// Hash table pass for primitive types
-
-template <typename Type>
-struct HashDictionary<Type, enable_if_has_c_type<Type>> {
-  using T = typename Type::c_type;
+  Status Reserve(const int64_t length) { return indices_builder_.Reserve(length); }
 
-  explicit HashDictionary(MemoryPool* pool) : pool(pool), size(0), capacity(0) {}
+  void ObserveNull() { indices_builder_.UnsafeAppendNull(); }
 
-  Status Init() {
-    this->size = 0;
-    RETURN_NOT_OK(AllocateResizableBuffer(this->pool, 0, &this->buffer));
-    return Resize(kInitialHashTableSize);
+  template <class Index>
+  void ObserveFound(Index index) {
+    indices_builder_.UnsafeAppend(index);
   }
 
-  Status DoubleSize() { return Resize(this->size * 2); }
-
-  Status Resize(const int64_t elements) {
-    RETURN_NOT_OK(this->buffer->Resize(elements * sizeof(T)));
+  template <class Index>
+  void ObserveNotFound(Index index) {
+    return ObserveFound(index);
+  }
 
-    this->capacity = elements;
-    this->values = reinterpret_cast<T*>(this->buffer->mutable_data());
+  Status Flush(Datum* out) {
+    std::shared_ptr<ArrayData> result;
+    RETURN_NOT_OK(indices_builder_.FinishInternal(&result));
+    out->value = std::move(result);
     return Status::OK();
   }
 
-  MemoryPool* pool;
-  std::shared_ptr<ResizableBuffer> buffer;
-  T* values;
-  int64_t size;
-  int64_t capacity;
+ private:
+  Int32Builder indices_builder_;
 };
 
-#define GENERIC_HASH_PASS(HASH_INNER_LOOP)                                               \
-  if (arr.null_count != 0) {                                                             \
-    internal::BitmapReader valid_reader(arr.buffers[0]->data(), arr.offset, arr.length); \
-    for (int64_t i = 0; i < arr.length; ++i) {                                           \
-      const bool is_null = valid_reader.IsNotSet();                                      \
-      valid_reader.Next();                                                               \
-                                                                                         \
-      if (is_null) {                                                                     \
-        action->ObserveNull();                                                           \
-        continue;                                                                        \
-      }                                                                                  \
-                                                                                         \
-      HASH_INNER_LOOP();                                                                 \
-    }                                                                                    \
-  } else {                                                                               \
-    for (int64_t i = 0; i < arr.length; ++i) {                                           \
-      HASH_INNER_LOOP();                                                                 \
-    }                                                                                    \
-  }
+// ----------------------------------------------------------------------
+// Base class for all hash kernel implementations
 
-template <typename Type, typename Action>
-class HashTableKernel<
-    Type, Action,
-    typename std::enable_if<has_c_type<Type>::value && !is_8bit_int<Type>::value>::type>
-    : public HashTable {
+class HashKernelImpl : public HashKernel {
  public:
-  using T = typename Type::c_type;
-
-  HashTableKernel(const std::shared_ptr<DataType>& type, MemoryPool* pool)
-      : HashTable(type, pool), dict_(pool) {}
-
-  Status Init() {
-    RETURN_NOT_OK(dict_.Init());
-    return HashTable::Init(kInitialHashTableSize);
-  }
-
-  Status Append(const ArrayData& arr) override {
-    if (!initialized_) {
-      RETURN_NOT_OK(Init());
-    }
-
-    const T* values = GetValues<T>(arr, 1);
-    auto action = checked_cast<Action*>(this);
-
-    RETURN_NOT_OK(action->Reserve(arr.length));
-
-#define HASH_INNER_LOOP()                                               \
-  const T value = values[i];                                            \
-  int64_t j = HashValue(value) & mod_bitmask_;                          \
-  hash_slot_t slot = hash_slots_[j];                                    \
-                                                                        \
-  while (kHashSlotEmpty != slot && dict_.values[slot] != value) {       \
-    ++j;                                                                \
-    if (ARROW_PREDICT_FALSE(j == hash_table_size_)) {                   \
-      j = 0;                                                            \
-    }                                                                   \
-    slot = hash_slots_[j];                                              \
-  }                                                                     \
-                                                                        \
-  if (slot == kHashSlotEmpty) {                                         \
-    if (!Action::allow_expand) {                                        \
-      throw HashException("Encountered new dictionary value");          \
-    }                                                                   \
-                                                                        \
-    slot = static_cast<hash_slot_t>(dict_.size);                        \
-    hash_slots_[j] = slot;                                              \
-    dict_.values[dict_.size++] = value;                                 \
-                                                                        \
-    action->ObserveNotFound(slot);                                      \
-                                                                        \
-    if (ARROW_PREDICT_FALSE(dict_.size > hash_table_load_threshold_)) { \
-      RETURN_NOT_OK(action->DoubleSize());                              \
-    }                                                                   \
-  } else {                                                              \
-    action->ObserveFound(slot);                                         \
+  Status Call(FunctionContext* ctx, const Datum& input, Datum* out) override {
+    DCHECK_EQ(Datum::ARRAY, input.kind());
+    RETURN_NOT_OK(Append(ctx, *input.array()));
+    return Flush(out);
   }
 
-    GENERIC_HASH_PASS(HASH_INNER_LOOP);
-
-#undef HASH_INNER_LOOP
-
-    return Status::OK();
+  Status Append(FunctionContext* ctx, const ArrayData& input) override {
+    std::lock_guard<std::mutex> guard(lock_);
+    return Append(input);
   }
 
-  Status GetDictionary(std::shared_ptr<ArrayData>* out) override {
-    // TODO(wesm): handle null being in the dictionary
-    auto dict_data = dict_.buffer;
-    RETURN_NOT_OK(dict_data->Resize(dict_.size * sizeof(T), false));
-    dict_data->ZeroPadding();
-
-    *out = ArrayData::Make(type_, dict_.size, {nullptr, dict_data}, 0);
-    return Status::OK();
-  }
+  virtual Status Append(const ArrayData& arr) = 0;
 
  protected:
-  int64_t HashValue(const T& value) const {
-    // TODO(wesm): Use faster hash function for C types
-    return HashUtil::Hash<HASH_USE_SSE>(&value, sizeof(T), 0);
-  }
-
-  Status DoubleTableSize() {
-#define PRIMITIVE_INNER_LOOP           \
-  const T value = dict_.values[index]; \
-  int64_t j = HashValue(value) & new_mod_bitmask;
-
-    DOUBLE_TABLE_SIZE(, PRIMITIVE_INNER_LOOP);
-
-#undef PRIMITIVE_INNER_LOOP
-
-    return dict_.Resize(hash_table_size_);
-  }
-
-  HashDictionary<Type> dict_;
+  std::mutex lock_;
 };
 
 // ----------------------------------------------------------------------
-// Hash table for boolean types
+// Base class for all "regular" hash kernel implementations
+// (NullType has a separate implementation)
 
-template <typename Type, typename Action>
-class HashTableKernel<Type, Action, enable_if_boolean<Type>> : public HashTable {
+template <typename Type, typename Scalar, typename Action>
+class RegularHashKernelImpl : public HashKernelImpl {
  public:
-  HashTableKernel(const std::shared_ptr<DataType>& type, MemoryPool* pool)
-      : HashTable(type, pool) {
-    std::fill(table_, table_ + 2, kHashSlotEmpty);
+  RegularHashKernelImpl(const std::shared_ptr<DataType>& type, MemoryPool* pool)
+      : pool_(pool), type_(type), action_(type, pool) {}
+
+  Status Reset() override {
+    memo_table_.reset(new MemoTable(0));
+    return action_.Reset();
   }
 
   Status Append(const ArrayData& arr) override {
-    auto action = checked_cast<Action*>(this);
-
-    RETURN_NOT_OK(action->Reserve(arr.length));
-
-    internal::BitmapReader value_reader(arr.buffers[1]->data(), arr.offset, arr.length);
-
-#define HASH_INNER_LOOP()                                      \
-  if (slot == kHashSlotEmpty) {                                \
-    if (!Action::allow_expand) {                               \
-      throw HashException("Encountered new dictionary value"); \
-    }                                                          \
-    table_[j] = slot = static_cast<hash_slot_t>(dict_.size()); \
-    dict_.push_back(value);                                    \
-    action->ObserveNotFound(slot);                             \
-  } else {                                                     \
-    action->ObserveFound(slot);                                \
+    RETURN_NOT_OK(action_.Reserve(arr.length));
+    return ArrayDataVisitor<Type>::Visit(arr, this);
   }
 
-    if (arr.null_count != 0) {
-      internal::BitmapReader valid_reader(arr.buffers[0]->data(), arr.offset, arr.length);
-      for (int64_t i = 0; i < arr.length; ++i) {
-        const bool is_null = valid_reader.IsNotSet();
-        valid_reader.Next();
-        if (is_null) {
-          value_reader.Next();
-          action->ObserveNull();
-          continue;
-        }
-        const bool value = value_reader.IsSet();
-        value_reader.Next();
-        const int j = value ? 1 : 0;
-        hash_slot_t slot = table_[j];
-        HASH_INNER_LOOP();
-      }
-    } else {
-      for (int64_t i = 0; i < arr.length; ++i) {
-        const bool value = value_reader.IsSet();
-        value_reader.Next();
-        const int j = value ? 1 : 0;
-        hash_slot_t slot = table_[j];
-        HASH_INNER_LOOP();
-      }
-    }
-
-#undef HASH_INNER_LOOP
-
-    return Status::OK();
-  }
+  Status Flush(Datum* out) override { return action_.Flush(out); }
 
   Status GetDictionary(std::shared_ptr<ArrayData>* out) override {
-    BooleanBuilder builder(pool_);
-    for (const bool value : dict_) {
-      RETURN_NOT_OK(builder.Append(value));
-    }
-    return builder.FinishInternal(out);
-  }
-
- private:
-  hash_slot_t table_[2];
-  std::vector<bool> dict_;
-};
-
-// ----------------------------------------------------------------------
-// Hash table pass for variable-length binary types
-
-template <typename Type, typename Action>
-class HashTableKernel<Type, Action, enable_if_binary<Type>> : public HashTable {
- public:
-  HashTableKernel(const std::shared_ptr<DataType>& type, MemoryPool* pool)
-      : HashTable(type, pool), dict_offsets_(pool), dict_data_(pool), dict_size_(0) {}
-
-  Status Init() {
-    RETURN_NOT_OK(dict_offsets_.Resize(kInitialHashTableSize));
-
-    // We append the end offset after each append to the dictionary, so this
-    // sets the initial condition for the length-0 case
-    //
-    // initial offsets (dict size == 0): 0
-    // after 1st dict entry of length 3: 0 3
-    // after 2nd dict entry of length 4: 0 3 7
-    RETURN_NOT_OK(dict_offsets_.Append(0));
-    return HashTable::Init(kInitialHashTableSize);
-  }
-
-  Status Append(const ArrayData& arr) override {
-    constexpr uint8_t empty_value = 0;
-    if (!initialized_) {
-      RETURN_NOT_OK(Init());
-    }
-
-    const int32_t* offsets = GetValues<int32_t>(arr, 1);
-    const uint8_t* data;
-    if (arr.buffers[2].get() == nullptr) {
-      data = &empty_value;
-    } else {
-      data = GetValues<uint8_t>(arr, 2);
-    }
-
-    auto action = checked_cast<Action*>(this);
-    RETURN_NOT_OK(action->Reserve(arr.length));
-
-#define HASH_INNER_LOOP()                                                           \
-  const int32_t position = offsets[i];                                              \
-  const int32_t length = offsets[i + 1] - position;                                 \
-  const uint8_t* value = data + position;                                           \
-                                                                                    \
-  int64_t j = HashValue(value, length) & mod_bitmask_;                              \
-  hash_slot_t slot = hash_slots_[j];                                                \
-                                                                                    \
-  const int32_t* dict_offsets = dict_offsets_.data();                               \
-  const uint8_t* dict_data = dict_data_.data();                                     \
-  while (kHashSlotEmpty != slot &&                                                  \
-         !((dict_offsets[slot + 1] - dict_offsets[slot]) == length &&               \
-           0 == memcmp(value, dict_data + dict_offsets[slot], length))) {           \
-    ++j;                                                                            \
-    if (ARROW_PREDICT_FALSE(j == hash_table_size_)) {                               \
-      j = 0;                                                                        \
-    }                                                                               \
-    slot = hash_slots_[j];                                                          \
-  }                                                                                 \
-                                                                                    \
-  if (slot == kHashSlotEmpty) {                                                     \
-    if (!Action::allow_expand) {                                                    \
-      throw HashException("Encountered new dictionary value");                      \
-    }                                                                               \
-                                                                                    \
-    slot = dict_size_++;                                                            \
-    hash_slots_[j] = slot;                                                          \
-                                                                                    \
-    RETURN_NOT_OK(dict_data_.Append(value, length));                                \
-    RETURN_NOT_OK(dict_offsets_.Append(static_cast<int32_t>(dict_data_.length()))); \
-                                                                                    \
-    action->ObserveNotFound(slot);                                                  \
-                                                                                    \
-    if (ARROW_PREDICT_FALSE(dict_size_ > hash_table_load_threshold_)) {             \
-      RETURN_NOT_OK(action->DoubleSize());                                          \
-    }                                                                               \
-  } else {                                                                          \
-    action->ObserveFound(slot);                                                     \
+    return DictionaryTraits<Type>::GetDictionaryArrayData(pool_, type_, *memo_table_,
+                                                          0 /* start_offset */, out);
   }
 
-    GENERIC_HASH_PASS(HASH_INNER_LOOP);
-
-#undef HASH_INNER_LOOP
-
+  Status VisitNull() {
+    action_.ObserveNull();
     return Status::OK();
   }
 
-  Status GetDictionary(std::shared_ptr<ArrayData>* out) override {
-    // TODO(wesm): handle null being in the dictionary
-    BufferVector buffers = {nullptr, nullptr, nullptr};
-
-    RETURN_NOT_OK(dict_offsets_.Finish(&buffers[1]));
-    RETURN_NOT_OK(dict_data_.Finish(&buffers[2]));
-
-    *out = ArrayData::Make(type_, dict_size_, std::move(buffers), 0);
+  Status VisitValue(const Scalar& value) {
+    auto on_found = [this](int32_t memo_index) { action_.ObserveFound(memo_index); };
+    auto on_not_found = [this](int32_t memo_index) {
+      action_.ObserveNotFound(memo_index);
+    };
+    memo_table_->GetOrInsert(value, on_found, on_not_found);
     return Status::OK();
   }
 
  protected:
-  int64_t HashValue(const uint8_t* data, int32_t length) const {
-    return HashUtil::Hash<HASH_USE_SSE>(data, length, 0);
-  }
-
-  Status DoubleTableSize() {
-#define VARBYTES_SETUP                                \
-  const int32_t* dict_offsets = dict_offsets_.data(); \
-  const uint8_t* dict_data = dict_data_.data()
-
-#define VARBYTES_COMPUTE_HASH                                           \
-  const int32_t length = dict_offsets[index + 1] - dict_offsets[index]; \
-  const uint8_t* value = dict_data + dict_offsets[index];               \
-  int64_t j = HashValue(value, length) & new_mod_bitmask
-
-    DOUBLE_TABLE_SIZE(VARBYTES_SETUP, VARBYTES_COMPUTE_HASH);
+  using MemoTable = typename HashTraits<Type>::MemoTableType;
 
-#undef VARBYTES_SETUP
-#undef VARBYTES_COMPUTE_HASH
-
-    return Status::OK();
-  }
-
-  TypedBufferBuilder<int32_t> dict_offsets_;
-  TypedBufferBuilder<uint8_t> dict_data_;
-  int32_t dict_size_;
+  MemoryPool* pool_;
+  std::shared_ptr<DataType> type_;
+  Action action_;
+  std::unique_ptr<MemoTable> memo_table_;
 };
 
 // ----------------------------------------------------------------------
-// Hash table pass for fixed size binary types
+// Hash kernel implementation for nulls
 
-template <typename Type, typename Action>
-class HashTableKernel<Type, Action, enable_if_fixed_size_binary<Type>>
-    : public HashTable {
+template <typename Action>
+class NullHashKernelImpl : public HashKernelImpl {
  public:
-  HashTableKernel(const std::shared_ptr<DataType>& type, MemoryPool* pool)
-      : HashTable(type, pool), dict_data_(pool), dict_size_(0) {
-    const auto& fw_type = checked_cast<const FixedSizeBinaryType&>(*type);
-    byte_width_ = fw_type.bit_width() / 8;
-  }
+  NullHashKernelImpl(const std::shared_ptr<DataType>& type, MemoryPool* pool)
+      : pool_(pool), type_(type), action_(type, pool) {}
 
-  Status Init() {
-    RETURN_NOT_OK(dict_data_.Resize(kInitialHashTableSize * byte_width_));
-    return HashTable::Init(kInitialHashTableSize);
-  }
+  Status Reset() override { return action_.Reset(); }
 
   Status Append(const ArrayData& arr) override {
-    if (!initialized_) {
-      RETURN_NOT_OK(Init());
+    RETURN_NOT_OK(action_.Reserve(arr.length));
+    for (int64_t i = 0; i < arr.length; ++i) {
+      action_.ObserveNull();
     }
-
-    const uint8_t* data = GetValues<uint8_t>(arr, 1);
-
-    auto action = checked_cast<Action*>(this);
-    RETURN_NOT_OK(action->Reserve(arr.length));
-
-#define HASH_INNER_LOOP()                                                      \
-  const uint8_t* value = data + i * byte_width_;                               \
-  int64_t j = HashValue(value) & mod_bitmask_;                                 \
-  hash_slot_t slot = hash_slots_[j];                                           \
-                                                                               \
-  const uint8_t* dict_data = dict_data_.data();                                \
-  while (kHashSlotEmpty != slot &&                                             \
-         !(0 == memcmp(value, dict_data + slot * byte_width_, byte_width_))) { \
-    ++j;                                                                       \
-    if (ARROW_PREDICT_FALSE(j == hash_table_size_)) {                          \
-      j = 0;                                                                   \
-    }                                                                          \
-    slot = hash_slots_[j];                                                     \
-  }                                                                            \
-                                                                               \
-  if (slot == kHashSlotEmpty) {                                                \
-    if (!Action::allow_expand) {                                               \
-      throw HashException("Encountered new dictionary value");                 \
-    }                                                                          \
-                                                                               \
-    slot = dict_size_++;                                                       \
-    hash_slots_[j] = slot;                                                     \
-                                                                               \
-    RETURN_NOT_OK(dict_data_.Append(value, byte_width_));                      \
-                                                                               \
-    action->ObserveNotFound(slot);                                             \
-                                                                               \
-    if (ARROW_PREDICT_FALSE(dict_size_ > hash_table_load_threshold_)) {        \
-      RETURN_NOT_OK(action->DoubleSize());                                     \
-    }                                                                          \
-  } else {                                                                     \
-    action->ObserveFound(slot);                                                \
-  }
-
-    GENERIC_HASH_PASS(HASH_INNER_LOOP);
-
-#undef HASH_INNER_LOOP
-
     return Status::OK();
   }
 
-  Status GetDictionary(std::shared_ptr<ArrayData>* out) override {
-    // TODO(wesm): handle null being in the dictionary
-    BufferVector buffers = {nullptr, nullptr};
-    RETURN_NOT_OK(dict_data_.Finish(&buffers[1]));
+  Status Flush(Datum* out) override { return action_.Flush(out); }
 
-    *out = ArrayData::Make(type_, dict_size_, std::move(buffers), 0);
+  Status GetDictionary(std::shared_ptr<ArrayData>* out) override {
+    // TODO(wesm): handle null being a valid dictionary value
+    auto null_array = std::make_shared<NullArray>(0);
+    *out = null_array->data();
     return Status::OK();
   }
 
  protected:
-  int64_t HashValue(const uint8_t* data) const {
-    return HashUtil::Hash<HASH_USE_SSE>(data, byte_width_, 0);
-  }
-
-  Status DoubleTableSize() {
-#define FIXED_BYTES_SETUP const uint8_t* dict_data = dict_data_.data()
-
-#define FIXED_BYTES_COMPUTE_HASH \
-  int64_t j = HashValue(dict_data + index * byte_width_) & new_mod_bitmask
-
-    DOUBLE_TABLE_SIZE(FIXED_BYTES_SETUP, FIXED_BYTES_COMPUTE_HASH);
-
-#undef FIXED_BYTES_SETUP
-#undef FIXED_BYTES_COMPUTE_HASH
-
-    return Status::OK();
-  }
-
-  int32_t byte_width_;
-  TypedBufferBuilder<uint8_t> dict_data_;
-  int32_t dict_size_;
+  MemoryPool* pool_;
+  std::shared_ptr<DataType> type_;
+  Action action_;
 };
 
 // ----------------------------------------------------------------------
-// Hash table pass for uint8 and int8
-
-template <typename T>
-inline int Hash8Bit(const T val) {
-  return 0;
-}
-
-template <>
-inline int Hash8Bit(const uint8_t val) {
-  return val;
-}
+// Kernel wrapper for generic hash table kernels
 
-template <>
-inline int Hash8Bit(const int8_t val) {
-  return val + 128;
-}
+template <typename Type, typename Action, typename Enable = void>
+struct HashKernelTraits {};
 
 template <typename Type, typename Action>
-class HashTableKernel<Type, Action, enable_if_8bit_int<Type>> : public HashTable {
- public:
-  using T = typename Type::c_type;
-
-  HashTableKernel(const std::shared_ptr<DataType>& type, MemoryPool* pool)
-      : HashTable(type, pool) {
-    std::fill(table_, table_ + 256, kHashSlotEmpty);
-  }
-
-  Status Append(const ArrayData& arr) override {
-    const T* values = GetValues<T>(arr, 1);
-    auto action = checked_cast<Action*>(this);
-    RETURN_NOT_OK(action->Reserve(arr.length));
-
-#define HASH_INNER_LOOP()                                      \
-  const T value = values[i];                                   \
-  const int hash = Hash8Bit<T>(value);                         \
-  hash_slot_t slot = table_[hash];                             \
-                                                               \
-  if (slot == kHashSlotEmpty) {                                \
-    if (!Action::allow_expand) {                               \
-      throw HashException("Encountered new dictionary value"); \
-    }                                                          \
-                                                               \
-    slot = static_cast<hash_slot_t>(dict_.size());             \
-    table_[hash] = slot;                                       \
-    dict_.push_back(value);                                    \
-    action->ObserveNotFound(slot);                             \
-  } else {                                                     \
-    action->ObserveFound(slot);                                \
-  }
-
-    GENERIC_HASH_PASS(HASH_INNER_LOOP);
-
-#undef HASH_INNER_LOOP
-
-    return Status::OK();
-  }
-
-  Status GetDictionary(std::shared_ptr<ArrayData>* out) override {
-    using BuilderType = typename TypeTraits<Type>::BuilderType;
-    BuilderType builder(pool_);
-
-    for (const T value : dict_) {
-      RETURN_NOT_OK(builder.Append(value));
-    }
-
-    return builder.FinishInternal(out);
-  }
-
- private:
-  hash_slot_t table_[256];
-  std::vector<T> dict_;
+struct HashKernelTraits<Type, Action, enable_if_null<Type>> {
+  using HashKernelImpl = NullHashKernelImpl<Action>;
 };
 
-// ----------------------------------------------------------------------
-// Unique implementation
-
-template <typename Type>
-class UniqueImpl : public HashTableKernel<Type, UniqueImpl<Type>> {
- public:
-  static constexpr bool allow_expand = true;
-  using Base = HashTableKernel<Type, UniqueImpl<Type>>;
-  using Base::Base;
-
-  Status Reserve(const int64_t length) { return Status::OK(); }
-
-  void ObserveFound(const hash_slot_t slot) {}
-  void ObserveNull() {}
-  void ObserveNotFound(const hash_slot_t slot) {}
-
-  Status DoubleSize() { return Base::DoubleTableSize(); }
-
-  Status Append(const ArrayData& input) override { return Base::Append(input); }
-
-  Status Flush(Datum* out) override {
-    // No-op
-    return Status::OK();
-  }
+template <typename Type, typename Action>
+struct HashKernelTraits<Type, Action, enable_if_has_c_type<Type>> {
+  using HashKernelImpl = RegularHashKernelImpl<Type, typename Type::c_type, Action>;
 };
 
-// ----------------------------------------------------------------------
-// Dictionary encode implementation
-
-template <typename Type>
-class DictEncodeImpl : public HashTableKernel<Type, DictEncodeImpl<Type>> {
- public:
-  static constexpr bool allow_expand = true;
-  using Base = HashTableKernel<Type, DictEncodeImpl>;
-
-  DictEncodeImpl(const std::shared_ptr<DataType>& type, MemoryPool* pool)
-      : Base(type, pool), indices_builder_(pool) {}
-
-  Status Reserve(const int64_t length) { return indices_builder_.Reserve(length); }
-
-  void ObserveNull() { indices_builder_.UnsafeAppendNull(); }
-
-  void ObserveFound(const hash_slot_t slot) { indices_builder_.UnsafeAppend(slot); }
-
-  void ObserveNotFound(const hash_slot_t slot) { return ObserveFound(slot); }
-
-  Status DoubleSize() { return Base::DoubleTableSize(); }
-
-  Status Flush(Datum* out) override {
-    std::shared_ptr<ArrayData> result;
-    RETURN_NOT_OK(indices_builder_.FinishInternal(&result));
-    out->value = std::move(result);
-    return Status::OK();
-  }
-
-  using Base::Append;
-
- private:
-  Int32Builder indices_builder_;
+template <typename Type, typename Action>
+struct HashKernelTraits<Type, Action, enable_if_boolean<Type>> {
+  using HashKernelImpl = RegularHashKernelImpl<Type, bool, Action>;
 };
 
-// ----------------------------------------------------------------------
-// Kernel wrapper for generic hash table kernels
-
-class HashKernelImpl : public HashKernel {
- public:
-  explicit HashKernelImpl(std::unique_ptr<HashTable> hasher)
-      : hasher_(std::move(hasher)) {}
-
-  Status Call(FunctionContext* ctx, const Datum& input, Datum* out) override {
-    DCHECK_EQ(Datum::ARRAY, input.kind());
-    RETURN_NOT_OK(Append(ctx, *input.array()));
-    return Flush(out);
-  }
-
-  Status Append(FunctionContext* ctx, const ArrayData& input) override {
-    std::lock_guard<std::mutex> guard(lock_);
-    try {
-      RETURN_NOT_OK(hasher_->Append(input));
-    } catch (const HashException& e) {
-      return Status(e.code(), e.what());
-    }
-    return Status::OK();
-  }
-
-  Status Flush(Datum* out) override { return hasher_->Flush(out); }
-
-  Status GetDictionary(std::shared_ptr<ArrayData>* out) override {
-    return hasher_->GetDictionary(out);
-  }
+template <typename Type, typename Action>
+struct HashKernelTraits<Type, Action, enable_if_binary<Type>> {
+  using HashKernelImpl = RegularHashKernelImpl<Type, util::string_view, Action>;
+};
 
- private:
-  std::mutex lock_;
-  std::unique_ptr<HashTable> hasher_;
+template <typename Type, typename Action>
+struct HashKernelTraits<Type, Action, enable_if_fixed_size_binary<Type>> {
+  using HashKernelImpl = RegularHashKernelImpl<Type, util::string_view, Action>;
 };
 
 }  // namespace
 
 Status GetUniqueKernel(FunctionContext* ctx, const std::shared_ptr<DataType>& type,
                        std::unique_ptr<HashKernel>* out) {
-  std::unique_ptr<HashTable> hasher;
+  std::unique_ptr<HashKernel> kernel;
 
-#define UNIQUE_CASE(InType)                                         \
-  case InType::type_id:                                             \
-    hasher.reset(new UniqueImpl<InType>(type, ctx->memory_pool())); \
+#define UNIQUE_CASE(InType)                                                           \
+  case InType::type_id:                                                               \
+    kernel.reset(new typename HashKernelTraits<InType, UniqueAction>::HashKernelImpl( \
+        type, ctx->memory_pool()));                                                   \
     break
 
   switch (type->id()) {
@@ -844,19 +300,22 @@ Status GetUniqueKernel(FunctionContext* ctx, const std::shared_ptr<DataType>& ty
 
 #undef UNIQUE_CASE
 
-  CHECK_IMPLEMENTED(hasher, "unique", type);
-  out->reset(new HashKernelImpl(std::move(hasher)));
+  CHECK_IMPLEMENTED(kernel, "unique", type);
+  RETURN_NOT_OK(kernel->Reset());
+  *out = std::move(kernel);
   return Status::OK();
 }
 
 Status GetDictionaryEncodeKernel(FunctionContext* ctx,
                                  const std::shared_ptr<DataType>& type,
                                  std::unique_ptr<HashKernel>* out) {
-  std::unique_ptr<HashTable> hasher;
+  std::unique_ptr<HashKernel> kernel;
 
-#define DICTIONARY_ENCODE_CASE(InType)                                  \
-  case InType::type_id:                                                 \
-    hasher.reset(new DictEncodeImpl<InType>(type, ctx->memory_pool())); \
+#define DICTIONARY_ENCODE_CASE(InType)                                                \
+  case InType::type_id:                                                               \
+    kernel.reset(new                                                                  \
+                 typename HashKernelTraits<InType, DictEncodeAction>::HashKernelImpl( \
+                     type, ctx->memory_pool()));                                      \
     break
 
   switch (type->id()) {
@@ -887,8 +346,9 @@ Status GetDictionaryEncodeKernel(FunctionContext* ctx,
 
 #undef DICTIONARY_ENCODE_CASE
 
-  CHECK_IMPLEMENTED(hasher, "dictionary-encode", type);
-  out->reset(new HashKernelImpl(std::move(hasher)));
+  CHECK_IMPLEMENTED(kernel, "dictionary-encode", type);
+  RETURN_NOT_OK(kernel->Reset());
+  *out = std::move(kernel);
   return Status::OK();
 }
 
diff --git a/cpp/src/arrow/compute/kernels/hash.h b/cpp/src/arrow/compute/kernels/hash.h
index 9e556c666b557..6bbe3cfb447e9 100644
--- a/cpp/src/arrow/compute/kernels/hash.h
+++ b/cpp/src/arrow/compute/kernels/hash.h
@@ -38,6 +38,8 @@ class FunctionContext;
 /// values. Implementations should be thread-safe
 class ARROW_EXPORT HashKernel : public UnaryKernel {
  public:
+  // XXX why are those methods exposed?
+  virtual Status Reset() = 0;
   virtual Status Append(FunctionContext* ctx, const ArrayData& input) = 0;
   virtual Status Flush(Datum* out) = 0;
   virtual Status GetDictionary(std::shared_ptr<ArrayData>* out) = 0;
diff --git a/cpp/src/arrow/compute/kernels/util-internal.h b/cpp/src/arrow/compute/kernels/util-internal.h
index 95dfed9eea541..23ed4fd7ee7d7 100644
--- a/cpp/src/arrow/compute/kernels/util-internal.h
+++ b/cpp/src/arrow/compute/kernels/util-internal.h
@@ -32,16 +32,6 @@ namespace compute {
 
 class FunctionContext;
 
-template <typename T>
-inline const T* GetValues(const ArrayData& data, int i) {
-  return reinterpret_cast<const T*>(data.buffers[i]->data()) + data.offset;
-}
-
-template <typename T>
-inline T* GetMutableValues(const ArrayData* data, int i) {
-  return reinterpret_cast<T*>(data->buffers[i]->mutable_data()) + data->offset;
-}
-
 static inline void CopyData(const ArrayData& input, ArrayData* output) {
   output->length = input.length;
   output->null_count = input.null_count;
diff --git a/cpp/src/arrow/util/CMakeLists.txt b/cpp/src/arrow/util/CMakeLists.txt
index 5d813ecdfa8da..122c551bf42e7 100644
--- a/cpp/src/arrow/util/CMakeLists.txt
+++ b/cpp/src/arrow/util/CMakeLists.txt
@@ -37,7 +37,7 @@ install(FILES
   date.h
   decimal.h
   hash-util.h
-  hash.h
+  hashing.h
   io-util.h
   key_value_metadata.h
   lazy.h
@@ -89,6 +89,7 @@ ADD_ARROW_TEST(bit-util-test)
 ADD_ARROW_TEST(checked-cast-test)
 ADD_ARROW_TEST(compression-test)
 ADD_ARROW_TEST(decimal-test)
+ADD_ARROW_TEST(hashing-test)
 ADD_ARROW_TEST(key-value-metadata-test)
 ADD_ARROW_TEST(lazy-test)
 ADD_ARROW_TEST(logging-test)
@@ -102,6 +103,7 @@ ADD_ARROW_TEST(utf8-util-test)
 ADD_ARROW_BENCHMARK(bit-util-benchmark)
 ADD_ARROW_BENCHMARK(compression-benchmark)
 ADD_ARROW_BENCHMARK(decimal-benchmark)
+ADD_ARROW_BENCHMARK(hashing-benchmark)
 ADD_ARROW_BENCHMARK(lazy-benchmark)
 ADD_ARROW_BENCHMARK(number-parsing-benchmark)
 ADD_ARROW_BENCHMARK(utf8-util-benchmark)
diff --git a/cpp/src/arrow/util/hash-util.h b/cpp/src/arrow/util/hash-util.h
index da23b8f509b35..3f7e4048bdf10 100644
--- a/cpp/src/arrow/util/hash-util.h
+++ b/cpp/src/arrow/util/hash-util.h
@@ -20,9 +20,9 @@
 #ifndef ARROW_UTIL_HASH_UTIL_H
 #define ARROW_UTIL_HASH_UTIL_H
 
+#include <cassert>
 #include <cstdint>
 
-#include "arrow/util/cpu-info.h"
 #include "arrow/util/logging.h"
 #include "arrow/util/macros.h"
 #include "arrow/util/sse-util.h"
@@ -32,27 +32,33 @@ namespace arrow {
 /// Utility class to compute hash values.
 class HashUtil {
  public:
+#ifdef ARROW_HAVE_SSE4_2
+  static constexpr bool have_hardware_crc32 = true;
+#else
+  static constexpr bool have_hardware_crc32 = false;
+#endif
+
   /// Compute the Crc32 hash for data using SSE4 instructions.  The input hash
   /// parameter is the current hash/seed value.
   /// This should only be called if SSE is supported.
   /// This is ~4x faster than Fnv/Boost Hash.
   /// TODO: crc32 hashes with different seeds do not result in different hash functions.
   /// The resulting hashes are correlated.
-  /// TODO: update this to also use SSE4_crc32_u64 and SSE4_crc32_u16 where appropriate.
-  static uint32_t CrcHash(const void* data, int32_t bytes, uint32_t hash) {
-    uint32_t words = static_cast<uint32_t>(bytes / sizeof(uint32_t));
-    bytes = static_cast<int32_t>(bytes % sizeof(uint32_t));
-
-    const uint32_t* p = reinterpret_cast<const uint32_t*>(data);
-    while (words--) {
-      hash = SSE4_crc32_u32(hash, *p);
-      ++p;
-    }
+  static uint32_t CrcHash(const void* data, int32_t nbytes, uint32_t hash) {
+    const uint8_t* p = reinterpret_cast<const uint8_t*>(data);
+    const uint8_t* end = p + nbytes;
 
-    const uint8_t* s = reinterpret_cast<const uint8_t*>(p);
-    while (bytes--) {
-      hash = SSE4_crc32_u8(hash, *s);
-      ++s;
+    while (p <= end - 8) {
+      hash = SSE4_crc32_u64(hash, *reinterpret_cast<const uint64_t*>(p));
+      p += 8;
+    }
+    while (p <= end - 4) {
+      hash = SSE4_crc32_u32(hash, *reinterpret_cast<const uint32_t*>(p));
+      p += 4;
+    }
+    while (p < end) {
+      hash = SSE4_crc32_u8(hash, *p);
+      ++p;
     }
 
     // The lower half of the CRC hash has has poor uniformity, so swap the halves
@@ -61,6 +67,54 @@ class HashUtil {
     return hash;
   }
 
+  /// A variant of CRC32 hashing that computes two independent running CRCs
+  /// over interleaved halves of the input, giving out a 64-bit integer.
+  /// The result's quality should be improved by a finalization step.
+  ///
+  /// In addition to producing more bits of output, this should be twice
+  /// faster than CrcHash on CPUs that can overlap several independent
+  /// CRC computations.
+  static uint64_t DoubleCrcHash(const void* data, int32_t nbytes, uint64_t hash) {
+    const uint8_t* p = reinterpret_cast<const uint8_t*>(data);
+
+    uint32_t h1 = static_cast<uint32_t>(hash >> 32);
+    uint32_t h2 = static_cast<uint32_t>(hash);
+
+    while (nbytes >= 16) {
+      h1 = SSE4_crc32_u64(h1, *reinterpret_cast<const uint64_t*>(p));
+      h2 = SSE4_crc32_u64(h2, *reinterpret_cast<const uint64_t*>(p + 8));
+      nbytes -= 16;
+      p += 16;
+    }
+    if (nbytes >= 8) {
+      h1 = SSE4_crc32_u32(h1, *reinterpret_cast<const uint32_t*>(p));
+      h2 = SSE4_crc32_u32(h2, *reinterpret_cast<const uint32_t*>(p + 4));
+      nbytes -= 8;
+      p += 8;
+    }
+    if (nbytes >= 4) {
+      h1 = SSE4_crc32_u16(h1, *reinterpret_cast<const uint16_t*>(p));
+      h2 = SSE4_crc32_u16(h2, *reinterpret_cast<const uint16_t*>(p + 2));
+      nbytes -= 4;
+      p += 4;
+    }
+    switch (nbytes) {
+      case 3:
+        h1 = SSE4_crc32_u8(h1, p[3]);
+      case 2:
+        h2 = SSE4_crc32_u8(h2, p[2]);
+      case 1:
+        h1 = SSE4_crc32_u8(h1, p[1]);
+      case 0:
+        break;
+      default:
+        assert(0);
+    }
+
+    // A finalization step is recommended to mix up the result's bits
+    return (static_cast<uint64_t>(h1) << 32) + h2;
+  }
+
   /// CrcHash() specialized for 1-byte data
   static inline uint32_t CrcHash1(const void* v, uint32_t hash) {
     const uint8_t* s = reinterpret_cast<const uint8_t*>(v);
diff --git a/cpp/src/arrow/util/hash.cc b/cpp/src/arrow/util/hash.cc
deleted file mode 100644
index ce79710f70b96..0000000000000
--- a/cpp/src/arrow/util/hash.cc
+++ /dev/null
@@ -1,40 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "arrow/util/hash.h"
-
-#include <algorithm>
-
-#include "arrow/buffer.h"
-#include "arrow/status.h"
-
-namespace arrow {
-namespace internal {
-
-Status NewHashTable(int64_t size, MemoryPool* pool, std::shared_ptr<Buffer>* out) {
-  std::shared_ptr<Buffer> hash_table;
-  RETURN_NOT_OK(AllocateBuffer(pool, sizeof(hash_slot_t) * size, &hash_table));
-
-  auto slots = reinterpret_cast<hash_slot_t*>(hash_table->mutable_data());
-  std::fill(slots, slots + size, kHashSlotEmpty);
-
-  *out = hash_table;
-  return Status::OK();
-}
-
-}  // namespace internal
-}  // namespace arrow
diff --git a/cpp/src/arrow/util/hash.h b/cpp/src/arrow/util/hash.h
deleted file mode 100644
index 3a444781574e1..0000000000000
--- a/cpp/src/arrow/util/hash.h
+++ /dev/null
@@ -1,88 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#ifndef ARROW_UTIL_HASH_H
-#define ARROW_UTIL_HASH_H
-
-#include <cstdint>
-#include <limits>
-#include <memory>
-
-namespace arrow {
-
-class Buffer;
-class MemoryPool;
-class Status;
-
-typedef int32_t hash_slot_t;
-static constexpr hash_slot_t kHashSlotEmpty = std::numeric_limits<int32_t>::max();
-
-// Initially 1024 elements
-static constexpr int kInitialHashTableSize = 1 << 10;
-
-// The maximum load factor for the hash table before resizing.
-static constexpr double kMaxHashTableLoad = 0.5;
-
-namespace internal {
-
-// TODO this ugliness should be rewritten as an inline function with
-// a callable argument.
-
-#define DOUBLE_TABLE_SIZE(SETUP_CODE, COMPUTE_HASH)                              \
-  do {                                                                           \
-    int64_t new_size = hash_table_size_ * 2;                                     \
-                                                                                 \
-    std::shared_ptr<Buffer> new_hash_table;                                      \
-    RETURN_NOT_OK(internal::NewHashTable(new_size, pool_, &new_hash_table));     \
-    int32_t* new_hash_slots =                                                    \
-        reinterpret_cast<hash_slot_t*>(new_hash_table->mutable_data());          \
-    int64_t new_mod_bitmask = new_size - 1;                                      \
-                                                                                 \
-    SETUP_CODE;                                                                  \
-                                                                                 \
-    for (int i = 0; i < hash_table_size_; ++i) {                                 \
-      hash_slot_t index = hash_slots_[i];                                        \
-                                                                                 \
-      if (index == kHashSlotEmpty) {                                             \
-        continue;                                                                \
-      }                                                                          \
-                                                                                 \
-      COMPUTE_HASH;                                                              \
-      while (kHashSlotEmpty != new_hash_slots[j]) {                              \
-        ++j;                                                                     \
-        if (ARROW_PREDICT_FALSE(j == new_size)) {                                \
-          j = 0;                                                                 \
-        }                                                                        \
-      }                                                                          \
-                                                                                 \
-      new_hash_slots[j] = index;                                                 \
-    }                                                                            \
-                                                                                 \
-    hash_table_ = new_hash_table;                                                \
-    hash_slots_ = reinterpret_cast<hash_slot_t*>(hash_table_->mutable_data());   \
-    hash_table_size_ = new_size;                                                 \
-    hash_table_load_threshold_ =                                                 \
-        static_cast<int64_t>(static_cast<double>(new_size) * kMaxHashTableLoad); \
-    mod_bitmask_ = new_size - 1;                                                 \
-  } while (false)
-
-Status NewHashTable(int64_t size, MemoryPool* pool, std::shared_ptr<Buffer>* out);
-
-}  // namespace internal
-}  // namespace arrow
-
-#endif  // ARROW_UTIL_HASH_H
diff --git a/cpp/src/arrow/util/hashing-benchmark.cc b/cpp/src/arrow/util/hashing-benchmark.cc
new file mode 100644
index 0000000000000..7d91f0f536ac1
--- /dev/null
+++ b/cpp/src/arrow/util/hashing-benchmark.cc
@@ -0,0 +1,126 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <algorithm>
+#include <cstdint>
+#include <limits>
+#include <random>
+#include <string>
+#include <vector>
+
+#include "benchmark/benchmark.h"
+
+#include "arrow/test-util.h"
+#include "arrow/util/hashing.h"
+
+namespace arrow {
+namespace internal {
+
+template <class Integer>
+static std::vector<Integer> MakeIntegers(int32_t n_values) {
+  std::vector<Integer> values(n_values);
+
+  std::default_random_engine gen(42);
+  std::uniform_int_distribution<Integer> values_dist(0,
+                                                     std::numeric_limits<Integer>::max());
+  std::generate(values.begin(), values.end(),
+                [&]() { return static_cast<Integer>(values_dist(gen)); });
+  return values;
+}
+
+static std::vector<std::string> MakeStrings(int32_t n_values, int32_t min_length,
+                                            int32_t max_length) {
+  std::default_random_engine gen(42);
+  std::vector<std::string> values(n_values);
+
+  // Generate strings between 2 and 20 bytes
+  std::uniform_int_distribution<int32_t> length_dist(min_length, max_length);
+  std::independent_bits_engine<std::default_random_engine, 8, uint8_t> bytes_gen(42);
+
+  std::generate(values.begin(), values.end(), [&]() {
+    auto length = length_dist(gen);
+    std::string s(length, 'X');
+    for (int32_t i = 0; i < length; ++i) {
+      s[i] = bytes_gen();
+    }
+    return s;
+  });
+  return values;
+}
+
+static void BM_HashIntegers(benchmark::State& state) {  // NOLINT non-const reference
+  const std::vector<int64_t> values = MakeIntegers<int64_t>(10000);
+
+  while (state.KeepRunning()) {
+    hash_t total = 0;
+    for (const int64_t v : values) {
+      total += ScalarHelper<int64_t, 0>::ComputeHash(v);
+      total += ScalarHelper<int64_t, 1>::ComputeHash(v);
+    }
+    benchmark::DoNotOptimize(total);
+  }
+  state.SetBytesProcessed(2 * state.iterations() * values.size() * sizeof(int64_t));
+}
+
+static void BenchmarkStringHashing(benchmark::State& state,  // NOLINT non-const reference
+                                   const std::vector<std::string>& values) {
+  uint64_t total_size = 0;
+  for (const std::string& v : values) {
+    total_size += v.size();
+  }
+
+  while (state.KeepRunning()) {
+    hash_t total = 0;
+    for (const std::string& v : values) {
+      total += ComputeStringHash<0>(v.data(), static_cast<int64_t>(v.size()));
+      total += ComputeStringHash<1>(v.data(), static_cast<int64_t>(v.size()));
+    }
+    benchmark::DoNotOptimize(total);
+  }
+  state.SetBytesProcessed(2 * state.iterations() * total_size);
+}
+
+static void BM_HashSmallStrings(benchmark::State& state) {  // NOLINT non-const reference
+  const std::vector<std::string> values = MakeStrings(10000, 2, 20);
+  BenchmarkStringHashing(state, values);
+}
+
+static void BM_HashMediumStrings(benchmark::State& state) {  // NOLINT non-const reference
+  const std::vector<std::string> values = MakeStrings(10000, 20, 120);
+  BenchmarkStringHashing(state, values);
+}
+
+static void BM_HashLargeStrings(benchmark::State& state) {  // NOLINT non-const reference
+  const std::vector<std::string> values = MakeStrings(1000, 120, 2000);
+  BenchmarkStringHashing(state, values);
+}
+
+// ----------------------------------------------------------------------
+// Benchmark declarations
+
+static constexpr int32_t kRepetitions = 1;
+
+BENCHMARK(BM_HashIntegers)->Repetitions(kRepetitions)->Unit(benchmark::kMicrosecond);
+
+BENCHMARK(BM_HashSmallStrings)->Repetitions(kRepetitions)->Unit(benchmark::kMicrosecond);
+
+BENCHMARK(BM_HashMediumStrings)->Repetitions(kRepetitions)->Unit(benchmark::kMicrosecond);
+
+BENCHMARK(BM_HashLargeStrings)->Repetitions(kRepetitions)->Unit(benchmark::kMicrosecond);
+
+}  // namespace internal
+}  // namespace arrow
diff --git a/cpp/src/arrow/util/hashing-test.cc b/cpp/src/arrow/util/hashing-test.cc
new file mode 100644
index 0000000000000..cc80283532241
--- /dev/null
+++ b/cpp/src/arrow/util/hashing-test.cc
@@ -0,0 +1,406 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <cmath>
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <random>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include <gtest/gtest.h>
+
+#include "arrow/test-util.h"
+#include "arrow/util/bit-util.h"
+#include "arrow/util/hashing.h"
+#include "arrow/util/logging.h"
+
+namespace arrow {
+namespace internal {
+
+template <typename Integer>
+static std::unordered_set<Integer> MakeDistinctIntegers(int32_t n_values) {
+  std::default_random_engine gen(42);
+  std::uniform_int_distribution<Integer> values_dist(0,
+                                                     std::numeric_limits<Integer>::max());
+
+  std::unordered_set<Integer> values;
+  values.reserve(n_values);
+
+  while (values.size() < static_cast<uint32_t>(n_values)) {
+    values.insert(static_cast<Integer>(values_dist(gen)));
+  }
+  return values;
+}
+
+template <typename Integer>
+static std::unordered_set<Integer> MakeSequentialIntegers(int32_t n_values) {
+  std::unordered_set<Integer> values;
+  values.reserve(n_values);
+
+  for (int32_t i = 0; i < n_values; ++i) {
+    values.insert(static_cast<Integer>(i));
+  }
+  DCHECK_EQ(values.size(), static_cast<uint32_t>(n_values));
+  return values;
+}
+
+static std::unordered_set<std::string> MakeDistinctStrings(int32_t n_values) {
+  std::unordered_set<std::string> values;
+  values.reserve(n_values);
+
+  // Generate strings between 0 and 24 bytes, with ASCII characters
+  std::default_random_engine gen(42);
+  std::uniform_int_distribution<int32_t> length_dist(0, 24);
+  std::uniform_int_distribution<uint32_t> char_dist('0', 'z');
+
+  while (values.size() < static_cast<uint32_t>(n_values)) {
+    auto length = length_dist(gen);
+    std::string s(length, 'X');
+    for (int32_t i = 0; i < length; ++i) {
+      s[i] = static_cast<uint8_t>(char_dist(gen));
+    }
+    values.insert(std::move(s));
+  }
+  return values;
+}
+
+template <typename T>
+static void CheckScalarHashQuality(const std::unordered_set<T>& distinct_values) {
+  std::unordered_set<hash_t> hashes;
+  for (const auto v : distinct_values) {
+    hashes.insert(ScalarHelper<T, 0>::ComputeHash(v));
+    hashes.insert(ScalarHelper<T, 1>::ComputeHash(v));
+  }
+  ASSERT_GE(static_cast<double>(hashes.size()),
+            0.96 * static_cast<double>(2 * distinct_values.size()));
+}
+
+TEST(HashingQuality, Int64) {
+#ifdef ARROW_VALGRIND
+  const int32_t n_values = 500;
+#else
+  const int32_t n_values = 10000;
+#endif
+  {
+    const auto values = MakeDistinctIntegers<int64_t>(n_values);
+    CheckScalarHashQuality<int64_t>(values);
+  }
+  {
+    const auto values = MakeSequentialIntegers<int64_t>(n_values);
+    CheckScalarHashQuality<int64_t>(values);
+  }
+}
+
+TEST(HashingQuality, Strings) {
+#ifdef ARROW_VALGRIND
+  const int32_t n_values = 500;
+#else
+  const int32_t n_values = 10000;
+#endif
+  const auto values = MakeDistinctStrings(n_values);
+
+  std::unordered_set<hash_t> hashes;
+  for (const auto& v : values) {
+    hashes.insert(ComputeStringHash<0>(v.data(), static_cast<int64_t>(v.size())));
+    hashes.insert(ComputeStringHash<1>(v.data(), static_cast<int64_t>(v.size())));
+  }
+  ASSERT_GE(static_cast<double>(hashes.size()),
+            0.96 * static_cast<double>(2 * values.size()));
+}
+
+TEST(ScalarMemoTable, Int64) {
+  const int64_t A = 1234, B = 0, C = -98765321, D = 12345678901234LL, E = -1, F = 1,
+                G = 9223372036854775807LL, H = -9223372036854775807LL - 1;
+
+  ScalarMemoTable<int64_t> table(0);
+  ASSERT_EQ(table.size(), 0);
+  ASSERT_EQ(table.Get(A), -1);
+  ASSERT_EQ(table.GetOrInsert(A), 0);
+  ASSERT_EQ(table.Get(B), -1);
+  ASSERT_EQ(table.GetOrInsert(B), 1);
+  ASSERT_EQ(table.GetOrInsert(C), 2);
+  ASSERT_EQ(table.GetOrInsert(D), 3);
+  ASSERT_EQ(table.GetOrInsert(E), 4);
+
+  ASSERT_EQ(table.Get(A), 0);
+  ASSERT_EQ(table.GetOrInsert(A), 0);
+  ASSERT_EQ(table.Get(E), 4);
+  ASSERT_EQ(table.GetOrInsert(E), 4);
+
+  ASSERT_EQ(table.GetOrInsert(F), 5);
+  ASSERT_EQ(table.GetOrInsert(G), 6);
+  ASSERT_EQ(table.GetOrInsert(H), 7);
+
+  ASSERT_EQ(table.GetOrInsert(G), 6);
+  ASSERT_EQ(table.GetOrInsert(F), 5);
+  ASSERT_EQ(table.GetOrInsert(E), 4);
+  ASSERT_EQ(table.GetOrInsert(D), 3);
+  ASSERT_EQ(table.GetOrInsert(C), 2);
+  ASSERT_EQ(table.GetOrInsert(B), 1);
+  ASSERT_EQ(table.GetOrInsert(A), 0);
+
+  ASSERT_EQ(table.size(), 8);
+  {
+    std::vector<int64_t> expected({A, B, C, D, E, F, G, H});
+    std::vector<int64_t> values(expected.size());
+    table.CopyValues(values.data());
+    ASSERT_EQ(values, expected);
+  }
+  {
+    std::vector<int64_t> expected({D, E, F, G, H});
+    std::vector<int64_t> values(expected.size());
+    table.CopyValues(3 /* start offset */, values.data());
+    ASSERT_EQ(values, expected);
+  }
+}
+
+TEST(ScalarMemoTable, UInt16) {
+  const uint16_t A = 1234, B = 0, C = 65535, D = 32767, E = 1;
+
+  ScalarMemoTable<uint16_t> table(0);
+  ASSERT_EQ(table.size(), 0);
+  ASSERT_EQ(table.Get(A), -1);
+  ASSERT_EQ(table.GetOrInsert(A), 0);
+  ASSERT_EQ(table.Get(B), -1);
+  ASSERT_EQ(table.GetOrInsert(B), 1);
+  ASSERT_EQ(table.GetOrInsert(C), 2);
+  ASSERT_EQ(table.GetOrInsert(D), 3);
+  ASSERT_EQ(table.GetOrInsert(E), 4);
+
+  ASSERT_EQ(table.Get(A), 0);
+  ASSERT_EQ(table.GetOrInsert(A), 0);
+  ASSERT_EQ(table.GetOrInsert(B), 1);
+  ASSERT_EQ(table.GetOrInsert(C), 2);
+  ASSERT_EQ(table.GetOrInsert(D), 3);
+  ASSERT_EQ(table.Get(E), 4);
+  ASSERT_EQ(table.GetOrInsert(E), 4);
+
+  ASSERT_EQ(table.size(), 5);
+  std::vector<uint16_t> expected({A, B, C, D, E});
+  std::vector<uint16_t> values(table.size());
+  table.CopyValues(values.data());
+  ASSERT_EQ(values, expected);
+}
+
+TEST(SmallScalarMemoTable, Int8) {
+  const int8_t A = 1, B = 0, C = -1, D = -128, E = 127;
+
+  SmallScalarMemoTable<int8_t> table(0);
+  ASSERT_EQ(table.size(), 0);
+  ASSERT_EQ(table.Get(A), -1);
+  ASSERT_EQ(table.GetOrInsert(A), 0);
+  ASSERT_EQ(table.Get(B), -1);
+  ASSERT_EQ(table.GetOrInsert(B), 1);
+  ASSERT_EQ(table.GetOrInsert(C), 2);
+  ASSERT_EQ(table.GetOrInsert(D), 3);
+  ASSERT_EQ(table.GetOrInsert(E), 4);
+
+  ASSERT_EQ(table.Get(A), 0);
+  ASSERT_EQ(table.GetOrInsert(A), 0);
+  ASSERT_EQ(table.GetOrInsert(B), 1);
+  ASSERT_EQ(table.GetOrInsert(C), 2);
+  ASSERT_EQ(table.GetOrInsert(D), 3);
+  ASSERT_EQ(table.Get(E), 4);
+  ASSERT_EQ(table.GetOrInsert(E), 4);
+
+  ASSERT_EQ(table.size(), 5);
+  std::vector<int8_t> expected({A, B, C, D, E});
+  std::vector<int8_t> values(table.size());
+  table.CopyValues(values.data());
+  ASSERT_EQ(values, expected);
+}
+
+TEST(SmallScalarMemoTable, Bool) {
+  SmallScalarMemoTable<bool> table(0);
+  ASSERT_EQ(table.size(), 0);
+  ASSERT_EQ(table.Get(true), -1);
+  ASSERT_EQ(table.GetOrInsert(true), 0);
+  ASSERT_EQ(table.Get(false), -1);
+  ASSERT_EQ(table.GetOrInsert(false), 1);
+
+  ASSERT_EQ(table.Get(true), 0);
+  ASSERT_EQ(table.GetOrInsert(true), 0);
+  ASSERT_EQ(table.Get(false), 1);
+  ASSERT_EQ(table.GetOrInsert(false), 1);
+
+  ASSERT_EQ(table.size(), 2);
+  std::vector<bool> expected({true, false});
+  ASSERT_EQ(table.values(), expected);
+  // NOTE std::vector<bool> doesn't have a data() method
+}
+
+TEST(ScalarMemoTable, Float64) {
+  const double A = 0.0, B = 1.5, C = -0.0, D = std::numeric_limits<double>::infinity(),
+               E = -D, F = std::nan("");
+
+  ScalarMemoTable<double> table(0);
+  ASSERT_EQ(table.size(), 0);
+  ASSERT_EQ(table.Get(A), -1);
+  ASSERT_EQ(table.GetOrInsert(A), 0);
+  ASSERT_EQ(table.Get(B), -1);
+  ASSERT_EQ(table.GetOrInsert(B), 1);
+  ASSERT_EQ(table.GetOrInsert(C), 2);
+  ASSERT_EQ(table.GetOrInsert(D), 3);
+  ASSERT_EQ(table.GetOrInsert(E), 4);
+  ASSERT_EQ(table.GetOrInsert(F), 5);
+
+  ASSERT_EQ(table.Get(A), 0);
+  ASSERT_EQ(table.GetOrInsert(A), 0);
+  ASSERT_EQ(table.GetOrInsert(B), 1);
+  ASSERT_EQ(table.GetOrInsert(C), 2);
+  ASSERT_EQ(table.GetOrInsert(D), 3);
+  ASSERT_EQ(table.Get(E), 4);
+  ASSERT_EQ(table.GetOrInsert(E), 4);
+  ASSERT_EQ(table.Get(F), 5);
+  ASSERT_EQ(table.GetOrInsert(F), 5);
+
+  ASSERT_EQ(table.size(), 6);
+  std::vector<double> expected({A, B, C, D, E, F});
+  std::vector<double> values(table.size());
+  table.CopyValues(values.data());
+  for (uint32_t i = 0; i < expected.size(); ++i) {
+    auto u = expected[i];
+    auto v = values[i];
+    if (std::isnan(u)) {
+      ASSERT_TRUE(std::isnan(v));
+    } else {
+      ASSERT_EQ(u, v);
+    }
+  }
+}
+
+TEST(ScalarMemoTable, StressInt64) {
+  std::default_random_engine gen(42);
+  std::uniform_int_distribution<int64_t> value_dist(-50, 50);
+#ifdef ARROW_VALGRIND
+  const int32_t n_repeats = 500;
+#else
+  const int32_t n_repeats = 10000;
+#endif
+
+  ScalarMemoTable<int64_t> table(0);
+  std::unordered_map<int64_t, int32_t> map;
+
+  for (int32_t i = 0; i < n_repeats; ++i) {
+    int64_t value = value_dist(gen);
+    int32_t expected;
+    auto it = map.find(value);
+    if (it == map.end()) {
+      expected = static_cast<int32_t>(map.size());
+      map[value] = expected;
+    } else {
+      expected = it->second;
+    }
+    ASSERT_EQ(table.GetOrInsert(value), expected);
+  }
+  ASSERT_EQ(table.size(), map.size());
+}
+
+TEST(BinaryMemoTable, Basics) {
+  std::string A = "", B = "a", C = "foo", D = "bar", E, F;
+  E += '\0';
+  F += '\0';
+  F += "trailing";
+
+  BinaryMemoTable table(0);
+  ASSERT_EQ(table.size(), 0);
+  ASSERT_EQ(table.Get(A), -1);
+  ASSERT_EQ(table.GetOrInsert(A), 0);
+  ASSERT_EQ(table.Get(B), -1);
+  ASSERT_EQ(table.GetOrInsert(B), 1);
+  ASSERT_EQ(table.GetOrInsert(C), 2);
+  ASSERT_EQ(table.GetOrInsert(D), 3);
+  ASSERT_EQ(table.GetOrInsert(E), 4);
+  ASSERT_EQ(table.GetOrInsert(F), 5);
+
+  ASSERT_EQ(table.GetOrInsert(A), 0);
+  ASSERT_EQ(table.GetOrInsert(B), 1);
+  ASSERT_EQ(table.GetOrInsert(C), 2);
+  ASSERT_EQ(table.GetOrInsert(D), 3);
+  ASSERT_EQ(table.GetOrInsert(E), 4);
+  ASSERT_EQ(table.GetOrInsert(F), 5);
+
+  ASSERT_EQ(table.size(), 6);
+  ASSERT_EQ(table.values_size(), 17);
+
+  {
+    std::vector<int8_t> expected({0, 0, 1, 4, 7, 8, 17});
+    std::vector<int8_t> offsets(expected.size());
+    table.CopyOffsets(offsets.data());
+    ASSERT_EQ(offsets, expected);
+
+    std::string expected_values;
+    expected_values += "afoobar";
+    expected_values += '\0';
+    expected_values += '\0';
+    expected_values += "trailing";
+    std::string values(17, 'X');
+    table.CopyValues(reinterpret_cast<uint8_t*>(&values[0]));
+    ASSERT_EQ(values, expected_values);
+  }
+  {
+    std::vector<int8_t> expected({0, 1, 10});
+    std::vector<int8_t> offsets(expected.size());
+    table.CopyOffsets(4 /* start offset */, offsets.data());
+    ASSERT_EQ(offsets, expected);
+
+    std::string expected_values;
+    expected_values += '\0';
+    expected_values += '\0';
+    expected_values += "trailing";
+    std::string values(10, 'X');
+    table.CopyValues(4 /* start offset */, reinterpret_cast<uint8_t*>(&values[0]));
+    ASSERT_EQ(values, expected_values);
+  }
+}
+
+TEST(BinaryMemoTable, Stress) {
+#ifdef ARROW_VALGRIND
+  const int32_t n_values = 20;
+  const int32_t n_repeats = 20;
+#else
+  const int32_t n_values = 100;
+  const int32_t n_repeats = 100;
+#endif
+
+  const auto values = MakeDistinctStrings(n_values);
+
+  BinaryMemoTable table(0);
+  std::unordered_map<std::string, int32_t> map;
+
+  for (int32_t i = 0; i < n_repeats; ++i) {
+    for (const auto& value : values) {
+      int32_t expected;
+      auto it = map.find(value);
+      if (it == map.end()) {
+        expected = static_cast<int32_t>(map.size());
+        map[value] = expected;
+      } else {
+        expected = it->second;
+      }
+      ASSERT_EQ(table.GetOrInsert(value), expected);
+    }
+  }
+  ASSERT_EQ(table.size(), map.size());
+}
+
+}  // namespace internal
+}  // namespace arrow
diff --git a/cpp/src/arrow/util/hashing.h b/cpp/src/arrow/util/hashing.h
new file mode 100644
index 0000000000000..24325e81eb4fd
--- /dev/null
+++ b/cpp/src/arrow/util/hashing.h
@@ -0,0 +1,805 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Private header, not to be exported
+
+#ifndef ARROW_UTIL_HASHING_H
+#define ARROW_UTIL_HASHING_H
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+#include <cstring>
+#include <limits>
+#include <memory>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "arrow/array.h"
+#include "arrow/buffer.h"
+#include "arrow/type.h"
+#include "arrow/type_traits.h"
+#include "arrow/util/bit-util.h"
+#include "arrow/util/checked_cast.h"
+#include "arrow/util/hash-util.h"
+#include "arrow/util/macros.h"
+#include "arrow/util/string_view.h"
+
+namespace arrow {
+namespace internal {
+
+// XXX would it help to have a 32-bit hash value on large datasets?
+typedef uint64_t hash_t;
+
+// Notes about the choice of a hash function.
+// - xxHash64 is extremely fast on large enough data
+// - for small- to medium-sized data, there are better choices
+//   (see comprehensive benchmarks results at
+//    https://aras-p.info/blog/2016/08/09/More-Hash-Function-Tests/)
+// - for very small fixed-size data (<= 16 bytes, e.g. Decimal128), it is
+//   beneficial to define specialized hash functions
+// - while xxHash and others have good statistical properties, we can relax those
+//   a bit if it helps performance (especially if the hash table implementation
+//   has a good collision resolution strategy)
+
+template <uint64_t AlgNum>
+inline hash_t ComputeStringHash(const void* data, int64_t length);
+
+template <typename Scalar, uint64_t AlgNum>
+struct ScalarHelperBase {
+  static bool CompareScalars(Scalar u, Scalar v) { return u == v; }
+
+  static hash_t ComputeHash(const Scalar& value) {
+    // Generic hash computation for scalars.  Simply apply the string hash
+    // to the bit representation of the value.
+
+    // XXX in the case of FP values, we'd like equal values to have the same hash,
+    // even if they have different bit representations...
+    return ComputeStringHash<AlgNum>(&value, sizeof(value));
+  }
+};
+
+template <typename Scalar, uint64_t AlgNum = 0, typename Enable = void>
+struct ScalarHelper : public ScalarHelperBase<Scalar, AlgNum> {};
+
+template <typename Scalar, uint64_t AlgNum>
+struct ScalarHelper<Scalar, AlgNum,
+                    typename std::enable_if<std::is_integral<Scalar>::value>::type>
+    : public ScalarHelperBase<Scalar, AlgNum> {
+  // ScalarHelper specialization for integers
+
+  static hash_t ComputeHash(const Scalar& value) {
+    // Faster hash computation for integers.
+
+    // Two of xxhash's prime multipliers (which are chosen for their
+    // bit dispersion properties)
+    static constexpr uint64_t multipliers[] = {11400714785074694791ULL,
+                                               14029467366897019727ULL};
+
+    // Multiplying by the prime number mixes the low bits into the high bits,
+    // then byte-swapping (which is a single CPU instruction) allows the
+    // combined high and low bits to participate in the initial hash table index.
+    auto h = static_cast<hash_t>(value);
+    return BitUtil::ByteSwap(multipliers[AlgNum] * h);
+  }
+};
+
+template <typename Scalar, uint64_t AlgNum>
+struct ScalarHelper<Scalar, AlgNum,
+                    typename std::enable_if<std::is_floating_point<Scalar>::value>::type>
+    : public ScalarHelperBase<Scalar, AlgNum> {
+  // ScalarHelper specialization for reals
+
+  static bool CompareScalars(Scalar u, Scalar v) {
+    if (std::isnan(u)) {
+      // XXX should we do a bit-precise comparison?
+      return std::isnan(v);
+    }
+    return u == v;
+  }
+};
+
+template <uint64_t AlgNum = 0>
+hash_t ComputeStringHash(const void* data, int64_t length) {
+  if (ARROW_PREDICT_TRUE(length <= 16)) {
+    // Specialize for small hash strings, as they are quite common as
+    // hash table keys.
+    auto p = reinterpret_cast<const uint8_t*>(data);
+    auto n = static_cast<uint32_t>(length);
+    if (n <= 8) {
+      if (n <= 3) {
+        if (n == 0) {
+          return 1U;
+        }
+        uint32_t x = (n << 24) ^ (p[0] << 16) ^ (p[n / 2] << 8) ^ p[n - 1];
+        return ScalarHelper<uint32_t, AlgNum>::ComputeHash(x);
+      }
+      // 4 <= length <= 8
+      // We can read the string as two overlapping 32-bit ints, apply
+      // different hash functions to each of them in parallel, then XOR
+      // the results
+      uint32_t x, y;
+      hash_t hx, hy;
+      // XXX those are unaligned accesses.  Should we have a facility for that?
+      x = *reinterpret_cast<const uint32_t*>(p + n - 4);
+      y = *reinterpret_cast<const uint32_t*>(p);
+      hx = ScalarHelper<uint32_t, AlgNum>::ComputeHash(x);
+      hy = ScalarHelper<uint32_t, AlgNum ^ 1>::ComputeHash(y);
+      return n ^ hx ^ hy;
+    }
+    // 8 <= length <= 16
+    // Apply the same principle as above
+    uint64_t x, y;
+    hash_t hx, hy;
+    x = *reinterpret_cast<const uint64_t*>(p + n - 8);
+    y = *reinterpret_cast<const uint64_t*>(p);
+    hx = ScalarHelper<uint64_t, AlgNum>::ComputeHash(x);
+    hy = ScalarHelper<uint64_t, AlgNum ^ 1>::ComputeHash(y);
+    return n ^ hx ^ hy;
+  }
+
+  if (HashUtil::have_hardware_crc32) {
+    // DoubleCrcHash is faster that Murmur2.
+    auto h = HashUtil::DoubleCrcHash(data, static_cast<int32_t>(length), AlgNum);
+    return ScalarHelper<uint64_t, AlgNum>::ComputeHash(h);
+  } else {
+    // Fall back on 64-bit Murmur2 for longer strings.
+    // It has decent speed for medium-sized strings.  There may be faster
+    // hashes on long strings such as xxHash, but that may not matter much
+    // for the typical length distribution of hash keys.
+    return HashUtil::MurmurHash2_64(data, static_cast<int>(length), AlgNum);
+  }
+}
+
+// XXX add a HashEq<ArrowType> struct with both hash and compare functions?
+
+// ----------------------------------------------------------------------
+// An open-addressing insert-only hash table (no deletes)
+
+template <typename Payload>
+class HashTable {
+ public:
+  struct Entry {
+    hash_t h;
+    Payload payload;
+  };
+
+  explicit HashTable(uint64_t capacity) {
+    // Presize for at least 8 elements
+    capacity = std::max(capacity, static_cast<uint64_t>(8U));
+    size_ = BitUtil::NextPower2(capacity * 4U);
+    size_mask_ = size_ - 1;
+    n_filled_ = 0;
+    // This will zero out hash entries, marking them empty
+    entries_.resize(size_);
+  }
+
+  // Lookup with non-linear probing
+  // cmp_func should have signature bool(const Payload*).
+  // Return a (Entry*, found) pair.
+  template <typename CmpFunc>
+  std::pair<Entry*, bool> Lookup(hash_t h, CmpFunc&& cmp_func) {
+    auto p = Lookup<DoCompare, CmpFunc>(h, entries_.data(), size_mask_,
+                                        std::forward<CmpFunc>(cmp_func));
+    return {&entries_[p.first], p.second};
+  }
+
+  template <typename CmpFunc>
+  std::pair<const Entry*, bool> Lookup(hash_t h, CmpFunc&& cmp_func) const {
+    auto p = Lookup<DoCompare, CmpFunc>(h, entries_.data(), size_mask_,
+                                        std::forward<CmpFunc>(cmp_func));
+    return {&entries_[p.first], p.second};
+  }
+
+  void Insert(Entry* entry, hash_t h, const Payload& payload) {
+    assert(entry->h == 0);
+    entry->h = FixHash(h);
+    entry->payload = payload;
+    ++n_filled_;
+    if (NeedUpsizing()) {
+      // Resizing is expensive, avoid doing it too often
+      Upsize(size_ * 4);
+    }
+  }
+
+  uint64_t size() const { return n_filled_; }
+
+  // Visit all non-empty entries in the table
+  // The visit_func should have signature void(const Entry*)
+  template <typename VisitFunc>
+  void VisitEntries(VisitFunc&& visit_func) const {
+    for (const auto& entry : entries_) {
+      if (entry.h != 0U) {
+        visit_func(&entry);
+      }
+    }
+  }
+
+ protected:
+  // NoCompare is for when the value is known not to exist in the table
+  enum CompareKind { DoCompare, NoCompare };
+
+  // The workhorse lookup function
+  template <CompareKind CKind, typename CmpFunc>
+  std::pair<uint64_t, bool> Lookup(hash_t h, const Entry* entries, uint64_t size_mask,
+                                   CmpFunc&& cmp_func) const {
+    static constexpr uint8_t perturb_shift = 5;
+
+    uint64_t index, perturb;
+    const Entry* entry;
+
+    h = FixHash(h);
+    index = h & size_mask;
+    perturb = (h >> perturb_shift) + 1U;
+
+    while (true) {
+      entry = &entries[index];
+      if (CompareEntry<CKind, CmpFunc>(h, entry, std::forward<CmpFunc>(cmp_func))) {
+        // Found
+        return {index, true};
+      }
+      if (entry->h == 0U) {
+        // Empty slot
+        return {index, false};
+      }
+
+      // Perturbation logic inspired from CPython's set / dict object.
+      // The goal is that all 64 bits of the unmasked hash value eventually
+      // participate in the probing sequence, to minimize clustering.
+      index = (index + perturb) & size_mask;
+      perturb = (perturb >> perturb_shift) + 1U;
+    }
+  }
+
+  template <CompareKind CKind, typename CmpFunc>
+  bool CompareEntry(hash_t h, const Entry* entry, CmpFunc&& cmp_func) const {
+    if (CKind == NoCompare) {
+      return false;
+    } else {
+      return entry->h == h && cmp_func(&entry->payload);
+    }
+  }
+
+  bool NeedUpsizing() const {
+    // Keep the load factor <= 1/2
+    return n_filled_ * 2U >= size_;
+  }
+
+  void Upsize(uint64_t new_size) {
+    assert(new_size > size_);
+    uint64_t new_mask = new_size - 1;
+    assert((new_size & new_mask) == 0);  // it's a power of two
+
+    std::vector<Entry> new_entries(new_size);
+    for (auto& entry : entries_) {
+      hash_t h = entry.h;
+      if (h != 0) {
+        // Dummy compare function (will not be called)
+        auto cmp_func = [](const Payload*) { return false; };
+        // Non-empty slot, move into new
+        auto p = Lookup<NoCompare>(h, new_entries.data(), new_mask, cmp_func);
+        assert(!p.second);  // shouldn't have found a matching entry
+        Entry* new_entry = &new_entries[p.first];
+        new_entry->h = h;
+        new_entry->payload = entry.payload;
+      }
+    }
+    std::swap(entries_, new_entries);
+    size_ = new_size;
+    size_mask_ = new_mask;
+  }
+
+  hash_t FixHash(hash_t h) const {
+    // 0 is used to indicate empty entries
+    return (h == 0U) ? 42U : h;
+  }
+
+  uint64_t size_;
+  uint64_t size_mask_;
+  uint64_t n_filled_;
+  std::vector<Entry> entries_;
+};
+
+// XXX typedef memo_index_t int32_t ?
+
+// ----------------------------------------------------------------------
+// A memoization table for memory-cheap scalar values.
+
+// The memoization table remembers and allows to look up the insertion
+// index for each key.
+
+template <typename Scalar, template <class> class HashTableTemplateType = HashTable>
+class ScalarMemoTable {
+ public:
+  explicit ScalarMemoTable(int64_t entries = 0)
+      : hash_table_(static_cast<uint64_t>(entries)) {}
+
+  int32_t Get(const Scalar value) const {
+    auto cmp_func = [value](const Payload* payload) -> bool {
+      return ScalarHelper<Scalar, 0>::CompareScalars(payload->value, value);
+    };
+    hash_t h = ComputeHash(value);
+    auto p = hash_table_.Lookup(h, cmp_func);
+    if (p.second) {
+      return p.first->payload.memo_index;
+    } else {
+      return -1;
+    }
+  }
+
+  template <typename Func1, typename Func2>
+  int32_t GetOrInsert(const Scalar value, Func1&& on_found, Func2&& on_not_found) {
+    auto cmp_func = [value](const Payload* payload) -> bool {
+      return ScalarHelper<Scalar, 0>::CompareScalars(value, payload->value);
+    };
+    hash_t h = ComputeHash(value);
+    auto p = hash_table_.Lookup(h, cmp_func);
+    int32_t memo_index;
+    if (p.second) {
+      memo_index = p.first->payload.memo_index;
+      on_found(memo_index);
+    } else {
+      memo_index = size();
+      hash_table_.Insert(p.first, h, {value, memo_index});
+      on_not_found(memo_index);
+    }
+    return memo_index;
+  }
+
+  int32_t GetOrInsert(const Scalar value) {
+    return GetOrInsert(value, [](int32_t i) {}, [](int32_t i) {});
+  }
+
+  // The number of entries in the memo table
+  // (which is also 1 + the largest memo index)
+  int32_t size() const { return static_cast<int32_t>(hash_table_.size()); }
+
+  // Copy values starting from index `start` into `out_data`
+  void CopyValues(int32_t start, Scalar* out_data) const {
+    hash_table_.VisitEntries([=](const HashTableEntry* entry) {
+      int32_t index = entry->payload.memo_index - start;
+      if (index >= 0) {
+        out_data[index] = entry->payload.value;
+      }
+    });
+  }
+
+  void CopyValues(Scalar* out_data) const { CopyValues(0, out_data); }
+
+ protected:
+  struct Payload {
+    Scalar value;
+    int32_t memo_index;
+  };
+  using HashTableType = HashTableTemplateType<Payload>;
+  using HashTableEntry = typename HashTableType::Entry;
+  HashTableType hash_table_;
+
+  hash_t ComputeHash(const Scalar& value) const {
+    return ScalarHelper<Scalar, 0>::ComputeHash(value);
+  }
+};
+
+// ----------------------------------------------------------------------
+// A memoization table for small scalar values, using direct indexing
+
+template <typename Scalar, typename Enable = void>
+struct SmallScalarTraits {};
+
+template <>
+struct SmallScalarTraits<bool> {
+  static constexpr int32_t cardinality = 2;
+
+  static uint32_t AsIndex(bool value) { return value ? 1 : 0; }
+};
+
+template <typename Scalar>
+struct SmallScalarTraits<Scalar,
+                         typename std::enable_if<std::is_integral<Scalar>::value>::type> {
+  using Unsigned = typename std::make_unsigned<Scalar>::type;
+
+  static constexpr int32_t cardinality = 1U + std::numeric_limits<Unsigned>::max();
+
+  static uint32_t AsIndex(Scalar value) { return static_cast<Unsigned>(value); }
+};
+
+template <typename Scalar, template <class> class HashTableTemplateType = HashTable>
+class SmallScalarMemoTable {
+ public:
+  explicit SmallScalarMemoTable(int64_t entries = 0) {
+    std::fill(value_to_index_, value_to_index_ + cardinality, -1);
+    index_to_value_.reserve(cardinality);
+  }
+
+  int32_t Get(const Scalar value) const {
+    auto value_index = AsIndex(value);
+    return value_to_index_[value_index];
+  }
+
+  template <typename Func1, typename Func2>
+  int32_t GetOrInsert(const Scalar value, Func1&& on_found, Func2&& on_not_found) {
+    auto value_index = AsIndex(value);
+    auto memo_index = value_to_index_[value_index];
+    if (memo_index < 0) {
+      memo_index = static_cast<int32_t>(index_to_value_.size());
+      index_to_value_.push_back(value);
+      value_to_index_[value_index] = memo_index;
+      assert(memo_index < cardinality);
+      on_not_found(memo_index);
+    } else {
+      on_found(memo_index);
+    }
+    return memo_index;
+  }
+
+  int32_t GetOrInsert(const Scalar value) {
+    return GetOrInsert(value, [](int32_t i) {}, [](int32_t i) {});
+  }
+
+  // The number of entries in the memo table
+  // (which is also 1 + the largest memo index)
+  int32_t size() const { return static_cast<int32_t>(index_to_value_.size()); }
+
+  // Copy values starting from index `start` into `out_data`
+  void CopyValues(int32_t start, Scalar* out_data) const {
+    memcpy(out_data, &index_to_value_[start], size() - start);
+  }
+
+  void CopyValues(Scalar* out_data) const { CopyValues(0, out_data); }
+
+  const std::vector<Scalar>& values() const { return index_to_value_; }
+
+ protected:
+  static constexpr auto cardinality = SmallScalarTraits<Scalar>::cardinality;
+  static_assert(cardinality <= 256, "cardinality too large for direct-addressed table");
+
+  uint32_t AsIndex(Scalar value) const {
+    return SmallScalarTraits<Scalar>::AsIndex(value);
+  }
+
+  int32_t value_to_index_[cardinality];
+  std::vector<Scalar> index_to_value_;
+};
+
+// ----------------------------------------------------------------------
+// A memoization table for variable-sized binary data.
+
+class BinaryMemoTable {
+ public:
+  explicit BinaryMemoTable(int64_t entries = 0, int64_t values_size = -1)
+      : hash_table_(static_cast<uint64_t>(entries)) {
+    offsets_.reserve(entries + 1);
+    offsets_.push_back(0);
+    if (values_size == -1) {
+      values_.reserve(entries * 4);  // A conservative heuristic
+    } else {
+      values_.reserve(values_size);
+    }
+  }
+
+  int32_t Get(const void* data, int32_t length) const {
+    hash_t h = ComputeStringHash<0>(data, length);
+    auto p = Lookup(h, data, length);
+    if (p.second) {
+      return p.first->payload.memo_index;
+    } else {
+      return -1;
+    }
+  }
+
+  int32_t Get(const std::string& value) const {
+    return Get(value.data(), static_cast<int32_t>(value.length()));
+  }
+
+  int32_t Get(const util::string_view& value) const {
+    return Get(value.data(), static_cast<int32_t>(value.length()));
+  }
+
+  template <typename Func1, typename Func2>
+  int32_t GetOrInsert(const void* data, int32_t length, Func1&& on_found,
+                      Func2&& on_not_found) {
+    hash_t h = ComputeStringHash<0>(data, length);
+    auto p = Lookup(h, data, length);
+    int32_t memo_index;
+    if (p.second) {
+      memo_index = p.first->payload.memo_index;
+      on_found(memo_index);
+    } else {
+      memo_index = size();
+      // Insert offset
+      auto offset = static_cast<int32_t>(values_.size());
+      assert(offsets_.size() == static_cast<uint32_t>(memo_index + 1));
+      assert(offsets_[memo_index] == offset);
+      offsets_.push_back(offset + length);
+      // Insert string value
+      values_.append(static_cast<const char*>(data), length);
+      // Insert hash entry
+      hash_table_.Insert(const_cast<HashTableEntry*>(p.first), h, {memo_index});
+
+      on_not_found(memo_index);
+    }
+    return memo_index;
+  }
+
+  template <typename Func1, typename Func2>
+  int32_t GetOrInsert(const util::string_view& value, Func1&& on_found,
+                      Func2&& on_not_found) {
+    return GetOrInsert(value.data(), static_cast<int32_t>(value.length()),
+                       std::forward<Func1>(on_found), std::forward<Func2>(on_not_found));
+  }
+
+  int32_t GetOrInsert(const void* data, int32_t length) {
+    return GetOrInsert(data, length, [](int32_t i) {}, [](int32_t i) {});
+  }
+
+  int32_t GetOrInsert(const util::string_view& value) {
+    return GetOrInsert(value.data(), static_cast<int32_t>(value.length()));
+  }
+
+  int32_t GetOrInsert(const std::string& value) {
+    return GetOrInsert(value.data(), static_cast<int32_t>(value.length()));
+  }
+
+  // The number of entries in the memo table
+  // (which is also 1 + the largest memo index)
+  int32_t size() const { return static_cast<int32_t>(hash_table_.size()); }
+
+  int32_t values_size() const { return static_cast<int32_t>(values_.size()); }
+
+  const uint8_t* values_data() const {
+    return reinterpret_cast<const uint8_t*>(values_.data());
+  }
+
+  // Copy (n + 1) offsets starting from index `start` into `out_data`
+  template <class Offset>
+  void CopyOffsets(int32_t start, Offset* out_data) const {
+    auto delta = offsets_[start];
+    for (uint32_t i = start; i < offsets_.size(); ++i) {
+      auto adjusted_offset = offsets_[i] - delta;
+      auto cast_offset = static_cast<Offset>(adjusted_offset);
+      assert(static_cast<int32_t>(cast_offset) == adjusted_offset);  // avoid truncation
+      *out_data++ = cast_offset;
+    }
+  }
+
+  template <class Offset>
+  void CopyOffsets(Offset* out_data) const {
+    CopyOffsets(0, out_data);
+  }
+
+  // Copy values starting from index `start` into `out_data`
+  void CopyValues(int32_t start, uint8_t* out_data) const {
+    CopyValues(start, -1, out_data);
+  }
+
+  // Same as above, but check output size in debug mode
+  void CopyValues(int32_t start, int64_t out_size, uint8_t* out_data) const {
+    int32_t offset = offsets_[start];
+    auto length = values_.size() - static_cast<size_t>(offset);
+    if (out_size != -1) {
+      assert(static_cast<int64_t>(length) == out_size);
+    }
+    memcpy(out_data, values_.data() + offset, length);
+  }
+
+  void CopyValues(uint8_t* out_data) const { CopyValues(0, -1, out_data); }
+
+  void CopyValues(int64_t out_size, uint8_t* out_data) const {
+    CopyValues(0, out_size, out_data);
+  }
+
+ protected:
+  struct Payload {
+    int32_t memo_index;
+  };
+  using HashTableType = HashTable<Payload>;
+  using HashTableEntry = typename HashTable<Payload>::Entry;
+  HashTableType hash_table_;
+  std::vector<int32_t> offsets_;
+  std::string values_;
+
+  std::pair<const HashTableEntry*, bool> Lookup(hash_t h, const void* data,
+                                                int32_t length) const {
+    auto cmp_func = [=](const Payload* payload) {
+      int32_t start, stop;
+      start = offsets_[payload->memo_index];
+      stop = offsets_[payload->memo_index + 1];
+      return length == stop - start && memcmp(data, values_.data() + start, length) == 0;
+    };
+    return hash_table_.Lookup(h, cmp_func);
+  }
+};
+
+template <typename T, typename Enable = void>
+struct HashTraits {};
+
+template <>
+struct HashTraits<BooleanType> {
+  using MemoTableType = SmallScalarMemoTable<bool>;
+};
+
+template <typename T>
+struct HashTraits<T, enable_if_8bit_int<T>> {
+  using c_type = typename T::c_type;
+  using MemoTableType = SmallScalarMemoTable<typename T::c_type>;
+
+  static Status GetDictionaryArrayData(MemoryPool* pool,
+                                       const std::shared_ptr<DataType>& type,
+                                       const MemoTableType& memo_table,
+                                       int64_t start_offset,
+                                       std::shared_ptr<ArrayData>* out) {
+    std::shared_ptr<Buffer> dict_buffer;
+    auto dict_length = static_cast<int64_t>(memo_table.size()) - start_offset;
+    // This makes a copy, but we assume a dictionary array is usually small
+    // compared to the size of the dictionary-using array.
+    // (also, copying the dictionary values is cheap compared to the cost
+    //  of building the memo table)
+    RETURN_NOT_OK(
+        AllocateBuffer(pool, TypeTraits<T>::bytes_required(dict_length), &dict_buffer));
+    memo_table.CopyValues(static_cast<int32_t>(start_offset),
+                          reinterpret_cast<c_type*>(dict_buffer->mutable_data()));
+    *out = ArrayData::Make(type, dict_length, {nullptr, dict_buffer}, 0 /* null_count */);
+    return Status::OK();
+  }
+};
+
+template <typename T>
+struct HashTraits<
+    T, typename std::enable_if<has_c_type<T>::value && !is_8bit_int<T>::value>::type> {
+  using c_type = typename T::c_type;
+  using MemoTableType = ScalarMemoTable<c_type, HashTable>;
+
+  static Status GetDictionaryArrayData(MemoryPool* pool,
+                                       const std::shared_ptr<DataType>& type,
+                                       const MemoTableType& memo_table,
+                                       int64_t start_offset,
+                                       std::shared_ptr<ArrayData>* out) {
+    std::shared_ptr<Buffer> dict_buffer;
+    auto dict_length = static_cast<int64_t>(memo_table.size()) - start_offset;
+    // This makes a copy, but we assume a dictionary array is usually small
+    // compared to the size of the dictionary-using array.
+    // (also, copying the dictionary values is cheap compared to the cost
+    //  of building the memo table)
+    RETURN_NOT_OK(
+        AllocateBuffer(pool, TypeTraits<T>::bytes_required(dict_length), &dict_buffer));
+    memo_table.CopyValues(static_cast<int32_t>(start_offset),
+                          reinterpret_cast<c_type*>(dict_buffer->mutable_data()));
+    *out = ArrayData::Make(type, dict_length, {nullptr, dict_buffer}, 0 /* null_count */);
+    return Status::OK();
+  }
+};
+
+template <typename T>
+struct HashTraits<T, enable_if_binary<T>> {
+  using MemoTableType = BinaryMemoTable;
+};
+
+template <typename T>
+struct HashTraits<T, enable_if_fixed_size_binary<T>> {
+  using MemoTableType = BinaryMemoTable;
+};
+
+template <typename T, typename Enable = void>
+struct DictionaryTraits {};
+
+template <>
+struct DictionaryTraits<BooleanType> {
+  using T = BooleanType;
+  using MemoTableType = typename HashTraits<T>::MemoTableType;
+
+  static Status GetDictionaryArrayData(MemoryPool* pool,
+                                       const std::shared_ptr<DataType>& type,
+                                       const MemoTableType& memo_table,
+                                       int64_t start_offset,
+                                       std::shared_ptr<ArrayData>* out) {
+    BooleanBuilder builder(pool);
+    const auto& bool_values = memo_table.values();
+    auto it = bool_values.begin() + start_offset;
+    for (; it != bool_values.end(); ++it) {
+      RETURN_NOT_OK(builder.Append(*it));
+    }
+    return builder.FinishInternal(out);
+  }
+};
+
+template <typename T>
+struct DictionaryTraits<T, enable_if_has_c_type<T>> {
+  using c_type = typename T::c_type;
+  using MemoTableType = typename HashTraits<T>::MemoTableType;
+
+  static Status GetDictionaryArrayData(MemoryPool* pool,
+                                       const std::shared_ptr<DataType>& type,
+                                       const MemoTableType& memo_table,
+                                       int64_t start_offset,
+                                       std::shared_ptr<ArrayData>* out) {
+    std::shared_ptr<Buffer> dict_buffer;
+    auto dict_length = static_cast<int64_t>(memo_table.size()) - start_offset;
+    // This makes a copy, but we assume a dictionary array is usually small
+    // compared to the size of the dictionary-using array.
+    // (also, copying the dictionary values is cheap compared to the cost
+    //  of building the memo table)
+    RETURN_NOT_OK(
+        AllocateBuffer(pool, TypeTraits<T>::bytes_required(dict_length), &dict_buffer));
+    memo_table.CopyValues(static_cast<int32_t>(start_offset),
+                          reinterpret_cast<c_type*>(dict_buffer->mutable_data()));
+    *out = ArrayData::Make(type, dict_length, {nullptr, dict_buffer}, 0 /* null_count */);
+    return Status::OK();
+  }
+};
+
+template <typename T>
+struct DictionaryTraits<T, enable_if_binary<T>> {
+  using MemoTableType = typename HashTraits<T>::MemoTableType;
+
+  static Status GetDictionaryArrayData(MemoryPool* pool,
+                                       const std::shared_ptr<DataType>& type,
+                                       const MemoTableType& memo_table,
+                                       int64_t start_offset,
+                                       std::shared_ptr<ArrayData>* out) {
+    std::shared_ptr<Buffer> dict_offsets;
+    std::shared_ptr<Buffer> dict_data;
+
+    // Create the offsets buffer
+    auto dict_length = static_cast<int64_t>(memo_table.size() - start_offset);
+    RETURN_NOT_OK(AllocateBuffer(
+        pool, TypeTraits<Int32Type>::bytes_required(dict_length + 1), &dict_offsets));
+    auto raw_offsets = reinterpret_cast<int32_t*>(dict_offsets->mutable_data());
+    memo_table.CopyOffsets(static_cast<int32_t>(start_offset), raw_offsets);
+
+    // Create the data buffer
+    DCHECK_EQ(raw_offsets[0], 0);
+    RETURN_NOT_OK(AllocateBuffer(pool, raw_offsets[dict_length], &dict_data));
+    memo_table.CopyValues(static_cast<int32_t>(start_offset), dict_data->size(),
+                          dict_data->mutable_data());
+
+    *out = ArrayData::Make(type, dict_length, {nullptr, dict_offsets, dict_data},
+                           0 /* null_count */);
+    return Status::OK();
+  }
+};
+
+template <typename T>
+struct DictionaryTraits<T, enable_if_fixed_size_binary<T>> {
+  using MemoTableType = typename HashTraits<T>::MemoTableType;
+
+  static Status GetDictionaryArrayData(MemoryPool* pool,
+                                       const std::shared_ptr<DataType>& type,
+                                       const MemoTableType& memo_table,
+                                       int64_t start_offset,
+                                       std::shared_ptr<ArrayData>* out) {
+    const T& concrete_type = internal::checked_cast<const T&>(*type);
+    std::shared_ptr<Buffer> dict_data;
+
+    // Create the data buffer
+    auto dict_length = static_cast<int64_t>(memo_table.size() - start_offset);
+    auto data_length = dict_length * concrete_type.byte_width();
+    RETURN_NOT_OK(AllocateBuffer(pool, data_length, &dict_data));
+    memo_table.CopyValues(static_cast<int32_t>(start_offset), data_length,
+                          dict_data->mutable_data());
+
+    *out = ArrayData::Make(type, dict_length, {nullptr, dict_data}, 0 /* null_count */);
+    return Status::OK();
+  }
+};
+
+}  // namespace internal
+}  // namespace arrow
+
+#endif  // ARROW_UTIL_HASHING_H
diff --git a/cpp/src/arrow/util/sse-util.h b/cpp/src/arrow/util/sse-util.h
index 50e38d7032dce..0ff1ff3ae3575 100644
--- a/cpp/src/arrow/util/sse-util.h
+++ b/cpp/src/arrow/util/sse-util.h
@@ -21,10 +21,33 @@
 #ifndef ARROW_UTIL_SSE_UTIL_H
 #define ARROW_UTIL_SSE_UTIL_H
 
-#ifdef ARROW_USE_SSE
+#undef ARROW_HAVE_SSE2
+#undef ARROW_HAVE_SSE4_2
+
+// MSVC x86-64
+
+#if (defined(_M_AMD64) || defined(_M_X64))
+#define ARROW_HAVE_SSE2 1
+#define ARROW_HAVE_SSE4_2 1
+#include <intrin.h>
+#endif
+
+// gcc/clang (possibly others)
+
+#if defined(__SSE4_2__)
+#define ARROW_HAVE_SSE2 1
 #include <emmintrin.h>
 #endif
 
+#if defined(__SSE4_2__)
+#define ARROW_HAVE_SSE4_2 1
+#include <nmmintrin.h>
+#endif
+
+#if defined(ARROW_USE_SSE) && !defined(ARROW_HAVE_SSE2)
+#error "ARROW_USE_SSE enabled but no intrinsics available"
+#endif
+
 namespace arrow {
 
 /// This class contains constants useful for text processing with SSE4.2 intrinsics.
@@ -58,153 +81,53 @@ static const int SSE_BITMASK[CHARS_PER_128_BIT_REGISTER] = {
 };
 }  // namespace SSEUtil
 
-#ifdef ARROW_USE_SSE
+#ifdef ARROW_HAVE_SSE4_2
 
 /// Define the SSE 4.2 intrinsics.  The caller must first verify at runtime (or codegen
 /// IR load time) that the processor supports SSE 4.2 before calling these.  These are
 /// defined outside the namespace because the IR w/ SSE 4.2 case needs to use macros.
-#ifndef IR_COMPILE
-/// When compiling to native code (i.e. not IR), we cannot use the -msse4.2 compiler
-/// flag.  Otherwise, the compiler will emit SSE 4.2 instructions outside of the runtime
-/// SSE 4.2 checks and Impala will crash on CPUs that don't support SSE 4.2
-/// (IMPALA-1399/1646).  The compiler intrinsics cannot be used without -msse4.2, so we
-/// define our own implementations of the intrinsics instead.
-
-/// The PCMPxSTRy instructions require that the control byte 'mode' be encoded as an
-/// immediate.  So, those need to be always inlined in order to always propagate the
-/// mode constant into the inline asm.
-#define SSE_ALWAYS_INLINE inline __attribute__((__always_inline__))
 
 template <int MODE>
 static inline __m128i SSE4_cmpestrm(__m128i str1, int len1, __m128i str2, int len2) {
-#ifdef __clang__
-  /// Use asm reg rather than Yz output constraint to workaround LLVM bug 13199 -
-  /// clang doesn't support Y-prefixed asm constraints.
-  register volatile __m128i result asm("xmm0");
-  __asm__ volatile("pcmpestrm %5, %2, %1"
-                   : "=x"(result)
-                   : "x"(str1), "xm"(str2), "a"(len1), "d"(len2), "i"(MODE)
-                   : "cc");
-#else
-  __m128i result;
-  __asm__ volatile("pcmpestrm %5, %2, %1"
-                   : "=Yz"(result)
-                   : "x"(str1), "xm"(str2), "a"(len1), "d"(len2), "i"(MODE)
-                   : "cc");
-#endif
-  return result;
+  return _mm_cmpestrm(str1, len1, str2, len2, MODE);
 }
 
 template <int MODE>
 static inline int SSE4_cmpestri(__m128i str1, int len1, __m128i str2, int len2) {
-  int result;
-  __asm__("pcmpestri %5, %2, %1"
-          : "=c"(result)
-          : "x"(str1), "xm"(str2), "a"(len1), "d"(len2), "i"(MODE)
-          : "cc");
-  return result;
+  return _mm_cmpestri(str1, len1, str2, len2, MODE);
 }
 
 static inline uint32_t SSE4_crc32_u8(uint32_t crc, uint8_t v) {
-  __asm__("crc32b %1, %0" : "+r"(crc) : "rm"(v));
-  return crc;
+  return _mm_crc32_u8(crc, v);
 }
 
 static inline uint32_t SSE4_crc32_u16(uint32_t crc, uint16_t v) {
-  __asm__("crc32w %1, %0" : "+r"(crc) : "rm"(v));
-  return crc;
+  return _mm_crc32_u16(crc, v);
 }
 
 static inline uint32_t SSE4_crc32_u32(uint32_t crc, uint32_t v) {
-  __asm__("crc32l %1, %0" : "+r"(crc) : "rm"(v));
-  return crc;
+  return _mm_crc32_u32(crc, v);
 }
 
 static inline uint32_t SSE4_crc32_u64(uint32_t crc, uint64_t v) {
-  uint64_t result = crc;
-  __asm__("crc32q %1, %0" : "+r"(result) : "rm"(v));
-  return static_cast<uint32_t>(result);
-}
-
-static inline int64_t POPCNT_popcnt_u64(uint64_t a) {
-  int64_t result;
-  __asm__("popcntq %1, %0" : "=r"(result) : "mr"(a) : "cc");
-  return result;
-}
-
-#undef SSE_ALWAYS_INLINE
-
-#elif defined(__SSE4_2__)  // IR_COMPILE for SSE 4.2.
-/// When cross-compiling to IR, we cannot use inline asm because LLVM JIT does not
-/// support it.  However, the cross-compiled IR is compiled twice: with and without
-/// -msse4.2.  When -msse4.2 is enabled in the cross-compile, we can just use the
-/// compiler intrinsics.
-
-#include <smmintrin.h>
-
-template <int MODE>
-static inline __m128i SSE4_cmpestrm(__m128i str1, int len1, __m128i str2, int len2) {
-  return _mm_cmpestrm(str1, len1, str2, len2, MODE);
+  return static_cast<uint32_t>(_mm_crc32_u64(crc, v));
 }
 
-template <int MODE>
-static inline int SSE4_cmpestri(__m128i str1, int len1, __m128i str2, int len2) {
-  return _mm_cmpestri(str1, len1, str2, len2, MODE);
-}
-
-#define SSE4_crc32_u8 _mm_crc32_u8
-#define SSE4_crc32_u16 _mm_crc32_u16
-#define SSE4_crc32_u32 _mm_crc32_u32
-#define SSE4_crc32_u64 _mm_crc32_u64
-#define POPCNT_popcnt_u64 _mm_popcnt_u64
+#else  // without SSE 4.2.
 
-#else  // IR_COMPILE without SSE 4.2.
-/// When cross-compiling to IR without SSE 4.2 support (i.e. no -msse4.2), we cannot use
-/// SSE 4.2 instructions.  Otherwise, the IR loading will fail on CPUs that don't
-/// support SSE 4.2.  However, because the caller isn't allowed to call these routines
-/// on CPUs that lack SSE 4.2 anyway, we can implement stubs for this case.
-
-template <int MODE>
+// __m128i may not be defined, so deduce it with a template parameter
+template <int MODE, typename __m128i>
 static inline __m128i SSE4_cmpestrm(__m128i str1, int len1, __m128i str2, int len2) {
   DCHECK(false) << "CPU doesn't support SSE 4.2";
   return (__m128i){0};  // NOLINT
 }
 
-template <int MODE>
+template <int MODE, typename __m128i>
 static inline int SSE4_cmpestri(__m128i str1, int len1, __m128i str2, int len2) {
   DCHECK(false) << "CPU doesn't support SSE 4.2";
   return 0;
 }
 
-static inline uint32_t SSE4_crc32_u8(uint32_t, uint8_t) {
-  DCHECK(false) << "CPU doesn't support SSE 4.2";
-  return 0;
-}
-
-static inline uint32_t SSE4_crc32_u16(uint32_t, uint16_t) {
-  DCHECK(false) << "CPU doesn't support SSE 4.2";
-  return 0;
-}
-
-static inline uint32_t SSE4_crc32_u32(uint32_t, uint32_t) {
-  DCHECK(false) << "CPU doesn't support SSE 4.2";
-  return 0;
-}
-
-static inline uint32_t SSE4_crc32_u64(uint32_t, uint64_t) {
-  DCHECK(false) << "CPU doesn't support SSE 4.2";
-  return 0;
-}
-
-static inline int64_t POPCNT_popcnt_u64(uint64_t) {
-  DCHECK(false) << "CPU doesn't support SSE 4.2";
-  return 0;
-}
-
-#endif  // IR_COMPILE
-
-#else
-
 static inline uint32_t SSE4_crc32_u8(uint32_t, uint8_t) {
   DCHECK(false) << "SSE support is not enabled";
   return 0;
@@ -225,12 +148,7 @@ static inline uint32_t SSE4_crc32_u64(uint32_t, uint64_t) {
   return 0;
 }
 
-static inline int64_t POPCNT_popcnt_u64(uint64_t) {
-  DCHECK(false) << "SSE support is not enabled";
-  return 0;
-}
-
-#endif  // ARROW_USE_SSE
+#endif  // ARROW_HAVE_SSE4_2
 
 }  // namespace arrow
 
diff --git a/cpp/src/plasma/thirdparty/xxhash.cc b/cpp/src/arrow/util/xxhash/xxhash.c
similarity index 73%
rename from cpp/src/plasma/thirdparty/xxhash.cc
rename to cpp/src/arrow/util/xxhash/xxhash.c
index f74880b0de71d..da06ea72bff7a 100644
--- a/cpp/src/plasma/thirdparty/xxhash.cc
+++ b/cpp/src/arrow/util/xxhash/xxhash.c
@@ -50,20 +50,26 @@
  * Prefer these methods in priority order (0 > 1 > 2)
  */
 #ifndef XXH_FORCE_MEMORY_ACCESS   /* can be defined externally, on command line for example */
-#  if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) )
+#  if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) \
+                        || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) \
+                        || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) )
 #    define XXH_FORCE_MEMORY_ACCESS 2
-#  elif defined(__INTEL_COMPILER) || \
-  (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__) ))
+#  elif (defined(__INTEL_COMPILER) && !defined(_WIN32)) || \
+  (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) \
+                    || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) \
+                    || defined(__ARM_ARCH_7S__) ))
 #    define XXH_FORCE_MEMORY_ACCESS 1
 #  endif
 #endif
 
 /*!XXH_ACCEPT_NULL_INPUT_POINTER :
- * If the input pointer is a null pointer, xxHash default behavior is to trigger a memory access error, since it is a bad pointer.
- * When this option is enabled, xxHash output for null input pointers will be the same as a null-length input.
- * By default, this option is disabled. To enable it, uncomment below define :
+ * If input pointer is NULL, xxHash default behavior is to dereference it, triggering a segfault.
+ * When this macro is enabled, xxHash actively checks input for null pointer.
+ * It it is, result for null input pointers is the same as a null-length input.
  */
-/* #define XXH_ACCEPT_NULL_INPUT_POINTER 1 */
+#ifndef XXH_ACCEPT_NULL_INPUT_POINTER   /* can be defined externally */
+#  define XXH_ACCEPT_NULL_INPUT_POINTER 0
+#endif
 
 /*!XXH_FORCE_NATIVE_FORMAT :
  * By default, xxHash library provides endian-independent Hash values, based on little-endian convention.
@@ -80,8 +86,9 @@
 /*!XXH_FORCE_ALIGN_CHECK :
  * This is a minor performance trick, only useful with lots of very small keys.
  * It means : check for aligned/unaligned input.
- * The check costs one initial branch per hash; set to 0 when the input data
- * is guaranteed to be aligned.
+ * The check costs one initial branch per hash;
+ * set it to 0 when the input is guaranteed to be aligned,
+ * or when alignment doesn't matter for performance.
  */
 #ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */
 #  if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
@@ -104,6 +111,8 @@ static void  XXH_free  (void* p)  { free(p); }
 #include <string.h>
 static void* XXH_memcpy(void* dest, const void* src, size_t size) { return memcpy(dest,src,size); }
 
+#include <assert.h>   /* assert */
+
 #define XXH_STATIC_LINKING_ONLY
 #include "xxhash.h"
 
@@ -131,17 +140,17 @@ static void* XXH_memcpy(void* dest, const void* src, size_t size) { return memcp
 *  Basic Types
 ***************************************/
 #ifndef MEM_MODULE
-# if !defined (__VMS) && (defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+# if !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
 #   include <stdint.h>
     typedef uint8_t  BYTE;
     typedef uint16_t U16;
     typedef uint32_t U32;
-    typedef  int32_t S32;
 # else
     typedef unsigned char      BYTE;
     typedef unsigned short     U16;
     typedef unsigned int       U32;
-    typedef   signed int       S32;
 # endif
 #endif
 
@@ -208,8 +217,12 @@ typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess;
 
 /* XXH_CPU_LITTLE_ENDIAN can be defined externally, for example on the compiler command line */
 #ifndef XXH_CPU_LITTLE_ENDIAN
-    static const int g_one = 1;
-#   define XXH_CPU_LITTLE_ENDIAN   (*(const char*)(&g_one))
+static int XXH_isLittleEndian(void)
+{
+    const union { U32 u; BYTE c[4]; } one = { 1 };   /* don't use static : performance detrimental  */
+    return one.c[0];
+}
+#   define XXH_CPU_LITTLE_ENDIAN   XXH_isLittleEndian()
 #endif
 
 
@@ -240,12 +253,12 @@ static U32 XXH_readBE32(const void* ptr)
 /* *************************************
 *  Macros
 ***************************************/
-#define XXH_STATIC_ASSERT(c)   { enum { XXH_static_assert = 1/(int)(!!(c)) }; }    /* use only *after* variable declarations */
+#define XXH_STATIC_ASSERT(c)  { enum { XXH_sa = 1/(int)(!!(c)) }; }  /* use after variable declarations */
 XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; }
 
 
 /* *******************************************************************
-*  32-bits hash functions
+*  32-bit hash functions
 *********************************************************************/
 static const U32 PRIME32_1 = 2654435761U;
 static const U32 PRIME32_2 = 2246822519U;
@@ -261,14 +274,89 @@ static U32 XXH32_round(U32 seed, U32 input)
     return seed;
 }
 
-FORCE_INLINE U32 XXH32_endian_align(const void* input, size_t len, U32 seed, XXH_endianess endian, XXH_alignment align)
+/* mix all bits */
+static U32 XXH32_avalanche(U32 h32)
+{
+    h32 ^= h32 >> 15;
+    h32 *= PRIME32_2;
+    h32 ^= h32 >> 13;
+    h32 *= PRIME32_3;
+    h32 ^= h32 >> 16;
+    return(h32);
+}
+
+#define XXH_get32bits(p) XXH_readLE32_align(p, endian, align)
+
+static U32
+XXH32_finalize(U32 h32, const void* ptr, size_t len,
+                XXH_endianess endian, XXH_alignment align)
+
+{
+    const BYTE* p = (const BYTE*)ptr;
+#define PROCESS1             \
+    h32 += (*p) * PRIME32_5; \
+    p++;                     \
+    h32 = XXH_rotl32(h32, 11) * PRIME32_1 ;
+
+#define PROCESS4                         \
+    h32 += XXH_get32bits(p) * PRIME32_3; \
+    p+=4;                                \
+    h32  = XXH_rotl32(h32, 17) * PRIME32_4 ;
+
+    switch(len&15)  /* or switch(bEnd - p) */
+    {
+      case 12:      PROCESS4;
+                    /* fallthrough */
+      case 8:       PROCESS4;
+                    /* fallthrough */
+      case 4:       PROCESS4;
+                    return XXH32_avalanche(h32);
+
+      case 13:      PROCESS4;
+                    /* fallthrough */
+      case 9:       PROCESS4;
+                    /* fallthrough */
+      case 5:       PROCESS4;
+                    PROCESS1;
+                    return XXH32_avalanche(h32);
+
+      case 14:      PROCESS4;
+                    /* fallthrough */
+      case 10:      PROCESS4;
+                    /* fallthrough */
+      case 6:       PROCESS4;
+                    PROCESS1;
+                    PROCESS1;
+                    return XXH32_avalanche(h32);
+
+      case 15:      PROCESS4;
+                    /* fallthrough */
+      case 11:      PROCESS4;
+                    /* fallthrough */
+      case 7:       PROCESS4;
+                    /* fallthrough */
+      case 3:       PROCESS1;
+                    /* fallthrough */
+      case 2:       PROCESS1;
+                    /* fallthrough */
+      case 1:       PROCESS1;
+                    /* fallthrough */
+      case 0:       return XXH32_avalanche(h32);
+    }
+    assert(0);
+    return h32;   /* reaching this point is deemed impossible */
+}
+
+
+FORCE_INLINE U32
+XXH32_endian_align(const void* input, size_t len, U32 seed,
+                    XXH_endianess endian, XXH_alignment align)
 {
     const BYTE* p = (const BYTE*)input;
     const BYTE* bEnd = p + len;
     U32 h32;
-#define XXH_get32bits(p) XXH_readLE32_align(p, endian, align)
 
-#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
     if (p==NULL) {
         len=0;
         bEnd=p=(const BYTE*)(size_t)16;
@@ -276,7 +364,7 @@ FORCE_INLINE U32 XXH32_endian_align(const void* input, size_t len, U32 seed, XXH
 #endif
 
     if (len>=16) {
-        const BYTE* const limit = bEnd - 16;
+        const BYTE* const limit = bEnd - 15;
         U32 v1 = seed + PRIME32_1 + PRIME32_2;
         U32 v2 = seed + PRIME32_2;
         U32 v3 = seed + 0;
@@ -287,34 +375,17 @@ FORCE_INLINE U32 XXH32_endian_align(const void* input, size_t len, U32 seed, XXH
             v2 = XXH32_round(v2, XXH_get32bits(p)); p+=4;
             v3 = XXH32_round(v3, XXH_get32bits(p)); p+=4;
             v4 = XXH32_round(v4, XXH_get32bits(p)); p+=4;
-        } while (p<=limit);
+        } while (p < limit);
 
-        h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
+        h32 = XXH_rotl32(v1, 1)  + XXH_rotl32(v2, 7)
+            + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18);
     } else {
         h32  = seed + PRIME32_5;
     }
 
-    h32 += (U32) len;
-
-    while (p+4<=bEnd) {
-        h32 += XXH_get32bits(p) * PRIME32_3;
-        h32  = XXH_rotl32(h32, 17) * PRIME32_4 ;
-        p+=4;
-    }
-
-    while (p<bEnd) {
-        h32 += (*p) * PRIME32_5;
-        h32 = XXH_rotl32(h32, 11) * PRIME32_1 ;
-        p++;
-    }
-
-    h32 ^= h32 >> 15;
-    h32 *= PRIME32_2;
-    h32 ^= h32 >> 13;
-    h32 *= PRIME32_3;
-    h32 ^= h32 >> 16;
+    h32 += (U32)len;
 
-    return h32;
+    return XXH32_finalize(h32, p, len&15, endian, align);
 }
 
 
@@ -366,23 +437,28 @@ XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* dstState, const XXH32_state_t
 XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, unsigned int seed)
 {
     XXH32_state_t state;   /* using a local state to memcpy() in order to avoid strict-aliasing warnings */
-    memset(&state, 0, sizeof(state)-4);   /* do not write into reserved, for future removal */
+    memset(&state, 0, sizeof(state));
     state.v1 = seed + PRIME32_1 + PRIME32_2;
     state.v2 = seed + PRIME32_2;
     state.v3 = seed + 0;
     state.v4 = seed - PRIME32_1;
-    memcpy(statePtr, &state, sizeof(state));
+    /* do not write into reserved, planned to be removed in a future version */
+    memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved));
     return XXH_OK;
 }
 
 
-FORCE_INLINE XXH_errorcode XXH32_update_endian (XXH32_state_t* state, const void* input, size_t len, XXH_endianess endian)
+FORCE_INLINE
+XXH_errorcode XXH32_update_endian (XXH32_state_t* state, const void* input, size_t len, XXH_endianess endian)
 {
     const BYTE* p = (const BYTE*)input;
     const BYTE* const bEnd = p + len;
 
-#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
-    if (input==NULL) return XXH_ERROR;
+    if (input==NULL)
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+        return XXH_OK;
+#else
+        return XXH_ERROR;
 #endif
 
     state->total_len_32 += (unsigned)len;
@@ -400,7 +476,7 @@ FORCE_INLINE XXH_errorcode XXH32_update_endian (XXH32_state_t* state, const void
             state->v1 = XXH32_round(state->v1, XXH_readLE32(p32, endian)); p32++;
             state->v2 = XXH32_round(state->v2, XXH_readLE32(p32, endian)); p32++;
             state->v3 = XXH32_round(state->v3, XXH_readLE32(p32, endian)); p32++;
-            state->v4 = XXH32_round(state->v4, XXH_readLE32(p32, endian)); p32++;
+            state->v4 = XXH32_round(state->v4, XXH_readLE32(p32, endian));
         }
         p += 16-state->memsize;
         state->memsize = 0;
@@ -434,6 +510,7 @@ FORCE_INLINE XXH_errorcode XXH32_update_endian (XXH32_state_t* state, const void
     return XXH_OK;
 }
 
+
 XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* state_in, const void* input, size_t len)
 {
     XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN;
@@ -445,40 +522,23 @@ XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* state_in, const void*
 }
 
 
-
-FORCE_INLINE U32 XXH32_digest_endian (const XXH32_state_t* state, XXH_endianess endian)
+FORCE_INLINE U32
+XXH32_digest_endian (const XXH32_state_t* state, XXH_endianess endian)
 {
-    const BYTE * p = (const BYTE*)state->mem32;
-    const BYTE* const bEnd = (const BYTE*)(state->mem32) + state->memsize;
     U32 h32;
 
     if (state->large_len) {
-        h32 = XXH_rotl32(state->v1, 1) + XXH_rotl32(state->v2, 7) + XXH_rotl32(state->v3, 12) + XXH_rotl32(state->v4, 18);
+        h32 = XXH_rotl32(state->v1, 1)
+            + XXH_rotl32(state->v2, 7)
+            + XXH_rotl32(state->v3, 12)
+            + XXH_rotl32(state->v4, 18);
     } else {
         h32 = state->v3 /* == seed */ + PRIME32_5;
     }
 
     h32 += state->total_len_32;
 
-    while (p+4<=bEnd) {
-        h32 += XXH_readLE32(p, endian) * PRIME32_3;
-        h32  = XXH_rotl32(h32, 17) * PRIME32_4;
-        p+=4;
-    }
-
-    while (p<bEnd) {
-        h32 += (*p) * PRIME32_5;
-        h32  = XXH_rotl32(h32, 11) * PRIME32_1;
-        p++;
-    }
-
-    h32 ^= h32 >> 15;
-    h32 *= PRIME32_2;
-    h32 ^= h32 >> 13;
-    h32 *= PRIME32_3;
-    h32 ^= h32 >> 16;
-
-    return h32;
+    return XXH32_finalize(h32, state->mem32, state->memsize, endian, XXH_aligned);
 }
 
 
@@ -498,7 +558,7 @@ XXH_PUBLIC_API unsigned int XXH32_digest (const XXH32_state_t* state_in)
 /*! Default XXH result types are basic unsigned 32 and 64 bits.
 *   The canonical representation follows human-readable write convention, aka big-endian (large digits first).
 *   These functions allow transformation of hash result into and from its canonical format.
-*   This way, hash values can be written into a file or buffer, and remain comparable across different systems and programs.
+*   This way, hash values can be written into a file or buffer, remaining comparable across different systems.
 */
 
 XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash)
@@ -517,18 +577,21 @@ XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src
 #ifndef XXH_NO_LONG_LONG
 
 /* *******************************************************************
-*  64-bits hash functions
+*  64-bit hash functions
 *********************************************************************/
 
 /*======   Memory access   ======*/
 
 #ifndef MEM_MODULE
 # define MEM_MODULE
-# if !defined (__VMS) && (defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+# if !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
 #   include <stdint.h>
     typedef uint64_t U64;
 # else
-    typedef unsigned long long U64;   /* if your compiler doesn't support unsigned long long, replace by another 64-bit type here. Note that xxhash.h will also need to be updated. */
+    /* if compiler doesn't support unsigned long long, replace by another 64-bit type */
+    typedef unsigned long long U64;
 # endif
 #endif
 
@@ -621,14 +684,138 @@ static U64 XXH64_mergeRound(U64 acc, U64 val)
     return acc;
 }
 
-FORCE_INLINE U64 XXH64_endian_align(const void* input, size_t len, U64 seed, XXH_endianess endian, XXH_alignment align)
+static U64 XXH64_avalanche(U64 h64)
+{
+    h64 ^= h64 >> 33;
+    h64 *= PRIME64_2;
+    h64 ^= h64 >> 29;
+    h64 *= PRIME64_3;
+    h64 ^= h64 >> 32;
+    return h64;
+}
+
+
+#define XXH_get64bits(p) XXH_readLE64_align(p, endian, align)
+
+static U64
+XXH64_finalize(U64 h64, const void* ptr, size_t len,
+               XXH_endianess endian, XXH_alignment align)
+{
+    const BYTE* p = (const BYTE*)ptr;
+
+#define PROCESS1_64          \
+    h64 ^= (*p) * PRIME64_5; \
+    p++;                     \
+    h64 = XXH_rotl64(h64, 11) * PRIME64_1;
+
+#define PROCESS4_64          \
+    h64 ^= (U64)(XXH_get32bits(p)) * PRIME64_1; \
+    p+=4;                    \
+    h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
+
+#define PROCESS8_64 {        \
+    U64 const k1 = XXH64_round(0, XXH_get64bits(p)); \
+    p+=8;                    \
+    h64 ^= k1;               \
+    h64  = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4; \
+}
+
+    switch(len&31) {
+      case 24: PROCESS8_64;
+                    /* fallthrough */
+      case 16: PROCESS8_64;
+                    /* fallthrough */
+      case  8: PROCESS8_64;
+               return XXH64_avalanche(h64);
+
+      case 28: PROCESS8_64;
+                    /* fallthrough */
+      case 20: PROCESS8_64;
+                    /* fallthrough */
+      case 12: PROCESS8_64;
+                    /* fallthrough */
+      case  4: PROCESS4_64;
+               return XXH64_avalanche(h64);
+
+      case 25: PROCESS8_64;
+                    /* fallthrough */
+      case 17: PROCESS8_64;
+                    /* fallthrough */
+      case  9: PROCESS8_64;
+               PROCESS1_64;
+               return XXH64_avalanche(h64);
+
+      case 29: PROCESS8_64;
+                    /* fallthrough */
+      case 21: PROCESS8_64;
+                    /* fallthrough */
+      case 13: PROCESS8_64;
+                    /* fallthrough */
+      case  5: PROCESS4_64;
+               PROCESS1_64;
+               return XXH64_avalanche(h64);
+
+      case 26: PROCESS8_64;
+                    /* fallthrough */
+      case 18: PROCESS8_64;
+                    /* fallthrough */
+      case 10: PROCESS8_64;
+               PROCESS1_64;
+               PROCESS1_64;
+               return XXH64_avalanche(h64);
+
+      case 30: PROCESS8_64;
+                    /* fallthrough */
+      case 22: PROCESS8_64;
+                    /* fallthrough */
+      case 14: PROCESS8_64;
+                    /* fallthrough */
+      case  6: PROCESS4_64;
+               PROCESS1_64;
+               PROCESS1_64;
+               return XXH64_avalanche(h64);
+
+      case 27: PROCESS8_64;
+                    /* fallthrough */
+      case 19: PROCESS8_64;
+                    /* fallthrough */
+      case 11: PROCESS8_64;
+               PROCESS1_64;
+               PROCESS1_64;
+               PROCESS1_64;
+               return XXH64_avalanche(h64);
+
+      case 31: PROCESS8_64;
+                    /* fallthrough */
+      case 23: PROCESS8_64;
+                    /* fallthrough */
+      case 15: PROCESS8_64;
+                    /* fallthrough */
+      case  7: PROCESS4_64;
+                    /* fallthrough */
+      case  3: PROCESS1_64;
+                    /* fallthrough */
+      case  2: PROCESS1_64;
+                    /* fallthrough */
+      case  1: PROCESS1_64;
+                    /* fallthrough */
+      case  0: return XXH64_avalanche(h64);
+    }
+
+    /* impossible to reach */
+    assert(0);
+    return 0;  /* unreachable, but some compilers complain without it */
+}
+
+FORCE_INLINE U64
+XXH64_endian_align(const void* input, size_t len, U64 seed,
+                XXH_endianess endian, XXH_alignment align)
 {
     const BYTE* p = (const BYTE*)input;
     const BYTE* bEnd = p + len;
     U64 h64;
-#define XXH_get64bits(p) XXH_readLE64_align(p, endian, align)
 
-#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
     if (p==NULL) {
         len=0;
         bEnd=p=(const BYTE*)(size_t)32;
@@ -661,32 +848,7 @@ FORCE_INLINE U64 XXH64_endian_align(const void* input, size_t len, U64 seed, XXH
 
     h64 += (U64) len;
 
-    while (p+8<=bEnd) {
-        U64 const k1 = XXH64_round(0, XXH_get64bits(p));
-        h64 ^= k1;
-        h64  = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4;
-        p+=8;
-    }
-
-    if (p+4<=bEnd) {
-        h64 ^= (U64)(XXH_get32bits(p)) * PRIME64_1;
-        h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
-        p+=4;
-    }
-
-    while (p<bEnd) {
-        h64 ^= (*p) * PRIME64_5;
-        h64 = XXH_rotl64(h64, 11) * PRIME64_1;
-        p++;
-    }
-
-    h64 ^= h64 >> 33;
-    h64 *= PRIME64_2;
-    h64 ^= h64 >> 29;
-    h64 *= PRIME64_3;
-    h64 ^= h64 >> 32;
-
-    return h64;
+    return XXH64_finalize(h64, p, len, endian, align);
 }
 
 
@@ -736,22 +898,27 @@ XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* dstState, const XXH64_state_t
 XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, unsigned long long seed)
 {
     XXH64_state_t state;   /* using a local state to memcpy() in order to avoid strict-aliasing warnings */
-    memset(&state, 0, sizeof(state)-8);   /* do not write into reserved, for future removal */
+    memset(&state, 0, sizeof(state));
     state.v1 = seed + PRIME64_1 + PRIME64_2;
     state.v2 = seed + PRIME64_2;
     state.v3 = seed + 0;
     state.v4 = seed - PRIME64_1;
-    memcpy(statePtr, &state, sizeof(state));
+     /* do not write into reserved, planned to be removed in a future version */
+    memcpy(statePtr, &state, sizeof(state) - sizeof(state.reserved));
     return XXH_OK;
 }
 
-FORCE_INLINE XXH_errorcode XXH64_update_endian (XXH64_state_t* state, const void* input, size_t len, XXH_endianess endian)
+FORCE_INLINE
+XXH_errorcode XXH64_update_endian (XXH64_state_t* state, const void* input, size_t len, XXH_endianess endian)
 {
     const BYTE* p = (const BYTE*)input;
     const BYTE* const bEnd = p + len;
 
-#ifdef XXH_ACCEPT_NULL_INPUT_POINTER
-    if (input==NULL) return XXH_ERROR;
+    if (input==NULL)
+#if defined(XXH_ACCEPT_NULL_INPUT_POINTER) && (XXH_ACCEPT_NULL_INPUT_POINTER>=1)
+        return XXH_OK;
+#else
+        return XXH_ERROR;
 #endif
 
     state->total_len += len;
@@ -812,8 +979,6 @@ XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* state_in, const void*
 
 FORCE_INLINE U64 XXH64_digest_endian (const XXH64_state_t* state, XXH_endianess endian)
 {
-    const BYTE * p = (const BYTE*)state->mem64;
-    const BYTE* const bEnd = (const BYTE*)state->mem64 + state->memsize;
     U64 h64;
 
     if (state->total_len >= 32) {
@@ -828,37 +993,12 @@ FORCE_INLINE U64 XXH64_digest_endian (const XXH64_state_t* state, XXH_endianess
         h64 = XXH64_mergeRound(h64, v3);
         h64 = XXH64_mergeRound(h64, v4);
     } else {
-        h64  = state->v3 + PRIME64_5;
+        h64  = state->v3 /*seed*/ + PRIME64_5;
     }
 
     h64 += (U64) state->total_len;
 
-    while (p+8<=bEnd) {
-        U64 const k1 = XXH64_round(0, XXH_readLE64(p, endian));
-        h64 ^= k1;
-        h64  = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4;
-        p+=8;
-    }
-
-    if (p+4<=bEnd) {
-        h64 ^= (U64)(XXH_readLE32(p, endian)) * PRIME64_1;
-        h64  = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3;
-        p+=4;
-    }
-
-    while (p<bEnd) {
-        h64 ^= (*p) * PRIME64_5;
-        h64  = XXH_rotl64(h64, 11) * PRIME64_1;
-        p++;
-    }
-
-    h64 ^= h64 >> 33;
-    h64 *= PRIME64_2;
-    h64 ^= h64 >> 29;
-    h64 *= PRIME64_3;
-    h64 ^= h64 >> 32;
-
-    return h64;
+    return XXH64_finalize(h64, state->mem64, (size_t)state->total_len, endian, XXH_aligned);
 }
 
 XXH_PUBLIC_API unsigned long long XXH64_digest (const XXH64_state_t* state_in)
diff --git a/cpp/src/plasma/thirdparty/xxhash.h b/cpp/src/arrow/util/xxhash/xxhash.h
similarity index 67%
rename from cpp/src/plasma/thirdparty/xxhash.h
rename to cpp/src/arrow/util/xxhash/xxhash.h
index 9d831e03b35f6..8c2d5fac1e746 100644
--- a/cpp/src/plasma/thirdparty/xxhash.h
+++ b/cpp/src/arrow/util/xxhash/xxhash.h
@@ -1,3 +1,5 @@
+// Vendored from git tag v0.6.5
+
 /*
    xxHash - Extremely Fast Hash algorithm
    Header File
@@ -57,8 +59,8 @@ Q.Score is a measure of quality of the hash function.
 It depends on successfully passing SMHasher test set.
 10 is a perfect score.
 
-A 64-bits version, named XXH64, is available since r35.
-It offers much better speed, but for 64-bits applications only.
+A 64-bit version, named XXH64, is available since r35.
+It offers much better speed, but for 64-bit applications only.
 Name     Speed on 64 bits    Speed on 32 bits
 XXH64       13.8 GB/s            1.9 GB/s
 XXH32        6.8 GB/s            6.0 GB/s
@@ -80,18 +82,19 @@ typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
 
 
 /* ****************************
-*  API modifier
-******************************/
-/** XXH_PRIVATE_API
-*   This is useful to include xxhash functions in `static` mode
-*   in order to inline them, and remove their symbol from the public list.
-*   Methodology :
-*     #define XXH_PRIVATE_API
-*     #include "xxhash.h"
-*   `xxhash.c` is automatically included.
-*   It's not useful to compile and link it as a separate module.
-*/
-#ifdef XXH_PRIVATE_API
+ *  API modifier
+ ******************************/
+/** XXH_INLINE_ALL (and XXH_PRIVATE_API)
+ *  This is useful to include xxhash functions in `static` mode
+ *  in order to inline them, and remove their symbol from the public list.
+ *  Inlining can offer dramatic performance improvement on small keys.
+ *  Methodology :
+ *     #define XXH_INLINE_ALL
+ *     #include "xxhash.h"
+ * `xxhash.c` is automatically included.
+ *  It's not useful to compile and link it as a separate module.
+ */
+#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)
 #  ifndef XXH_STATIC_LINKING_ONLY
 #    define XXH_STATIC_LINKING_ONLY
 #  endif
@@ -102,23 +105,24 @@ typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode;
 #  elif defined(_MSC_VER)
 #    define XXH_PUBLIC_API static __inline
 #  else
-#    define XXH_PUBLIC_API static   /* this version may generate warnings for unused static functions; disable the relevant warning */
+     /* this version may generate warnings for unused static functions */
+#    define XXH_PUBLIC_API static
 #  endif
 #else
 #  define XXH_PUBLIC_API   /* do nothing */
-#endif /* XXH_PRIVATE_API */
-
-/*!XXH_NAMESPACE, aka Namespace Emulation :
-
-If you want to include _and expose_ xxHash functions from within your own library,
-but also want to avoid symbol collisions with other libraries which may also include xxHash,
-
-you can use XXH_NAMESPACE, to automatically prefix any public symbol from xxhash library
-with the value of XXH_NAMESPACE (therefore, avoid NULL and numeric values).
-
-Note that no change is required within the calling program as long as it includes `xxhash.h` :
-regular symbol name will be automatically translated by this header.
-*/
+#endif /* XXH_INLINE_ALL || XXH_PRIVATE_API */
+
+/*! XXH_NAMESPACE, aka Namespace Emulation :
+ *
+ * If you want to include _and expose_ xxHash functions from within your own library,
+ * but also want to avoid symbol collisions with other libraries which may also include xxHash,
+ *
+ * you can use XXH_NAMESPACE, to automatically prefix any public symbol from xxhash library
+ * with the value of XXH_NAMESPACE (therefore, avoid NULL and numeric values).
+ *
+ * Note that no change is required within the calling program as long as it includes `xxhash.h` :
+ * regular symbol name will be automatically translated by this header.
+ */
 #ifdef XXH_NAMESPACE
 #  define XXH_CAT(A,B) A##B
 #  define XXH_NAME2(A,B) XXH_CAT(A,B)
@@ -149,18 +153,18 @@ regular symbol name will be automatically translated by this header.
 ***************************************/
 #define XXH_VERSION_MAJOR    0
 #define XXH_VERSION_MINOR    6
-#define XXH_VERSION_RELEASE  2
+#define XXH_VERSION_RELEASE  5
 #define XXH_VERSION_NUMBER  (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE)
 XXH_PUBLIC_API unsigned XXH_versionNumber (void);
 
 
 /*-**********************************************************************
-*  32-bits hash
+*  32-bit hash
 ************************************************************************/
 typedef unsigned int XXH32_hash_t;
 
 /*! XXH32() :
-    Calculate the 32-bits hash of sequence "length" bytes stored at memory address "input".
+    Calculate the 32-bit hash of sequence "length" bytes stored at memory address "input".
     The memory between input & input+length must be valid (allocated and read-accessible).
     "seed" can be used to alter the result predictably.
     Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s */
@@ -177,26 +181,25 @@ XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void*
 XXH_PUBLIC_API XXH32_hash_t  XXH32_digest (const XXH32_state_t* statePtr);
 
 /*
-These functions generate the xxHash of an input provided in multiple segments.
-Note that, for small input, they are slower than single-call functions, due to state management.
-For small input, prefer `XXH32()` and `XXH64()` .
-
-XXH state must first be allocated, using XXH*_createState() .
-
-Start a new hash by initializing state with a seed, using XXH*_reset().
-
-Then, feed the hash state by calling XXH*_update() as many times as necessary.
-Obviously, input must be allocated and read accessible.
-The function returns an error code, with 0 meaning OK, and any other value meaning there is an error.
-
-Finally, a hash value can be produced anytime, by using XXH*_digest().
-This function returns the nn-bits hash as an int or long long.
-
-It's still possible to continue inserting input into the hash state after a digest,
-and generate some new hashes later on, by calling again XXH*_digest().
-
-When done, free XXH state space if it was allocated dynamically.
-*/
+ * Streaming functions generate the xxHash of an input provided in multiple segments.
+ * Note that, for small input, they are slower than single-call functions, due to state management.
+ * For small inputs, prefer `XXH32()` and `XXH64()`, which are better optimized.
+ *
+ * XXH state must first be allocated, using XXH*_createState() .
+ *
+ * Start a new hash by initializing state with a seed, using XXH*_reset().
+ *
+ * Then, feed the hash state by calling XXH*_update() as many times as necessary.
+ * The function returns an error code, with 0 meaning OK, and any other value meaning there is an error.
+ *
+ * Finally, a hash value can be produced anytime, by using XXH*_digest().
+ * This function returns the nn-bits hash as an int or long long.
+ *
+ * It's still possible to continue inserting input into the hash state after a digest,
+ * and generate some new hashes later on, by calling again XXH*_digest().
+ *
+ * When done, free XXH state space if it was allocated dynamically.
+ */
 
 /*======   Canonical representation   ======*/
 
@@ -205,22 +208,22 @@ XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t
 XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src);
 
 /* Default result type for XXH functions are primitive unsigned 32 and 64 bits.
-*  The canonical representation uses human-readable write convention, aka big-endian (large digits first).
-*  These functions allow transformation of hash result into and from its canonical format.
-*  This way, hash values can be written into a file / memory, and remain comparable on different systems and programs.
-*/
+ * The canonical representation uses human-readable write convention, aka big-endian (large digits first).
+ * These functions allow transformation of hash result into and from its canonical format.
+ * This way, hash values can be written into a file / memory, and remain comparable on different systems and programs.
+ */
 
 
 #ifndef XXH_NO_LONG_LONG
 /*-**********************************************************************
-*  64-bits hash
+*  64-bit hash
 ************************************************************************/
 typedef unsigned long long XXH64_hash_t;
 
 /*! XXH64() :
-    Calculate the 64-bits hash of sequence of length "len" stored at memory address "input".
+    Calculate the 64-bit hash of sequence of length "len" stored at memory address "input".
     "seed" can be used to alter the result predictably.
-    This function runs faster on 64-bits systems, but slower on 32-bits systems (see benchmark).
+    This function runs faster on 64-bit systems, but slower on 32-bit systems (see benchmark).
 */
 XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t length, unsigned long long seed);
 
@@ -241,18 +244,49 @@ XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src
 #endif  /* XXH_NO_LONG_LONG */
 
 
+
 #ifdef XXH_STATIC_LINKING_ONLY
 
 /* ================================================================================================
-   This section contains definitions which are not guaranteed to remain stable.
+   This section contains declarations which are not guaranteed to remain stable.
    They may change in future versions, becoming incompatible with a different version of the library.
-   They shall only be used with static linking.
-   Never use these definitions in association with dynamic linking !
+   These declarations should only be used with static linking.
+   Never use them in association with dynamic linking !
 =================================================================================================== */
 
-/* These definitions are only meant to make possible
-   static allocation of XXH state, on stack or in a struct for example.
-   Never use members directly. */
+/* These definitions are only present to allow
+ * static allocation of XXH state, on stack or in a struct for example.
+ * Never **ever** use members directly. */
+
+#if !defined (__VMS) \
+  && (defined (__cplusplus) \
+  || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) )
+#   include <stdint.h>
+
+struct XXH32_state_s {
+   uint32_t total_len_32;
+   uint32_t large_len;
+   uint32_t v1;
+   uint32_t v2;
+   uint32_t v3;
+   uint32_t v4;
+   uint32_t mem32[4];
+   uint32_t memsize;
+   uint32_t reserved;   /* never read nor write, might be removed in a future version */
+};   /* typedef'd to XXH32_state_t */
+
+struct XXH64_state_s {
+   uint64_t total_len;
+   uint64_t v1;
+   uint64_t v2;
+   uint64_t v3;
+   uint64_t v4;
+   uint64_t mem64[4];
+   uint32_t memsize;
+   uint32_t reserved[2];          /* never read nor write, might be removed in a future version */
+};   /* typedef'd to XXH64_state_t */
+
+# else
 
 struct XXH32_state_s {
    unsigned total_len_32;
@@ -261,25 +295,28 @@ struct XXH32_state_s {
    unsigned v2;
    unsigned v3;
    unsigned v4;
-   unsigned mem32[4];   /* buffer defined as U32 for alignment */
+   unsigned mem32[4];
    unsigned memsize;
-   unsigned reserved;   /* never read nor write, will be removed in a future version */
+   unsigned reserved;   /* never read nor write, might be removed in a future version */
 };   /* typedef'd to XXH32_state_t */
 
-#ifndef XXH_NO_LONG_LONG   /* remove 64-bits support */
+#   ifndef XXH_NO_LONG_LONG  /* remove 64-bit support */
 struct XXH64_state_s {
    unsigned long long total_len;
    unsigned long long v1;
    unsigned long long v2;
    unsigned long long v3;
    unsigned long long v4;
-   unsigned long long mem64[4];   /* buffer defined as U64 for alignment */
+   unsigned long long mem64[4];
    unsigned memsize;
-   unsigned reserved[2];          /* never read nor write, will be removed in a future version */
+   unsigned reserved[2];     /* never read nor write, might be removed in a future version */
 };   /* typedef'd to XXH64_state_t */
-#endif
+#    endif
+
+# endif
+
 
-#ifdef XXH_PRIVATE_API
+#if defined(XXH_INLINE_ALL) || defined(XXH_PRIVATE_API)
 #  include "xxhash.c"   /* include xxhash function bodies as `static`, for inlining */
 #endif
 
diff --git a/cpp/src/arrow/visitor_inline.h b/cpp/src/arrow/visitor_inline.h
index 8bc01bfd1093d..b6fc1f1ff2bfb 100644
--- a/cpp/src/arrow/visitor_inline.h
+++ b/cpp/src/arrow/visitor_inline.h
@@ -24,7 +24,9 @@
 #include "arrow/status.h"
 #include "arrow/tensor.h"
 #include "arrow/type.h"
+#include "arrow/util/bit-util.h"
 #include "arrow/util/checked_cast.h"
+#include "arrow/util/string_view.h"
 
 namespace arrow {
 
@@ -110,6 +112,146 @@ inline Status VisitArrayInline(const Array& array, VISITOR* visitor) {
   return Status::NotImplemented("Type not implemented");
 }
 
+// Visit an array's data values, in order, without overhead.
+//
+// The Visit function's `visitor` argument should define two public methods:
+// - Status VisitNull()
+// - Status VisitValue(<scalar>)
+//
+// The scalar value's type depends on the array data type:
+// - the type's `c_type`, if any
+// - for boolean arrays, a `bool`
+// - for binary, string and fixed-size binary arrars, a `util::string_view`
+
+template <typename T, typename Enable = void>
+struct ArrayDataVisitor {};
+
+template <>
+struct ArrayDataVisitor<BooleanType> {
+  template <typename Visitor>
+  static Status Visit(const ArrayData& arr, Visitor* visitor) {
+    if (arr.null_count != 0) {
+      internal::BitmapReader valid_reader(arr.buffers[0]->data(), arr.offset, arr.length);
+      internal::BitmapReader value_reader(arr.buffers[1]->data(), arr.offset, arr.length);
+      for (int64_t i = 0; i < arr.length; ++i) {
+        const bool is_null = valid_reader.IsNotSet();
+        if (is_null) {
+          ARROW_RETURN_NOT_OK(visitor->VisitNull());
+        } else {
+          ARROW_RETURN_NOT_OK(visitor->VisitValue(value_reader.IsSet()));
+        }
+        valid_reader.Next();
+        value_reader.Next();
+      }
+    } else {
+      internal::BitmapReader value_reader(arr.buffers[1]->data(), arr.offset, arr.length);
+      for (int64_t i = 0; i < arr.length; ++i) {
+        ARROW_RETURN_NOT_OK(visitor->VisitValue(value_reader.IsSet()));
+        value_reader.Next();
+      }
+    }
+    return Status::OK();
+  }
+};
+
+template <typename T>
+struct ArrayDataVisitor<T, enable_if_has_c_type<T>> {
+  template <typename Visitor>
+  static Status Visit(const ArrayData& arr, Visitor* visitor) {
+    using c_type = typename T::c_type;
+    const c_type* data = arr.GetValues<c_type>(1);
+
+    if (arr.null_count != 0) {
+      internal::BitmapReader valid_reader(arr.buffers[0]->data(), arr.offset, arr.length);
+      for (int64_t i = 0; i < arr.length; ++i) {
+        const bool is_null = valid_reader.IsNotSet();
+        if (is_null) {
+          ARROW_RETURN_NOT_OK(visitor->VisitNull());
+        } else {
+          ARROW_RETURN_NOT_OK(visitor->VisitValue(data[i]));
+        }
+        valid_reader.Next();
+      }
+    } else {
+      for (int64_t i = 0; i < arr.length; ++i) {
+        ARROW_RETURN_NOT_OK(visitor->VisitValue(data[i]));
+      }
+    }
+    return Status::OK();
+  }
+};
+
+template <typename T>
+struct ArrayDataVisitor<T, enable_if_binary<T>> {
+  template <typename Visitor>
+  static Status Visit(const ArrayData& arr, Visitor* visitor) {
+    constexpr uint8_t empty_value = 0;
+
+    const int32_t* offsets = arr.GetValues<int32_t>(1);
+    const uint8_t* data;
+    if (!arr.buffers[2]) {
+      data = &empty_value;
+    } else {
+      data = arr.GetValues<uint8_t>(2);
+    }
+
+    if (arr.null_count != 0) {
+      internal::BitmapReader valid_reader(arr.buffers[0]->data(), arr.offset, arr.length);
+      for (int64_t i = 0; i < arr.length; ++i) {
+        const bool is_null = valid_reader.IsNotSet();
+        valid_reader.Next();
+        if (is_null) {
+          ARROW_RETURN_NOT_OK(visitor->VisitNull());
+        } else {
+          auto value = util::string_view(reinterpret_cast<const char*>(data + offsets[i]),
+                                         offsets[i + 1] - offsets[i]);
+          ARROW_RETURN_NOT_OK(visitor->VisitValue(value));
+        }
+      }
+    } else {
+      for (int64_t i = 0; i < arr.length; ++i) {
+        auto value = util::string_view(reinterpret_cast<const char*>(data + offsets[i]),
+                                       offsets[i + 1] - offsets[i]);
+        ARROW_RETURN_NOT_OK(visitor->VisitValue(value));
+      }
+    }
+    return Status::OK();
+  }
+};
+
+template <typename T>
+struct ArrayDataVisitor<T, enable_if_fixed_size_binary<T>> {
+  template <typename Visitor>
+  static Status Visit(const ArrayData& arr, Visitor* visitor) {
+    const auto& fw_type = internal::checked_cast<const FixedSizeBinaryType&>(*arr.type);
+
+    const int32_t byte_width = fw_type.byte_width();
+    const uint8_t* data = arr.GetValues<uint8_t>(1);
+
+    if (arr.null_count != 0) {
+      internal::BitmapReader valid_reader(arr.buffers[0]->data(), arr.offset, arr.length);
+      for (int64_t i = 0; i < arr.length; ++i) {
+        const bool is_null = valid_reader.IsNotSet();
+        valid_reader.Next();
+        if (is_null) {
+          ARROW_RETURN_NOT_OK(visitor->VisitNull());
+        } else {
+          auto value = util::string_view(reinterpret_cast<const char*>(data), byte_width);
+          ARROW_RETURN_NOT_OK(visitor->VisitValue(value));
+        }
+        data += byte_width;
+      }
+    } else {
+      for (int64_t i = 0; i < arr.length; ++i) {
+        auto value = util::string_view(reinterpret_cast<const char*>(data), byte_width);
+        ARROW_RETURN_NOT_OK(visitor->VisitValue(value));
+        data += byte_width;
+      }
+    }
+    return Status::OK();
+  }
+};
+
 }  // namespace arrow
 
 #endif  // ARROW_VISITOR_INLINE_H
diff --git a/cpp/src/plasma/CMakeLists.txt b/cpp/src/plasma/CMakeLists.txt
index adf27de01d62c..f9ed4e3d4e3f5 100644
--- a/cpp/src/plasma/CMakeLists.txt
+++ b/cpp/src/plasma/CMakeLists.txt
@@ -78,8 +78,7 @@ set(PLASMA_SRCS
   malloc.cc
   plasma.cc
   protocol.cc
-  thirdparty/ae/ae.c
-  thirdparty/xxhash.cc)
+  thirdparty/ae/ae.c)
 
 set(PLASMA_LINK_LIBS arrow_shared)
 set(PLASMA_STATIC_LINK_LIBS arrow_static)
@@ -115,12 +114,6 @@ if ("${COMPILER_FAMILY}" STREQUAL "clang")
 -Wno-null-pointer-arithmetic \
 -Wno-shorten-64-to-32 \
 -Wno-unused-macros")
-
-  set_property(SOURCE thirdparty/xxhash.cc
-    APPEND_STRING
-    PROPERTY COMPILE_FLAGS
-    "-Wno-unused-macros \
--Wno-unreachable-code")
 endif()
 
 if ("${COMPILER_FAMILY}" STREQUAL "gcc")
diff --git a/cpp/src/plasma/client.cc b/cpp/src/plasma/client.cc
index d37b033f8fce1..0c96be060e1c1 100644
--- a/cpp/src/plasma/client.cc
+++ b/cpp/src/plasma/client.cc
@@ -62,8 +62,9 @@ using arrow::gpu::CudaContext;
 using arrow::gpu::CudaDeviceManager;
 #endif
 
-#define XXH_STATIC_LINKING_ONLY
-#include "thirdparty/xxhash.h"
+#define XXH_INLINE_ALL 1
+#define XXH_NAMESPACE plasma_client_
+#include "arrow/util/xxhash/xxhash.h"
 
 #define XXH64_DEFAULT_SEED 0
 
diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt
index 88608b35fac6a..13918d55fca87 100644
--- a/dev/release/rat_exclude_files.txt
+++ b/dev/release/rat_exclude_files.txt
@@ -20,6 +20,8 @@ cpp/src/arrow/util/variant/recursive_wrapper.h
 cpp/src/arrow/util/variant/variant_cast.h
 cpp/src/arrow/util/variant/variant_io.h
 cpp/src/arrow/util/variant/variant_visitor.h
+cpp/src/arrow/util/xxhash/xxhash.c
+cpp/src/arrow/util/xxhash/xxhash.h
 cpp/build-support/asan_symbolize.py
 cpp/build-support/cpplint.py
 cpp/build-support/clang_format_exclusions.txt
@@ -41,8 +43,6 @@ cpp/src/plasma/thirdparty/ae/ae_select.c
 cpp/src/plasma/thirdparty/ae/config.h
 cpp/src/plasma/thirdparty/ae/zmalloc.h
 cpp/src/plasma/thirdparty/dlmalloc.c
-cpp/src/plasma/thirdparty/xxhash.cc
-cpp/src/plasma/thirdparty/xxhash.h
 dev/release/rat_exclude_files.txt
 dev/tasks/linux-packages/debian.ubuntu-trusty/compat
 dev/tasks/linux-packages/debian.ubuntu-trusty/control