From 19eec4d102a88899b12ad2f0259de8cb1092a846 Mon Sep 17 00:00:00 2001
From: Wes McKinney <wes@cloudera.com>
Date: Mon, 1 Feb 2016 16:23:54 -0800
Subject: [PATCH] PARQUET-438: Update RLE encoding tools and add unit tests
 from Impala

This also changes the compiler options to use only SSE3. We can explore benchmarking to optionally enable SSE4 on supported systems for performance gains in the future.

@nongli would it be OK if I remove the unused delta encodings (they depend on code that is not in Impala anymore, or never way)? Are these encodings used extensively, or primarily PLAIN / DICTIONARY encoding?

Author: Wes McKinney <wes@cloudera.com>

Closes #31 from wesm/PARQUET-438 and squashes the following commits:

df9da64 [Wes McKinney] Add back (commented-out) ZigZig encode/decode impls
65c49bc [Wes McKinney] Fix comment
73c1dc1 [Wes McKinney] Fix cpplint errors post PARQUET-496
553ca0e [Wes McKinney] Fix cpplint errors
9528c83 [Wes McKinney] Updates RLE encoding routines and dependencies from upstream changes in Apache Impala (incubating). Add rle-test and bit-util-test modules unchanged from Impala.

Change-Id: I6dd1ba027b316de945c56f70b390cf44dcf09f63
---
 cpp/src/parquet/encodings/encodings.h         |   7 +-
 cpp/src/parquet/util/CMakeLists.txt           |   6 +
 cpp/src/parquet/util/bit-stream-utils.h       | 108 +++--
 .../parquet/util/bit-stream-utils.inline.h    |  32 +-
 cpp/src/parquet/util/bit-util-test.cc         | 164 +++++++
 cpp/src/parquet/util/bit-util.h               | 170 ++++++--
 cpp/src/parquet/util/cpu-info.cc              | 169 ++++++++
 cpp/src/parquet/util/cpu-info.h               | 108 +++++
 cpp/src/parquet/util/rle-encoding.h           | 288 +++++++------
 cpp/src/parquet/util/rle-test.cc              | 400 ++++++++++++++++++
 cpp/src/parquet/util/sse-util.h               | 191 +++++++++
 11 files changed, 1417 insertions(+), 226 deletions(-)
 create mode 100644 cpp/src/parquet/util/bit-util-test.cc
 create mode 100644 cpp/src/parquet/util/cpu-info.cc
 create mode 100644 cpp/src/parquet/util/cpu-info.h
 create mode 100644 cpp/src/parquet/util/rle-test.cc
 create mode 100644 cpp/src/parquet/util/sse-util.h

diff --git a/cpp/src/parquet/encodings/encodings.h b/cpp/src/parquet/encodings/encodings.h
index 50b6291adfbfb..b30146a2c5919 100644
--- a/cpp/src/parquet/encodings/encodings.h
+++ b/cpp/src/parquet/encodings/encodings.h
@@ -72,8 +72,9 @@ class Decoder {
 #include "parquet/encodings/plain-encoding.h"
 #include "parquet/encodings/dictionary-encoding.h"
 
-#include "parquet/encodings/delta-bit-pack-encoding.h"
-#include "parquet/encodings/delta-length-byte-array-encoding.h"
-#include "parquet/encodings/delta-byte-array-encoding.h"
+// The encoding tools changed and these are missing the ZigZag functions
+// #include "parquet/encodings/delta-bit-pack-encoding.h"
+// #include "parquet/encodings/delta-length-byte-array-encoding.h"
+// #include "parquet/encodings/delta-byte-array-encoding.h"
 
 #endif // PARQUET_ENCODINGS_ENCODINGS_H
diff --git a/cpp/src/parquet/util/CMakeLists.txt b/cpp/src/parquet/util/CMakeLists.txt
index 8c15de1d1afde..1c86112ddec19 100644
--- a/cpp/src/parquet/util/CMakeLists.txt
+++ b/cpp/src/parquet/util/CMakeLists.txt
@@ -20,6 +20,8 @@ install(FILES
   bit-stream-utils.h
   bit-stream-utils.inline.h
   bit-util.h
+  cpu-info.h
+  sse-info.h
   compiler-util.h
   logging.h
   rle-encoding.h
@@ -29,6 +31,7 @@ install(FILES
 
 add_library(parquet_util STATIC
   input_stream.cc
+  cpu-info.cc
 )
 
 add_library(parquet_test_main
@@ -47,3 +50,6 @@ else()
     pthread
   )
 endif()
+
+ADD_PARQUET_TEST(bit-util-test)
+ADD_PARQUET_TEST(rle-test)
diff --git a/cpp/src/parquet/util/bit-stream-utils.h b/cpp/src/parquet/util/bit-stream-utils.h
index 97ba71b833eb1..a02839dc3b438 100644
--- a/cpp/src/parquet/util/bit-stream-utils.h
+++ b/cpp/src/parquet/util/bit-stream-utils.h
@@ -15,26 +15,28 @@
 // specific language governing permissions and limitations
 // under the License.
 
+// From Apache Impala as of 2016-01-29
+
 #ifndef PARQUET_UTIL_BIT_STREAM_UTILS_H
 #define PARQUET_UTIL_BIT_STREAM_UTILS_H
 
-#include <string.h>
 #include <algorithm>
 #include <cstdint>
+#include <string.h>
 
 #include "parquet/util/compiler-util.h"
-#include "parquet/util/bit-util.h"
 #include "parquet/util/logging.h"
+#include "parquet/util/bit-util.h"
 
 namespace parquet_cpp {
 
-// Utility class to write bit/byte streams.  This class can write data to either be
-// bit packed or byte aligned (and a single stream that has a mix of both).
-// This class does not allocate memory.
+/// Utility class to write bit/byte streams.  This class can write data to either be
+/// bit packed or byte aligned (and a single stream that has a mix of both).
+/// This class does not allocate memory.
 class BitWriter {
  public:
-  // buffer: buffer to write bits to.  Buffer should be preallocated with
-  // 'buffer_len' bytes.
+  /// buffer: buffer to write bits to.  Buffer should be preallocated with
+  /// 'buffer_len' bytes.
   BitWriter(uint8_t* buffer, int buffer_len) :
       buffer_(buffer),
       max_bytes_(buffer_len) {
@@ -47,56 +49,56 @@ class BitWriter {
     bit_offset_ = 0;
   }
 
-  // The number of current bytes written, including the current byte (i.e. may include a
-  // fraction of a byte). Includes buffered values.
+  /// The number of current bytes written, including the current byte (i.e. may include a
+  /// fraction of a byte). Includes buffered values.
   int bytes_written() const { return byte_offset_ + BitUtil::Ceil(bit_offset_, 8); }
   uint8_t* buffer() const { return buffer_; }
   int buffer_len() const { return max_bytes_; }
 
-  // Writes a value to buffered_values_, flushing to buffer_ if necessary.  This is bit
-  // packed.  Returns false if there was not enough space. num_bits must be <= 32.
+  /// Writes a value to buffered_values_, flushing to buffer_ if necessary.  This is bit
+  /// packed.  Returns false if there was not enough space. num_bits must be <= 32.
   bool PutValue(uint64_t v, int num_bits);
 
-  // Writes v to the next aligned byte using num_bytes. If T is larger than num_bytes, the
-  // extra high-order bytes will be ignored. Returns false if there was not enough space.
+  /// Writes v to the next aligned byte using num_bytes. If T is larger than
+  /// num_bytes, the extra high-order bytes will be ignored. Returns false if
+  /// there was not enough space.
   template<typename T>
   bool PutAligned(T v, int num_bytes);
 
-  // Write a Vlq encoded int to the buffer.  Returns false if there was not enough
-  // room.  The value is written byte aligned.
-  // For more details on vlq:
-  // en.wikipedia.org/wiki/Variable-length_quantity
-  bool PutVlqInt(uint32_t v);
-  bool PutZigZagVlqInt(int32_t v);
+  /// Write a Vlq encoded int to the buffer.  Returns false if there was not enough
+  /// room.  The value is written byte aligned.
+  /// For more details on vlq:
+  /// en.wikipedia.org/wiki/Variable-length_quantity
+  bool PutVlqInt(int32_t v);
 
-  // Get a pointer to the next aligned byte and advance the underlying buffer
-  // by num_bytes.
-  // Returns NULL if there was not enough space.
+  /// Get a pointer to the next aligned byte and advance the underlying buffer
+  /// by num_bytes.
+  /// Returns NULL if there was not enough space.
   uint8_t* GetNextBytePtr(int num_bytes = 1);
 
-  // Flushes all buffered values to the buffer. Call this when done writing to the buffer.
-  // If 'align' is true, buffered_values_ is reset and any future writes will be written
-  // to the next byte boundary.
+  /// Flushes all buffered values to the buffer. Call this when done writing to
+  /// the buffer.  If 'align' is true, buffered_values_ is reset and any future
+  /// writes will be written to the next byte boundary.
   void Flush(bool align = false);
 
  private:
   uint8_t* buffer_;
   int max_bytes_;
 
-  // Bit-packed values are initially written to this variable before being memcpy'd to
-  // buffer_. This is faster than writing values byte by byte directly to buffer_.
+  /// Bit-packed values are initially written to this variable before being memcpy'd to
+  /// buffer_. This is faster than writing values byte by byte directly to buffer_.
   uint64_t buffered_values_;
 
   int byte_offset_;       // Offset in buffer_
   int bit_offset_;        // Offset in buffered_values_
 };
 
-// Utility class to read bit/byte stream.  This class can read bits or bytes
-// that are either byte aligned or not.  It also has utilities to read multiple
-// bytes in one read (e.g. encoded int).
+/// Utility class to read bit/byte stream.  This class can read bits or bytes
+/// that are either byte aligned or not.  It also has utilities to read multiple
+/// bytes in one read (e.g. encoded int).
 class BitReader {
  public:
-  // 'buffer' is the buffer to read from.  The buffer's length is 'buffer_len'.
+  /// 'buffer' is the buffer to read from.  The buffer's length is 'buffer_len'.
   BitReader(const uint8_t* buffer, int buffer_len) :
       buffer_(buffer),
       max_bytes_(buffer_len),
@@ -108,36 +110,48 @@ class BitReader {
 
   BitReader() : buffer_(NULL), max_bytes_(0) {}
 
-  // Gets the next value from the buffer.  Returns true if 'v' could be read or false if
-  // there are not enough bytes left. num_bits must be <= 32.
+  void Reset(const uint8_t* buffer, int buffer_len) {
+    buffer_ = buffer;
+    max_bytes_ = buffer_len;
+    byte_offset_ = 0;
+    bit_offset_ = 0;
+  }
+
+  /// Gets the next value from the buffer.  Returns true if 'v' could be read or false if
+  /// there are not enough bytes left. num_bits must be <= 32.
   template<typename T>
   bool GetValue(int num_bits, T* v);
 
-  // Reads a 'num_bytes'-sized value from the buffer and stores it in 'v'. T needs to be a
-  // little-endian native type and big enough to store 'num_bytes'. The value is assumed
-  // to be byte-aligned so the stream will be advanced to the start of the next byte
-  // before 'v' is read. Returns false if there are not enough bytes left.
+  /// Reads a 'num_bytes'-sized value from the buffer and stores it in 'v'. T
+  /// needs to be a little-endian native type and big enough to store
+  /// 'num_bytes'. The value is assumed to be byte-aligned so the stream will
+  /// be advanced to the start of the next byte before 'v' is read. Returns
+  /// false if there are not enough bytes left.
   template<typename T>
   bool GetAligned(int num_bytes, T* v);
 
-  // Reads a vlq encoded int from the stream.  The encoded int must start at the
-  // beginning of a byte. Return false if there were not enough bytes in the buffer.
-  bool GetVlqInt(uint64_t* v);
-  bool GetZigZagVlqInt(int64_t* v);
+  /// Reads a vlq encoded int from the stream.  The encoded int must start at
+  /// the beginning of a byte. Return false if there were not enough bytes in
+  /// the buffer.
+  bool GetVlqInt(int32_t* v);
 
-  // Returns the number of bytes left in the stream, not including the current byte (i.e.,
-  // there may be an additional fraction of a byte).
+  /// Returns the number of bytes left in the stream, not including the current
+  /// byte (i.e., there may be an additional fraction of a byte).
   int bytes_left() { return max_bytes_ - (byte_offset_ + BitUtil::Ceil(bit_offset_, 8)); }
 
-  // Maximum byte length of a vlq encoded int
+  /// Maximum byte length of a vlq encoded int
   static const int MAX_VLQ_BYTE_LEN = 5;
 
+  // TODO(nongli): implementations to be fixed given changes in Impala
+  // bool GetZigZagVlqInt(int64_t* v);
+  // bool PutZigZagVlqInt(int32_t v);
+
  private:
   const uint8_t* buffer_;
   int max_bytes_;
 
-  // Bytes are memcpy'd from buffer_ and values are read from this variable. This is
-  // faster than reading values byte by byte directly from buffer_.
+  /// Bytes are memcpy'd from buffer_ and values are read from this variable. This is
+  /// faster than reading values byte by byte directly from buffer_.
   uint64_t buffered_values_;
 
   int byte_offset_;       // Offset in buffer_
@@ -146,4 +160,4 @@ class BitReader {
 
 } // namespace parquet_cpp
 
-#endif
+#endif // PARQUET_UTIL_BIT_STREAM_UTILS_H
diff --git a/cpp/src/parquet/util/bit-stream-utils.inline.h b/cpp/src/parquet/util/bit-stream-utils.inline.h
index 6171b1fb04f6a..77e2d48817110 100644
--- a/cpp/src/parquet/util/bit-stream-utils.inline.h
+++ b/cpp/src/parquet/util/bit-stream-utils.inline.h
@@ -15,6 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
+// From Apache Impala as of 2016-01-29
+
 #ifndef PARQUET_UTIL_BIT_STREAM_UTILS_INLINE_H
 #define PARQUET_UTIL_BIT_STREAM_UTILS_INLINE_H
 
@@ -73,7 +75,7 @@ inline bool BitWriter::PutAligned(T val, int num_bytes) {
   return true;
 }
 
-inline bool BitWriter::PutVlqInt(uint32_t v) {
+inline bool BitWriter::PutVlqInt(int32_t v) {
   bool result = true;
   while ((v & 0xFFFFFF80) != 0L) {
     result &= PutAligned<uint8_t>((v & 0x7F) | 0x80, 1);
@@ -83,13 +85,9 @@ inline bool BitWriter::PutVlqInt(uint32_t v) {
   return result;
 }
 
-inline bool BitWriter::PutZigZagVlqInt(int32_t v) {
-  uint32_t u = (v << 1) ^ (v >> 31);
-  return PutVlqInt(u);
-}
-
 template<typename T>
 inline bool BitReader::GetValue(int num_bits, T* v) {
+  DCHECK(buffer_ != NULL);
   // TODO: revisit this limit if necessary
   DCHECK_LE(num_bits, 32);
   DCHECK_LE(num_bits, sizeof(T) * 8);
@@ -140,7 +138,7 @@ inline bool BitReader::GetAligned(int num_bytes, T* v) {
   return true;
 }
 
-inline bool BitReader::GetVlqInt(uint64_t* v) {
+inline bool BitReader::GetVlqInt(int32_t* v) {
   *v = 0;
   int shift = 0;
   int num_bytes = 0;
@@ -154,12 +152,20 @@ inline bool BitReader::GetVlqInt(uint64_t* v) {
   return true;
 }
 
-inline bool BitReader::GetZigZagVlqInt(int64_t* v) {
-  uint64_t u;
-  if (!GetVlqInt(&u)) return false;
-  *reinterpret_cast<uint64_t*>(v) = (u >> 1) ^ -(u & 1);
-  return true;
-}
+// TODO(nongli): review/test these implementations given divergence in Impala
+// functions
+
+// inline bool BitWriter::PutZigZagVlqInt(int32_t v) {
+//   uint32_t u = (v << 1) ^ (v >> 31);
+//   return PutVlqInt(u);
+// }
+
+// inline bool BitReader::GetZigZagVlqInt(int64_t* v) {
+//   uint64_t u;
+//   if (!GetVlqInt(&u)) return false;
+//   *reinterpret_cast<uint64_t*>(v) = (u >> 1) ^ -(u & 1);
+//   return true;
+// }
 
 } // namespace parquet_cpp
 
diff --git a/cpp/src/parquet/util/bit-util-test.cc b/cpp/src/parquet/util/bit-util-test.cc
new file mode 100644
index 0000000000000..78efe1a85536e
--- /dev/null
+++ b/cpp/src/parquet/util/bit-util-test.cc
@@ -0,0 +1,164 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// From Apache Impala as of 2016-01-29
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <iostream>
+#include <limits.h>
+
+#include <boost/utility.hpp>
+#include <gtest/gtest.h>
+
+#include "parquet/util/bit-util.h"
+#include "parquet/util/cpu-info.h"
+
+namespace parquet_cpp {
+
+TEST(BitUtil, Ceil) {
+  EXPECT_EQ(BitUtil::Ceil(0, 1), 0);
+  EXPECT_EQ(BitUtil::Ceil(1, 1), 1);
+  EXPECT_EQ(BitUtil::Ceil(1, 2), 1);
+  EXPECT_EQ(BitUtil::Ceil(1, 8), 1);
+  EXPECT_EQ(BitUtil::Ceil(7, 8), 1);
+  EXPECT_EQ(BitUtil::Ceil(8, 8), 1);
+  EXPECT_EQ(BitUtil::Ceil(9, 8), 2);
+  EXPECT_EQ(BitUtil::Ceil(9, 9), 1);
+  EXPECT_EQ(BitUtil::Ceil(10000000000, 10), 1000000000);
+  EXPECT_EQ(BitUtil::Ceil(10, 10000000000), 1);
+  EXPECT_EQ(BitUtil::Ceil(100000000000, 10000000000), 10);
+}
+
+TEST(BitUtil, RoundUp) {
+  EXPECT_EQ(BitUtil::RoundUp(0, 1), 0);
+  EXPECT_EQ(BitUtil::RoundUp(1, 1), 1);
+  EXPECT_EQ(BitUtil::RoundUp(1, 2), 2);
+  EXPECT_EQ(BitUtil::RoundUp(6, 2), 6);
+  EXPECT_EQ(BitUtil::RoundUp(7, 3), 9);
+  EXPECT_EQ(BitUtil::RoundUp(9, 9), 9);
+  EXPECT_EQ(BitUtil::RoundUp(10000000001, 10), 10000000010);
+  EXPECT_EQ(BitUtil::RoundUp(10, 10000000000), 10000000000);
+  EXPECT_EQ(BitUtil::RoundUp(100000000000, 10000000000), 100000000000);
+}
+
+TEST(BitUtil, RoundDown) {
+  EXPECT_EQ(BitUtil::RoundDown(0, 1), 0);
+  EXPECT_EQ(BitUtil::RoundDown(1, 1), 1);
+  EXPECT_EQ(BitUtil::RoundDown(1, 2), 0);
+  EXPECT_EQ(BitUtil::RoundDown(6, 2), 6);
+  EXPECT_EQ(BitUtil::RoundDown(7, 3), 6);
+  EXPECT_EQ(BitUtil::RoundDown(9, 9), 9);
+  EXPECT_EQ(BitUtil::RoundDown(10000000001, 10), 10000000000);
+  EXPECT_EQ(BitUtil::RoundDown(10, 10000000000), 0);
+  EXPECT_EQ(BitUtil::RoundDown(100000000000, 10000000000), 100000000000);
+}
+
+TEST(BitUtil, Popcount) {
+  EXPECT_EQ(BitUtil::Popcount(BOOST_BINARY(0 1 0 1 0 1 0 1)), 4);
+  EXPECT_EQ(BitUtil::PopcountNoHw(BOOST_BINARY(0 1 0 1 0 1 0 1)), 4);
+  EXPECT_EQ(BitUtil::Popcount(BOOST_BINARY(1 1 1 1 0 1 0 1)), 6);
+  EXPECT_EQ(BitUtil::PopcountNoHw(BOOST_BINARY(1 1 1 1 0 1 0 1)), 6);
+  EXPECT_EQ(BitUtil::Popcount(BOOST_BINARY(1 1 1 1 1 1 1 1)), 8);
+  EXPECT_EQ(BitUtil::PopcountNoHw(BOOST_BINARY(1 1 1 1 1 1 1 1)), 8);
+  EXPECT_EQ(BitUtil::Popcount(0), 0);
+  EXPECT_EQ(BitUtil::PopcountNoHw(0), 0);
+}
+
+TEST(BitUtil, TrailingBits) {
+  EXPECT_EQ(BitUtil::TrailingBits(BOOST_BINARY(1 1 1 1 1 1 1 1), 0), 0);
+  EXPECT_EQ(BitUtil::TrailingBits(BOOST_BINARY(1 1 1 1 1 1 1 1), 1), 1);
+  EXPECT_EQ(BitUtil::TrailingBits(BOOST_BINARY(1 1 1 1 1 1 1 1), 64),
+            BOOST_BINARY(1 1 1 1 1 1 1 1));
+  EXPECT_EQ(BitUtil::TrailingBits(BOOST_BINARY(1 1 1 1 1 1 1 1), 100),
+            BOOST_BINARY(1 1 1 1 1 1 1 1));
+  EXPECT_EQ(BitUtil::TrailingBits(0, 1), 0);
+  EXPECT_EQ(BitUtil::TrailingBits(0, 64), 0);
+  EXPECT_EQ(BitUtil::TrailingBits(1LL << 63, 0), 0);
+  EXPECT_EQ(BitUtil::TrailingBits(1LL << 63, 63), 0);
+  EXPECT_EQ(BitUtil::TrailingBits(1LL << 63, 64), 1LL << 63);
+}
+
+TEST(BitUtil, ByteSwap) {
+  EXPECT_EQ(BitUtil::ByteSwap(static_cast<uint32_t>(0)), 0);
+  EXPECT_EQ(BitUtil::ByteSwap(static_cast<uint32_t>(0x11223344)), 0x44332211);
+
+  EXPECT_EQ(BitUtil::ByteSwap(static_cast<int32_t>(0)), 0);
+  EXPECT_EQ(BitUtil::ByteSwap(static_cast<int32_t>(0x11223344)), 0x44332211);
+
+  EXPECT_EQ(BitUtil::ByteSwap(static_cast<uint64_t>(0)), 0);
+  EXPECT_EQ(BitUtil::ByteSwap(
+      static_cast<uint64_t>(0x1122334455667788)), 0x8877665544332211);
+
+  EXPECT_EQ(BitUtil::ByteSwap(static_cast<int64_t>(0)), 0);
+  EXPECT_EQ(BitUtil::ByteSwap(
+      static_cast<int64_t>(0x1122334455667788)), 0x8877665544332211);
+
+  EXPECT_EQ(BitUtil::ByteSwap(static_cast<int16_t>(0)), 0);
+  EXPECT_EQ(BitUtil::ByteSwap(static_cast<int16_t>(0x1122)), 0x2211);
+
+  EXPECT_EQ(BitUtil::ByteSwap(static_cast<uint16_t>(0)), 0);
+  EXPECT_EQ(BitUtil::ByteSwap(static_cast<uint16_t>(0x1122)), 0x2211);
+}
+
+TEST(BitUtil, Log2) {
+  EXPECT_EQ(BitUtil::Log2(1), 0);
+  EXPECT_EQ(BitUtil::Log2(2), 1);
+  EXPECT_EQ(BitUtil::Log2(3), 2);
+  EXPECT_EQ(BitUtil::Log2(4), 2);
+  EXPECT_EQ(BitUtil::Log2(5), 3);
+  EXPECT_EQ(BitUtil::Log2(INT_MAX), 31);
+  EXPECT_EQ(BitUtil::Log2(UINT_MAX), 32);
+  EXPECT_EQ(BitUtil::Log2(ULLONG_MAX), 64);
+}
+
+TEST(BitUtil, RoundUpToPowerOf2) {
+  EXPECT_EQ(BitUtil::RoundUpToPowerOf2(7, 8), 8);
+  EXPECT_EQ(BitUtil::RoundUpToPowerOf2(8, 8), 8);
+  EXPECT_EQ(BitUtil::RoundUpToPowerOf2(9, 8), 16);
+}
+
+TEST(BitUtil, RoundDownToPowerOf2) {
+  EXPECT_EQ(BitUtil::RoundDownToPowerOf2(7, 8), 0);
+  EXPECT_EQ(BitUtil::RoundDownToPowerOf2(8, 8), 8);
+  EXPECT_EQ(BitUtil::RoundDownToPowerOf2(9, 8), 8);
+}
+
+TEST(BitUtil, RoundUpDown) {
+  EXPECT_EQ(BitUtil::RoundUpNumBytes(7), 1);
+  EXPECT_EQ(BitUtil::RoundUpNumBytes(8), 1);
+  EXPECT_EQ(BitUtil::RoundUpNumBytes(9), 2);
+  EXPECT_EQ(BitUtil::RoundDownNumBytes(7), 0);
+  EXPECT_EQ(BitUtil::RoundDownNumBytes(8), 1);
+  EXPECT_EQ(BitUtil::RoundDownNumBytes(9), 1);
+
+  EXPECT_EQ(BitUtil::RoundUpNumi32(31), 1);
+  EXPECT_EQ(BitUtil::RoundUpNumi32(32), 1);
+  EXPECT_EQ(BitUtil::RoundUpNumi32(33), 2);
+  EXPECT_EQ(BitUtil::RoundDownNumi32(31), 0);
+  EXPECT_EQ(BitUtil::RoundDownNumi32(32), 1);
+  EXPECT_EQ(BitUtil::RoundDownNumi32(33), 1);
+
+  EXPECT_EQ(BitUtil::RoundUpNumi64(63), 1);
+  EXPECT_EQ(BitUtil::RoundUpNumi64(64), 1);
+  EXPECT_EQ(BitUtil::RoundUpNumi64(65), 2);
+  EXPECT_EQ(BitUtil::RoundDownNumi64(63), 0);
+  EXPECT_EQ(BitUtil::RoundDownNumi64(64), 1);
+  EXPECT_EQ(BitUtil::RoundDownNumi64(65), 1);
+}
+
+} // namespace parquet_cpp
diff --git a/cpp/src/parquet/util/bit-util.h b/cpp/src/parquet/util/bit-util.h
index 593d1c2cfe447..4db585a0ccc67 100644
--- a/cpp/src/parquet/util/bit-util.h
+++ b/cpp/src/parquet/util/bit-util.h
@@ -15,47 +15,132 @@
 // specific language governing permissions and limitations
 // under the License.
 
+// From Apache Impala as of 2016-01-29
+
 #ifndef PARQUET_UTIL_BIT_UTIL_H
 #define PARQUET_UTIL_BIT_UTIL_H
 
 #if defined(__APPLE__)
-  #include <machine/endian.h>
+#include <machine/endian.h>
 #else
-  #include <endian.h>
+#include <endian.h>
 #endif
 
+#include <boost/type_traits/make_unsigned.hpp>
+
 #include "parquet/util/compiler-util.h"
-#include "parquet/util/logging.h"
+#include "parquet/util/cpu-info.h"
+#include "parquet/util/sse-util.h"
 
 namespace parquet_cpp {
 
-// Utility class to do standard bit tricks
-// TODO: is this in boost or something else like that?
+using boost::make_unsigned;
+
+/// Utility class to do standard bit tricks
+/// TODO: is this in boost or something else like that?
 class BitUtil {
  public:
-  // Returns the ceil of value/divisor
-  static inline int Ceil(int value, int divisor) {
+  /// Returns the ceil of value/divisor
+  static inline int64_t Ceil(int64_t value, int64_t divisor) {
     return value / divisor + (value % divisor != 0);
   }
 
-  // Returns 'value' rounded up to the nearest multiple of 'factor'
-  static inline int RoundUp(int value, int factor) {
+  /// Returns 'value' rounded up to the nearest multiple of 'factor'
+  static inline int64_t RoundUp(int64_t value, int64_t factor) {
     return (value + (factor - 1)) / factor * factor;
   }
 
-  // Returns 'value' rounded down to the nearest multiple of 'factor'
-  static inline int RoundDown(int value, int factor) {
+  /// Returns 'value' rounded down to the nearest multiple of 'factor'
+  static inline int64_t RoundDown(int64_t value, int64_t factor) {
     return (value / factor) * factor;
   }
 
-  // Returns the number of set bits in x
-  static inline int Popcount(uint64_t x) {
+  /// Returns the smallest power of two that contains v. Taken from
+  /// http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
+  /// TODO: Pick a better name, as it is not clear what happens when the input is
+  /// already a power of two.
+  static inline int64_t NextPowerOfTwo(int64_t v) {
+    --v;
+    v |= v >> 1;
+    v |= v >> 2;
+    v |= v >> 4;
+    v |= v >> 8;
+    v |= v >> 16;
+    v |= v >> 32;
+    ++v;
+    return v;
+  }
+
+  /// Returns 'value' rounded up to the nearest multiple of 'factor' when factor is
+  /// a power of two
+  static inline int RoundUpToPowerOf2(int value, int factor) {
+    DCHECK((factor > 0) && ((factor & (factor - 1)) == 0));
+    return (value + (factor - 1)) & ~(factor - 1);
+  }
+
+  static inline int RoundDownToPowerOf2(int value, int factor) {
+    DCHECK((factor > 0) && ((factor & (factor - 1)) == 0));
+    return value & ~(factor - 1);
+  }
+
+  /// Specialized round up and down functions for frequently used factors,
+  /// like 8 (bits->bytes), 32 (bits->i32), and 64 (bits->i64).
+  /// Returns the rounded up number of bytes that fit the number of bits.
+  static inline uint32_t RoundUpNumBytes(uint32_t bits) {
+    return (bits + 7) >> 3;
+  }
+
+  /// Returns the rounded down number of bytes that fit the number of bits.
+  static inline uint32_t RoundDownNumBytes(uint32_t bits) {
+    return bits >> 3;
+  }
+
+  /// Returns the rounded up to 32 multiple. Used for conversions of bits to i32.
+  static inline uint32_t RoundUpNumi32(uint32_t bits) {
+    return (bits + 31) >> 5;
+  }
+
+  /// Returns the rounded up 32 multiple.
+  static inline uint32_t RoundDownNumi32(uint32_t bits) {
+    return bits >> 5;
+  }
+
+  /// Returns the rounded up to 64 multiple. Used for conversions of bits to i64.
+  static inline uint32_t RoundUpNumi64(uint32_t bits) {
+    return (bits + 63) >> 6;
+  }
+
+  /// Returns the rounded down to 64 multiple.
+  static inline uint32_t RoundDownNumi64(uint32_t bits) {
+    return bits >> 6;
+  }
+
+  /// Non hw accelerated pop count.
+  /// TODO: we don't use this in any perf sensitive code paths currently.  There
+  /// might be a much faster way to implement this.
+  static inline int PopcountNoHw(uint64_t x) {
     int count = 0;
     for (; x != 0; ++count) x &= x-1;
     return count;
   }
 
-  // Returns the 'num_bits' least-significant bits of 'v'.
+  /// Returns the number of set bits in x
+  static inline int Popcount(uint64_t x) {
+    if (LIKELY(CpuInfo::IsSupported(CpuInfo::POPCNT))) {
+      return POPCNT_popcnt_u64(x);
+    } else {
+      return PopcountNoHw(x);
+    }
+  }
+
+  // Compute correct population count for various-width signed integers
+  template<typename T>
+  static inline int PopcountSigned(T v) {
+    // Converting to same-width unsigned then extending preserves the bit pattern.
+    return BitUtil::Popcount(static_cast<typename make_unsigned<T>::type>(v));
+  }
+
+  /// Returns the 'num_bits' least-significant bits of 'v'.
   static inline uint64_t TrailingBits(uint64_t v, int num_bits) {
     if (UNLIKELY(num_bits == 0)) return 0;
     if (UNLIKELY(num_bits >= 64)) return v;
@@ -63,11 +148,12 @@ class BitUtil {
     return (v << n) >> n;
   }
 
-  // Returns ceil(log2(x)).
-  // TODO: this could be faster if we use __builtin_clz.  Fix this if this ever shows up
-  // in a hot path.
+  /// Returns ceil(log2(x)).
+  /// TODO: this could be faster if we use __builtin_clz.  Fix this if this ever shows up
+  /// in a hot path.
   static inline int Log2(uint64_t x) {
-    if (x == 0) return 0;
+    DCHECK_GT(x, 0);
+    if (x == 1) return 0;
     // Compute result = ceil(log2(x))
     //                = floor(log2(x - 1)) + 1, for x > 1
     // by finding the position of the most significant bit (1-indexed) of x - 1
@@ -78,15 +164,7 @@ class BitUtil {
     return result;
   }
 
-  // Returns the minimum number of bits needed to represent the value of 'x'
-  static inline int NumRequiredBits(uint64_t x) {
-    for (int i = 63; i >= 0; --i) {
-      if (x & 1L << i) return i + 1;
-    }
-    return 0;
-  }
-
-  // Swaps the byte order (i.e. endianess)
+  /// Swaps the byte order (i.e. endianess)
   static inline int64_t ByteSwap(int64_t value) {
     return __builtin_bswap64(value);
   }
@@ -106,7 +184,7 @@ class BitUtil {
     return static_cast<uint16_t>(ByteSwap(static_cast<int16_t>(value)));
   }
 
-  // Write the swapped bytes into dst. Src and st cannot overlap.
+  /// Write the swapped bytes into dst. Src and st cannot overlap.
   static inline void ByteSwap(void* dst, const void* src, int len) {
     switch (len) {
       case 1:
@@ -134,8 +212,8 @@ class BitUtil {
     }
   }
 
-  // Converts to big endian format (if not already in big endian) from the
-  // machine's native endian format.
+  /// Converts to big endian format (if not already in big endian) from the
+  /// machine's native endian format.
 #if __BYTE_ORDER == __LITTLE_ENDIAN
   static inline int64_t  ToBigEndian(int64_t value)  { return ByteSwap(value); }
   static inline uint64_t ToBigEndian(uint64_t value) { return ByteSwap(value); }
@@ -152,7 +230,7 @@ class BitUtil {
   static inline uint16_t ToBigEndian(uint16_t val) { return val; }
 #endif
 
-  // Converts from big endian format to the machine's native endian format.
+  /// Converts from big endian format to the machine's native endian format.
 #if __BYTE_ORDER == __LITTLE_ENDIAN
   static inline int64_t  FromBigEndian(int64_t value)  { return ByteSwap(value); }
   static inline uint64_t FromBigEndian(uint64_t value) { return ByteSwap(value); }
@@ -168,6 +246,36 @@ class BitUtil {
   static inline int16_t  FromBigEndian(int16_t val)  { return val; }
   static inline uint16_t FromBigEndian(uint16_t val) { return val; }
 #endif
+
+  // Logical right shift for signed integer types
+  // This is needed because the C >> operator does arithmetic right shift
+  // Negative shift amounts lead to undefined behavior
+  template<typename T>
+  static T ShiftRightLogical(T v, int shift) {
+    // Conversion to unsigned ensures most significant bits always filled with 0's
+    return static_cast<typename make_unsigned<T>::type>(v) >> shift;
+  }
+
+  // Get an specific bit of a numeric type
+  template<typename T>
+  static inline int8_t GetBit(T v, int bitpos) {
+    T masked = v & (static_cast<T>(0x1) << bitpos);
+    return static_cast<int8_t>(ShiftRightLogical(masked, bitpos));
+  }
+
+  // Set a specific bit to 1
+  // Behavior when bitpos is negative is undefined
+  template<typename T>
+  static T SetBit(T v, int bitpos) {
+    return v | (static_cast<T>(0x1) << bitpos);
+  }
+
+  // Set a specific bit to 0
+  // Behavior when bitpos is negative is undefined
+  template<typename T>
+  static T UnsetBit(T v, int bitpos) {
+    return v & ~(static_cast<T>(0x1) << bitpos);
+  }
 };
 
 } // namespace parquet_cpp
diff --git a/cpp/src/parquet/util/cpu-info.cc b/cpp/src/parquet/util/cpu-info.cc
new file mode 100644
index 0000000000000..610fb623ed042
--- /dev/null
+++ b/cpp/src/parquet/util/cpu-info.cc
@@ -0,0 +1,169 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// From Apache Impala as of 2016-01-29. Pared down to a minimal set of
+// functions needed for parquet-cpp
+
+#include "parquet/util/cpu-info.h"
+
+#ifdef __APPLE__
+#include <sys/sysctl.h>
+#endif
+
+#include <boost/algorithm/string.hpp>
+#include <algorithm>
+#include <cstdint>
+#include <iostream>
+#include <fstream>
+#include <mmintrin.h>
+#include <sstream>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <string>
+
+#include "parquet/exception.h"
+
+using boost::algorithm::contains;
+using boost::algorithm::trim;
+using std::max;
+using std::string;
+
+namespace parquet_cpp {
+
+bool CpuInfo::initialized_ = false;
+int64_t CpuInfo::hardware_flags_ = 0;
+int64_t CpuInfo::original_hardware_flags_;
+int64_t CpuInfo::cache_sizes_[L3_CACHE + 1];
+int64_t CpuInfo::cycles_per_ms_;
+int CpuInfo::num_cores_ = 1;
+string CpuInfo::model_name_ = "unknown"; // NOLINT
+
+static struct {
+  string name;
+  int64_t flag;
+} flag_mappings[] = {
+  { "ssse3",  CpuInfo::SSSE3 },
+  { "sse4_1", CpuInfo::SSE4_1 },
+  { "sse4_2", CpuInfo::SSE4_2 },
+  { "popcnt", CpuInfo::POPCNT },
+};
+static const int64_t num_flags = sizeof(flag_mappings) / sizeof(flag_mappings[0]);
+
+// Helper function to parse for hardware flags.
+// values contains a list of space-seperated flags.  check to see if the flags we
+// care about are present.
+// Returns a bitmap of flags.
+int64_t ParseCPUFlags(const string& values) {
+  int64_t flags = 0;
+  for (int i = 0; i < num_flags; ++i) {
+    if (contains(values, flag_mappings[i].name)) {
+      flags |= flag_mappings[i].flag;
+    }
+  }
+  return flags;
+}
+
+void CpuInfo::Init() {
+  string line;
+  string name;
+  string value;
+
+  float max_mhz = 0;
+  int num_cores = 0;
+
+  memset(&cache_sizes_, 0, sizeof(cache_sizes_));
+
+  // Read from /proc/cpuinfo
+  std::ifstream cpuinfo("/proc/cpuinfo", std::ios::in);
+  while (cpuinfo) {
+    getline(cpuinfo, line);
+    size_t colon = line.find(':');
+    if (colon != string::npos) {
+      name = line.substr(0, colon - 1);
+      value = line.substr(colon + 1, string::npos);
+      trim(name);
+      trim(value);
+      if (name.compare("flags") == 0) {
+        hardware_flags_ |= ParseCPUFlags(value);
+      } else if (name.compare("cpu MHz") == 0) {
+        // Every core will report a different speed.  We'll take the max, assuming
+        // that when impala is running, the core will not be in a lower power state.
+        // TODO: is there a more robust way to do this, such as
+        // Window's QueryPerformanceFrequency()
+        float mhz = atof(value.c_str());
+        max_mhz = max(mhz, max_mhz);
+      } else if (name.compare("processor") == 0) {
+        ++num_cores;
+      } else if (name.compare("model name") == 0) {
+        model_name_ = value;
+      }
+    }
+  }
+  if (cpuinfo.is_open()) cpuinfo.close();
+
+#ifdef __APPLE__
+  // On Mac OS X use sysctl() to get the cache sizes
+  size_t len = 0;
+  sysctlbyname("hw.cachesize", NULL, &len, NULL, 0);
+  uint64_t* data = static_cast<uint64_t*>(malloc(len));
+  sysctlbyname("hw.cachesize", data, &len, NULL, 0);
+  DCHECK(len / sizeof(uint64_t) >= 3);
+  for (size_t i = 0; i < 3; ++i) {
+    cache_sizes_[i] = data[i];
+  }
+#else
+  // Call sysconf to query for the cache sizes
+  cache_sizes_[0] = sysconf(_SC_LEVEL1_DCACHE_SIZE);
+  cache_sizes_[1] = sysconf(_SC_LEVEL2_CACHE_SIZE);
+  cache_sizes_[2] = sysconf(_SC_LEVEL3_CACHE_SIZE);
+#endif
+
+  if (max_mhz != 0) {
+    cycles_per_ms_ = max_mhz * 1000;
+  } else {
+    cycles_per_ms_ = 1000000;
+  }
+  original_hardware_flags_ = hardware_flags_;
+
+  if (num_cores > 0) {
+    num_cores_ = num_cores;
+  } else {
+    num_cores_ = 1;
+  }
+
+  initialized_ = true;
+}
+
+void CpuInfo::VerifyCpuRequirements() {
+  if (!CpuInfo::IsSupported(CpuInfo::SSSE3)) {
+    throw ParquetException("CPU does not support the Supplemental SSE3 instruction set");
+  }
+}
+
+void CpuInfo::EnableFeature(int64_t flag, bool enable) {
+  DCHECK(initialized_);
+  if (!enable) {
+    hardware_flags_ &= ~flag;
+  } else {
+    // Can't turn something on that can't be supported
+    DCHECK((original_hardware_flags_ & flag) != 0);
+    hardware_flags_ |= flag;
+  }
+}
+
+} // namespace parquet_cpp
diff --git a/cpp/src/parquet/util/cpu-info.h b/cpp/src/parquet/util/cpu-info.h
new file mode 100644
index 0000000000000..9026cde6630ec
--- /dev/null
+++ b/cpp/src/parquet/util/cpu-info.h
@@ -0,0 +1,108 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// From Apache Impala as of 2016-01-29. Pared down to a minimal set of
+// functions needed for parquet-cpp
+
+#ifndef PARQUET_UTIL_CPU_INFO_H
+#define PARQUET_UTIL_CPU_INFO_H
+
+#include <cstdint>
+#include <string>
+
+#include "parquet/util/logging.h"
+
+namespace parquet_cpp {
+
+/// CpuInfo is an interface to query for cpu information at runtime.  The caller can
+/// ask for the sizes of the caches and what hardware features are supported.
+/// On Linux, this information is pulled from a couple of sys files (/proc/cpuinfo and
+/// /sys/devices)
+class CpuInfo {
+ public:
+  static const int64_t SSSE3   = (1 << 1);
+  static const int64_t SSE4_1  = (1 << 2);
+  static const int64_t SSE4_2  = (1 << 3);
+  static const int64_t POPCNT  = (1 << 4);
+
+  /// Cache enums for L1 (data), L2 and L3
+  enum CacheLevel {
+    L1_CACHE = 0,
+    L2_CACHE = 1,
+    L3_CACHE = 2,
+  };
+
+  /// Initialize CpuInfo.
+  static void Init();
+
+  /// Determine if the CPU meets the minimum CPU requirements and if not, issue an error
+  /// and terminate.
+  static void VerifyCpuRequirements();
+
+  /// Returns all the flags for this cpu
+  static int64_t hardware_flags() {
+    DCHECK(initialized_);
+    return hardware_flags_;
+  }
+
+  /// Returns whether of not the cpu supports this flag
+  inline static bool IsSupported(int64_t flag) {
+    DCHECK(initialized_);
+    return (hardware_flags_ & flag) != 0;
+  }
+
+  /// Toggle a hardware feature on and off.  It is not valid to turn on a feature
+  /// that the underlying hardware cannot support. This is useful for testing.
+  static void EnableFeature(int64_t flag, bool enable);
+
+  /// Returns the size of the cache in KB at this cache level
+  static int64_t CacheSize(CacheLevel level) {
+    DCHECK(initialized_);
+    return cache_sizes_[level];
+  }
+
+  /// Returns the number of cpu cycles per millisecond
+  static int64_t cycles_per_ms() {
+    DCHECK(initialized_);
+    return cycles_per_ms_;
+  }
+
+  /// Returns the number of cores (including hyper-threaded) on this machine.
+  static int num_cores() {
+    DCHECK(initialized_);
+    return num_cores_;
+  }
+
+  /// Returns the model name of the cpu (e.g. Intel i7-2600)
+  static std::string model_name() {
+    DCHECK(initialized_);
+    return model_name_;
+  }
+
+ private:
+  static bool initialized_;
+  static int64_t hardware_flags_;
+  static int64_t original_hardware_flags_;
+  static int64_t cache_sizes_[L3_CACHE + 1];
+  static int64_t cycles_per_ms_;
+  static int num_cores_;
+  static std::string model_name_; // NOLINT
+};
+
+} // namespace parquet_cpp
+
+#endif // PARQUET_UTIL_CPU_INFO_H
diff --git a/cpp/src/parquet/util/rle-encoding.h b/cpp/src/parquet/util/rle-encoding.h
index e65368e7fc7a2..22b2c2fcaf0c6 100644
--- a/cpp/src/parquet/util/rle-encoding.h
+++ b/cpp/src/parquet/util/rle-encoding.h
@@ -15,74 +15,75 @@
 // specific language governing permissions and limitations
 // under the License.
 
+// From Apache Impala as of 2016-01-29
+
 #ifndef PARQUET_UTIL_RLE_ENCODING_H
 #define PARQUET_UTIL_RLE_ENCODING_H
 
-#include <math.h>
 #include <algorithm>
+#include <math.h>
 
 #include "parquet/util/compiler-util.h"
 #include "parquet/util/bit-stream-utils.inline.h"
 #include "parquet/util/bit-util.h"
-#include "parquet/util/logging.h"
 
 namespace parquet_cpp {
 
-// Utility classes to do run length encoding (RLE) for fixed bit width values.  If runs
-// are sufficiently long, RLE is used, otherwise, the values are just bit-packed
-// (literal encoding).
-// For both types of runs, there is a byte-aligned indicator which encodes the length
-// of the run and the type of the run.
-// This encoding has the benefit that when there aren't any long enough runs, values
-// are always decoded at fixed (can be precomputed) bit offsets OR both the value and
-// the run length are byte aligned. This allows for very efficient decoding
-// implementations.
-// The encoding is:
-//    encoded-block := run*
-//    run := literal-run | repeated-run
-//    literal-run := literal-indicator < literal bytes >
-//    repeated-run := repeated-indicator < repeated value. padded to byte boundary >
-//    literal-indicator := varint_encode( number_of_groups << 1 | 1)
-//    repeated-indicator := varint_encode( number_of_repetitions << 1 )
+/// Utility classes to do run length encoding (RLE) for fixed bit width values.  If runs
+/// are sufficiently long, RLE is used, otherwise, the values are just bit-packed
+/// (literal encoding).
+/// For both types of runs, there is a byte-aligned indicator which encodes the length
+/// of the run and the type of the run.
+/// This encoding has the benefit that when there aren't any long enough runs, values
+/// are always decoded at fixed (can be precomputed) bit offsets OR both the value and
+/// the run length are byte aligned. This allows for very efficient decoding
+/// implementations.
+/// The encoding is:
+///    encoded-block := run*
+///    run := literal-run | repeated-run
+///    literal-run := literal-indicator < literal bytes >
+///    repeated-run := repeated-indicator < repeated value. padded to byte boundary >
+///    literal-indicator := varint_encode( number_of_groups << 1 | 1)
+///    repeated-indicator := varint_encode( number_of_repetitions << 1 )
 //
-// Each run is preceded by a varint. The varint's least significant bit is
-// used to indicate whether the run is a literal run or a repeated run. The rest
-// of the varint is used to determine the length of the run (eg how many times the
-// value repeats).
+/// Each run is preceded by a varint. The varint's least significant bit is
+/// used to indicate whether the run is a literal run or a repeated run. The rest
+/// of the varint is used to determine the length of the run (eg how many times the
+/// value repeats).
 //
-// In the case of literal runs, the run length is always a multiple of 8 (i.e. encode
-// in groups of 8), so that no matter the bit-width of the value, the sequence will end
-// on a byte boundary without padding.
-// Given that we know it is a multiple of 8, we store the number of 8-groups rather than
-// the actual number of encoded ints. (This means that the total number of encoded values
-// can not be determined from the encoded data, since the number of values in the last
-// group may not be a multiple of 8). For the last group of literal runs, we pad
-// the group to 8 with zeros. This allows for 8 at a time decoding on the read side
-// without the need for additional checks.
+/// In the case of literal runs, the run length is always a multiple of 8 (i.e. encode
+/// in groups of 8), so that no matter the bit-width of the value, the sequence will end
+/// on a byte boundary without padding.
+/// Given that we know it is a multiple of 8, we store the number of 8-groups rather than
+/// the actual number of encoded ints. (This means that the total number of encoded values
+/// can not be determined from the encoded data, since the number of values in the last
+/// group may not be a multiple of 8). For the last group of literal runs, we pad
+/// the group to 8 with zeros. This allows for 8 at a time decoding on the read side
+/// without the need for additional checks.
 //
-// There is a break-even point when it is more storage efficient to do run length
-// encoding.  For 1 bit-width values, that point is 8 values.  They require 2 bytes
-// for both the repeated encoding or the literal encoding.  This value can always
-// be computed based on the bit-width.
-// TODO: think about how to use this for strings.  The bit packing isn't quite the same.
+/// There is a break-even point when it is more storage efficient to do run length
+/// encoding.  For 1 bit-width values, that point is 8 values.  They require 2 bytes
+/// for both the repeated encoding or the literal encoding.  This value can always
+/// be computed based on the bit-width.
+/// TODO: think about how to use this for strings.  The bit packing isn't quite the same.
 //
-// Examples with bit-width 1 (eg encoding booleans):
-// ----------------------------------------
-// 100 1s followed by 100 0s:
-// <varint(100 << 1)> <1, padded to 1 byte>  <varint(100 << 1)> <0, padded to 1 byte>
-//  - (total 4 bytes)
+/// Examples with bit-width 1 (eg encoding booleans):
+/// ----------------------------------------
+/// 100 1s followed by 100 0s:
+/// <varint(100 << 1)> <1, padded to 1 byte>  <varint(100 << 1)> <0, padded to 1 byte>
+///  - (total 4 bytes)
 //
-// alternating 1s and 0s (200 total):
-// 200 ints = 25 groups of 8
-// <varint((25 << 1) | 1)> <25 bytes of values, bitpacked>
-// (total 26 bytes, 1 byte overhead)
+/// alternating 1s and 0s (200 total):
+/// 200 ints = 25 groups of 8
+/// <varint((25 << 1) | 1)> <25 bytes of values, bitpacked>
+/// (total 26 bytes, 1 byte overhead)
 //
 
-// Decoder class for RLE encoded data.
+/// Decoder class for RLE encoded data.
 class RleDecoder {
  public:
-  // Create a decoder object. buffer/buffer_len is the decoded data.
-  // bit_width is the width of each value (before encoding).
+  /// Create a decoder object. buffer/buffer_len is the decoded data.
+  /// bit_width is the width of each value (before encoding).
   RleDecoder(const uint8_t* buffer, int buffer_len, int bit_width)
     : bit_reader_(buffer, buffer_len),
       bit_width_(bit_width),
@@ -93,170 +94,172 @@ class RleDecoder {
     DCHECK_LE(bit_width_, 64);
   }
 
-  RleDecoder() {}
+  RleDecoder() : bit_width_(-1) {}
+
+  void Reset(const uint8_t* buffer, int buffer_len, int bit_width) {
+    DCHECK_GE(bit_width, 0);
+    DCHECK_LE(bit_width, 64);
+    bit_reader_.Reset(buffer, buffer_len);
+    bit_width_ = bit_width;
+    current_value_ = 0;
+    repeat_count_ = 0;
+    literal_count_ = 0;
+  }
 
-  // Gets the next value.  Returns false if there are no more.
+  /// Gets the next value.  Returns false if there are no more.
   template<typename T>
   bool Get(T* val);
 
- private:
+ protected:
   BitReader bit_reader_;
+  /// Number of bits needed to encode the value. Must be between 0 and 64.
   int bit_width_;
   uint64_t current_value_;
   uint32_t repeat_count_;
   uint32_t literal_count_;
+
+ private:
+  /// Fills literal_count_ and repeat_count_ with next values. Returns false if there
+  /// are no more.
+  template<typename T>
+  bool NextCounts();
 };
 
-// Class to incrementally build the rle data.   This class does not allocate any memory.
-// The encoding has two modes: encoding repeated runs and literal runs.
-// If the run is sufficiently short, it is more efficient to encode as a literal run.
-// This class does so by buffering 8 values at a time.  If they are not all the same
-// they are added to the literal run.  If they are the same, they are added to the
-// repeated run.  When we switch modes, the previous run is flushed out.
+/// Class to incrementally build the rle data.   This class does not allocate any memory.
+/// The encoding has two modes: encoding repeated runs and literal runs.
+/// If the run is sufficiently short, it is more efficient to encode as a literal run.
+/// This class does so by buffering 8 values at a time.  If they are not all the same
+/// they are added to the literal run.  If they are the same, they are added to the
+/// repeated run.  When we switch modes, the previous run is flushed out.
 class RleEncoder {
  public:
-  // buffer/buffer_len: preallocated output buffer.
-  // bit_width: max number of bits for value.
-  // TODO: consider adding a min_repeated_run_length so the caller can control
-  // when values should be encoded as repeated runs.  Currently this is derived
-  // based on the bit_width, which can determine a storage optimal choice.
-  // TODO: allow 0 bit_width (and have dict encoder use it)
+  /// buffer/buffer_len: preallocated output buffer.
+  /// bit_width: max number of bits for value.
+  /// TODO: consider adding a min_repeated_run_length so the caller can control
+  /// when values should be encoded as repeated runs.  Currently this is derived
+  /// based on the bit_width, which can determine a storage optimal choice.
+  /// TODO: allow 0 bit_width (and have dict encoder use it)
   RleEncoder(uint8_t* buffer, int buffer_len, int bit_width)
     : bit_width_(bit_width),
       bit_writer_(buffer, buffer_len) {
-    DCHECK_GE(bit_width_, 1);
+    DCHECK_GE(bit_width_, 0);
     DCHECK_LE(bit_width_, 64);
     max_run_byte_size_ = MinBufferSize(bit_width);
     DCHECK_GE(buffer_len, max_run_byte_size_) << "Input buffer not big enough.";
     Clear();
   }
 
-  // Returns the minimum buffer size needed to use the encoder for 'bit_width'
-  // This is the maximum length of a single run for 'bit_width'.
-  // It is not valid to pass a buffer less than this length.
+  /// Returns the minimum buffer size needed to use the encoder for 'bit_width'
+  /// This is the maximum length of a single run for 'bit_width'.
+  /// It is not valid to pass a buffer less than this length.
   static int MinBufferSize(int bit_width) {
-    // 1 indicator byte and MAX_VALUES_PER_LITERAL_RUN 'bit_width' values.
+    /// 1 indicator byte and MAX_VALUES_PER_LITERAL_RUN 'bit_width' values.
     int max_literal_run_size = 1 +
         BitUtil::Ceil(MAX_VALUES_PER_LITERAL_RUN * bit_width, 8);
-    // Up to MAX_VLQ_BYTE_LEN indicator and a single 'bit_width' value.
+    /// Up to MAX_VLQ_BYTE_LEN indicator and a single 'bit_width' value.
     int max_repeated_run_size = BitReader::MAX_VLQ_BYTE_LEN + BitUtil::Ceil(bit_width, 8);
     return std::max(max_literal_run_size, max_repeated_run_size);
   }
 
-  // Returns the maximum byte size it could take to encode 'num_values'.
+  /// Returns the maximum byte size it could take to encode 'num_values'.
   static int MaxBufferSize(int bit_width, int num_values) {
     int bytes_per_run = BitUtil::Ceil(bit_width * MAX_VALUES_PER_LITERAL_RUN, 8.0);
     int num_runs = BitUtil::Ceil(num_values, MAX_VALUES_PER_LITERAL_RUN);
     int literal_max_size = num_runs + num_runs * bytes_per_run;
-    int min_run_size = MinBufferSize(bit_width);
-    return std::max(min_run_size, literal_max_size) + min_run_size;
+    return std::max(MinBufferSize(bit_width), literal_max_size);
   }
 
-  // Encode value.  Returns true if the value fits in buffer, false otherwise.
-  // This value must be representable with bit_width_ bits.
+  /// Encode value.  Returns true if the value fits in buffer, false otherwise.
+  /// This value must be representable with bit_width_ bits.
   bool Put(uint64_t value);
 
-  // Flushes any pending values to the underlying buffer.
-  // Returns the total number of bytes written
+  /// Flushes any pending values to the underlying buffer.
+  /// Returns the total number of bytes written
   int Flush();
 
-  // Resets all the state in the encoder.
+  /// Resets all the state in the encoder.
   void Clear();
 
-  // Returns pointer to underlying buffer
+  /// Returns pointer to underlying buffer
   uint8_t* buffer() { return bit_writer_.buffer(); }
   int32_t len() { return bit_writer_.bytes_written(); }
 
  private:
-  // Flushes any buffered values.  If this is part of a repeated run, this is largely
-  // a no-op.
-  // If it is part of a literal run, this will call FlushLiteralRun, which writes
-  // out the buffered literal values.
-  // If 'done' is true, the current run would be written even if it would normally
-  // have been buffered more.  This should only be called at the end, when the
-  // encoder has received all values even if it would normally continue to be
-  // buffered.
+  /// Flushes any buffered values.  If this is part of a repeated run, this is largely
+  /// a no-op.
+  /// If it is part of a literal run, this will call FlushLiteralRun, which writes
+  /// out the buffered literal values.
+  /// If 'done' is true, the current run would be written even if it would normally
+  /// have been buffered more.  This should only be called at the end, when the
+  /// encoder has received all values even if it would normally continue to be
+  /// buffered.
   void FlushBufferedValues(bool done);
 
-  // Flushes literal values to the underlying buffer.  If update_indicator_byte,
-  // then the current literal run is complete and the indicator byte is updated.
+  /// Flushes literal values to the underlying buffer.  If update_indicator_byte,
+  /// then the current literal run is complete and the indicator byte is updated.
   void FlushLiteralRun(bool update_indicator_byte);
 
-  // Flushes a repeated run to the underlying buffer.
+  /// Flushes a repeated run to the underlying buffer.
   void FlushRepeatedRun();
 
-  // Checks and sets buffer_full_. This must be called after flushing a run to
-  // make sure there are enough bytes remaining to encode the next run.
+  /// Checks and sets buffer_full_. This must be called after flushing a run to
+  /// make sure there are enough bytes remaining to encode the next run.
   void CheckBufferFull();
 
-  // The maximum number of values in a single literal run
-  // (number of groups encodable by a 1-byte indicator * 8)
+  /// The maximum number of values in a single literal run
+  /// (number of groups encodable by a 1-byte indicator * 8)
   static const int MAX_VALUES_PER_LITERAL_RUN = (1 << 6) * 8;
 
-  // Number of bits needed to encode the value.
+  /// Number of bits needed to encode the value. Must be between 0 and 64.
   const int bit_width_;
 
-  // Underlying buffer.
+  /// Underlying buffer.
   BitWriter bit_writer_;
 
-  // If true, the buffer is full and subsequent Put()'s will fail.
+  /// If true, the buffer is full and subsequent Put()'s will fail.
   bool buffer_full_;
 
-  // The maximum byte size a single run can take.
+  /// The maximum byte size a single run can take.
   int max_run_byte_size_;
 
-  // We need to buffer at most 8 values for literals.  This happens when the
-  // bit_width is 1 (so 8 values fit in one byte).
-  // TODO: generalize this to other bit widths
+  /// We need to buffer at most 8 values for literals.  This happens when the
+  /// bit_width is 1 (so 8 values fit in one byte).
+  /// TODO: generalize this to other bit widths
   int64_t buffered_values_[8];
 
-  // Number of values in buffered_values_
+  /// Number of values in buffered_values_
   int num_buffered_values_;
 
-  // The current (also last) value that was written and the count of how
-  // many times in a row that value has been seen.  This is maintained even
-  // if we are in a literal run.  If the repeat_count_ get high enough, we switch
-  // to encoding repeated runs.
+  /// The current (also last) value that was written and the count of how
+  /// many times in a row that value has been seen.  This is maintained even
+  /// if we are in a literal run.  If the repeat_count_ get high enough, we switch
+  /// to encoding repeated runs.
   int64_t current_value_;
   int repeat_count_;
 
-  // Number of literals in the current run.  This does not include the literals
-  // that might be in buffered_values_.  Only after we've got a group big enough
-  // can we decide if they should part of the literal_count_ or repeat_count_
+  /// Number of literals in the current run.  This does not include the literals
+  /// that might be in buffered_values_.  Only after we've got a group big enough
+  /// can we decide if they should part of the literal_count_ or repeat_count_
   int literal_count_;
 
-  // Pointer to a byte in the underlying buffer that stores the indicator byte.
-  // This is reserved as soon as we need a literal run but the value is written
-  // when the literal run is complete.
+  /// Pointer to a byte in the underlying buffer that stores the indicator byte.
+  /// This is reserved as soon as we need a literal run but the value is written
+  /// when the literal run is complete.
   uint8_t* literal_indicator_byte_;
 };
 
 template<typename T>
 inline bool RleDecoder::Get(T* val) {
+  DCHECK_GE(bit_width_, 0);
   if (UNLIKELY(literal_count_ == 0 && repeat_count_ == 0)) {
-    // Read the next run's indicator int, it could be a literal or repeated run
-    // The int is encoded as a vlq-encoded value.
-    uint64_t indicator_value = 0;
-    bool result = bit_reader_.GetVlqInt(&indicator_value);
-    if (!result) return false;
-
-    // lsb indicates if it is a literal run or repeated run
-    bool is_literal = indicator_value & 1;
-    if (is_literal) {
-      literal_count_ = (indicator_value >> 1) * 8;
-    } else {
-      repeat_count_ = indicator_value >> 1;
-      bool result = bit_reader_.GetAligned<T>(
-          BitUtil::Ceil(bit_width_, 8), reinterpret_cast<T*>(&current_value_));
-      DCHECK(result);
-    }
+    if (!NextCounts<T>()) return false;
   }
 
   if (LIKELY(repeat_count_ > 0)) {
     *val = current_value_;
     --repeat_count_;
   } else {
-    DCHECK(literal_count_ > 0);
+    DCHECK_GT(literal_count_, 0);
     bool result = bit_reader_.GetValue(bit_width_, val);
     DCHECK(result);
     --literal_count_;
@@ -265,8 +268,29 @@ inline bool RleDecoder::Get(T* val) {
   return true;
 }
 
-// This function buffers input values 8 at a time.  After seeing all 8 values,
-// it decides whether they should be encoded as a literal or repeated run.
+template<typename T>
+bool RleDecoder::NextCounts() {
+  // Read the next run's indicator int, it could be a literal or repeated run.
+  // The int is encoded as a vlq-encoded value.
+  int32_t indicator_value = 0;
+  bool result = bit_reader_.GetVlqInt(&indicator_value);
+  if (!result) return false;
+
+  // lsb indicates if it is a literal run or repeated run
+  bool is_literal = indicator_value & 1;
+  if (is_literal) {
+    literal_count_ = (indicator_value >> 1) * 8;
+  } else {
+    repeat_count_ = indicator_value >> 1;
+    bool result = bit_reader_.GetAligned<T>(
+        BitUtil::Ceil(bit_width_, 8), reinterpret_cast<T*>(&current_value_));
+    DCHECK(result);
+  }
+  return true;
+}
+
+/// This function buffers input values 8 at a time.  After seeing all 8 values,
+/// it decides whether they should be encoded as a literal or repeated run.
 inline bool RleEncoder::Put(uint64_t value) {
   DCHECK(bit_width_ == 64 || value < (1LL << bit_width_));
   if (UNLIKELY(buffer_full_)) return false;
@@ -341,8 +365,8 @@ inline void RleEncoder::FlushRepeatedRun() {
   CheckBufferFull();
 }
 
-// Flush the values that have been buffered.  At this point we decide whether
-// we need to switch between the run types or continue the current one.
+/// Flush the values that have been buffered.  At this point we decide whether
+/// we need to switch between the run types or continue the current one.
 inline void RleEncoder::FlushBufferedValues(bool done) {
   if (repeat_count_ >= 8) {
     // Clear the buffered values.  They are part of the repeated run now and we
diff --git a/cpp/src/parquet/util/rle-test.cc b/cpp/src/parquet/util/rle-test.cc
new file mode 100644
index 0000000000000..b2628e981d6e1
--- /dev/null
+++ b/cpp/src/parquet/util/rle-test.cc
@@ -0,0 +1,400 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// From Apache Impala as of 2016-01-29
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <cstdint>
+#include <iostream>
+#include <vector>
+
+#include <boost/utility.hpp>
+#include <gtest/gtest.h>
+#include <math.h>
+
+#include "parquet/util/rle-encoding.h"
+#include "parquet/util/bit-stream-utils.inline.h"
+
+using std::vector;
+
+namespace parquet_cpp {
+
+const int MAX_WIDTH = 32;
+
+TEST(BitArray, TestBool) {
+  const int len = 8;
+  uint8_t buffer[len];
+
+  BitWriter writer(buffer, len);
+
+  // Write alternating 0's and 1's
+  for (int i = 0; i < 8; ++i) {
+    bool result = writer.PutValue(i % 2, 1);
+    EXPECT_TRUE(result);
+  }
+  writer.Flush();
+  EXPECT_EQ((int)buffer[0], BOOST_BINARY(1 0 1 0 1 0 1 0));
+
+  // Write 00110011
+  for (int i = 0; i < 8; ++i) {
+    bool result = false;
+    switch (i) {
+      case 0:
+      case 1:
+      case 4:
+      case 5:
+        result = writer.PutValue(false, 1);
+        break;
+      default:
+        result = writer.PutValue(true, 1);
+        break;
+    }
+    EXPECT_TRUE(result);
+  }
+  writer.Flush();
+
+  // Validate the exact bit value
+  EXPECT_EQ((int)buffer[0], BOOST_BINARY(1 0 1 0 1 0 1 0));
+  EXPECT_EQ((int)buffer[1], BOOST_BINARY(1 1 0 0 1 1 0 0));
+
+  // Use the reader and validate
+  BitReader reader(buffer, len);
+  for (int i = 0; i < 8; ++i) {
+    bool val = false;
+    bool result = reader.GetValue(1, &val);
+    EXPECT_TRUE(result);
+    EXPECT_EQ(val, i % 2);
+  }
+
+  for (int i = 0; i < 8; ++i) {
+    bool val = false;
+    bool result = reader.GetValue(1, &val);
+    EXPECT_TRUE(result);
+    switch (i) {
+      case 0:
+      case 1:
+      case 4:
+      case 5:
+        EXPECT_EQ(val, false);
+        break;
+      default:
+        EXPECT_EQ(val, true);
+        break;
+    }
+  }
+}
+
+// Writes 'num_vals' values with width 'bit_width' and reads them back.
+void TestBitArrayValues(int bit_width, int num_vals) {
+  const int len = BitUtil::Ceil(bit_width * num_vals, 8);
+  const uint64_t mod = bit_width == 64? 1 : 1LL << bit_width;
+
+  uint8_t buffer[len];
+  BitWriter writer(buffer, len);
+  for (int i = 0; i < num_vals; ++i) {
+    bool result = writer.PutValue(i % mod, bit_width);
+    EXPECT_TRUE(result);
+  }
+  writer.Flush();
+  EXPECT_EQ(writer.bytes_written(), len);
+
+  BitReader reader(buffer, len);
+  for (int i = 0; i < num_vals; ++i) {
+    int64_t val;
+    bool result = reader.GetValue(bit_width, &val);
+    EXPECT_TRUE(result);
+    EXPECT_EQ(val, i % mod);
+  }
+  EXPECT_EQ(reader.bytes_left(), 0);
+}
+
+TEST(BitArray, TestValues) {
+  for (int width = 0; width <= MAX_WIDTH; ++width) {
+    TestBitArrayValues(width, 1);
+    TestBitArrayValues(width, 2);
+    // Don't write too many values
+    TestBitArrayValues(width, (width < 12) ? (1 << width) : 4096);
+    TestBitArrayValues(width, 1024);
+  }
+}
+
+// Test some mixed values
+TEST(BitArray, TestMixed) {
+  const int len = 1024;
+  uint8_t buffer[len];
+  bool parity = true;
+
+  BitWriter writer(buffer, len);
+  for (int i = 0; i < len; ++i) {
+    bool result;
+    if (i % 2 == 0) {
+      result = writer.PutValue(parity, 1);
+      parity = !parity;
+    } else {
+      result = writer.PutValue(i, 10);
+    }
+    EXPECT_TRUE(result);
+  }
+  writer.Flush();
+
+  parity = true;
+  BitReader reader(buffer, len);
+  for (int i = 0; i < len; ++i) {
+    bool result;
+    if (i % 2 == 0) {
+      bool val;
+      result = reader.GetValue(1, &val);
+      EXPECT_EQ(val, parity);
+      parity = !parity;
+    } else {
+      int val;
+      result = reader.GetValue(10, &val);
+      EXPECT_EQ(val, i);
+    }
+    EXPECT_TRUE(result);
+  }
+}
+
+// Validates encoding of values by encoding and decoding them.  If
+// expected_encoding != NULL, also validates that the encoded buffer is
+// exactly 'expected_encoding'.
+// if expected_len is not -1, it will validate the encoded size is correct.
+void ValidateRle(const vector<int>& values, int bit_width,
+                 uint8_t* expected_encoding, int expected_len) {
+  const int len = 64 * 1024;
+  uint8_t buffer[len];
+  EXPECT_LE(expected_len, len);
+
+  RleEncoder encoder(buffer, len, bit_width);
+  for (int i = 0; i < values.size(); ++i) {
+    bool result = encoder.Put(values[i]);
+    EXPECT_TRUE(result);
+  }
+  int encoded_len = encoder.Flush();
+
+  if (expected_len != -1) {
+    EXPECT_EQ(encoded_len, expected_len);
+  }
+  if (expected_encoding != NULL) {
+    EXPECT_TRUE(memcmp(buffer, expected_encoding, expected_len) == 0);
+  }
+
+  // Verify read
+  RleDecoder decoder(buffer, len, bit_width);
+  for (int i = 0; i < values.size(); ++i) {
+    uint64_t val;
+    bool result = decoder.Get(&val);
+    EXPECT_TRUE(result);
+    EXPECT_EQ(values[i], val);
+  }
+}
+
+TEST(Rle, SpecificSequences) {
+  const int len = 1024;
+  uint8_t expected_buffer[len];
+  vector<int> values;
+
+  // Test 50 0' followed by 50 1's
+  values.resize(100);
+  for (int i = 0; i < 50; ++i) {
+    values[i] = 0;
+  }
+  for (int i = 50; i < 100; ++i) {
+    values[i] = 1;
+  }
+
+  // expected_buffer valid for bit width <= 1 byte
+  expected_buffer[0] = (50 << 1);
+  expected_buffer[1] = 0;
+  expected_buffer[2] = (50 << 1);
+  expected_buffer[3] = 1;
+  for (int width = 1; width <= 8; ++width) {
+    ValidateRle(values, width, expected_buffer, 4);
+  }
+
+  for (int width = 9; width <= MAX_WIDTH; ++width) {
+    ValidateRle(values, width, NULL, 2 * (1 + BitUtil::Ceil(width, 8)));
+  }
+
+  // Test 100 0's and 1's alternating
+  for (int i = 0; i < 100; ++i) {
+    values[i] = i % 2;
+  }
+  int num_groups = BitUtil::Ceil(100, 8);
+  expected_buffer[0] = (num_groups << 1) | 1;
+  for (int i = 1; i <= 100/8; ++i) {
+    expected_buffer[i] = BOOST_BINARY(1 0 1 0 1 0 1 0);
+  }
+  // Values for the last 4 0 and 1's. The upper 4 bits should be padded to 0.
+  expected_buffer[100/8 + 1] = BOOST_BINARY(0 0 0 0 1 0 1 0);
+
+  // num_groups and expected_buffer only valid for bit width = 1
+  ValidateRle(values, 1, expected_buffer, 1 + num_groups);
+  for (int width = 2; width <= MAX_WIDTH; ++width) {
+    int num_values = BitUtil::Ceil(100, 8) * 8;
+    ValidateRle(values, width, NULL, 1 + BitUtil::Ceil(width * num_values, 8));
+  }
+}
+
+// ValidateRle on 'num_vals' values with width 'bit_width'. If 'value' != -1, that value
+// is used, otherwise alternating values are used.
+void TestRleValues(int bit_width, int num_vals, int value = -1) {
+  const uint64_t mod = (bit_width == 64) ? 1 : 1LL << bit_width;
+  vector<int> values;
+  for (int v = 0; v < num_vals; ++v) {
+    values.push_back((value != -1) ? value : (v % mod));
+  }
+  ValidateRle(values, bit_width, NULL, -1);
+}
+
+TEST(Rle, TestValues) {
+  for (int width = 1; width <= MAX_WIDTH; ++width) {
+    TestRleValues(width, 1);
+    TestRleValues(width, 1024);
+    TestRleValues(width, 1024, 0);
+    TestRleValues(width, 1024, 1);
+  }
+}
+
+TEST(Rle, BitWidthZeroRepeated) {
+  uint8_t buffer[1];
+  const int num_values = 15;
+  buffer[0] = num_values << 1; // repeated indicator byte
+  RleDecoder decoder(buffer, sizeof(buffer), 0);
+  uint8_t val;
+  for (int i = 0; i < num_values; ++i) {
+    bool result = decoder.Get(&val);
+    EXPECT_TRUE(result);
+    EXPECT_EQ(val, 0); // can only encode 0s with bit width 0
+  }
+  EXPECT_FALSE(decoder.Get(&val));
+}
+
+TEST(Rle, BitWidthZeroLiteral) {
+  uint8_t buffer[1];
+  const int num_groups = 4;
+  buffer[0] = num_groups << 1 | 1; // literal indicator byte
+  RleDecoder decoder = RleDecoder(buffer, sizeof(buffer), 0);
+  const int num_values = num_groups * 8;
+  uint8_t val;
+  for (int i = 0; i < num_values; ++i) {
+    bool result = decoder.Get(&val);
+    EXPECT_TRUE(result);
+    EXPECT_EQ(val, 0); // can only encode 0s with bit width 0
+  }
+  EXPECT_FALSE(decoder.Get(&val));
+}
+
+// Test that writes out a repeated group and then a literal
+// group but flush before finishing.
+TEST(BitRle, Flush) {
+  vector<int> values;
+  for (int i = 0; i < 16; ++i) values.push_back(1);
+  values.push_back(0);
+  ValidateRle(values, 1, NULL, -1);
+  values.push_back(1);
+  ValidateRle(values, 1, NULL, -1);
+  values.push_back(1);
+  ValidateRle(values, 1, NULL, -1);
+  values.push_back(1);
+  ValidateRle(values, 1, NULL, -1);
+}
+
+// Test some random sequences.
+TEST(BitRle, Random) {
+  int iters = 0;
+  while (iters < 1000) {
+    srand(iters++);
+    if (iters % 10000 == 0) LOG(ERROR) << "Seed: " << iters;
+    vector<int> values;
+    bool parity = 0;
+    for (int i = 0; i < 1000; ++i) {
+      int group_size = rand() % 20 + 1;  // NOLINT
+      if (group_size > 16) {
+        group_size = 1;
+      }
+      for (int i = 0; i < group_size; ++i) {
+        values.push_back(parity);
+      }
+      parity = !parity;
+    }
+    ValidateRle(values, (iters % MAX_WIDTH) + 1, NULL, -1);
+  }
+}
+
+// Test a sequence of 1 0's, 2 1's, 3 0's. etc
+// e.g. 011000111100000
+TEST(BitRle, RepeatedPattern) {
+  vector<int> values;
+  const int min_run = 1;
+  const int max_run = 32;
+
+  for (int i = min_run; i <= max_run; ++i) {
+    int v = i % 2;
+    for (int j = 0; j < i; ++j) {
+      values.push_back(v);
+    }
+  }
+
+  // And go back down again
+  for (int i = max_run; i >= min_run; --i) {
+    int v = i % 2;
+    for (int j = 0; j < i; ++j) {
+      values.push_back(v);
+    }
+  }
+
+  ValidateRle(values, 1, NULL, -1);
+}
+
+TEST(BitRle, Overflow) {
+  for (int bit_width = 1; bit_width < 32; bit_width += 3) {
+    const int len = RleEncoder::MinBufferSize(bit_width);
+    uint8_t buffer[len];
+    int num_added = 0;
+    bool parity = true;
+
+    RleEncoder encoder(buffer, len, bit_width);
+    // Insert alternating true/false until there is no space left
+    while (true) {
+      bool result = encoder.Put(parity);
+      parity = !parity;
+      if (!result) break;
+      ++num_added;
+    }
+
+    int bytes_written = encoder.Flush();
+    EXPECT_LE(bytes_written, len);
+    EXPECT_GT(num_added, 0);
+
+    RleDecoder decoder(buffer, bytes_written, bit_width);
+    parity = true;
+    uint32_t v;
+    for (int i = 0; i < num_added; ++i) {
+      bool result = decoder.Get(&v);
+      EXPECT_TRUE(result);
+      EXPECT_EQ(v, parity);
+      parity = !parity;
+    }
+    // Make sure we get false when reading past end a couple times.
+    EXPECT_FALSE(decoder.Get(&v));
+    EXPECT_FALSE(decoder.Get(&v));
+  }
+}
+
+} // namespace parquet_cpp
diff --git a/cpp/src/parquet/util/sse-util.h b/cpp/src/parquet/util/sse-util.h
new file mode 100644
index 0000000000000..588c30a07f238
--- /dev/null
+++ b/cpp/src/parquet/util/sse-util.h
@@ -0,0 +1,191 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// From Apache Impala as of 2016-01-29. Pared down to a minimal set of
+// functions needed for parquet-cpp
+
+#ifndef PARQUET_UTIL_SSE_UTIL_H
+#define PARQUET_UTIL_SSE_UTIL_H
+
+#include <emmintrin.h>
+
+namespace parquet_cpp {
+
+/// This class contains constants useful for text processing with SSE4.2 intrinsics.
+namespace SSEUtil {
+  /// Number of characters that fit in 64/128 bit register.  SSE provides instructions
+  /// for loading 64 or 128 bits into a register at a time.
+  static const int CHARS_PER_64_BIT_REGISTER = 8;
+  static const int CHARS_PER_128_BIT_REGISTER = 16;
+
+  /// SSE4.2 adds instructions for text processing.  The instructions have a control
+  /// byte that determines some of functionality of the instruction.  (Equivalent to
+  /// GCC's _SIDD_CMP_EQUAL_ANY, etc).
+  static const int PCMPSTR_EQUAL_ANY    = 0x00; // strchr
+  static const int PCMPSTR_EQUAL_EACH   = 0x08; // strcmp
+  static const int PCMPSTR_UBYTE_OPS    = 0x00; // unsigned char (8-bits, rather than 16)
+  static const int PCMPSTR_NEG_POLARITY = 0x10; // see Intel SDM chapter 4.1.4.
+
+  /// In this mode, SSE text processing functions will return a mask of all the
+  /// characters that matched.
+  static const int STRCHR_MODE = PCMPSTR_EQUAL_ANY | PCMPSTR_UBYTE_OPS;
+
+  /// In this mode, SSE text processing functions will return the number of
+  /// bytes that match consecutively from the beginning.
+  static const int STRCMP_MODE = PCMPSTR_EQUAL_EACH | PCMPSTR_UBYTE_OPS |
+      PCMPSTR_NEG_POLARITY;
+
+  /// Precomputed mask values up to 16 bits.
+  static const int SSE_BITMASK[CHARS_PER_128_BIT_REGISTER] = {
+    1 << 0,
+    1 << 1,
+    1 << 2,
+    1 << 3,
+    1 << 4,
+    1 << 5,
+    1 << 6,
+    1 << 7,
+    1 << 8,
+    1 << 9,
+    1 << 10,
+    1 << 11,
+    1 << 12,
+    1 << 13,
+    1 << 14,
+    1 << 15,
+  };
+} // namespace SSEUtil
+
+/// Define the SSE 4.2 intrinsics.  The caller must first verify at runtime (or codegen
+/// IR load time) that the processor supports SSE 4.2 before calling these.  These are
+/// defined outside the namespace because the IR w/ SSE 4.2 case needs to use macros.
+#ifndef IR_COMPILE
+/// When compiling to native code (i.e. not IR), we cannot use the -msse4.2 compiler
+/// flag.  Otherwise, the compiler will emit SSE 4.2 instructions outside of the runtime
+/// SSE 4.2 checks and Impala will crash on CPUs that don't support SSE 4.2
+/// (IMPALA-1399/1646).  The compiler intrinsics cannot be used without -msse4.2, so we
+/// define our own implementations of the intrinsics instead.
+
+#if defined(__SSE4_1__) || defined(__POPCNT__)
+/// Impala native code should not be compiled with -msse4.1 or higher until the minimum
+/// CPU requirement is raised to at least the targeted instruction set.
+#error "Do not compile with -msse4.1 or higher."
+#endif
+
+/// The PCMPxSTRy instructions require that the control byte 'mode' be encoded as an
+/// immediate.  So, those need to be always inlined in order to always propagate the
+/// mode constant into the inline asm.
+#define SSE_ALWAYS_INLINE inline __attribute__ ((__always_inline__))
+
+template<int MODE>
+static inline __m128i SSE4_cmpestrm(__m128i str1, int len1, __m128i str2, int len2) {
+  /// Use asm reg rather than Yz output constraint to workaround LLVM bug 13199 -
+  /// clang doesn't support Y-prefixed asm constraints.
+  register volatile __m128i result asm("xmm0");
+  __asm__ volatile ("pcmpestrm %5, %2, %1"
+      : "=x"(result) : "x"(str1), "xm"(str2), "a"(len1), "d"(len2), "i"(MODE) : "cc");
+  return result;
+}
+
+template<int MODE>
+static inline int SSE4_cmpestri(__m128i str1, int len1, __m128i str2, int len2) {
+  int result;
+  __asm__("pcmpestri %5, %2, %1"
+      : "=c"(result) : "x"(str1), "xm"(str2), "a"(len1), "d"(len2), "i"(MODE) : "cc");
+  return result;
+}
+
+static inline uint32_t SSE4_crc32_u8(uint32_t crc, uint8_t v) {
+  __asm__("crc32b %1, %0" : "+r"(crc) : "rm"(v));
+  return crc;
+}
+
+static inline uint32_t SSE4_crc32_u32(uint32_t crc, uint32_t v) {
+  __asm__("crc32l %1, %0" : "+r"(crc) : "rm"(v));
+  return crc;
+}
+
+static inline int64_t POPCNT_popcnt_u64(uint64_t a) {
+  int64_t result;
+  __asm__("popcntq %1, %0" : "=r"(result) : "mr"(a) : "cc");
+  return result;
+}
+
+#undef SSE_ALWAYS_INLINE
+
+#elif defined(__SSE4_2__) // IR_COMPILE for SSE 4.2.
+/// When cross-compiling to IR, we cannot use inline asm because LLVM JIT does not
+/// support it.  However, the cross-compiled IR is compiled twice: with and without
+/// -msse4.2.  When -msse4.2 is enabled in the cross-compile, we can just use the
+/// compiler intrinsics.
+
+#include <smmintrin.h>
+
+template<int MODE>
+static inline __m128i SSE4_cmpestrm(
+    __m128i str1, int len1, __m128i str2, int len2) {
+  return _mm_cmpestrm(str1, len1, str2, len2, MODE);
+}
+
+template<int MODE>
+static inline int SSE4_cmpestri(
+    __m128i str1, int len1, __m128i str2, int len2) {
+  return _mm_cmpestri(str1, len1, str2, len2, MODE);
+}
+
+#define SSE4_crc32_u8 _mm_crc32_u8
+#define SSE4_crc32_u32 _mm_crc32_u32
+#define POPCNT_popcnt_u64 _mm_popcnt_u64
+
+#else  // IR_COMPILE without SSE 4.2.
+/// When cross-compiling to IR without SSE 4.2 support (i.e. no -msse4.2), we cannot use
+/// SSE 4.2 instructions.  Otherwise, the IR loading will fail on CPUs that don't
+/// support SSE 4.2.  However, because the caller isn't allowed to call these routines
+/// on CPUs that lack SSE 4.2 anyway, we can implement stubs for this case.
+
+template<int MODE>
+static inline __m128i SSE4_cmpestrm(__m128i str1, int len1, __m128i str2, int len2) {
+  DCHECK(false) << "CPU doesn't support SSE 4.2";
+  return (__m128i) { 0 }; // NOLINT
+}
+
+template<int MODE>
+static inline int SSE4_cmpestri(__m128i str1, int len1, __m128i str2, int len2) {
+  DCHECK(false) << "CPU doesn't support SSE 4.2";
+  return 0;
+}
+
+static inline uint32_t SSE4_crc32_u8(uint32_t crc, uint8_t v) {
+  DCHECK(false) << "CPU doesn't support SSE 4.2";
+  return 0;
+}
+
+static inline uint32_t SSE4_crc32_u32(uint32_t crc, uint32_t v) {
+  DCHECK(false) << "CPU doesn't support SSE 4.2";
+  return 0;
+}
+
+static inline int64_t POPCNT_popcnt_u64(uint64_t a) {
+  DCHECK(false) << "CPU doesn't support SSE 4.2";
+  return 0;
+}
+
+#endif
+
+} // namespace parquet_cpp
+
+#endif //  PARQUET_UTIL_SSE_UTIL_H