diff --git a/CMakeLists.txt b/CMakeLists.txt
index 93731f8b3..bcba35bbc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -21,6 +21,7 @@ if(CMAKE_Fortran_COMPILER_ID STREQUAL GNU)
   endif()
   add_compile_options(-fimplicit-none)
   add_compile_options(-ffree-line-length-132)
+  add_compile_options(-fno-range-check)
   add_compile_options(-Wall)
   add_compile_options(-Wextra)
   add_compile_options(-Wimplicit-procedure)
diff --git a/Makefile.manual b/Makefile.manual
index b7af735b7..54dc3b89f 100644
--- a/Makefile.manual
+++ b/Makefile.manual
@@ -1,7 +1,7 @@
 # Fortran stdlib Makefile
 
 FC ?= gfortran
-FFLAGS ?= -Wall -Wextra -Wimplicit-interface -fPIC -g -fcheck=all
+FFLAGS ?= -Wall -Wextra -Wimplicit-interface -fPIC -g -fcheck=all -fno-range-check
 FYPPFLAGS ?=
 
 export FC
diff --git a/doc/specs/index.md b/doc/specs/index.md
index a3b0a5def..0d1afbffa 100644
--- a/doc/specs/index.md
+++ b/doc/specs/index.md
@@ -14,6 +14,8 @@ This is and index/directory of the specifications (specs) for each new module/fe
  - [ascii](./stdlib_ascii.html) - Procedures for handling ASCII characters
  - [bitsets](./stdlib_bitsets.html) - Bitset data types and procedures
  - [error](./stdlib_error.html) - Catching and handling errors
+ - [hash\_functions](./stdlib_has_functions.html) - Hashing integer
+   vectors or character strings
  - [IO](./stdlib_io.html) - Input/output helper & convenience
  - [kinds](./stdlib_kinds.html) - Kind parameters
  - [linalg](./stdlib_linalg.html) - Linear Algebra
diff --git a/doc/specs/stdlib_hash_functions.md b/doc/specs/stdlib_hash_functions.md
new file mode 100644
index 000000000..f84f60623
--- /dev/null
+++ b/doc/specs/stdlib_hash_functions.md
@@ -0,0 +1,1697 @@
+---
+title: Hash codes
+---
+
+# The `stdlib_32_bit_hash_functions` and `stdlib_64_bit_hash_functions` modules
+
+(TOC)
+
+## Overview of hash functions
+
+The comparison of lexical entities or other objects for equality
+can be computationally expensive.
+This cost is often reduced by computing a near unique integer value,
+termed a hash code, from the structure of the object, termed a key,
+using a procedure, termed a hash function.
+Equality of hash codes is a necessary, but not sufficient, condition
+for the original objects to be equal. 
+As integer comparisons are very efficient, performing an initial
+comparison of hash codes and then performing a detailed comparison
+only if the hash codes are equal can improve performance.
+The hash codes, in turn, can be mapped to a smaller set of integers,
+that can be used as an index, termed a hash index, to a rank one
+array, often termed a hash table.
+This mapping will be known as a scalar hash.
+The use of a hash table reduces the number of hash codes that need to
+be compared, further improving performance.
+A hash function can also be used to generate a checksum to verify that
+data has not changed.
+The Fortran Standard Library therefore provides procedures to compute
+hash codes and scalar hashes, and derived types implementing hash
+tables.
+This document only discusses the hash codes and scalar hashes in the
+library.
+
+## Licensing
+
+The Fortran Standard Library is distributed under the MIT License.
+However components of the library may be based on code with additional
+licensing restrictions. In particular, the hash codes are often based
+on algorithms with additional restrictions on distribution.
+The algorithms with such restrictions (`Fibonacci Hash`, `Universal
+Multiplicative Hash`,
+`FNV-1 Hash`, `FNV-1A Hash`, `nmhash32`, `nmhash32x`, `waterhash`,
+`pengyhash` and `SpookyHash`) are discussed below.
+
+`FIBONACCI_HASH` is a scalar hash. It is an implementation in Fortran
+2008 and signed two's complement integers of the Fibonacci Hash
+described in D. E. Knuth, "The Art of
+Computer Programming, Second Edition, Volume 3, Sorting and
+Searching", Addison-Wesley, Upper Saddle River, NJ,
+pp. 517-518, 1998. The algorithms in that source are considered public
+domain.
+
+`UNIVERSAL_MULT_HASH` is a scalar hash. It is an implementation in
+Fortran 2008 and signed two's complement integers of the
+universal multiplicative hash algorithm of M. Dietzfelbinger,
+T. Hagerup, J. Katajainen, and M. Penttonen, "A Reliable Randomized
+Algorithm for the Closest-Pair Problem," J. Algorithms, Vol. 25,
+No. 1, Oct. 1997, pp. 19-51. Because of its publication in the Journal
+of Algorithms, the universal multiplicative hash algorithm is public
+domain.
+
+`FNV_1_HASH` and `FNV_1A_HASH` are translations to Fortran 2008 and
+signed two's complement integers of the
+`FNV-1` and `FNV-1a` hash functions of Glenn Fowler, Landon Curt Noll,
+and Phong Vo, that has been released into the public
+domain. Permission has been granted, by Landon Curt Noll, for the use
+of these algorithms in the Fortran Standard Library. A description of
+these functions is available at
+<https://en.wikipedia.org/wiki/Fowler–Noll–Vo_hash_function>.
+These functions have been modified from their normal forms to also
+encode the structure size in the output hash.
+
+Similarly `SPOOKY_HASH` and associated procedures are translations to
+Fortran 2008 and signed two's complement integers of the unsigned 64
+bit version 2 `SpookyHash` functions of Bob
+Jenkins <https://burtleburtle.net/bob/hash/spooky.html> to signed 64
+bit operations. Version 2 was chosen over version 1 as it has better
+performance and fewer bad seeds
+Bob Jenkins has also put this code in the public
+domain and has given permission to treat this code as public domain in
+the USA, provided the code can be used under other licenses and he is
+given appropriate credit.
+
+`NMHASH32` and `NMHASH32x` are translations to Fortran 2008 and signed
+two's complement integers of the unsigned 32 bit
+hashes of James Z. M. Gao's `nmhash32` and `nmhash32x` version of 0.2,
+<https://github.com/gzm55/hash-garage/blob/a8913138bdb3b7539c202edee30a7f0794bbd835/nmhash.h>
+James Z. M. Gao has released his code under the BSD 2 Clause
+License. The BSD 2-Clause license is as follows:
+
+    BSD 2-Clause License
+
+    Copyright (c) 2021, James Z.M. Gao
+    All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions are met:
+
+    1. Redistributions of source code must retain the above copyright notice,
+       this list of conditions and the following disclaimer.
+
+    2. Redistributions in binary form must reproduce the above copyright notice,
+       this list of conditions and the following disclaimer in the documentation
+       and/or other materials provided with the distribution.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+    POSSIBILITY OF SUCH DAMAGE.
+
+`WATER_HASH` is a translation to Fortran 2008  and signed two's
+complement integers of the `waterhash` algorithm
+of Tommy Ettinger. This algorithm is inspired by the Wy Hash of
+Wang Yi. Tommy Ettinger's original C++ code, `waterhash.h`,
+is available at URL: <https://github.com/tommyettinger/waterhash> under
+the `unlicense`,
+<https://github.com/tommyettinger/waterhash/blob/master/LICENSE>.
+The `unlicense` reads as follows:
+
+    This is free and unencumbered software released into the public domain.
+    Anyone is free to copy, modify, publish, use, compile, sell, or
+    distribute this software, either in source code form or as a compiled
+    binary, for any purpose, commercial or non-commercial, and by any
+    means.
+
+    In jurisdictions that recognize copyright laws, the author or authors
+    of this software dedicate any and all copyright interest in the
+    software to the public domain. We make this dedication for the benefit
+    of the public at large and to the detriment of our heirs and
+    successors. We intend this dedication to be an overt act of
+    relinquishment in perpetuity of all present and future rights to this
+    software under copyright law.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+    IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+    OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+    ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+    OTHER DEALINGS IN THE SOFTWARE.
+
+    For more information, please refer to <http://unlicense.org>
+
+`PENGY_HASH` is a translation to Fortran 2008 and signed two's
+complement arithmetic of the `pengyhash` algorithm of Alberto Fajardo,
+copyright 2020. Alberto Fajardo's original C code, `pengyhash.c`, is
+available at the URL:
+https://github.com/tinypeng/pengyhash/blob/master/pengyhash.c
+under the BSD 2-Clause License:
+https://github.com/tinypeng/pengyhash/blob/master/LICENSE
+
+The BSD 2-Clause license is as follows:
+
+    BSD 2-Clause License
+
+    pengyhash
+    Copyright (c) 2020 Alberto Fajardo
+    All rights reserved.
+
+    Redistribution and use in source and binary forms, with or without
+    modification, are permitted provided that the following conditions
+    are met:
+
+    1. Redistributions of source code must retain the above copyright
+    notice, this list of conditions and the following disclaimer.
+
+    2. Redistributions in binary form must reproduce the above
+    copyright notice, this list of conditions and the following
+    disclaimer in the documentation and/or other materials provided
+    with the distribution.
+
+    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+	CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+	INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+	MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+	DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+	BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+	EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+	TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+	DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+	ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
+	TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
+	THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+	SUCH DAMAGE.
+
+
+## The hash codes modules
+
+### Overview of the modules
+
+The Standard Library provides two modules implementing hash
+functions and scalar hashes.
+The `stdlib_32_bit_hash_functions` module provides procedures to
+compute 32 bit integer hash codes and a scalar hash. 
+The 32 bit hash codes are useful for tables of up to `2**16` entries,
+and for keys with a few hundred elements.
+The `stdlib_64_bit_hash_functions` module provides hash procedures to
+compute 64 bit integer hash codes and a scalar hash.
+The 64 bit hash codes are useful for tables of up to `2**30` entries,
+and for keys with thousands of elements.
+While one of the codes in `stdlib_64_bit_hash_functions`,
+`SPSOOKY_HASH`, can also be used to calculate 128 bit hash codes, none
+of the current codes can be used to calculate 256 bit hash codes.
+Such larger hash codes are useful for larger hash tables and keys, and
+for checksums.
+Such larger keys and tables are little used, if used at all, in
+current
+Fortran codes, but the larger hash codes may be added to the library
+if there is a demand for them.
+
+Hash functions are often divided into two categories
+"cryptographic" and "non-cryptographic". 
+Cryptographic hash functions produce codes that are infeasible to
+reverse without additional information beyond the identity of
+the hash function used to generate the code and the resulting codes.
+Non-cryptographic codes, in some circumstances, are believed to be
+reversible.
+The modules only implement hash
+functions that are believed to be non-cryptographic, with
+implementations available in the public domain.
+
+There are a number of algorithms available for the computation of
+non-cryptographic 32 and 64 bit hash codes that differ in their
+computational complexity,
+their relative performance on different size keys, and the
+expected uniqueness (randomness) of the resulting hash codes.
+Their relative performance in the analysis of text, in particular,
+can depend on the processor, character set, language, and content.
+The quality of a hash function is often evaluated using
+the SMHasher test suite, originally written by
+[Austin Appleby](https://github.com/aappleby/smhasher), but greatly
+extended by [Reini Urban](https://github.com/rurban/smhasher).
+All except the simplest, `FNV_1` and `FNV_1A`, of the hash functions
+defined in the modules perform well on the tests in Reini Urban's
+version of SMHasher.
+
+There are two problems in implementing hash functions in Fortran.
+First, the static typing of Fortran makes it awkward to define general
+purpose hash functions.
+Instead hash functions are defined for some of the more common objects
+that are sufficiently complicated that a direct comparison is costly
+and common enough that a general procedure is useful:
+character strings and rank one arrays of integers.
+Other objects can, in principle, be hashed by using `transfer` to
+map their contents to an integer array, typically one of  kind `INT8`.
+The other problem is that hash codes are typically defined using
+modular unsigned integer arithmetic.
+As such integers are not part of the current Fortran standard,
+workarounds have to be used.
+These can take two forms.
+In one, the operations are emulated by using an integer of a
+larger size, or, for the larger integers, by dividing the integer into
+two lower and higher order halves, 
+and performing the operations on each half separately using 
+the larger integers.
+In the other, the unsigned integers may be replaced directly by
+the corresponding signed integers, but
+otherwise not modifying the the code logic.
+The first should be standard conforming on current processors, but
+is more computationally intensive unless the processors recognize 
+underlying idioms that are rarely used in Fortran codes. The second is
+not standard conforming as bit operations involving the sign are
+undefined,
+but should yield equivalent results with fewer operations on
+processors with two's complement integers that do not trap on over
+or under flow. The codes currently use the second method.
+
+In order to compile the hash function modules, the processors must
+implement much of Fortran 2003, and selected components of Fortran
+2008: submodules, 64 bit integers, and some bit intrinsics.
+The main limitation on valid processors is whether they
+implement the submodules enhancement of Fortran 2008.
+In order to properly run the hash functions, the compilers must
+use two's complement integers, and be able to execute them with
+wraparound semantics and no integer overflow exceptions.
+Current Fortran 2003+ processors solely use two's complement
+integers, and appear to be able to turn off overflow detection,
+so the modules use signed integer arithmetic. For that reason
+trapping on signed arithmetic must be disabled. The command line
+flags to disable overflow detection for processors implementing
+submodules are summarized in the table below.
+Note that FLANG, gfortran, ifort, and NAG all default to
+integer overflow wrapping.
+
+|Processor|Legal flag|Illegal flag|Default|
+|---------|----------|------------|-------|
+| ARM Fortran | NA? | NA? | overflow wrapping? |
+| Cray Fortran | NA? | NA? | overflow wrapping? |
+| FLANG/PGI | -fwrapv | -ftrapv | -fwrapv |
+| gfortran | -fwrapv | -ftrapv | -fwrapv |
+| IBM Fortran | NA? | NA? | overflow wrapping? |
+| ifort| NA? | NA? | overflow wrapping |
+| NAG Fortran | -C=none | -C=intovf | -C=none |
+| NEC Fortran | NA? | NA? | overflow wrapping? |
+| NVIDIA Fortran | NA? | NA? | overflow wrapping? |
+
+All of the modules' hash functions take one or two arguments.
+All of them have as their first argument the object to be hashed,
+termed a *key*.
+Most have a second argument, termed a *seed*, that sets the initial
+value of the hash code changing the hash function behavior.
+In particular, inputs that hash to the same hash index with a given
+seed, will often hash to different indexes with a different seed.
+This difference in behavior makes algorithms that use a seed much
+more resistant to denial of service attacks that use the properties
+of a known hash to increase the number of hash table collisions.
+This additional integer must be kept the same for all hashes
+in a given hash table, but can be changed and the objects rehashed
+if collisions are unusually common.
+The *seed* can be either a scalar or a two element array.
+Some of the hash functions have alternatives that allow incremental
+hashing. 
+
+|Algorithm|Seed|Result|
+|---------|----|------|
+|FNV-1|None|32 or 64 bit integer|
+|FNV-1a|None|32 or 64 bit integer|
+|nmhash32 |32 bit scalar integer|32 bit integer|
+|nmhash32x |32 bit scalar integer|32 bit integer|
+|pengyhash |32 bit scalar integer|64 bit integer|
+|Spooky Hash|64 bit two element vector|64 bit two element vector|
+|waterhash|64 bit scalar integer|32 bit integer|
+
+The hash function modules each provide at least five algorithms for
+hash functions: two optimized for small (< 32 `INT8` integer elements)
+keys, and three optimized for large (> 100 `INT8` integer elements)
+keys.
+The core implementation for each algorithm is for keys that are
+vectors of `INT8` integers.
+These core implementations are then used in wrappers for keys
+that are vectors of `INT16`, `INT32` and `INT64` integers, or default
+character strings, in the expectation that inlining will eliminate the
+overhead of transferring the other keys to `INT8` integer vectors.
+
+The `stdlib_32_bit_hash_functions` module provides
+implementations of five hash code algorithms:
+the *FNV_1* and *FNV_1A* variants of Glenn Fowler, 
+Landon Curt Noll, and Kiem-Phong Vo;
+the *nmhash32* and *nmhash32x* of James Z. M. Gao;
+and the *waterhash*  of Tommy Ettinger.
+The detailed implementation of each algorithm is handled in a separate
+submodule: `stdlib_32_bit_fnv_hashes`,
+`stdlib_32_bit_nmhashes`, and `stdlib_32_bit_water_hashes`,
+respectively. The `nmhash32`, `nmhash32x`, and `waterhash` algorithms
+require seeds. The submodules provide separate seed generators
+for each algorithm.
+The module itself
+implements two scalar hash functions, `FIBONACCI_HASH` and
+`UNIVERSAL_MULT_HASH`. 
+It also implements the subroutine, `ODD_RANDOM_INTEGER`, for
+generating seeds for `UNIVERSAL_MULT_HASH`.
+All assume a two's complement sign bit, and no out of
+range checks.
+
+The `stdlib_64_bit_hash_functions` module also provides
+implementations of four hash code algorithms:
+the *FNV_1* and *FNV_1A* variants of Glenn Fowler, 
+Landon Curt Noll, and Kiem-Phong Vo;
+the *pengynash* of Alberto Fajardo;
+and the *SpookyHash*  of Bob Jenkins.
+The detailed implementation of each algorithm is handled in a separate
+submodule: `stdlib_64_bit_fnv_hashes`,
+`stdlib_64_bit_pengy_hashes`, and `stdlib_64_bit_spooky_hashes`,
+respectively.
+The `pengyhash`, and `Spooky Hash` algorithms
+require seeds. The submodules provide separate seed generators
+for each algorithm.
+The module itself implements two scalar hash functions,
+`FIBONACCI_HASH` and `UNIVERSAL_MULT_HASH`.
+It also implements the subroutine, `ODD_RANDOM_INTEGER`, for
+generating seeds for `UNIVERSAL_MULT_HASH`.
+All assume a two's complement sign bit, and no out of
+range checks.
+
+The `stdlib_32_bit_fnv_hashes` and `stdlib_64_bits_fnv_hashes`
+submodules each provide implementations of ths FNV-1  and FNV-1A
+algorithms in the form of two separate overloaded functions: `FNV_1`
+and `FNV_1A`.
+The FNV-1 and FNV-2 algorithms differ in their order of the
+multiplication and exclusive or operations.
+They differ from their normal implementation in that they also
+encode the structure size in  the hash code.
+The 32 and 64 bit algorithms differ in their initial offsets and in
+their multiplicative constants.
+Analysis suggests that `FNV_1A` should be better at randomizing the
+input, but tests with hash tables show negligible difference.
+These algorithms have the reputation of being particularly useful for
+small byte strings, i.e, strings of less than 32 bytes.
+While they do not at all perform well on the SMHasher test suite,
+usage indicates that that that this has little impact on the
+performance of small hash tables, and the small size of the functions
+allows their quick loading and retainment in the instruction cache,
+givng a performance boost where the hashing is intermittent.
+(See the
+[SMHasher discussion](https://github.com/rurban/smhasher/README.md)
+and S. Richter, V. Alvarez, and J. Dittrich,
+["A Seven-Dimensional Analysis of Hashing Methods and its Implications on Query Processing"](https://bigdata.uni-saarland.de/publications/p249-richter.pdf).
+
+The `stdlib_32_bit_nmhashes` submodule provides implementations
+of James Z.M. Gao's `nmhash32` and `nmhash32x` algorithms,
+version 0.2, 
+in the form of the overloaded functions, `NMHASH32` and `NMHASH32X`.
+The implementations are based on the scalar versions of Gao's
+algorithms and not the vector versions that require access to
+the vector instructions of some processors.
+Both algorithms perform well on the SMHasher tests, and have no known
+bad seeds. The vector versions of both codes perform well on large
+keys, with the `nmhash32x` faster on short keys. To provide randomly
+generated seeds for the two functions the submodule also defines the
+subroutines `NEW_NMHASH32_SEED` and `NEW_NMHASH32X_SEED`. Gao claims
+that `NMHASH32X` is significantly faster than `NMHASH32` on short
+seeds, but slower on long seeds, but our limited testing so far shows
+`NMHASH32X` to be significantly faster on short seeds and slightly
+faster on long seeds.
+
+The `stdlib_32_bit_water_hashes` submodule provides implementations
+of Tommy Ettinger's `waterhash` algorithm in the form of the overloaded
+function, `WATER_HASH`. Water Hash has not been tested by Reini Urban,
+but Tommy Ettinger has tested it with Urban's SMHasher and presents
+results that shows Water Hash passing all the tests. So far his
+testing hasn't found any bad seeds for the algorithm. To provide
+randomly generated seeds for the hash function the submodule also
+defines the subroutine `NEW_WATER_HASH_SEED`.
+
+The `stdlib_64_bit_pengy_hashes` submodule provides implementations of
+Alberto Fajardo's `pengyhash` in the form of the overloaded function,
+`PENGY_HASH`. Reini Urban's testing shows that PengyHash passes all
+the tests and has no bad seeds.  To provide randomly generated seeds
+for the hash function the submodule also defines the subroutine
+`NEW_PENGY_HASH_SEED`. 
+
+The `stdlib_64_bit_spooky_hashes` submodule provides implementations
+of Bob Jenkins' SpookyHash in the form of the overloaded function,
+`SPOOKY_HASH`. Future implementations may provide the SpookyHash
+incremental hashing procedures.
+SpookyHash is optimized for large objects and should give excellent 
+performance for objects greater than about 96 byes, but has
+significant overhead for smaller objects.
+The code was designed for Little Endian processors, and will give
+different results on Big Endian processors, but the hash quality on
+those processors is probably just as good.
+SpookyHash version 2 passes all of Reini Urban's SMHasher tests, and
+has one bad seed only when reduced to a 32 bit output.
+Its only potential problem is undefined behavior if the key is
+misaligned.
+
+## The `stdlib_32_bit_hash_codes` module
+
+### Overview of the module
+
+Thirty two bit hash functions are primarily useful for generating hash
+codes for hash tables.
+Checksums generally benefit from having a larger number of bits.
+The `stdlib_32_bit_hash_codes` module defines five public overloaded
+32 bit hash code functions, `FNV_1`, `FNV-1A`, `NMHASH32`, `NMHASH32x`
+and `WATER_HASH`, two scalar hash functions, `FIBONACCI_HASH` and
+`UNIVERSAL_MULT_HASH`, four seed generators, `ODD_RANDOM_INTEGER` for
+`UNIVERSAL_MULT_HASH`, and `NEW_NMHASH32_SEED`, `NEW_NMHASH32X_SEED`,
+and `NEW_WATER_HASH_SEED`, for their respective hash code
+functions. It also defines the integer kind constant, `INT_HASH`, and
+a logical constant, `LITTLE_ENDIAN`, used to deal with one aspect of
+the machine dependence of the hash codes.
+
+### The `INT_HASH` parameter
+
+It is necessary to define the kind of integer used to return the hash
+code.
+As `stdlib_32_bit_hash_codes` deals exclusively with 32 bit hash codes,
+`INT_HASH` is an alias for the integer kind `INT32`.
+
+### The `LITTLE_ENDIAN` parameter
+
+In implementing hash functions it is sometimes necessary to know the
+"endianess" of the processor's integers. To this end the
+`stdlib_32_bit_hash_codes` module defines the logical parameter
+`LITTLE_ENDIAN` that, if true, indicates that the processor has little
+endian integers, and that if false indicates that the integers are big
+endian.
+
+### Specifications of the `stdlib_32_bit_hash_codes` procedures
+
+#### `FIBONACCI_HASH` - maps an integer to a smaller number of bits
+
+##### Status
+
+Experimental
+
+##### Description
+
+Calculates an `nbits` hash code from a 32 bit integer.
+
+##### Syntax
+
+`code = [[stdlib_32_bit_hash_codes:fibonacci_hash]]( key, nbits )`
+
+##### Class
+
+Pure function
+
+##### Arguments
+
+`key`: Shall be a scalar integer expression of kind `INT32`. It is an
+`intent(in)` argument.
+
+`nbits` Shall be a scalar default integer expression with `0 < nbits <
+32`. It is an `intent(in)` argument.
+
+##### Result
+
+The result is an integer of kind `INT32` with at most the lowest
+`nbits` nonzero.
+
+##### Note
+
+`FIBONACCI_HASH` is an implementation of the Fibonacci Hash of Donald
+E. Knuth. It multiplies the `KEY` by the odd valued approximation to
+`2**32/phi`, where `phi` is the golden ratio 1.618..., and returns the
+`NBITS` upper bits of the product as the lowest bits of the result.
+
+##### Example
+
+```fortran
+    program demo_fibonacci_hash
+      use stdlib_32_bit_hash_codes, only: fibonacci_hash
+      use iso_fortran_env, only: int32 
+      implicit none
+      integer, allocatable :: array1(:)
+      integer(int32) :: hash, source
+      allocate( array1(0:2**6-1) )
+      array1(:) = 0
+      source = int(Z'1FFFFFF', int32)
+      hash = fibonacci_hash(source, 6)
+      azray1(hash) = source
+      print *, hash
+    end program demo_fibonacci_hash
+```
+
+#### `FNV_1_HASH`- calculates a hash code from a key
+
+##### Status
+
+Experimental
+
+##### Description
+
+Calculates a 32 bit hash code from a rank 1 integer array or a default
+character string.
+
+##### Syntax
+
+`code = [[stdlib_32_bit_hash_codes:fnv_1_hash]]( key )`
+
+##### Class
+
+Pure function
+
+##### Argument
+
+`key`: Shall be a deferred length default character scalar expression
+or a rank 1 integer array expression of kind `INT8`, `INT16`,
+`INT32`, or `INT64`.
+It is an `intent(in)` argument.
+
+##### Result
+
+The result is a scalar integer of kind `INT32`.
+
+##### Note
+
+`FNV_1_HASH` is an implementation of the original FNV-1 hash code of Glenn
+Fowler, Landon Curt Noll, and Phong Vo.
+It differs from typical implementations in that it also ecodes the
+size of the structure in the hash code.
+This code is relatively fast on short keys, and is small enough that it
+will often be retained in the instruction cache if hashing is
+intermittent.
+As a result it should give good performance for typical hash table
+applications.
+This code does not pass any of the SMHasher tests, but the resulting
+degradation in performance due to its larger number of collisions is
+expected to be minor compared to its faster hashing rate.
+
+
+##### Example
+
+```fortran
+    program demo_fnv_1_hash
+      use stdlib_32_bit_hash_codes, only: fnv_1_hash
+      use iso_fortran_env, only: int32 
+      implicit none
+      integer, allocatable :: array1(:)
+      integer(int32) :: hash
+      array1 = [ 5, 4, 3, 1, 10, 4, 9]
+      hash = fnv_1_hash(array1)
+      print *, hash
+    end program demo_fnv_1_hash
+```
+
+
+#### `FNV_1A_HASH`- calculates a hash code from a key
+
+##### Status
+
+Experimental
+
+##### Description
+
+Calculates a 32 bit hash code from a rank 1 integer array or a default
+character string.
+
+##### Syntax
+
+`code = [[stdlib_32_bit_hash_codes:fnv_1a_hash]]( key )`
+
+##### Class
+
+Pure function
+
+##### Argument
+
+`key`: Shall be a deferred length default character scalar expression
+or a rank 1 integer array expression of kind `INT8`, `INT16`,
+`INT32`, or `INT64`.
+It is an `intent(in)` argument.
+
+##### Result
+
+The result is a scalar integer of kind `INT32`.
+
+##### Note
+
+`FNV_1A_HASH` is an implementation of the alternative FNV-1a hash code of
+Glenn Fowler, Landon Curt Noll, and Phong Vo.
+It differs from typical implementations in that it also ecodes the
+size of the structure in the hash code.
+This code is relatively fast on short keys, and is small enough that it
+will often be retained in the instruction cache if hashing is
+intermittent.
+As a result it should give good performance for typical hash table
+applications.
+This code does not pass any of the SMHasher tests, but the resulting
+degradation in performance due to its larger number of collisions is
+expected to be minor compared to its faster hashing rate.
+
+##### Example
+
+```fortran
+    program demo_fnv_1a_hash
+      use stdlib_32_bit_hash_codes, only: fnv_1a_hash
+      use iso_fortran_env, only: int32 
+      implicit none
+      integer, allocatable :: array1(:)
+      integer(int32) :: hash
+      array1 = [ 5, 4, 3, 1, 10, 4, 9]
+      hash = fnv_1a_hash(array1)
+      print *, hash
+    end program demo_fnv_1a_hash
+```
+
+
+#### `NEW_NMHASH32_SEED`- returns a valid input seed for `NMHASH32`
+
+##### Status
+
+Experimental
+
+##### Description
+
+Calculates a 32 bit "random" integer that is believed to be a valid
+seed for `NMHASH32` and is also different from the input seed.
+
+##### Syntax
+
+`code = call [[stdlib_32_bit_hash_codes:new_nmhash32_seed]]( seed )`
+
+##### Class
+
+Subroutine
+
+##### Argument
+
+`seed`: shall be a defined integer scalar variable of kind `INT32`.
+It is an `intent(inout)` argument. On input `seed` should be defined,
+and on output it will be different from the input `seed`.
+
+##### Note
+
+Currently there are no known bad seeds for `NMHASH32`, but if any are
+identified the procedure will be revised so that they cannot be
+returned.  This subroutine uses Fortran's intrinsic
+ `RANDOM_NUMBER` and the values returned can be changed by calling the
+ intrinsic `RANDOM_INIT`.
+
+##### Example
+
+See the example for `NMHASH32`.
+
+
+#### `NEW_NMHASH32X_SEED`- returns a valid input seed for `NMHASH32X`
+
+##### Status
+
+Experimental
+
+##### Description
+
+Calculates a 32 bit "random" integer that is believed to be a valid
+seed for `NMHASH32X` and is also different from the input seed.
+
+##### Syntax
+
+`code = call [[stdlib_32_bit_hash_codes:new_nmhash32x_seed]]( seed )`
+
+##### Class
+
+Subroutine
+
+##### Argument
+
+`seed`: shall be a defined integer scalar variable of kind `INT32`.
+It is an `intent(inout)` argument. On input `seed` should be defined,
+and on output it will be different from the input `seed`.
+
+##### Note
+
+Currently there are no known bad seeds for `NMHASH32X`, but if any are
+identified the procedure will be revised so that they cannot be
+returned.  This subroutine uses Fortran's intrinsic
+ `RANDOM_NUMBER` and the values returned can be changed by calling the
+ intrinsic `RANDOM_INIT`.
+
+##### Example
+
+See the example for `NMHASH32X`.
+
+
+#### `NEW_WATER_HASH_SEED`- returns a valid input seed for `WATER_HASH`
+
+##### Status
+
+Experimental
+
+##### Description
+
+Calculates a 64 bit "random" integer that is believed to be a valid
+seed for `WATER_HASH` and is also different from the input seed.
+
+##### Syntax
+
+`code = call [[stdlib_32_bit_hash_codes:new_water_hash_seed]]( seed )`
+
+##### Class
+
+Subroutine
+
+##### Argument
+
+`seed`: shall be a defined integer scalar variable of kind `INT64`.
+It is an `intent(inout)` argument. On input `seed` should be defined,
+and on output it will be different from the input `seed`.
+
+##### Note
+
+Currently there are no known bad seeds for `WATER_HASH`, but if any
+are identified the procedure will be revised so that they cannot be
+returned. This subroutine uses Fortran's intrinsic
+ `RANDOM_NUMBER` and the values returned can be changed by calling the
+ intrinsic `RANDOM_INIT`.
+ 
+
+##### Example
+
+See the example for `WATER_HASH`.
+
+
+#### `NMHASH32`- calculates a hash code from a key and a seed
+
+##### Status
+
+Experimental
+
+##### Description
+
+Calculates a 32 bit hash code from a rank 1 integer array or a default
+character string, and the input `seed`.
+
+##### Syntax
+
+`code = [[stdlib_32_bit_hash_codes:nmhash32]]( key, seed )`
+
+##### Class
+
+Pure function
+
+##### Arguments
+
+`key`: Shall be a deferred length default character scalar expression
+or a rank 1 integer array expression of kind `INT8`, `INT16`,
+`INT32`, or `INT64`.
+It is an `intent(in)` argument.
+
+`seed`: shall be an integer scalar expression of kind `INT32`.
+It is an `intent(in)` argument.
+
+##### Result
+
+The result is a scalar integer of kind `INT32`.
+
+##### Note
+
+`NMHASH32` is an implementation of the `nmhash32` hash code of
+James Z. M. Gao.
+This code has good, but not great, performance on long keys, poorer
+performance on short keys.
+As a result it should give fair performance for typical hash table
+applications.
+This code passes the SMHasher tests, and has no known bad seeds:
+
+##### Example
+
+```fortran
+    program demo_nmhash32
+      use stdlib_32_bit_hash_codes, only: nmhash32, &
+          new_nmhash32_seed
+      use iso_fortran_env, only: int32 
+      implicit none
+      integer, allocatable :: array1(:)
+      integer(int32) :: hash
+      integer(int32) :: seed = int(Z'11111111`, int32)
+      call new_nmhash32_seed(seed)
+      array1 = [ 5, 4, 3, 1, 10, 4, 9]
+      hash = nmhash32(array1, seed)
+      print *, seed, hash
+    end program demo_nmhash32
+```
+
+
+#### `NMHASH32X`- calculates a hash code from a key and a seed
+
+##### Status
+
+Experimental
+
+##### Description
+
+Calculates a 32 bit hash code from a rank 1 integer array or a default
+character string, and the input `seed`.
+
+##### Syntax
+
+`code = [[stdlib_32_bit_hash_codes:nmhash32x]]( key, seed )`
+
+##### Class
+
+Pure function
+
+##### Arguments
+
+`key`: Shall be a deferred length default character scalar expression
+or a rank 1 integer array expression of kind `INT8`, `INT16`,
+`INT32`, or `INT64`.
+It is an `intent(in)` argument.
+
+`seed`: shall be an integer scalar expression of kind `INT32`.
+It is an `intent(in)` argument.
+
+##### Result
+
+The result is a scalar integer of kind `INT32`.
+
+##### Note
+
+`NMHASH32X` is an implementation of the `nmhash32x` hash code of
+James Z. M. Gao.
+This code has good, but not great, performance on long keys, poorer
+performance on short keys.
+As a result it should give fair performance for typical hash table
+applications.
+This code passes the SMHasher tests, and has no known bad seeds:
+
+##### Example
+
+```fortran
+    program demo_nmhash32x
+      use stdlib_32_bit_hash_codes, only: nmhash32x, &
+	  new_nmhash32x_seed
+      use iso_fortran_env, only: int32 
+      implicit none
+      integer, allocatable :: array1(:)
+      integer(int32) :: hash
+      integer(int32) :: seed = int(Z'11111111`, int32)
+      call new_nmhash32x_seed(seed)
+      array1 = [ 5, 4, 3, 1, 10, 4, 9]
+      hash = nmhash32x(array1, seed)
+      print *, seed, hash
+    end program demo_nmhash32x
+```
+
+#### `ODD_RANDOM_INTEGER` - returns an odd integer
+
+##### Status
+
+Experimental
+
+##### Description
+
+Returns a random 32 bit integer distributed uniformly over the odd values.
+
+##### Syntax
+
+`call [[stdlib_32_bit_hash_codes:odd_random_integer]]( harvest )`
+
+##### Class
+
+Subroutine
+
+##### Argument
+
+`harvest`: Shall be a scalar integer variable of kind `INT32`. It is
+an `intent(out)` argument.
+
+##### Note
+
+`ODD_RANDOM_INTEGER` is intended to generate seeds for
+ `UNIVERSAL_MULT_HASH`. `ODD_RANDOM_NUMBER` uses Fortran's intrinsic
+ `RANDOM_NUMBER` and the values returned can be changed by calling the
+ intrinsic `RANDOM_INIT`.
+ 
+##### Example
+
+See `UNIVERSAL_MULT_HASH`.
+
+
+#### `UNIVERSAL_MULT_HASH` - maps an integer to a smaller number of bits
+
+##### Status
+
+Experimental
+
+##### Description
+
+Calculates an `nbits` hash code from a 32 bit integer.
+
+##### Syntax
+
+`code = [[stdlib_32_bit_hash_codes:universal_mult_hash]]( key, seed, nbits )`
+
+##### Class
+
+Pure function
+
+##### Arguments
+
+`key`: Shall be a scalar integer expression of kind `INT32`. It is an
+`intent(in)` argument.
+
+`seed`: Shall be a scalar integer expression of kind `INT32`. It is an
+`intent(in)` argument. It must have an odd value.
+
+`nbits` Shall be a scalar default integer expression with `0 < nbits <
+32`. It is an `intent(in)` argument.
+
+##### Result
+
+The result is a scalar integer of kind `INT32` with at most the lowest
+`nbits` nonzero.
+
+##### Note
+
+`UNIVERSAL_MULT_HASH` is an implementation of the Universal
+Multiplicative Hash of M. Dietzfelbinger, et al.
+It multiplies the `KEY` by `SEED`, and returns the
+`NBITS` upper bits of the product as the lowest bits of the result.
+
+##### Example
+
+```fortran
+    program demo_universal_mult_hash
+      use stdlib_32_bit_hash_codes, only: odd_random_integer, &
+	  universal_mult_hash
+      use iso_fortran_env, only: int32 
+      implicit none
+      integer, allocatable :: array1(:)
+      integer(int32) :: hash, i, seed, source
+      seed = 0
+      allocate( array1(0:2**6-1) )
+      do i = 0, 2**6-1
+          array(i) = i
+      end do
+      call odd_random_integer( seed )
+      source = int(Z'1FFFFFF', int32)
+      hash = universal_mult_hash(source, seed, 6)
+      azray1(hash) = source
+      print *, seed, hash, array1
+    end program demo_odd_random_integer
+```
+
+#### `WATER_HASH`- calculates a hash code from a key and a seed
+
+##### Status
+
+Experimental
+
+##### Description
+
+Calculates a 32 bit hash code from a rank 1 integer array or a default
+character string, and the input `seed`.
+
+##### Syntax
+
+`code = [[stdlib_32_bit_hash_codes:water_hash]]( key, seed )`
+
+##### Class
+
+Pure function
+
+##### Arguments
+
+`key`: Shall be a deferred length default character scalar expression
+or a rank 1 integer array expression of kind `INT8`, `INT16`,
+`INT32`, or `INT64`.
+It is an `intent(in)` argument.
+
+`seed`: shall be an integer scalar expression of kind `INT64`.
+It is an `intent(in)` argument.
+
+##### Result
+
+The result is a scalar integer of kind `INT32`.
+
+##### Note
+
+`WATER_HASH` is an implementation of the `waterhash` hash code of
+Tommy Ettinger.
+This code has excellent performance on long keys, and good performance
+on short keys.
+As a result it should give reasonable performance for typical hash
+table applications.
+This code passes the SMHasher tests.
+The `waterhash` is based on the `wyhash` of Wang Yi.
+While `wyhash` has a number of bad seeds, depending on the version,
+so far testing has not found any bad seeds for `waterhash`.
+It can have undefined behavior if the key is not word aligned. 
+
+##### Example
+
+```fortran
+    program demo_water_hash
+      use stdlib_32_bit_hash_codes, only: water_hash, &
+	  new_water_hash_seed
+      use iso_fortran_env, only: int32, int64
+      implicit none
+      integer, allocatable :: array1(:)
+      integer(int32) :: hash
+      integer(int64) :: seed = int(Z'11111111`, int64)
+      call new_water_hash_seed( seed )
+      array1 = [ 5, 4, 3, 1, 10, 4, 9]
+      hash = water_hash(array1, seed)
+      print *, hash, seed
+    end program demo_water_hash
+```
+
+## The `stdlib_64_bit_hash_codes` module
+
+### Overview of the module
+
+Sixty four bit hash functions are generally overkill for hash table
+applications, and are primarily useful for check sums and related
+applications.
+As checksums often have to deal with extremely large files or
+directories, it is often useful to use incremental hashing as well as
+direct hashing, so 64 bit and higher hash algorithms often provide
+multiple implementations. The current module, for simplicity of API,
+doesn't provide any incremental hashes.
+The `stdlib_64_bit_hash_codes` module defines several public
+overloaded 64 bit hash procedures, `FNV_1`, `FNV-1A`,
+`PENGY_HASH`, and `SPOOKY_HASH`, two scalar hash functions,
+`FIBONACCI_HASH` and 
+`UNIVERSAL_MULT_HASH`, a seed generator, `ODD_RANDOM_INTEGER`, for the
+`UNIVERSAL_MULT_HASH`, and two seed generators, `NEW_PENGY_HASH_SEED`
+and `NEW_SPOOKY_HASH_SEED` for their respective hash functions. It
+also defines the integer kind constant, `INT_HASH`, used to specify
+the kind of the hash function results, and a logical constant,
+`LITTLE_ENDIAN`, used to deal with one aspect of the machine
+dependence of the hash codes. 
+Note that while SpookyHash can be used as a sixty four bit hash
+algorithm, its algorithms actually returns two element integer arrays
+of kind `INT64`, so it can also be used as a 128 bit hash.
+
+### The `INT_HASH` parameters
+
+It is necessary to define the kind of integer used to return the hash
+code.
+As `stdlib_64_bit_hash_codes` deals exclusively with 64 bit hash codes,
+`INT_HASH` is an alias for the integer kind `INT64`.
+
+### The `LITTLE_ENDIAN` parameter
+
+In implementing hash functions it is sometimes necessary to know the
+"endianess" of the processor's integers. To this end the
+`stdlib_64_bit_hash_codes` module defines the logical parameter
+`LITTLE_ENDIAN` that if true indicates that the processor has little
+endian integers, and that if false indicates that the integers are big
+endian.
+
+
+### Specifications of the `stdlib_64_bit_hash_codes` procedures
+
+#### `FIBONACCI_HASH` - maps an integer to a smaller number of bits
+
+##### Status
+
+Experimental
+
+##### Description
+
+Calculates an `nbits` hash code from a 64 bit integer.
+
+##### Syntax
+
+`code = [[stdlib_64_bit_hash_codes:fibonacci_hash]]( key, nbits )`
+
+##### Class
+
+Pure function
+
+##### Arguments
+
+`key`: Shall be a scalar integer expression of kind `INT64`. It is an
+`intent(in)` argument.
+
+`nbits` Shall be a scalar default integer expression with `0 < nbits <
+64`. It is an `intent(in)` argument.
+
+##### Result
+
+The result is a scalar integer of kind `INT64` with at most the lowest
+`nbits` nonzero.
+
+##### Note
+
+`FIBONACCI_HASH` is an implementation of the Fibonacci Hash of Donald
+E. Knuth. It multiplies the `KEY` by the odd valued approximation to
+`2**64/phi`, where `phi` is the golden ratio 1.618..., and returns the
+`nbits` upper bits of the product as the lowest bits of the result.
+
+##### Example
+
+```fortran
+    program demo_fibonacci_hash
+      use stdlib_64_bit_hash_codes, only: fibonacci_hash
+      use iso_fortran_env, only: int64 
+      implicit none
+      integer, allocatable :: array1(:)
+      integer(int64) :: hash, source
+      allocate( array1(0:2**6-1) )
+      array1(:) = 0
+      source = int(Z'1FFFFFFFF', int64)
+      hash = fibonacci_hash(source, 6)
+      azray1(hash) = source
+      print *, hash
+    end program demo_fibonacci_hash
+```
+
+#### `FNV_1`- calculates a hash code from a key
+
+##### Status
+
+Experimental
+
+##### Description
+
+Calculates a 64 bit hash code from a rank 1 integer array or a default
+character string.
+
+##### Syntax
+
+`code = [[stdlib_64_bit_hash_codes:fnv_1]]( key )`
+
+##### Class
+
+Pure function
+
+##### Argument
+
+`key`: Shall be a deferred length default character scalar expression
+or a rank 1 integer array expression of kind `INT8`, `INT16`,
+`INT32`, or `INT64`.
+It is an `intent(in)` argument.
+
+##### Result
+
+The result is a scalar integer of kind `INT64`. 
+
+##### Note
+
+`FNV_1` is an implementation of the original FNV-1 hash code of Glenn
+Fowler, Landon Curt Noll, and Phong Vo.
+It differs from typical implementations in that it also ecodes the
+size of the structure in the hash code.
+This code is relatively fast on short keys, and is small enough that it
+will often be retained in the instruction cache if hashing is
+intermittent.
+As a result it should give good performance for typical hash table
+applications, although it is rare for them to need 64 bits.
+This code does not pass any of the SMHasher tests, but the resulting
+degradation in performance due to its larger number of collisions is
+expected to be minor compared to its faster hashing rate.
+
+
+##### Example
+
+```fortran
+    program demo_fnv_1_hash
+      use stdlib_64_bit_hash_codes, only: fnv_1_hash
+	  use iso_fortran_env, only: int64
+      implicit none
+      integer, allocatable :: array1(:)
+      integer(int64) :: hash
+      array1 = [ 5, 4, 3, 1, 10, 4, 9]
+      hash = fnv_1_hash(array1)
+      print *, hash
+    end program demo_fnv_1_hash
+```
+
+
+#### `FNV_1A`- calculates a hash code from a key
+
+##### Status
+
+Experimental
+
+##### Description
+
+Calculates a 64 bit hash code from a rank 1 integer array or a default
+character string.
+
+##### Syntax
+
+`code = [[stdlib_64_bit_hash_codes:fnv_1a]]( key )`
+
+##### Class
+
+Pure function
+
+##### Argument
+
+`key`: Shall be a deferred length default character scalar expression
+or a rank 1 integer array expression of kind `INT8`, `INT16`,
+`INT32`, or `INT64`.
+It is an `intent(in)` argument.
+
+##### Result
+
+The result is a scalar integer of kind `INT32`.
+
+##### Note
+
+`FNV_1A` is an implementation of the alternative FNV-1a hash code of
+Glenn Fowler, Landon Curt Noll, and Phong Vo.
+It differs from typical implementations in that it also ecodes the
+size of the structure in the hash code.
+This code is relatively fast on short keys, and is small enough that it
+will often be retained in the instruction cache if hashing is
+intermittent.
+As a result it should give good performance for typical hash table
+applications.
+This code does not pass any of the SMHasher tests, but the resulting
+degradation in performance due to its larger number of collisions is
+expected to be minor compared to its faster hashing rate.
+
+##### Example
+
+```fortran
+    program demo_fnv_1a_hash
+      use stdlib_64_bit_hash_codes, only: fnv_1a_hash
+      use iso_fortran_env, only: int64
+      implicit none
+      integer, allocatable :: array1(:)
+      integer(int64) :: hash
+      array1 = [ 5, 4, 3, 1, 10, 4, 9]
+      hash = fnv_1a_hash(array1)
+      print *, hash
+    end program demo_fnv_1a_hash
+```
+
+
+#### `NEW_PENGY_HASH_SEED`- returns a valid input seed for `PENGY_HASH`
+
+##### Status
+
+Experimental
+
+##### Description
+
+Calculates a 32 bit "random" integer that is believed to be a valid
+seed for `PENGY_HASH` and is also different from the input seed.
+
+##### Syntax
+
+`code = call [[stdlib_32_bit_hash_codes:new_pengy_hash_seed]]( seed )`
+
+##### Class
+
+Subroutine
+
+##### Argument
+
+`seed`: shall be a defined integer scalar variable of kind `INT32`.
+It is an `intent(inout)` argument. On input `seed` should be defined,
+and on output it will be different from the input `seed`.
+
+##### Note
+
+Currently there are no known bad seeds for `PENGY_HASH`, but if any are
+identified the procedure will be revised so that they cannot be
+returned.  This subroutine uses Fortran's intrinsic
+ `RANDOM_NUMBER` and the values returned can be changed by calling the
+ intrinsic `RANDOM_INIT`.
+
+##### Example
+
+See the example for `PENGY_HASH`.
+
+
+#### `NEW_SPOOKY_HASH_SEED`- returns a valid input seed for `SPOOKY_HASH`
+
+##### Status
+
+Experimental
+
+##### Description
+
+Calculates a 32 bit two element vector of "random" integer values that
+is believed to be a valid seed for `SPOOKY_HASH` and is also different
+from the input seed. 
+
+##### Syntax
+
+`code = call [[stdlib_32_bit_hash_codes:new_spooky_hash_seed]]( seed )`
+
+##### Class
+
+Subroutine
+
+##### Argument
+
+`seed`: shall be a defined two element integer vector variable of kind
+`INT32`. It is an `intent(inout)` argument. On input `seed` should be
+defined, and on output it will be different from the input `seed`.
+
+##### Note
+
+Currently there are no known bad seeds for `SPOOKY_HASH`, but if any are
+identified the procedure will be revised so that they cannot be
+returned.  This subroutine uses Fortran's intrinsic
+ `RANDOM_NUMBER` and the values returned can be changed by calling the
+ intrinsic `RANDOM_INIT`.
+
+##### Example
+
+See the example for `SPOOKY_HASH`.
+
+
+#### `ODD_RANDOM_INTEGER` - returns odd integer
+
+##### Status
+
+Experimental
+
+##### Description
+
+Returns a random 64 bit integer distributed uniformly over the odd values.
+
+##### Syntax
+
+`call [[stdlib_64_bit_hash_codes:odd_random_integer]]( harvest )`
+
+##### Class
+
+Subroutine
+
+##### Argument
+
+`harvest`: Shall be an integer of kind `INT64`. It is an `intent(out)`
+argument.
+
+##### Note
+
+`ODD_RANDOM_INTEGER` is intended to generate seeds for
+ `UNIVERSAL_MULT_HASH`. `ODD_RANDOM_NUMBER` uses Fortran's intrinsic
+ `RANDOM_NUMBER` and the values returned can be changed by calling the
+ intrinsic `RANDOM_INIT`.
+
+##### Example
+
+See `UNIVERSAL_MULT_HASH`.
+
+
+#### `PENGY_HASH` - maps a character string or integer vector to an integer
+
+##### Status
+
+Experimental
+
+##### Description
+
+Maps a character string or integer vector to a 64 bit integer whose
+value also depends on a scalar 32 bit integer, `seed`.
+
+##### Syntax
+
+`code = [[stdlib_64_bit_hash_codes:pengy_hash]]( key, seed )`
+
+#####  Class
+
+Pure function
+
+##### Arguments
+
+`key`: shall be a scalar  expression of type default character or a
+Rank 1 integer vector expression of kind `INt8`, `INT16`, `INT32`, or
+`INTT64`. It is an `intent(in)` argument.
+
+`seed`: shall be an integer ex of kind `INT64`. It ispression
+an `intent(in)` argument.
+
+##### Result
+
+The result is an integer of kind `INT64`.
+
+##### Note
+
+`PENGY_HASH` is an implementation of the 64 bit `pengyhash` of Alberto
+Fajardo. The hash has acceptable performance on small keys, and good
+performance on long keys. It passes all the SMHasher tests, and has
+no known bad seeds.
+
+##### Exampl
+
+```fortran
+    program demo_pengy_hash
+      use stdlib_64_bit_hash_codes, only: new_pengy_hash_seed, pengy_hash
+      use iso_fortran_env, only: int64 
+      implicit none
+      integer, allocatable :: key(:)
+      integer(int64) :: hash
+      integer(int32)  ::  seed
+      key = [ 0_int64, 1_int64, 2_int64, 3_int64 ]
+      seed = 0_int32
+      call new_pengy_hash_seed( seed )
+      hash = pengy_hash( key, seed )
+      print *, seed, hash
+    end program demo_pengy_hash
+```
+
+
+#### `SPOOKY_HASH` - maps a character string or integer vector to an integer
+
+##### Status
+
+Experimental
+
+##### Description
+
+Maps a character string or integer vector to a 64 bit integer whose
+value also depends on a two element vector,  `seed`.
+
+##### Syntax
+
+`code = [[stdlib_64_bit_hash_codes:spooky_hash]]( key, seed )`
+
+#####  Class
+
+Pure function
+
+##### Arguments
+
+`key`: shall be a scalar of type default character expression or a
+Rank 1 integer vector expression of kind `INt8`, `INT16`, `INT32`, or
+`INTT64`. It is an `intent(in)` argument.
+
+`seed`: shall be a two element integer vector expression of kind
+`INT64`. It is an `intent(in)` argument.
+
+##### Result
+
+The result is a two element integer vector of kind `INT64`.
+
+##### Note
+
+`SPOOKY_HASH` is an implementation of the 64 bit version 2 of
+SpookyHash of Bob Jenkins. The code was designed for Little-Endian
+processors. The output is different on Big Endian processors, but still
+probably as good quality. It is often used as a 64 bit hash using the
+first element of the returned value, but can be used as a 128 bit
+hash. This version of `SPOOKY_HASH` has good performance on small keys
+and excellent performance on long keys. It passes all the SMHasher tests
+and has no known bad seeds.
+
+##### Example
+
+```fortran
+    program demo_spooky_hash
+      use stdlib_64_bit_hash_codes, only: new_spooky_hash_seed, &
+          spooky_hash
+      use iso_fortran_env, only: int64 
+      implicit none
+      integer, allocatable :: key(:)
+      integer(int64) :: hash(2), seed(2), source
+      key = [ 0_int64, 1_int64, 2_int64, 3_int64 ]
+      seed = [ 119_int64, 2_int64**41-1 ]
+      call new_spooky_hash_seed( seed )
+      hash = spooky_hash( key, seed )
+      print *, seed, hash
+    end program demo_spooky_hash
+```
+
+#### `UNIVERSAL_MULT_HASH` - maps an integer to a smaller number of bits
+
+##### Status
+
+Experimental
+
+##### Description
+
+Calculates an `nbits` hash code from a 64 bit integer.
+
+##### Syntax
+
+`code = [[stdlib_64_bit_hash_codes:universal_mult_hash]]( key, seed, nbits )`
+
+##### Class
+
+Pure function
+
+##### Arguments
+
+`key`: Shall be an integer of kind `INT64`. It is an `intent(in)`
+argument.
+
+`seed`: Shall be an integer of kind `INT64`. It is an `intent(in)`
+argument. It should be an odd value.
+
+`nbits` Shall be a default integer with `0 < nbits < 64`. It is an
+`intent(in)` argument.  It must be an odd integer.
+
+##### Result
+
+The result is an integer of kind `INT64` with at most the lowest
+`nbits` nonzero.
+
+##### Note
+
+`UNIVERSAL_MULT_HASH` is an implementation of the Universal
+Multiplicative Hash of M. Dietzfelbinger, et al.
+It multiplies the `KEY` by `SEED`, and returns the
+`NBITS` upper bits of the product as the lowest bits of the result.
+
+##### Example
+
+
+```fortran
+    program demo_universal_mult_hash
+      use stdlib_32_bit_hash_codes, only: odd_random_integer, &
+          universal_mult_hash
+      use iso_fortran_env, only: int64
+      implicit none
+      integer, allocatable :: array1(:)
+      integer(int64) :: hash, i, seed, source
+      seed = 0
+      allocate( array1(0:2**6-1) )
+      do i = 0, 2**6-1
+          array(i) = i
+      end do
+      call odd_random_integer( seed )
+      source = int(Z'1FFFFFF', int64)
+      hash = universal_mult_hash(source, seed, 6)
+      azray1(hash) = source
+      print *, seed, hash, array1
+    end program demo_universal_mult_hash
+```
+
+
+### Test Codes
+
+The Fortran Standard Library provides two test codes for the hash
+functions of `stdlib_32_bit_hash_functions` and
+`stdlib_64_bit_hash_functions`, `test_32_bit_hash_performance` and
+`test_64_bit_hash_performance` respectively. These are primarily set
+up to test runtime performance of the functions. They take a sample of
+`2**18` integers of kind `INT8` and break it up into vectors of size
+1, 2, 4, 8, 16, 64,  256, and 1024 elements, yielding `2**18`,
+`2**17`, `2**16`, `2**15`, `2**14`, `2**12`, `2**10`, and `2**8`
+vectors respectively. These are then processed by the hash functions
+4 times, and the time for processing is reported. Testing so far has
+been on a MacBook Pro with a 2.3 GHz Quad-Core Intel Core i5 and 8 GB
+2133 MHz LPDDR3 of RAM, using GNU Fortran (GCC) 11.1.0 to compile the
+code. The results for `test_32_bit_hash_performance` is given by the
+following table:
+
+| Algorithm  | Key Size  | Key #      | Time (s) |
+|            | Bytes     |            |          |
+|------------|-----------|------------|----------|
+|     FNV-1  |       1   |    1048576 |  0.02949 |
+|     FNV-1  |       2   |     524288 |  0.02361 |
+|     FNV-1  |       4   |     262144 |  0.02016 |
+|     FNV-1  |       8   |     131072 |  0.01806 |
+|     FNV-1  |      16   |      65536 |  0.01867 |
+|     FNV-1  |      64   |      16384 |  0.01717 |
+|     FNV-1  |     256   |       4096 |  0.01759 |
+|     FNV-1  |    1024   |       1024 |  0.01659 |
+|    FNV-1a  |       1   |    1048576 |  0.02897 |
+|    FNV-1a  |       2   |     524288 |  0.02472 |
+|    FNV-1a  |       4   |     262144 |  0.02025 |
+|    FNV-1a  |       8   |     131072 |  0.01901 |
+|    FNV-1a  |      16   |      65536 |  0.01898 |
+|    FNV-1a  |      64   |      16384 |  0.01784 |
+|    FNV-1a  |     256   |       4096 |  0.01723 |
+|    FNV-1a  |    1024   |       1024 |  0.01673 |
+|  nmhash32  |       1   |    1048576 |  0.31092 |
+|  nmhash32  |       2   |     524288 |  0.16230 |
+|  nmhash32  |       4   |     262144 |  0.07815 |
+|  nmhash32  |       8   |     131072 |  0.04176 |
+|  nmhash32  |      16   |      65536 |  0.09261 |
+|  nmhash32  |      64   |      16384 |  0.04587 |
+|  nmhash32  |     256   |       4096 |  0.07238 |
+|  nmhash32  |    1024   |       1024 |  0.07263 |
+| nmhash32x  |       1   |    1048576 |  0.04294 |
+| nmhash32x  |       2   |     524288 |  0.02937 |
+| nmhash32x  |       4   |     262144 |  0.01096 |
+| nmhash32x  |       8   |     131072 |  0.00911 |
+| nmhash32x  |      16   |      65536 |  0.01291 |
+| nmhash32x  |      64   |      16384 |  0.00859 |
+| nmhash32x  |     256   |       4096 |  0.07373 |
+| nmhash32x  |    1024   |       1024 |  0.07618 |
+|     water  |       1   |    1048576 |  0.12560 |
+|     water  |       2   |     524288 |  0.06302 |
+|     water  |       4   |     262144 |  0.04020 |
+|     water  |       8   |     131072 |  0.01999 |
+|     water  |      16   |      65536 |  0.01459 |
+|     water  |      64   |      16384 |  0.00923 |
+|     water  |     256   |       4096 |  0.00816 |
+|     water  |    1024   |       1024 |  0.00792 |
+
+while for `test_64_bit_hash_performance` the results are:
+
+| Algorithm  | Key Size  | Key #      | Time (s) |
+|            | Bytes     |            |          |
+|------------|-----------|------------|----------|
+|     FNV-1  |       1   |    1048576 |  0.02981 |
+|     FNV-1  |       2   |     524288 |  0.02697 |
+|     FNV-1  |       4   |     262144 |  0.02275 |
+|     FNV-1  |       8   |     131072 |  0.02431 |
+|     FNV-1  |      16   |      65536 |  0.02158 |
+|     FNV-1  |      64   |      16384 |  0.02007 |
+|     FNV-1  |     256   |       4096 |  0.01932 |
+|     FNV-1  |    1024   |       1024 |  0.02089 |
+|    FNV-1a  |       1   |    1048576 |  0.03226 |
+|    FNV-1a  |       2   |     524288 |  0.03076 |
+|    FNV-1a  |       4   |     262144 |  0.02359 |
+|    FNV-1a  |       8   |     131072 |  0.02542 |
+|    FNV-1a  |      16   |      65536 |  0.02364 |
+|    FNV-1a  |      64   |      16384 |  0.02130 |
+|    FNV-1a  |     256   |       4096 |  0.01962 |
+|    FNV-1a  |    1024   |       1024 |  0.01966 |
+|     Pengy  |       1   |    1048576 |  0.24294 |
+|     Pengy  |       2   |     524288 |  0.12066 |
+|     Pengy  |       4   |     262144 |  0.06205 |
+|     Pengy  |       8   |     131072 |  0.03138 |
+|     Pengy  |      16   |      65536 |  0.01608 |
+|     Pengy  |      64   |      16384 |  0.00669 |
+|     Pengy  |     256   |       4096 |  0.00387 |
+|     Pengy  |    1024   |       1024 |  0.00295 |
+|    Spooky  |       1   |    1048576 |  0.11920 |
+|    Spooky  |       2   |     524288 |  0.07478 |
+|    Spooky  |       4   |     262144 |  0.03185 |
+|    Spooky  |       8   |     131072 |  0.01468 |
+|    Spooky  |      16   |      65536 |  0.01503 |
+|    Spooky  |      64   |      16384 |  0.00440 |
+|    Spooky  |     256   |       4096 |  0.00290 |
+|    Spooky  |    1024   |       1024 |  0.00177 |
+
+As the tested function will typically reside in the instruction cache
+these results do not include the costs of reloading the procedure if
+hashing is intermittent. If hashing is intermittent then that can more
+severely impact the performance of  `nmhash32`, `nmhash32x`,
+`water_hash`, `pengy_hash`, and `spooky_hash` relative to
+`fnv_1_hash` and `fnv_1a_hash`.
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index bb9fb4fd8..d0d20c492 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -2,6 +2,14 @@
 
 # Create a list of the files to be preprocessed
 set(fppFiles
+    stdlib_32_bit_fnv_hashes.fypp
+    stdlib_32_bit_hash_functions.fypp
+    stdlib_32_bit_nmhashes.fypp 
+    stdlib_32_bit_water_hashes.fypp 
+    stdlib_64_bit_fnv_hashes.fypp 
+    stdlib_64_bit_hash_functions.fypp 
+    stdlib_64_bit_pengy_hashes.fypp 
+    stdlib_64_bit_spookyv2_hashes.fypp 
     stdlib_ascii.fypp
     stdlib_bitsets.fypp
     stdlib_bitsets_64.fypp
diff --git a/src/Makefile.manual b/src/Makefile.manual
index 179fc600f..78ea780fb 100644
--- a/src/Makefile.manual
+++ b/src/Makefile.manual
@@ -1,4 +1,12 @@
 SRCFYPP = \
+        stdlib_32_bit_fnv_hashes.fypp \
+        stdlib_32_bit_hash_functions.fypp \
+        stdlib_32_bit_nmhashes.fypp \
+        stdlib_32_bit_water_hashes.fypp \
+        stdlib_64_bit_fnv_hashes.fypp \
+        stdlib_64_bit_hash_functions.fypp \
+        stdlib_64_bit_pengy_hashes.fypp \
+        stdlib_64_bit_spookyv2_hashes.fypp \
         stdlib_ascii.fypp \
         stdlib_bitsets_64.fypp \
         stdlib_bitsets_large.fypp \
@@ -74,6 +82,22 @@ $(SRCGEN): %.f90: %.fypp common.fypp
 
 # Fortran module dependencies
 f18estop.o: stdlib_error.o
+stdlib_32_bit_fnv_hashes.o: \
+   stdlib_32_bit_hash_functions.o
+stdlib_32_bit_hash_functions.o: \
+   stdlib_kinds.o
+stdlib_32_bit_nmhashes.o: \
+   stdlib_32_bit_hash_functions.o
+stdlib_32_bit_water_hashes.o: \
+   stdlib_32_bit_hash_functions.o
+stdlib_64_bit_fnv_hashes.o: \
+   stdlib_64_bit_hash_functions.o
+stdlib_64_bit_hash_functions.o: \
+   stdlib_kinds.o
+stdlib_64_bit_pengy_hashes.o: \
+   stdlib_64_bit_hash_functions.o
+stdlib_64_bit_spookyv2_hashes.o: \
+   stdlib_64_bit_hash_functions.o
 stdlib_ascii.o: stdlib_kinds.o
 stdlib_bitsets.o: stdlib_kinds.o
 stdlib_bitsets_64.o: stdlib_bitsets.o
diff --git a/src/stdlib_32_bit_fnv_hashes.fypp b/src/stdlib_32_bit_fnv_hashes.fypp
new file mode 100644
index 000000000..562de2978
--- /dev/null
+++ b/src/stdlib_32_bit_fnv_hashes.fypp
@@ -0,0 +1,126 @@
+!!------------------------------------------------------------------------------
+!! `FNV_1_HASH` and  `FNV_1A_Hash` are translations to Fortran 2008 of the
+!! `FNV-1` and `FNV-1a` hash functions of Glenn Fowler, Landon Curt Noll,
+!! and Phong Vo, that has been released into the public domain. Permission
+!! has been granted, by Landon Curt Noll, for the use of these algorithms
+!! in the Fortran Standard Library. A description of these functions is
+!! available at https://en.wikipedia.org/wiki/Fowler–Noll–Vo_hash_function.
+!!------------------------------------------------------------------------------
+
+!#! Integer kinds to be considered during templating
+#:set INT_KINDS = ["int16", "int32", "int64"]
+
+submodule(stdlib_32_bit_hash_functions) stdlib_32_bit_fnv_hashes
+!! An implementation of the FNV hashes 1 and 1a of Glenn Fowler, Landon Curt
+!! Noll, and Kiem-Phong-Vo,
+!! https://en.wikipedia.org/wiki/Fowler–Noll–Vo_hash_function
+    implicit none
+
+    integer(int_hash), parameter ::                  &
+        offset_basis = int( z'811C9DC5', int_hash ), &
+        prime        = int( z'01000193', int_hash )
+
+contains
+
+    pure module function int8_fnv_1( key ) result(hash_code)
+!! The original FNV-1 8-bit key algorithm.
+        integer(int8), intent(in)     :: key(:)
+        integer(int_hash)             :: hash_code
+
+        integer(int64) :: i
+
+        hash_code = offset_basis
+        do i=1_int64, size(key, kind=int64)
+            hash_code = hash_code * prime
+            if ( little_endian ) then
+                hash_code = ieor( hash_code, &
+                                  transfer( [key(i), 0_int8, 0_int8, 0_int8], &
+                                            0_int_hash ) )
+            else
+                hash_code = ieor( hash_code, &
+                                  transfer( [0_int8, 0_int8, 0_int8, key(i)], &
+                                            0_int_hash ) )
+            end if
+        end do
+
+    end function int8_fnv_1
+
+
+#:for k1 in INT_KINDS
+    pure module function ${k1}$_fnv_1( key ) result(hash_code)
+! A ${k1}$ array key wrapper for the FNV-1 algorithm.
+        integer(${k1}$), intent(in) :: key(:)
+        integer(int_hash)           :: hash_code
+
+        hash_code = int8_fnv_1( transfer( key, 0_int8,                      &
+                                          bytes_${k1}$*                     &
+                                          size( key, kind=int64 ) ) )
+
+    end function ${k1}$_fnv_1
+
+#:endfor
+
+
+    pure module function character_fnv_1( key ) result(hash_code)
+! A default character key wrapper for the FNV-1 algorithm.
+        character(*), intent(in)      :: key
+        integer(int_hash)             :: hash_code
+
+        hash_code = int8_fnv_1( transfer( key,                           &
+                                          0_int8,                        &
+                                          bytes_char*                    &
+                                          len(key, kind=int64) ) )
+
+    end function character_fnv_1
+
+
+    pure module function int8_fnv_1a( key ) result(hash_code)
+!! The original FNV-1a 8-bit key algorithm.
+        integer(int8), intent(in)     :: key(:)
+        integer(int_hash)             :: hash_code
+
+        integer(int64) :: i
+
+        hash_code = offset_basis
+        do i=1_int64, size(key, kind=int64)
+            if ( little_endian ) then
+                hash_code = ieor( hash_code, &
+                                  transfer( [key(i), 0_int8, 0_int8, 0_int8],  &
+                                            0_int_hash ) )
+            else
+                hash_code = ieor( hash_code, &
+                                  transfer( [0_int8, 0_int8, 0_int8, key(i)], &
+                                            0_int_hash ) )
+            end if
+            hash_code = hash_code * prime
+        end do
+
+    end function int8_fnv_1a
+
+
+#:for k1 in INT_KINDS
+    pure module function ${k1}$_fnv_1a( key ) result(hash_code)
+! A ${k1}$ array key wrapper for the FNV-1a algorithm.
+        integer(${k1}$), intent(in)   :: key(:)
+        integer(int_hash)             :: hash_code
+
+        hash_code = int8_fnv_1a( transfer( key, 0_int8,                   &
+                                           bytes_${k1}$*                  &
+                                           size(key, kind=int64)) )
+
+    end function ${k1}$_fnv_1a
+
+#:endfor
+
+    pure module function character_fnv_1a( key ) result(hash_code)
+! A default character key wrapper for the FNV-1 algorithm.
+        character(*), intent(in)      :: key
+        integer(int_hash)             :: hash_code
+
+        hash_code = int8_fnv_1a( transfer( key, 0_int8,                   &
+                                           (bits_char/bits_int8)*         &
+                                           len(key, kind=int64) ) )
+
+    end function character_fnv_1a
+
+end submodule stdlib_32_bit_fnv_hashes
diff --git a/src/stdlib_32_bit_hash_functions.fypp b/src/stdlib_32_bit_hash_functions.fypp
new file mode 100644
index 000000000..9425a2280
--- /dev/null
+++ b/src/stdlib_32_bit_hash_functions.fypp
@@ -0,0 +1,244 @@
+#! Integer kinds to be considered during templating
+#:set INT_KINDS = ["int8", "int16", "int32", "int64"]
+
+module stdlib_32_bit_hash_functions
+
+    use, intrinsic :: iso_fortran_env, only : &
+        character_storage_size
+
+    use stdlib_kinds, only: &
+        dp,                 &
+        int8,               &
+        int16,              &
+        int32,              &
+        int64
+
+    implicit none
+
+    private
+
+    integer, parameter, public :: &
+        int_hash     = int32
+!! The number of bits in the output hash
+
+! pow32_over_phi is the odd integer that most closely approximates 2**32/phi,
+! where phi is the golden ratio 1.618...
+    integer(int32), parameter ::                 &
+        pow32_over_phi = int( z'9E3779B9', int32 )
+
+! The number of bits used by each integer type
+    integer, parameter ::               &
+! Should be 8
+        bits_int8  = bit_size(0_int8),  &
+! Should be 16
+        bits_int16 = bit_size(0_int16), &
+! Should be 32
+        bits_int32 = bit_size(0_int32), &
+! Should be 64
+        bits_int64 = bit_size(0_int64)
+
+    integer, parameter ::                   &
+! Should be 8
+        bytes_int8  = bits_int8/bits_int8,  &
+! Should be 16
+        bytes_int16 = bits_int16/bits_int8, &
+! Should be 32
+        bytes_int32 = bits_int32/bits_int8, &
+! Should be 64
+        bytes_int64 = bits_int64/bits_int8
+
+    integer, parameter ::                   &
+        bits_char = character_storage_size, &
+        bytes_char = bits_char/bits_int8
+
+! Dealing with different endians
+    logical, parameter, public ::                                    &
+        little_endian = ( 1 == transfer([1_int8, 0_int8], 0_int16) )
+
+    public ::               &
+        fibonacci_hash,     &
+        fnv_1_hash,         &
+        fnv_1a_hash,        &
+        new_nmhash32_seed,  &
+        new_nmhash32x_seed, &
+        new_water_hash_seed,&
+        nmhash32,           &
+        nmhash32x,          &
+        odd_random_integer, &
+        universal_mult_hash,&
+        water_hash
+
+
+    interface fnv_1_hash
+!! FNV_1 interfaces
+
+        #:for k1 in INT_KINDS
+          pure module function ${k1}$_fnv_1( key ) result(hash_code)
+!! FNV_1 hash function for rank 1 array keys of kind ${k1}$
+              integer(${k1}$), intent(in) :: key(:)
+              integer(int_hash)           :: hash_code
+          end function ${k1}$_fnv_1
+
+        #:endfor
+
+        pure module function character_fnv_1( key ) result(hash_code)
+!! FNV_1 hash function for default character string keys
+            character(*), intent(in) :: key
+            integer(int_hash)        :: hash_code
+        end function character_fnv_1
+
+    end interface fnv_1_hash
+
+    interface fnv_1a_hash
+!! FNV_1A interfaces
+        #:for k1 in INT_KINDS
+          pure module function ${k1}$_fnv_1a( key ) result(hash_value)
+!! FNV_1A hash function for rank 1 array keys of kind ${k1}$
+              integer(${k1}$), intent(in) :: key(:)
+              integer(int_hash)           :: hash_value
+          end function ${k1}$_fnv_1a
+
+        #:endfor
+
+        pure module function character_fnv_1a( key ) result(hash_value)
+!! FNV_1A hash function for default character string keys
+            character(*), intent(in) :: key
+            integer(int_hash)        :: hash_value
+        end function character_fnv_1a
+
+    end interface fnv_1a_hash
+
+    interface nmhash32
+!!  NMHASH32 interfaces
+
+        #:for k1 in INT_KINDS
+          pure module function ${k1}$_nmhash32( key, seed ) &
+              result(hash_value)
+!! NMHASH32 hash function for rank 1 array keys of kind ${k1}$
+              integer(${k1}$), intent(in) :: key(0:)
+              integer(int32), intent(in)  :: seed
+              integer(int32)              :: hash_value
+          end function ${k1}$_nmhash32
+
+        #:endfor
+
+        pure module function character_nmhash32( key, seed ) &
+            result(hash_value)
+!! NMHASH32 hash function for default character string keys
+            character(*), intent(in)      :: key
+            integer(int32), intent(in) :: seed
+            integer(int32) :: hash_value
+        end function character_nmhash32
+
+    end interface nmhash32
+
+    interface nmhash32x
+!!  NMHASH32X interfaces
+
+        #:for k1 in INT_KINDS
+          pure module function ${k1}$_nmhash32x( key, seed ) &
+              result(hash_value)
+!! NMHASH32 hash function for rank 1 array keys of kind ${k1}$
+              integer(${k1}$), intent(in) :: key(0:)
+              integer(int32), intent(in) :: seed
+              integer(int32)             :: hash_value
+          end function ${k1}$_nmhash32x
+
+        #:endfor
+
+        pure module function character_nmhash32x( key, seed ) &
+            result(hash_value)
+!! NMHASH32 hash function for default character string keys
+            character(*), intent(in)      :: key
+            integer(int32), intent(in) :: seed
+            integer(int32) :: hash_value
+        end function character_nmhash32x
+
+    end interface nmhash32x
+
+    interface water_hash
+!! WATER_HASH interfaces
+
+        #:for k1 in INT_KINDS
+          pure module function ${k1}$_water_hash( key, seed ) &
+              result(hash_code)
+!! WATER HASH function for rank 1 array keys of kind ${k1}$
+              integer(${k1}$), intent(in) :: key(0:)
+              integer(int64), intent(in)  :: seed
+              integer(int_hash)           :: hash_code
+          end function ${k1}$_water_hash
+        #:endfor
+
+        pure module function character_water_hash( key, seed ) &
+            result(hash_code)
+!! WATER hash function for default character string keys
+            character(*), intent(in)   :: key
+            integer(int64), intent(in) :: seed
+            integer(int_hash)          :: hash_code
+        end function character_water_hash
+
+    end interface water_hash
+
+    interface new_water_hash_seed
+
+        module subroutine new_water_hash_seed( seed )
+            integer(int64), intent(inout) :: seed
+        end subroutine new_water_hash_seed
+
+    end interface new_water_hash_seed
+
+    interface new_nmhash32_seed
+
+        module subroutine new_nmhash32_seed( seed )
+            integer(int32), intent(inout) :: seed
+        end subroutine new_nmhash32_seed
+
+    end interface new_nmhash32_seed
+
+    interface new_nmhash32x_seed
+
+        module subroutine new_nmhash32x_seed( seed )
+            integer(int32), intent(inout) :: seed
+        end subroutine new_nmhash32x_seed
+
+    end interface new_nmhash32x_seed
+
+contains
+
+    pure function fibonacci_hash( key, nbits ) result( sample )
+!! Maps the 32 bit integer KEY to an unsigned integer value with only NBITS
+!! bits where NBITS is less than 32
+        integer(int32), intent(in) :: key
+        integer, intent(in)        :: nbits
+        integer(int32)             :: sample
+
+        sample = ishft( key*pow32_over_phi, -32 + nbits )
+
+    end function fibonacci_hash
+
+    pure function universal_mult_hash( key, seed, nbits ) result( sample )
+!! Uses the "random" odd 32 bit integer SEED to map the 32 bit integer KEY to
+!! an unsigned integer value with only NBITS bits where NBITS is less than 32
+        integer(int32), intent(in) :: key
+        integer(int32), intent(in) :: seed
+        integer, intent(in)        :: nbits
+        integer(int32)             :: sample
+
+        sample = ishft( key*seed, -32 + nbits )
+
+    end function universal_mult_hash
+
+    subroutine odd_random_integer( harvest )
+!! Returns a 32 bit pseudo random integer, HARVEST, distributed uniformly over
+!! the odd integers of the INT32 kind.
+        integer(int32), intent(out) :: harvest
+        real(dp) :: sample
+
+        call random_number( sample )
+        harvest = int( floor( sample * 2_int64**32, int64 ) - 2_int64**31, &
+            int32 )
+        harvest = ishft( harvest, 1 ) + 1_int32
+
+    end subroutine odd_random_integer
+
+end module stdlib_32_bit_hash_functions
diff --git a/src/stdlib_32_bit_nmhashes.fypp b/src/stdlib_32_bit_nmhashes.fypp
new file mode 100644
index 000000000..ba1fcb504
--- /dev/null
+++ b/src/stdlib_32_bit_nmhashes.fypp
@@ -0,0 +1,801 @@
+!!------------------------------------------------------------------------------
+!! `NM_HASH32` and `NM_HASH32X` are translations to Fortran 2008 and signed
+!! two's complement arithmetic of the `nmhash32` and `nmhash32x` V. 2 scalar
+!! algorithms of James Z. M. Gao, copyright 2021. James Z. M. Gao's original
+!! C++ code, `nmhash.h`, is available at the URL:
+!! https://github.com/gzm55/hash-garage/blob/a8913138bdb3b7539c202edee30a7f0794bbd835/nmhash.h
+!! under the BSD 2-Clause License:
+!! https://github.com/gzm55/hash-garage/blob/a8913138bdb3b7539c202edee30a7f0794bbd835/LICENSE
+!! The algorithms come in multiple versions, depending on whether the
+!! vectorized instructions SSE2 or AVX2 are available. As neither instruction
+!! is available in portable Fortran 2008, the algorithms that do not use these
+!! instructions.
+!!
+!! The BSD 2-Clause license is as follows:
+!!
+!! BSD 2-Clause License
+!!
+!! Copyright (c) 2021, water hash algorithm. James Z.M. Gao
+!! All rights reserved.
+!!
+!! Redistribution and use in source and binary forms, with or without
+!! modification, are permitted provided that the following conditions are met:
+!!
+!! 1. Redistributions of source code must retain the above copyright notice,
+!!    this list of conditions and the following disclaimer.
+!!
+!! 2. Redistributions in binary form must reproduce the above copyright notice,
+!!    this list of conditions and the following disclaimer in the documentation
+!!    and/or other materials provided with the distribution.
+!!
+!! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+!! AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+!! IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+!! ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+!! LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+!! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+!! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+!! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+!! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+!! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+!! POSSIBILITY OF SUCH DAMAGE.
+!!------------------------------------------------------------------------------
+
+#! Integer kinds to be considered during templating
+#:set INT_KINDS = ["int16", "int32", "int64"]
+
+submodule(stdlib_32_bit_hash_functions) stdlib_32_bit_nmhashes
+
+    implicit none
+
+! Primes from XXH
+    integer(int32), parameter :: nmh_prime32_1 = int( Z'9E3779B1', int32 )
+    integer(int32), parameter :: nmh_prime32_2 = int( Z'85EBCA77', int32 )
+    integer(int32), parameter :: nmh_prime32_3 = int( Z'C2B2AE3D', int32 )
+    integer(int32), parameter :: nmh_prime32_4 = int( Z'27D4EB2F', int32 )
+
+    integer(int32), parameter :: nmh_m1 = int(z'F0D9649B', int32 )
+    integer(int32), parameter :: nmh_m2 = int(z'29A7935D', int32 )
+    integer(int32), parameter :: nmh_m3 = int(z'55D35831', int32 )
+
+    integer(int32), parameter :: nmh_m1_v(0:31) = nmh_m1
+    integer(int32), parameter :: nmh_m2_v(0:31) = nmh_m2
+    integer(int32), parameter :: nmh_m3_v(0:31) = nmh_m3
+
+    integer(int16), parameter :: nmh_m3_16(2) = transfer( nmh_m3, 0_int16, 2 )
+
+    logical, parameter :: nmh_short32_without_seed2=.false.
+    logical, parameter :: nmh_short32_with_seed2=.true.
+
+    integer, parameter :: init_size = 32
+
+! Pseudorandom secrets taken directly from FARSH.
+    integer(int32), parameter :: nmh_acc_init(0:init_size-1) = [ &
+        int( z'B8FE6C39', int32 ), int( z'23A44BBE', int32 ), &
+        int( z'7C01812C', int32 ), int( z'F721AD1C', int32 ), &
+        int( z'DED46DE9', int32 ), int( z'839097DB', int32 ), &
+        int( z'7240A4A4', int32 ), int( z'B7B3671F', int32 ), &
+        int( z'CB79E64E', int32 ), int( z'CCC0E578', int32 ), &
+        int( z'825AD07D', int32 ), int( z'CCFF7221', int32 ), &
+        int( z'B8084674', int32 ), int( z'F743248E', int32 ), &
+        int( z'E03590E6', int32 ), int( z'813A264C', int32 ), &
+
+        int( z'3C2852BB', int32 ), int( z'91C300CB', int32 ), &
+        int( z'88D0658B', int32 ), int( z'1B532EA3', int32 ), &
+        int( z'71644897', int32 ), int( z'A20DF94E', int32 ), &
+        int( z'3819EF46', int32 ), int( z'A9DEACD8', int32 ), &
+        int( z'A8FA763F', int32 ), int( z'E39C343F', int32 ), &
+        int( z'F9DCBBC7', int32 ), int( z'C70B4F1D', int32 ), &
+        int( z'8A51E04B', int32 ), int( z'CDB45931', int32 ), &
+        int( z'C89F7EC9', int32 ), int( z'D9787364', int32 ) ]
+
+contains
+
+    pure function nmh_readle32( p ) result( v )
+        integer(int32) :: v
+        integer(int8), intent(in) :: p(1:4)
+
+        if ( little_endian ) then
+            v = transfer( p(1:4), 0_int32 )
+        else
+            v = transfer( [ p(4), p(3), p(2), p(1) ], 0_int32 )
+        end if
+
+    end function nmh_readle32
+
+    pure function nmh_readle16( p ) result( v )
+        integer(int16) :: v
+        integer(int8), intent(in) :: p(1:2)
+
+        if ( little_endian ) then
+            v = transfer( p(1:2), 0_int16 )
+        else
+            v = transfer( [ p(2), p(1) ], 0_int16 )
+        end if
+
+    end function nmh_readle16
+
+    pure function nmhash32_0to8( x, seed ) result( vx32 )
+        integer(int32), intent(in) :: x
+        integer(int32), intent(in) :: seed
+        integer(int32) :: vx32
+        ! base mixer: [-6 -12 776bf593 -19 11 3fb39c65 -15 -9 e9139917 -11 16]
+        ! = 0.027071104091278835
+        integer(int32), parameter :: m1 = int(z'776BF593', int32)
+        integer(int32), parameter :: m2 = int(z'3FB39C65', int32)
+        integer(int32), parameter :: m3 = int(z'E9139917', int32)
+
+        integer(int16) :: vx16(0:1)
+        integer(int16), parameter :: &
+            m116(0:1) = transfer( m1, 0_int16, 2 ), &
+            m216(0:1) = transfer( m2, 0_int16, 2 ), &
+            m316(0:1) = transfer( m3, 0_int16, 2 )
+
+        vx32 = x
+        vx32 = ieor( vx32, ieor( ishft( vx32, -12 ), ishft( vx32, -6 ) ) )
+        vx16 = transfer( vx32, 0_int16, 2 )
+        vx16 = vx16 * m116
+        vx32 = transfer( vx16, 0_int32 )
+        vx32 = ieor( vx32, ieor( ishft( vx32, 11 ), ishft( vx32, -19 ) ) )
+        vx16 = transfer( vx32, 0_int16, 2 )
+        vx16 = vx16 * m216
+        vx32 = transfer( vx16, 0_int32 )
+        vx32 = ieor( vx32, seed )
+        vx32 = ieor( vx32, ieor( ishft( vx32, -15 ), ishft( vx32, -9 ) ) )
+        vx16 = transfer( vx32, 0_int16, 2 )
+        vx16 = vx16 * m316
+        vx32 = transfer( vx16, 0_int32 )
+        vx32 = ieor( vx32, ieor( ishft(vx32, 16), ishft(vx32, -11) ) )
+
+    end function nmhash32_0to8
+
+    pure function nmhash32_9to255( p, seed, full_avalanche ) result( hash )
+        integer(int8), intent(in)  :: p(0:)
+        integer(int32), intent(in) :: seed
+        logical, intent(in)        :: full_avalanche
+        integer(int32) :: hash
+
+        integer(int32) :: xu32(0:3), yu32(0:3)
+        integer(int16) :: xu16(0:1)
+        integer(int16), parameter :: &
+            nmh_m1_16(0:1) = transfer( nmh_m1, 0_int16, 2 ),  &
+            nmh_m2_16(0:1) = transfer( nmh_m2, 0_int16, 2 ),  &
+            nmh_m3_16(0:1) = transfer( nmh_m3, 0_int16, 2 )
+        integer(int32) :: s1
+        integer(int64) :: length
+        integer(int32) :: length32(0:1)
+        integer(int64) :: i, j, r
+
+        ! base mixer: [f0d9649b  5 -13 29a7935d -9 11 55d35831 -20 -10 ] =
+        ! 0.93495901789135362
+
+        length = size( p, kind=int64 )
+        length32 = transfer(length, 0_int32, 2)
+        if (little_endian) then
+            s1 = seed + length32(0)
+        else
+            s1 = seed + length32(1)
+        end if
+        xu32(0) = nmh_prime32_1
+        xu32(1) = nmh_prime32_2
+        xu32(2) = nmh_prime32_3
+        xu32(3) = nmh_prime32_4
+        yu32(:) = s1
+
+        if (full_avalanche) then
+            ! 33 to 255 bytes
+            r = (length - 1 ) /32
+            do i=0, r-1
+                do j=0, 3
+                    xu32(j) = ieor( xu32(j), nmh_readle32( p(i*32 + j*4: ) ) )
+                    yu32(j) = ieor( yu32(j), &
+                                    nmh_readle32( p(i*32 + j*4 + 16: ) ) )
+                    xu32(j) = xu32(j) + yu32(j)
+                    xu16 = transfer( xu32(j), 0_int16, 2 )
+                    xu16 = xu16 * nmh_m1_16
+                    xu32(j) = transfer( xu16, 0_int32 )
+                    xu32(j) = ieor( xu32(j), &
+                                    ieor( ishft(xu32(j), 5), &
+                                          ishft(xu32(j), -13)) )
+                    xu16 = transfer( xu32(j), 0_int16, 2 )
+                    xu16 = xu16 * nmh_m2_16
+                    xu32(j) = transfer( xu16, 0_int32 )
+                    xu32(j) = ieor( xu32(j), yu32(j) )
+                    xu32(j) = ieor( xu32(j), &
+                                    ieor( ishft(xu32(j), 11), &
+                                          ishft(xu32(j), -9) ) )
+                    xu16 = transfer( xu32(j), 0_int16, 2 )
+                    xu16 = xu16 * nmh_m3_16
+                    xu32(j) = transfer( xu16, 0_int32 )
+                    xu32(j) = ieor( xu32(j), &
+                                    ieor( ishft(xu32(j),-10), &
+                                          ishft(xu32(j), -20) ) )
+                end do
+            end do
+            do j=0, 3
+                xu32(j) = ieor( xu32(j), &
+                                nmh_readle32( p(length - 32 + j*4: ) ) )
+                yu32(j) = ieor( yu32(j), &
+                                nmh_readle32( p(length - 16 + j*4: ) ) )
+            end do
+        else
+            ! 9 to 32 bytes
+            xu32(0) = ieor(xu32(0), nmh_readle32(p(0:)))
+            xu32(1) = ieor(xu32(1), nmh_readle32(p(ishft(ishft(length,-4),3):)))
+            xu32(2) = ieor(xu32(2), nmh_readle32(p(length-8:)))
+            xu32(3) = ieor(xu32(3), &
+                           nmh_readle32(p(length-8-ishft(ishft(length,-4),3):)))
+            yu32(0) = ieor(yu32(0), nmh_readle32(p(4:)))
+            yu32(1) = ieor(yu32(1), &
+                      nmh_readle32(p(ishft(ishft(length,-4),3)+4:)))
+            yu32(2) = ieor(yu32(2), nmh_readle32(p(length-8+4:)))
+            yu32(3) = ieor(yu32(3), &
+                           nmh_readle32(p(length - 8 - &
+                                        ishft(ishft(length,-4),3)+4:)))
+        end if
+        do j=0, 3
+            xu32(j) = xu32(j) + yu32(j)
+            yu32(j) = ieor( yu32(j), ieor(ishft(yu32(j), 17), &
+                                          ishft(yu32(j), -6) ) )
+            xu16 = transfer( xu32(j), 0_int16, 2 )
+            xu16 = xu16 * nmh_m1_16
+            xu32(j) = transfer( xu16, 0_int32 )
+            xu32(j) = ieor( xu32(j), ieor(ishft(xu32(j), 5), &
+                                          ishft(xu32(j), -13) ) )
+            xu16 = transfer( xu32(j), 0_int16, 2 )
+            xu16 = xu16 * nmh_m2_16
+            xu32(j) = transfer( xu16, 0_int32 )
+            xu32(j) = ieor( xu32(j), yu32(j) )
+            xu32(j) = ieor( xu32(j), ieor(ishft(xu32(j), 11), &
+                                          ishft(xu32(j), -9) ) )
+            xu16 = transfer( xu32(j), 0_int16, 2 )
+            xu16 = xu16 * nmh_m3_16
+            xu32(j) = transfer( xu16, 0_int32 )
+            xu32(j) = ieor( xu32(j), ieor(ishft(xu32(j), -10), &
+                                          ishft(xu32(j), -20) ) )
+        end do
+        xu32(0) = ieor( xu32(0), nmh_prime32_1 )
+        xu32(1) = ieor( xu32(1), nmh_prime32_2 )
+        xu32(2) = ieor( xu32(2), nmh_prime32_3 )
+        xu32(3) = ieor( xu32(3), nmh_prime32_4 )
+        do j=1, 3
+            xu32(0) = xu32(0) + xu32(j)
+        end do
+        xu32(0) = ieor(xu32(0), s1 + ishft(s1, -5) )
+        xu16 = transfer( xu32(0), 0_int16, 2 )
+        xu16 = xu16 * nmh_m3_16
+        xu32(0) = transfer( xu16, 0_int32 )
+        xu32(0) = ieor(xu32(0), &
+                       ieor(ishft(xu32(0), -10), ishft(xu32(0), -20) ) )
+        hash = xu32(0)
+
+    end function nmhash32_9to255
+
+    pure function nmhash32_9to32( p, seed ) result( result )
+        integer(int8), intent(in)  :: p(0:)
+        integer(int32), intent(in) :: seed
+        integer(int32) :: result
+
+        result = nmhash32_9to255( p, seed, .false. )
+
+    end function nmhash32_9to32
+
+    pure function nmhash32_33to255( p, seed ) result( result )
+        integer(int8), intent(in)  :: p(0:)
+        integer(int32), intent(in) :: seed
+        integer(int32) :: result
+
+        result = nmhash32_9to255( p, seed, .true. )
+
+    end function nmhash32_33to255
+
+    pure subroutine nmhash32_long_round( accx, accy, p )
+        integer(int32), intent(inout) :: accx(0:)
+        integer(int32), intent(inout) :: accy(0:)
+        integer(int8), intent(in)     :: p(0:)
+
+        integer(int64), parameter :: nbgroups = init_size
+        integer(int64) :: i
+        integer(int16) :: dummy1(0:1)
+        integer(int16) :: dummy2(0:1)
+
+        do i = 0, nbgroups-1
+            accx(i) = ieor( accx(i), nmh_readle32( p(i*4:) ) )
+            accy(i) = ieor( accy(i), nmh_readle32( p(i*4+nbgroups*4:) ) )
+            accx(i) = accx(i) + accy(i)
+            accy(i) = ieor( accy(i), ishft(accx(i),  -1) )
+            dummy1 = transfer( accx(i), 0_int16, 2 )
+            dummy2 = transfer( nmh_m1_v(i), 0_int16, 2 )
+            dummy1 = dummy1 * dummy2
+            accx(i) = transfer( dummy1, 0_int32 )
+            accx(i) = ieor( accx(i), ieor( ishft(accx(i), 5), &
+                                           ishft(accx(i),-13) ) )
+            dummy1 = transfer( accx(i), 0_int16, 2 )
+            dummy2 = transfer( nmh_m2_v(i), 0_int16, 2 )
+            dummy1 = dummy1 * dummy2
+            accx(i) = transfer( dummy1, 0_int32 )
+            accx(i) = ieor( accx(i), accy(i) )
+            accx(i) = ieor( accx(i), ieor( ishft(accx(i), 11), &
+                                           ishft(accx(i),-9) ) )
+            dummy1 = transfer( accx(i), 0_int16, 2 )
+            dummy2 = transfer( nmh_m3_v(i), 0_int16, 2 )
+            dummy1 = dummy1 * dummy2
+            accx(i) = transfer( dummy1, 0_int32 )
+            accx(i) = ieor( accx(i), ieor( ishft(accx(i),-10), &
+                                           ishft(accx(i),-20) ) )
+        end do
+
+    end subroutine nmhash32_long_round
+
+    pure function nmhash32_long( p, seed ) result( sum )
+        integer(int32) :: sum
+        integer(int8), intent(in) :: p(0:)
+        integer(int32), intent(in) :: seed
+
+        integer(int32) :: accx(0:size(nmh_acc_init)-1)
+        integer(int32) :: accy(0:size(nmh_acc_init)-1)
+        integer(int64) :: nbrounds
+        integer(int64) :: len
+        integer(int32) :: len32(0:1)
+        integer(int64) :: i
+
+        len  = size( p, kind=int64 )
+        nbrounds = (len-1) / ( 4*size(accx, kind=int64) * 2 )
+        sum = 0
+
+!  Init
+        do i=0_int64, size(nmh_acc_init, kind=int64)-1
+            accx(i) = nmh_acc_init(i)
+            accy(i) = seed
+        end do
+
+        ! init
+        do i=0_int64, nbrounds-1
+            call nmhash32_long_round( accx, accy, &
+                                      p(i*8*size(accx, kind=int64):) )
+        end do
+        call nmhash32_long_round( accx, accy, &
+                                  p(len-8*size(accx, kind=int64):) )
+
+        ! merge acc
+        do i=0, size( accx, kind=int64 )-1
+            accx(i) = ieor( accx(i), nmh_acc_init(i) )
+            sum = sum + accx(i)
+        end do
+
+        len32 = transfer(len, 0_int32, 2)
+        if ( little_endian ) then
+            sum = sum + len32(1)
+            sum = ieor(sum, len32(0))
+        else
+            sum = sum + len32(0)
+            sum = ieor(sum, len32(1))
+        end if
+
+    end function nmhash32_long
+
+    pure function nmhash32_avalanche32( x ) result( u32 )
+        integer(int32) :: u32
+        integer(int32), intent(in) :: x
+
+        integer(int16) :: u16(0:1)
+        integer(int32), parameter:: m1 = int(z'CCE5196D', int32)
+        integer(int32), parameter:: m2 = int(z'464BE229', int32)
+        integer(int16), parameter:: m1_16(0:1) = transfer(m1, 0_int16, 2)
+        integer(int16), parameter:: m2_16(0:1) = transfer(m2, 0_int16, 2)
+        ! [-21 -8 cce5196d 12 -7 464be229 -21 -8] = 3.2267098842182733
+
+        u32 = x
+        u32 = ieor( u32, ieor( ishft( u32, -8 ), ishft( u32, -21 ) ) )
+        u16 = transfer( u32, 0_int16, 2 )
+        u16 = u16 * m1_16
+        u32 = transfer( u16, 0_int32 )
+        u32 = ieor( u32, ieor( ishft( u32, 12 ), ishft( u32, -7 ) ) )
+        u16 = transfer( u32, 0_int16, 2 )
+        u16 = u16 * m2_16
+        u32 = transfer( u16, 0_int32 )
+        u32 = ieor( u32, ieor( ishft( u32, -8 ), ishft( u32, -21 ) ) )
+
+    end function nmhash32_avalanche32
+
+    pure module function int8_nmhash32( key, seed ) result( hash )
+!! NMHASH32 hash function for rank 1 array keys of kind INT8
+        integer(int32) :: hash
+        integer(int8), intent(in) :: key(0:)
+        integer(int32), intent(in) :: seed
+        integer(int64) :: len
+        integer(int32) :: u32
+        integer(int16) :: u16(0:1)
+        integer(int32) :: x, y
+        integer(int32) :: new_seed
+
+        len = size( key, kind=int64 )
+        if ( len <= 32 ) then
+            if ( len > 8 ) then
+                hash = nmhash32_9to32( key, seed )
+                return
+            else if ( len > 4 ) then
+                x = nmh_readle32(key)
+                y = ieor( nmh_readle32(key(len-4:)), nmh_prime32_4 + 2 + seed )
+                x = x + y
+                x = ieor( x, ishft(x, len + 7 ) )
+                hash = nmhash32_0to8( x, ishftc(y, 5) )
+                return
+            else
+                select case(len)
+                case(0)
+                    new_seed = seed + nmh_prime32_2
+                    u32 = 0
+                case(1)
+                    new_seed = seed + nmh_prime32_2 + ishft(1_int32, 24) + &
+                               2_int32
+                    if ( little_endian ) then
+                        u32 = transfer( [key(0), 0_int8, 0_int8, 0_int8], &
+                                        0_int32 )
+                    else
+                        u32 = transfer( [0_int8, 0_int8, 0_int8, key(0)], &
+                                        0_int32 )
+                    end if
+                case(2)
+                    new_seed = seed + nmh_prime32_2 + ishft(2_int32, 24) + &
+                               4_int32
+                    if (little_endian) then
+                        u32 = transfer( [nmh_readle16(key), 0_int16], 0_int32 )
+                    else
+                        u32 = transfer( [0_int16, nmh_readle16(key)], 0_int32 )
+                    end if
+                case(3)
+                    new_seed = seed + nmh_prime32_2 + ishft(3_int32, 24) + &
+                               6_int32
+                    if ( little_endian ) then
+                        u16(1) = transfer( [key(2), 0_int8], 0_int16 )
+                        u16(0) = nmh_readle16( key )
+                    else
+                        u16(0) = transfer( [0_int8, key(2)], 0_int16 )
+                        u16(1) = nmh_readle16( key )
+                    end if
+                    u32 = transfer( u16, 0_int32 )
+                case(4)
+                    new_seed = seed + nmh_prime32_3
+                    u32 = nmh_readle32(key)
+                case default
+                    hash = 0
+                    return
+                end select
+                hash = nmhash32_0to8(u32+new_seed, ishftc(new_seed, 5) )
+                return
+            end if
+        else if ( len < 256_int64 ) then
+            hash = nmhash32_33to255( key, seed )
+            return
+        else
+            hash = nmhash32_avalanche32( nmhash32_long(key, seed ))
+            return
+        end if
+
+    end function int8_nmhash32
+
+    pure function nmhash32x_0to4( x, seed ) result( hash )
+        integer(int32), intent(in) :: x
+        integer(int32), intent(in) :: seed
+        integer(int32) :: hash
+
+        ! [bdab1ea9 18 a7896a1b 12 83796a2d 16] = 0.092922873297662509
+
+        hash = x
+        hash = ieor( hash, seed )
+        hash = hash * int(z'BDAB1EA9', int32)
+        hash = hash + ishftc(seed, 31)
+        hash = ieor( hash, ishft(hash, -18) )
+        hash = hash * int(z'A7896A1B', int32)
+        hash = ieor( hash, ishft(hash, -12) )
+        hash = hash * int(z'83796A2D', int32)
+        hash = ieor( hash, ishft(hash, -16) )
+
+    end function nmhash32x_0to4
+
+    pure function nmhash32x_5to8( p, seed ) result( x )
+        integer(int8), intent(in) :: p(0:)
+        integer(int32), intent(in) :: seed
+        integer(int32) :: x
+
+        integer(int64) :: len
+        integer(int32) :: y
+
+        ! 5 to 9 bytes
+        ! mixer: [11049a7d 23 bcccdc7b 12 065e9dad 12] = 0.16577596555667246
+
+        len = size(p, kind=int64)
+        x = ieor( nmh_readle32(p), nmh_prime32_3 )
+        y = ieor( nmh_readle32(p(len-4:)), seed )
+        x  = x + y
+        x = ieor( x, ishft(x, -len) )
+        x = x * int(z'11049A7D', int32)
+        x = ieor( x, ishft(x, -23) )
+        x = x * int(z'BCCCDC7B', int32)
+        x = ieor( x, ishftc(y, 3) )
+        x = ieor( x, ishft(x, -12) )
+        x = x * int(z'065E9DAD', int32)
+        x = ieor( x, ishft(x, -12) )
+
+    end function nmhash32x_5to8
+
+    pure function nmhash32x_9to255( p, seed ) result( x )
+        integer(int8), intent(in) :: p(0:)
+        integer(int32), intent(in) :: seed
+        integer(int32) :: x
+
+        integer(int64) :: len
+        integer(int32) :: len32(0:1)
+        integer(int8)  :: len8(0:7)
+        integer(int32) :: len_base
+        integer(int32) :: y
+        integer(int32) :: a, b
+        integer(int64) :: i, r
+
+        ! - at least 9 bytes
+        ! base mixer: [11049a7d 23 bcccdc7b 12 065e9dad 12] = 0.16577596555667246
+        ! tail mixer: [16 a52fb2cd 15 551e4d49 16] = 0.17162579707098322
+
+        len = size(p, kind=int64)
+        len8 = transfer(len, 0_int8, 8)
+        len32 = transfer(len, 0_int32, 2)
+        if (little_endian) then
+            len_base = transfer( [len8(0), 0_int8, 0_int8, 0_int8], 0_int32 )
+        else
+            len_base = transfer( [0_int8, 0_int8, 0_int8, len8(7)], 0_int32 )
+        end if
+        x = nmh_prime32_3
+        y = seed
+        a = nmh_prime32_4
+        b = seed
+        r = (len - 1)/16
+
+        do i=0, r-1
+            x = ieor(x, nmh_readle32( p(i*16 + 0:) ) )
+            y = ieor(y, nmh_readle32( p(i*16 + 4:) ) )
+            x = ieor(x, y)
+            x = x * int(z'11049A7D', int32)
+            x = ieor(x, ishft(x, -23) )
+            x = x * int(z'BCCCDC7B', int32)
+            y = ishftc(y, 4)
+            x = ieor(x, y)
+            x = ieor(x, ishft(x, -12) )
+            x = x * int(z'065E9DAD', int32)
+            x = ieor(x, ishft(x, -12) )
+
+            a = ieor(a, nmh_readle32(p(i*16 + 8:)))
+            b = ieor(b, nmh_readle32(p(i*16 + 12:)))
+            a = ieor(a, b)
+            a = a * int(z'11049A7D', int32)
+            a = ieor(a, ishft(a, -23) )
+            a = a * int(z'BCCCDC7B', int32)
+            b = ishftc(b, 3)
+            a = ieor(a, b)
+            a = ieor(a, ishft(a, -12) )
+            a = a * int(z'065E9DAD', int32)
+            a = ieor(a, ishft(a, -12) )
+        end do
+
+        if ( iand(len_base-1_int32, 8_int32) /= 0 ) then
+            if ( iand(len_base-1_int32, 4_int32) /= 0 ) then
+                a = ieor( a, nmh_readle32( p(r*16 + 0:) ) )
+                b = ieor( b, nmh_readle32( p(r*16 + 4:) ) )
+                a = ieor(a, b)
+                a = a * int(z'11049A7D', int32)
+                a = ieor(a, ishft(a, -23) )
+                a = a * int(z'BCCCDC7B', int32)
+                a = ieor(a, ishftc(b, 4))
+                a = ieor(a, ishft(a, -12))
+                a = a * int(z'065E9DAD', int32)
+            else
+                a = ieor( a, nmh_readle32( p(r*16:) ) + b )
+                a = ieor( a, ishft(a, -16) )
+                a = a * int(z'A52FB2CD', int32)
+                a = ieor( a, ishft(a, -15) )
+                a = a * int(z'551E4D49', int32)
+            end if
+            x = ieor( x, nmh_readle32( p(len - 8:) ) )
+            y = ieor( y, nmh_readle32( p(len - 4:) ) )
+            x = ieor( x, y )
+            x = x * int(z'11049A7D', int32)
+            x = ieor( x, ishft(x, -23) )
+            x = x * int(z'BCCCDC7B', int32);
+            x = ieor( x, ishftc(y, 3) )
+            x = ieor( x, ishft(x, -12) )
+            x = x * int(z'065E9DAD', int32)
+        else
+            if ( iand(len_base-1_int32, 4_int32) /= 0) then
+                a = ieor(a, nmh_readle32(p( r * 16:) ) + b )
+                a = ieor( a, ishft(a,-16) )
+                a = a * int(z'A52FB2CD', int32)
+                a = ieor( a, ishft(a,-15) )
+                a = a * int(z'551E4D49', int32)
+            end if
+            x = ieor( x, nmh_readle32(p( len - 4:) ) + y )
+            x = ieor( x, ishft(x,-16) )
+            x = x * int(z'A52FB2CD', int32)
+            x = ieor( x, ishft(x,-15) )
+            x = x * int(z'551E4D49', int32)
+        end if
+
+        if ( little_endian ) then
+            x = ieor(x, len32(0) )
+        else
+            x = ieor(x, len32(1) )
+        end if
+        x = ieor(x, ishftc(a, 27)) ! rotate one lane to pass Diff test
+        x = ieor(x, ishft(x,-14))
+        x = x * int(z'141CC535', int32 )
+
+    end function nmhash32x_9to255
+
+    pure function nmhash32x_avalanche32( x ) result(hash)
+        integer(int32) :: hash
+        integer(int32), intent(in) :: x
+! Mixer with 2 mul from skeeto/hash-prospector:
+! [15 d168aaad 15 af723597 15] = 0.15983776156606694
+
+        hash = x
+        hash = ieor( hash, ishft( hash, -15 ) )
+        hash = hash * int( z'D168AAAD', int32 )
+        hash = ieor( hash, ishft( hash, -15 ) )
+        hash = hash * int( z'AF723597', int32 )
+        hash = ieor( hash, ishft( hash, -15 ) )
+
+    end function nmhash32x_avalanche32
+
+    pure module function int8_nmhash32x( key, seed ) result(hash)
+!! NMHASH32x hash function for rank 1 array keys of kind INT8
+        integer(int32) :: hash
+        integer(int8), intent(in) :: key(0:)
+        integer(int32), intent(in) :: seed
+
+        integer(int64) :: len
+        integer(int32) :: seed2
+        integer(int32) :: u32
+        integer(int16) :: u16(0:1)
+
+        len = size( key, kind=int64 )
+        if ( len <= 8 ) then
+            if ( len > 4 ) then
+                hash = nmhash32x_5to8( key, seed )
+                return
+            else ! 0 to 4 bytes
+                select case (len)
+                case(0)
+                    seed2 = seed + nmh_prime32_2
+                    u32 = 0
+                case(1)
+                    seed2 = seed + nmh_prime32_2 + ishft(1_int32, 24) + &
+                        ishft(1_int32, 1)
+                    if (little_endian) then
+                        u32 = transfer( [key(0), 0_int8, 0_int8, 0_int8], &
+                                        0_int32 )
+                    else
+                        u32 = transfer( [0_int8, 0_int8, 0_int8, key(0)], &
+                                        0_int32 )
+                    end if
+                case(2)
+                    seed2 = seed + nmh_prime32_2 + ishft(2_int32, 24) + &
+                        ishft(2_int32, 1)
+                    if (little_endian) then
+                        u32 = transfer( [nmh_readle16(key), 0_int16], 0_int32 )
+                    else
+                        u32 = transfer( [0_int16, nmh_readle16(key)], 0_int32 )
+                    end if
+                case(3)
+                    seed2 = seed + nmh_prime32_2 + ishft(3_int32, 24) + &
+                        ishft(3_int32, 1)
+                    if (little_endian ) then
+                        u16(1) = transfer( [ key(2), 0_int8 ], 0_int16 )
+                        u16(0) = nmh_readle16(key)
+                    else
+                        u16(0) = transfer( [ 0_int8, key(2) ], 0_int16 )
+                        u16(1) = nmh_readle16(key)
+                    end if
+                    u32 = transfer( u16, 0_int32 )
+                case(4)
+                    seed2 = seed + nmh_prime32_1
+                    u32 = nmh_readle32(key)
+                case default
+                    hash = 0
+                    return
+                end select
+                hash = nmhash32x_0to4(u32, seed2)
+                return
+            end if
+        end if
+        if (len < 256) then
+            hash = nmhash32x_9to255(key, seed)
+            return
+        end if
+        hash = nmhash32x_avalanche32(nmhash32_long(key, seed))
+
+    end function int8_nmhash32x
+
+#:for k1 in INT_KINDS
+    pure module function ${k1}$_nmhash32( key, seed ) result(hash_code)
+!! NMHASH32 hash function for rank 1 array keys of kind ${k1}$
+        integer(${k1}$), intent(in) :: key(:)
+        integer(int32), intent(in)  :: seed
+        integer(int32)           :: hash_code
+
+        hash_code = int8_nmhash32( transfer( key, 0_int8, &
+                     bytes_${k1}$*size(key, kind=int64) ), seed)
+
+    end function ${k1}$_nmhash32
+
+#:endfor
+
+    pure module function character_nmhash32( key, seed ) result(hash_code)
+!! NMHASH32 hash function for default character keys
+        character(*), intent(in)   :: key
+        integer(int32), intent(in) :: seed
+        integer(int32)             :: hash_code
+
+        hash_code = int8_nmhash32( transfer( key, 0_int8, &
+                     bytes_char*len(key, kind=int64) ), seed)
+
+    end function character_nmhash32
+
+#:for k1 in INT_KINDS
+    pure module function ${k1}$_nmhash32x( key, seed ) result(hash_code)
+!! NMHASH32X hash function for rank 1 array keys of kind ${k1}$
+        integer(${k1}$), intent(in) :: key(:)
+        integer(int32), intent(in)  :: seed
+        integer(int32)           :: hash_code
+
+        hash_code = int8_nmhash32x( transfer( key, 0_int8, &
+                     bytes_${k1}$*size(key, kind=int64) ), seed)
+
+    end function ${k1}$_nmhash32x
+
+#:endfor
+
+    pure module function character_nmhash32x( key, seed ) result(hash_code)
+!! NMHASH32X hash function for default character keys
+        character(*), intent(in)   :: key
+        integer(int32), intent(in) :: seed
+        integer(int32)             :: hash_code
+
+        hash_code = int8_nmhash32x( transfer( key, 0_int8, &
+                     bytes_char*len(key, kind=int64) ), seed)
+
+    end function character_nmhash32x
+
+    module subroutine new_nmhash32_seed( seed )
+! Random SEED generator for NMHASH32
+        integer(int32), intent(inout) :: seed
+
+        integer(int32) :: old_seed
+        real(dp) :: sample
+
+        old_seed = seed
+        find_seed:do
+            call random_number( sample )
+            seed = int( floor( sample * 2_int64**32, int64 ) - 2_int64**31, &
+                int32 )
+            if ( seed /= old_seed ) return
+        end do find_seed
+
+    end subroutine new_nmhash32_seed
+
+    module subroutine new_nmhash32x_seed( seed )
+! Random SEED generator for NMHASH32X
+         integer(int32), intent(inout) :: seed
+
+        integer(int32) :: old_seed
+        real(dp) :: sample
+
+        old_seed = seed
+        find_seed:do
+            call random_number( sample )
+            seed = int( floor( sample * 2_int64**32, int64 ) - 2_int64**31, &
+                int32 )
+            if ( seed /= old_seed ) return
+        end do find_seed
+
+    end subroutine new_nmhash32x_seed
+
+end submodule stdlib_32_bit_nmhashes
diff --git a/src/stdlib_32_bit_water_hashes.fypp b/src/stdlib_32_bit_water_hashes.fypp
new file mode 100644
index 000000000..33181ab3f
--- /dev/null
+++ b/src/stdlib_32_bit_water_hashes.fypp
@@ -0,0 +1,282 @@
+!!------------------------------------------------------------------------------
+!! `WATER_HASH` is a translation to Fortran 2008 of the `waterhash` algorithm
+!! of Tommy Ettinger. Tommy Ettinger's original C++ code, `waterhash.h`, is
+!! available at the URL: https://github.com/tommyettinger/waterhash under the
+!! `unlicense`, https://github.com/tommyettinger/waterhash/blob/master/LICENSE.
+!! "`waterhash` is a variant on Wang Yi's `wyhash`, with 32 bit output,
+!! using at most 64 bit arithmetic. `wyhash` is available at the URL:
+!! `https://github.com/wangyi-fudan/wyhash` also under the unlicense:
+!! `https://github.com/wangyi-fudan/wyhash/blob/master/LICENSE`.
+!! Original Author: Wang Yi <godspeed_china@yeah.net>
+!! Waterhash Variant Author: Tommy Ettinger <tommy.ettinger@gmail.com>
+!!
+!! The `unlicense` reads as follows:
+!!   This is free and unencumbered software released into the public domain.
+!!
+!!   Anyone is free to copy, modify, publish, use, compile, sell, or
+!!   distribute this software, either in source code form or as a compiled
+!!   binary, for any purpose, commercial or non-commercial, and by any
+!!   means.
+!!
+!!   In jurisdictions that recognize copyright laws, the author or authors
+!!   of this software dedicate any and all copyright interest in the
+!!   software to the public domain. We make this dedication for the benefit
+!!   of the public at large and to the detriment of our heirs and
+!!   successors. We intend this dedication to be an overt act of
+!!   relinquishment in perpetuity of all present and future rights to this
+!!   software under copyright law.
+!!
+!!   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+!!   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+!!   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+!!   IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+!!   OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+!!   ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+!!   OTHER DEALINGS IN THE SOFTWARE.
+!!
+!!   For more information, please refer to <http://unlicense.org>
+!!
+!! `WATER_HASH` is distributed as part of the `stdlib_32_bit_hash_functions.f90`
+!! module and its `stdlib_32_bit_water_hashes.f90` submodule with the Fortran
+!! Standard Library at URL: https://github.com/fortran-lang/stdlib.
+!! The Fortran Standard Library, including this code, is distributed under the
+!! MIT License as described in the `LICENSE` file distributed with the library.
+!! `WATER_HASH` differs from `waterhash.h` not only in its use of Fortran,
+!! but also in its use of signed two's complement arithmetic in contrast to
+!! the unsigned arithmetic of Ettinger and Wang Yi, and in making some of the
+!! uses of `TRANSFER` endian dependent, in an attempt to make the quality of
+!! the hash endian independent. The use of signed arithmetic may change with
+!! the planned introduction of the unsigned BITS datatype in what is currently
+!! known as Fortran 202X.
+!!
+!! To be useful this code must be processed by a processor that implements two
+!! Fortran 2008 extensions to Fortran 2003: submodules, and 64 bit (`INT64`)
+!! integers. The processor must also use two's complement integers
+!! (all Fortran 95+ processors use two's complement arithmetic) with
+!! wrap around overflow at runtime and for BOZ constants. The latest releases
+!! of the following processors are known to implement the required Fortran
+!! 2008 extensions and default to runtime wrap around overflow: FLANG,
+!! gfortran, ifort, and NAG Fortran. Older versions of gfortran will require
+!! the compiler flag, `-fno-range-check`, to ensure wrap around semantics
+!! for BOZ constants, and only versions of the NAG compiler starting with
+!! version 17, have implemented submodules. The latest releases of Cray
+!! Fortran and IBM Fortran are known to implement the Fortran 2008 extensions,
+!! but whether they also implement wrap around overflow is unknown.
+!!
+!! This implementation has only been tested on little endian processors. It
+!! will generate different hashes on big endian processors, but they are
+!! believed to be of comparable quality to those generated for little endian
+!! processors.
+!!
+!! No version of this hash is suitable as a cryptographic hash.
+!!------------------------------------------------------------------------------
+
+#! Integer kinds to be considered during templating
+#:set INT_KINDS = ["int16", "int32", "int64"]
+
+submodule(stdlib_32_bit_hash_functions) stdlib_32_bit_water_hashes
+    implicit none
+
+contains
+
+    pure module function int8_water_hash( key, seed ) result(hash_code)
+        integer(int32)             :: hash_code
+        integer(int8), intent(in)  :: key(0:)
+        integer(int64), intent(in) :: seed
+
+        integer(int32) :: dummy(2)
+        integer(int64) :: h
+        integer(int64) :: i
+        integer(int64) :: len
+        integer(int64), parameter ::                &
+            waterp0 = int(z'a0761d65', kind=int64), &
+            waterp1 = int(z'e7037ed1', kind=int64), &
+            waterp2 = int(z'8ebc6af1', kind=int64), &
+            waterp3 = int(z'589965cd', kind=int64), &
+            waterp4 = int(z'1d8e4e27', kind=int64), &
+            waterp5 = int(z'eb44accb', kind=int64)
+
+        len = size(key, kind=int64)
+        h = seed
+        do i = 0_int64, len-16, 16
+            h = watermum(watermum(ieor(waterr32(key(i:)),waterp1),        &
+                                  ieor(waterr32(key(i+4:)),waterp2)) + h, &
+                         watermum(ieor(waterr32(key(i+8:)),waterp3),      &
+                                  ieor(waterr32(key(i+12:)),waterp4)))
+        end do
+        h = h + waterp5
+
+        select case( iand(len, 15_int64) )
+        case(1)
+            h = watermum(ieor(waterp2, h),               &
+                         ieor(waterr08(key(i:)), waterp1))
+        case(2)
+            h = watermum(ieor(waterp3, h),               &
+                         ieor(waterr16(key(i:)), waterp4))
+        case(3)
+            h = watermum(ieor(waterr16(key(i:)), h),        &
+                         ieor(waterr08(key(i+2:)), waterp2))
+        case(4)
+            h = watermum(ieor(waterr16(key(i:)), h),        &
+                         ieor(waterr16(key(i+2:)), waterp3))
+        case(5)
+            h = watermum(ieor(waterr32(key(i:)), h),        &
+                         ieor(waterr08(key(i+4:)), waterp1))
+        case(6)
+            h = watermum(ieor(waterr32(key(i:)), h),        &
+                         ieor(waterr16(key(i+4:)), waterp1))
+        case(7)
+            h = watermum(ieor(waterr32(key(i:)), h),             &
+                         ieor(ior(ishft(waterr16(key(i+4:)), 8), &
+                                  waterr08(key(i+6:))), waterp1))
+        case(8)
+            h = watermum(ieor(waterr32(key(i:)), h),        &
+                         ieor(waterr32(key(i+4:)), waterp0))
+        case(9)
+            h = ieor(watermum(ieor(waterr32(key(i:)), h),          &
+                              ieor(waterr32(key(i+4:)), waterp2)), &
+                     watermum(ieor(h, waterp4),                    &
+                              ieor(waterr08(key(i+8:)), waterp3)))
+        case(10)
+            h = ieor(watermum(ieor(waterr32(key(i:)), h),            &
+                              ieor(waterr32(key(i+4:)), waterp2)),   &
+                     watermum(h, ieor(waterr16(key(i+8:)), waterp3)))
+        case(11)
+            h = ieor(watermum(ieor(waterr32(key(i:)), h),            &
+                              ieor(waterr32(key(i+4:)), waterp2)),   &
+                     watermum(h,                                     &
+                              ieor(ior(ishft(waterr16(key(i+8:)),8), &
+                                       waterr08(key(i+10:))),        &
+                                   waterp3)))
+        case(12)
+            h = ieor(watermum(ieor(waterr32(key(i:)), h),          &
+                              ieor(waterr32(key(i+4:)), waterp2)), &
+                     watermum(ieor(h, waterr32(key(i+8:))),        &
+                                      waterp4))
+        case(13)
+            h = ieor(watermum(ieor(waterr32(key(i:)), h),            &
+                              ieor(waterr32(key(i+4:)), waterp2)),   &
+                     watermum(ieor(h, waterr32(key(i+8:))),          &
+                              ieor(waterr08(key(i+12:)), waterp4)))
+        case(14)
+            h = ieor(watermum(ieor(waterr32(key(i:)), h),            &
+                              ieor(waterr32(key(i+4:)), waterp2)),   &
+                     watermum(ieor(h, waterr32(key(i+8:))),          &
+                              ieor(waterr16(key(i+12:)), waterp4)))
+        case(15)
+            h = ieor(watermum(ieor(waterr32(key(i:)), h),             &
+                              ieor(waterr32(key(i+4:)), waterp2)),    &
+                     watermum(ieor(h, waterr32(key(i+8:))),           &
+                              ieor(ior(ishft(waterr16(key(i+12:)),8), &
+                                       waterr08(key(i+14:))),         &
+                                   waterp4)))
+        end select
+
+        h = ieor( h, ishft(h,16) ) * ieor( len, waterp0 )
+        h = h - ishft( h, -32 )
+        dummy(1:2) = transfer(h, dummy, 2)
+        if (little_endian) then
+            hash_code = dummy(1)
+        else
+            hash_code = dummy(2)
+        end if
+
+    contains
+
+        pure function watermum( a, b ) result(r)
+            integer(int64)             :: r
+            integer(int64), intent(in) :: a, b
+
+            r = a * b
+            r = r - ishft(r, -32)
+
+        end function watermum
+
+        pure function waterr08( p ) result(v)
+            integer(int64)            :: v
+            integer(int8), intent(in) :: p(:)
+
+            if (little_endian) then
+                v = transfer( [ p(1), 0_int8, 0_int8, 0_int8,       &
+                                0_int8, 0_int8, 0_int8, 0_int8 ], v )
+            else
+                v = transfer( [ 0_int8, 0_int8, 0_int8, 0_int8,   &
+                                0_int8, 0_int8, 0_int8, p(1) ], v )
+            end if
+
+        end function waterr08
+
+        pure function waterr16( p ) result(v)
+            integer(int64)            :: v
+            integer(int8), intent(in) :: p(:)
+
+            if (little_endian) then
+                v = transfer( [ p(1), p(2), 0_int8, 0_int8,         &
+                                0_int8, 0_int8, 0_int8, 0_int8 ], v )
+            else
+                v = transfer( [ 0_int8, 0_int8, 0_int8, 0_int8,  &
+                                0_int8, 0_int8, p(2), p(1) ], v )
+            end if
+
+        end function waterr16
+
+        pure function waterr32( p ) result(v)
+            integer(int64)            :: v
+            integer(int8), intent(in) :: p(:)
+
+            if (little_endian) then
+                v = transfer( [ p(1), p(2), p(3), p(4),             &
+                                0_int8, 0_int8, 0_int8, 0_int8 ], v )
+            else
+                v = transfer( [ 0_int8, 0_int8, 0_int8, 0_int8, &
+                                p(4), p(3), p(2), p(1) ], v )
+            end if
+
+        end function waterr32
+
+    end function int8_water_hash
+
+
+#:for k1 in INT_KINDS
+    pure module function ${k1}$_water_hash( key, seed ) result(hash_code)
+        integer(${k1}$), intent(in) :: key(:)
+        integer(int64), intent(in)  :: seed
+        integer(int_hash)           :: hash_code
+
+        hash_code = int8_water_hash( transfer( key, 0_int8, &
+                     bytes_${k1}$*size(key, kind=int64) ), seed)
+
+    end function ${k1}$_water_hash
+
+#:endfor
+
+    pure module function character_water_hash( key, seed ) result(hash_code)
+        character(*), intent(in)   :: key
+        integer(int64), intent(in) :: seed
+        integer(int_hash)          :: hash_code
+
+        hash_code = int8_water_hash( transfer( key, 0_int8, &
+                     bytes_char*len(key, kind=int64) ), seed)
+
+    end function character_water_hash
+
+    module subroutine new_water_hash_seed( seed )
+        integer(int64), intent(inout) :: seed
+
+        integer(int64) :: old_seed
+
+        real(dp) :: sample(2)
+        integer(int32) :: part(2)
+
+        old_seed = seed
+        find_seed:do
+            call random_number( sample )
+            part = int( floor( sample * 2_int64**32, int64 ) - 2_int64**31, &
+                int32 )
+            seed = transfer( part, seed )
+            if ( seed /= old_seed ) return
+        end do find_seed
+
+    end subroutine new_water_hash_seed
+
+end submodule stdlib_32_bit_water_hashes
diff --git a/src/stdlib_64_bit_fnv_hashes.fypp b/src/stdlib_64_bit_fnv_hashes.fypp
new file mode 100644
index 000000000..1eefdb886
--- /dev/null
+++ b/src/stdlib_64_bit_fnv_hashes.fypp
@@ -0,0 +1,125 @@
+!!------------------------------------------------------------------------------
+!! `FNV_1_HASH` and  `FNV_1A_HASH` are translations to Fortran 2008 of the
+!! `FNV-1` and `FNV-1a` hash functions of Glenn Fowler, Landon Curt Noll,
+!! and Phong Vo, that has been released into the public domain. Permission
+!! has been granted, by Landon Curt Noll, for the use of these algorithms
+!! in the Fortran Standard Library. A description of these functions is
+!! available at https://en.wikipedia.org/wiki/Fowler–Noll–Vo_hash_function.
+!! The functions have been modified from their normal form to also encode
+!! the size of the structure in the hash.
+!!------------------------------------------------------------------------------
+
+#! Integer kinds to be considered during templating
+#:set INT_KINDS = ["int16", "int32", "int64"]
+
+submodule(stdlib_64_bit_hash_functions) stdlib_64_bit_fnv_hashes
+! An implementation of the FNV hashes 1 and 1a of Glenn Fowler, Landon Curt
+! Noll, and Kiem-Phong-Vo,
+! https://en.wikipedia.org/wiki/Fowler–Noll–Vo_hash_function
+    implicit none
+
+    integer(int_hash), parameter ::                          &
+        offset_basis = int( z'CBF29CE484222325', int_hash ), &
+        prime        = int( z'100000001B3', int_hash )
+
+contains
+
+    pure module function int8_fnv_1( key ) result(hash_code)
+        integer(int8), intent(in)     :: key(:)
+        integer(int_hash)             :: hash_code
+
+        integer(int64) :: i
+
+        hash_code = offset_basis
+        do i=1_int64, size(key, kind=int64)
+            hash_code = hash_code * prime
+            if ( little_endian ) then
+                hash_code = ieor( hash_code, &
+                                  transfer( [key(i), 0_int8, 0_int8, 0_int8,  &
+                                             0_int8, 0_int8, 0_int8, 0_int8], &
+                                            0_int_hash ) )
+            else
+                hash_code = ieor( hash_code, &
+                                  transfer( [0_int8, 0_int8, 0_int8, 0_int8,  &
+                                             0_int8, 0_int8, 0_int8, key(i)], &
+                                            0_int_hash ) )
+            end if
+        end do
+
+    end function int8_fnv_1
+
+
+#:for k1 in INT_KINDS
+    pure module function ${k1}$_fnv_1( key ) result(hash_code)
+        integer(${k1}$), intent(in)   :: key(:)
+        integer(int_hash)             :: hash_code
+
+        hash_code = int8_fnv_1( transfer( key, 0_int8,                      &
+                                          bytes_${k1}$*                     &
+                                          size( key, kind=int64 ) ) )
+
+    end function ${k1}$_fnv_1
+
+#:endfor
+
+    pure module function character_fnv_1( key ) result(hash_code)
+        character(*), intent(in)      :: key
+        integer(int_hash)             :: hash_code
+
+        hash_code = int8_fnv_1( transfer( key,                           &
+                                          0_int8,                        &
+                                          bytes_char*                    &
+                                          len(key, kind=int64) ) )
+
+    end function character_fnv_1
+
+
+    pure module function int8_fnv_1a( key ) result(hash_code)
+        integer(int8), intent(in)     :: key(:)
+        integer(int_hash)             :: hash_code
+
+        integer(int64) :: i
+
+        hash_code = offset_basis
+        do i=1_int64, size(key, kind=int64)
+            if ( little_endian ) then
+                hash_code = ieor( hash_code, &
+                                  transfer( [key(i), 0_int8, 0_int8, 0_int8,  &
+                                             0_int8, 0_int8, 0_int8, 0_int8], &
+                                            0_int_hash ) )
+            else
+                hash_code = ieor( hash_code, &
+                                  transfer( [0_int8, 0_int8, 0_int8, 0_int8,  &
+                                             0_int8, 0_int8, 0_int8, key(i)], &
+                                            0_int_hash ) )
+            end if
+            hash_code = hash_code * prime
+        end do
+
+    end function int8_fnv_1a
+
+
+#:for k1 in INT_KINDS
+    pure module function ${k1}$_fnv_1a( key ) result(hash_code)
+        integer(${k1}$), intent(in)   :: key(:)
+        integer(int_hash)             :: hash_code
+
+        hash_code = int8_fnv_1a( transfer( key, 0_int8,                   &
+                                           bytes_${k1}$*                  &
+                                           size(key, kind=int64)))
+
+    end function ${k1}$_fnv_1a
+
+#:endfor
+
+    pure module function character_fnv_1a( key ) result(hash_code)
+        character(*), intent(in)      :: key
+        integer(int_hash)             :: hash_code
+
+        hash_code = int8_fnv_1a( transfer( key, 0_int8,                   &
+                                           (bits_char/bits_int8)*         &
+                                           len(key, kind=int64) ) )
+
+    end function character_fnv_1a
+
+end submodule stdlib_64_bit_fnv_hashes
diff --git a/src/stdlib_64_bit_hash_functions.fypp b/src/stdlib_64_bit_hash_functions.fypp
new file mode 100644
index 000000000..0f31a0d26
--- /dev/null
+++ b/src/stdlib_64_bit_hash_functions.fypp
@@ -0,0 +1,308 @@
+#! Integer kinds to be considered during templating
+#:set INT_KINDS = ["int8", "int16", "int32", "int64"]
+
+module stdlib_64_bit_hash_functions
+
+    use, intrinsic :: iso_fortran_env, only : &
+        character_storage_size
+
+    use stdlib_kinds, only: &
+        dp,                 &
+        int8,               &
+        int16,              &
+        int32,              &
+        int64
+
+    implicit none
+
+    private
+
+    integer, parameter, public :: &
+        int_hash     = int64
+!! The number of bits in the output hash
+
+! The number of bits used by each integer type
+    integer, parameter, public ::       &
+! Should be 8
+        bits_int8  = bit_size(0_int8),  &
+! Should be 16
+        bits_int16 = bit_size(0_int16), &
+! Should be 32
+        bits_int32 = bit_size(0_int32), &
+! Should be 64
+        bits_int64 = bit_size(0_int64)
+
+    integer, parameter, public ::       &
+! Should be 8
+        bytes_int8  = bits_int8/bits_int8,  &
+! Should be 16
+        bytes_int16 = bits_int16/bits_int8, &
+! Should be 32
+        bytes_int32 = bits_int32/bits_int8, &
+! Should be 64
+        bytes_int64 = bits_int64/bits_int8
+
+    integer, parameter, public :: &
+        bits_char = character_storage_size, &
+        bytes_char = bits_char/bits_int8
+
+! Dealing with different endians
+    logical, parameter, public ::                                    &
+        little_endian = ( 1 == transfer( [1_int8, 0_int8], 0_int16) )
+
+    public ::                     &
+        fibonacci_hash,           &
+        fnv_1_hash,               &
+        fnv_1a_hash,              &
+        new_pengy_hash_seed,      &
+        new_spooky_hash_seed,     &
+        odd_random_integer,       &
+        pengy_hash,               &
+        spooky_hash,              &
+        spookyhash_128,           &
+        universal_mult_hash
+
+! pow64_over_phi is the odd number that most closely approximates 2**64/phi,
+! where phi is the golden ratio 1.618...
+    integer(int64), parameter ::                        &
+        pow64_over_phi = int(z'9E3779B97F4A7C15', int64)
+
+    integer(int_hash), parameter :: &
+        two_32 = 2_int_hash**32
+
+! constants used by Bob Jenkins' SpookyHash
+    integer(int32), parameter ::                            &
+        sc_numvars = 12,                                    &
+        sc_blocksize = sc_numvars*8,                        &
+        sc_buffsize = 2*sc_blocksize,                       &
+        sc_constsub = int(z'deadbeef', int32)
+        ! twos complement "deadbeef"
+
+    integer(int64), parameter ::                                  &
+        sc_const = transfer( [sc_constsub, sc_constsub], 0_int64 )
+
+    type :: spooky_subhash
+        integer(int8)  :: data(0:2*sc_blocksize-1)
+        integer(int64) :: state(0:sc_numvars-1)
+        integer(int64) :: length
+        integer(int16) :: remainder
+    end type spooky_subhash
+
+    interface fnv_1_hash
+!! FNV_1 interfaces
+
+        #:for k1 in INT_KINDS
+          pure module function ${k1}$_fnv_1( key ) result(hash_code)
+!! FNV_1 hash function for rank 1 arrays of kind ${k1}$
+              integer(${k1}$), intent(in) :: key(:)
+              integer(int_hash)              :: hash_code
+          end function ${k1}$_fnv_1
+        #:endfor
+
+        pure module function character_fnv_1( key ) result(hash_code)
+!! FNV_1 hash function for character strings
+            character(*), intent(in)   :: key
+            integer(int_hash)             :: hash_code
+        end function character_fnv_1
+
+    end interface fnv_1_hash
+
+
+    interface fnv_1a_hash
+!! FNV_1A interfaces
+        #:for k1 in INT_KINDS
+          pure module function ${k1}$_fnv_1a( key ) result(hash_code)
+!! FNV_1A hash function for rank 1 arrays of kind ${k1}$
+              integer(${k1}$), intent(in) :: key(:)
+              integer(int_hash)           :: hash_code
+          end function ${k1}$_fnv_1a
+        #:endfor
+
+        pure module function character_fnv_1a( key ) result(hash_code)
+!! FNV_1A hash function for character strings
+            character(*), intent(in)   :: key
+             integer(int_hash)         :: hash_code
+        end function character_fnv_1a
+
+    end interface fnv_1a_hash
+
+
+    interface murmur2_hash
+!!  MURMUR2_HASHES interfaces
+
+        #:for k1 in INT_KINDS
+          pure module function ${k1}$_murmur2_hash( key, seed ) &
+              result(hash_code)
+!! MURMUR2 hash function for rank 1 arrays of kind ${k1}$
+              integer(${k1}$), intent(in) :: key(0:)
+              integer(int_hash), intent(in)  :: seed
+              integer(int_hash) :: hash_code
+          end function ${k1}$_murmur2_hash
+        #:endfor
+
+        pure module function character_murmur2_hash( key, seed ) &
+            result(hash_code)
+!! MURMUR2 hash function for character strings
+            character(*), intent(in)    :: key
+            integer(int_hash), intent(in)  :: seed
+            integer(int_hash) :: hash_code
+        end function character_murmur2_hash
+
+    end interface murmur2_hash
+
+
+    interface spooky_hash
+!! SPOOKY_HASH interfaces
+
+        #:for k1 in INT_KINDS
+           module function ${k1}$_spooky_hash( key, seed ) &
+              result(hash_code)
+!! SPOOKY HASH function for rank 1 arrays of kind ${k1}$
+              integer(${k1}$), intent(in) :: key(0:)
+              integer(int_hash), intent(in)  :: seed(2)
+              integer(int_hash) :: hash_code(2)
+          end function ${k1}$_spooky_hash
+        #:endfor
+
+         module function character_spooky_hash( key, seed ) &
+            result(hash_code)
+!! SPOOKY hash function for character strings
+            character(*), intent(in)    :: key
+            integer(int_hash), intent(in)  :: seed(2)
+            integer(int_hash) :: hash_code(2)
+        end function character_spooky_hash
+
+    end interface spooky_hash
+
+    interface
+
+         module subroutine spookyHash_128( key, hash_inout )
+            integer(int8), intent(in), target :: key(0:)
+            integer(int_hash), intent(inout)  :: hash_inout(2)
+        end subroutine spookyHash_128
+
+    end interface
+
+
+    interface spooky_init
+
+         module subroutine spookysubhash_init( self, seed )
+            type(spooky_subhash), intent(out) :: self
+            integer(int_hash), intent(in)     :: seed(2)
+        end subroutine spookysubhash_init
+
+    end interface spooky_init
+
+
+    interface spooky_update
+
+         module subroutine spookyhash_update( spooky, key )
+            type(spooky_subhash), intent(out) :: spooky
+            integer(int8), intent(in)         :: key(0:)
+        end subroutine spookyhash_update
+
+    end interface spooky_update
+
+
+    interface spooky_final
+
+         module subroutine spookyhash_final(spooky, hash_code)
+            type(spooky_subhash), intent(inout) :: spooky
+            integer(int_hash), intent(inout)    :: hash_code(2)
+        end subroutine spookyhash_final
+
+    end interface spooky_final
+
+interface
+
+        module subroutine new_spooky_hash_seed( seed )
+! Random SEED generator for
+            integer(int64), intent(inout) :: seed(2)
+        end subroutine new_spooky_hash_seed
+
+    end interface
+
+    interface pengy_hash
+!! PENGY_HASH interfaces
+
+        #:for k1 in INT_KINDS
+    pure module function ${k1}$_pengy_hash( key, seed ) result(hash_code)
+!! PENGY_HASH hash function for rank 1 array keys of kind ${k1}$
+        integer(${k1}$), intent(in) :: key(:)
+        integer(int32), intent(in)  :: seed
+        integer(int64)           :: hash_code
+    end function ${k1}$_pengy_hash
+        #:endfor
+
+        pure module function character_pengy_hash( key, seed ) &
+            result(hash_code)
+!! MIR HASH STRICT function for character strings
+            character(*), intent(in)      :: key
+            integer(int32), intent(in) :: seed
+            integer(int64)             :: hash_code
+        end function character_pengy_hash
+
+    end interface pengy_hash
+
+    interface
+
+        module subroutine new_pengy_hash_seed( seed )
+! Random SEED generator for MIR_HASH_STRICT
+            integer(int32), intent(inout) :: seed
+        end subroutine new_pengy_hash_seed
+
+    end interface
+
+contains
+
+    pure function fibonacci_hash( key, nbits ) result( sample )
+!! Maps the 64 bit integer KEY to an unsigned integer value with only NBITS
+!! bits where NBITS is less than 64
+        integer(int64), intent(in) :: key
+        integer, intent(in)        :: nbits
+        integer(int64)             :: sample
+
+        sample = ishft( key*pow64_over_phi, -64 + nbits )
+
+    end function fibonacci_hash
+
+    pure function universal_mult_hash( key, seed, nbits ) result( sample )
+!! Uses the "random" odd 64 bit integer SEED to map the 64 bit integer KEY to
+!! an unsigned integer value with only NBITS bits where NBITS is less than 64.
+        integer(int64), intent(in) :: key
+        integer(int64), intent(in) :: seed
+        integer, intent(in)        :: nbits
+        integer(int64)             :: sample
+
+        sample = ishft( key*seed, -64 + nbits )
+
+    end function universal_mult_hash
+
+    subroutine odd_random_integer( harvest )
+!! Returns a 64 bit pseudo random integer, HARVEST, distributed uniformly over
+!! the odd integers of the 64 bit kind.
+        integer(int64), intent(out) :: harvest
+        real(dp) :: sample(2)
+        integer(int32) :: part(2)
+
+        call random_number( sample )
+        part = int( floor( sample * 2_int64**32, int64 ) - 2_int64**31, int32 )
+        harvest = transfer( part, harvest )
+        harvest = ishft( harvest, 1 ) + 1_int64
+
+    end subroutine odd_random_integer
+
+    subroutine random_integer( harvest )
+!! Returns a 64 bit pseudo random integer, HARVEST, distributed uniformly over
+!! the values of the 64 bit kind.
+        integer(int64), intent(out) :: harvest
+        real(dp) :: sample(2)
+        integer(int32) :: part(2)
+
+        call random_number( sample )
+        part = int( floor( sample * 2_int64**32, int64 ) - 2_int64**31, int32 )
+        harvest = transfer( part, harvest )
+
+    end subroutine random_integer
+
+end module stdlib_64_bit_hash_functions
diff --git a/src/stdlib_64_bit_pengy_hashes.fypp b/src/stdlib_64_bit_pengy_hashes.fypp
new file mode 100644
index 000000000..ca1f14791
--- /dev/null
+++ b/src/stdlib_64_bit_pengy_hashes.fypp
@@ -0,0 +1,148 @@
+!!------------------------------------------------------------------------------
+!! `PENGY_HASH` is a translation to Fortran 2008 and signed two's complement
+!! arithmetic of the `pengyhash` algorithm of Alberto Fajardo, copyright 2020.
+!! Alberto Fajardo's original C code, `pengyhash.c`, is available at the URL:
+!! https://github.com/tinypeng/pengyhash/blob/master/pengyhash.c
+!! under the BSD 2-Clause License:
+!! https://github.com/tinypeng/pengyhash/blob/master/LICENSE
+!!
+!! The BSD 2-Clause license is as follows:
+!!
+!! BSD 2-Clause License
+!!
+!! pengyhash
+!! Copyright (c) 2020 Alberto Fajardo
+!! All rights reserved.
+!!
+!! Redistribution and use in source and binary forms, with or without
+!! modification, are permitted provided that the following conditions are met:
+!!
+!! 1. Redistributions of source code must retain the above copyright notice,
+!!    this list of conditions and the following disclaimer.
+!!
+!! 2. Redistributions in binary form must reproduce the above copyright notice,
+!!    this list of conditions and the following disclaimer in the documentation
+!!    and/or other materials provided with the distribution.
+!!
+!! THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+!! AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+!! IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+!! ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+!! LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+!! CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+!! SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+!! INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+!! CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+!! ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+!! POSSIBILITY OF SUCH DAMAGE.
+!!------------------------------------------------------------------------------
+
+#! Integer kinds to be considered during templating
+#:set INT_KINDS = ["int16", "int32", "int64"]
+
+submodule(stdlib_64_bit_hash_functions) stdlib_64_bit_pengy_hashes
+
+    implicit none
+
+contains
+
+    pure module function int8_pengy_hash( key, seed ) result(hash_code)
+        integer(int64) :: hash_code
+        integer(int8), intent(in) :: key(0:)
+        integer(int32), intent(in) :: seed
+
+        integer(int64) :: b(0:3)
+        integer(int64) :: i
+        integer(int64) :: index
+        integer(int64) :: len
+        integer(int64) :: s(0:3)
+        integer(int64) :: seed2
+        integer(int8)  :: dummy(0:31)
+
+        b(0:3) = 0_int64
+        len = size( key, kind=int64 )
+        s(0:3) = [ 0_int64, 0_int64, 0_int64, len ]
+
+        index = 0_int64
+        do while ( len >= 32 )
+            b(0:3) = transfer( key( index:index+31 ), 0_int64, 4 )
+
+            s(0) = s(0) + s(1) + b(3)
+            s(1) = s(0) + ishftc( s(1), 14 )
+            s(2) = s(2) + s(3) + b(2)
+            s(3) = s(2) + ishftc( s(3), 23 )
+            s(0) = s(0) + s(3) + b(1)
+            s(3) = ieor( s(0), ishftc( s(3), 16 ) )
+            s(2) = s(2) + s(1) + b(0)
+            s(1) = ieor( s(2), ishftc( s(1), 40 ) )
+
+            len = len - 32
+            index = index + 32
+        end do
+
+        dummy(0:31) = transfer( b, 0_int8, 32 )
+        dummy(0:len-1) = key(index:index+len-1)
+        b(0:3) = transfer( dummy, 0_int64, 4)
+        if ( little_endian ) then
+            seed2 = transfer( [ seed, 0_int32 ], 0_int64 )
+        else
+            seed2 = transfer( [ 0_int32, seed ], 0_int64 )
+        end if
+
+        do i = 0, 5
+            s(0) = s(0) + s(1) + b(3)
+            s(1) = s(0) + ishftc( s(1), 14 ) + seed2
+            s(2) = s(2) + s(3) + b(2)
+            s(3) = s(2) + ishftc( s(3), 23 )
+            s(0) = s(0) + s(3) + b(1)
+            s(3) = ieor( s(0), ishftc( s(3), 16 ) )
+            s(2) = s(2) + s(1) + b(0)
+            s(1) = ieor( s(2), ishftc( s(1), 40 ) )
+        end do
+
+        hash_code = s(0) + s(1) + s(2) + s(3)
+
+    end function int8_pengy_hash
+
+#:for k1 in INT_KINDS
+    pure module function ${k1}$_pengy_hash( key, seed ) result(hash_code)
+!! PENGY_HASH hash function for rank 1 array keys of kind ${k1}$
+        integer(${k1}$), intent(in) :: key(:)
+        integer(int32), intent(in)  :: seed
+        integer(int64)              :: hash_code
+
+        hash_code = int8_pengy_hash( transfer( key, 0_int8, &
+                     bytes_${k1}$*size(key, kind=int64) ), seed)
+
+    end function ${k1}$_pengy_hash
+
+#:endfor
+
+    pure module function character_pengy_hash( key, seed ) result(hash_code)
+!! PENGY_HASH hash function for default character keys
+        character(*), intent(in)   :: key
+        integer(int32), intent(in) :: seed
+        integer(int64)             :: hash_code
+
+        hash_code = int8_pengy_hash( transfer( key, 0_int8, &
+                     bytes_char*len(key, kind=int64) ), seed)
+
+    end function character_pengy_hash
+
+    module subroutine new_pengy_hash_seed( seed )
+! Random SEED generator for PENGY_HASH
+        integer(int32), intent(inout) :: seed
+        real(dp) :: sample
+        integer(int32) :: old_seed
+
+        old_seed = seed
+        find_seed: do
+            call random_number( sample )
+            seed = int( floor( sample * 2_int64**32, int64 ) - 2_int64**31, &
+                int32 )
+            if ( seed /= old_seed ) return
+        end do find_seed
+
+    end subroutine new_pengy_hash_seed
+
+end submodule stdlib_64_bit_pengy_hashes
diff --git a/src/stdlib_64_bit_spookyv2_hashes.fypp b/src/stdlib_64_bit_spookyv2_hashes.fypp
new file mode 100644
index 000000000..eaaccff4d
--- /dev/null
+++ b/src/stdlib_64_bit_spookyv2_hashes.fypp
@@ -0,0 +1,718 @@
+!!------------------------------------------------------------------------------
+!! `SPOOKY_HASH` is a translation to Fortran 2008 of the unsigned 64 bit
+!! `SpookyHash` V2 function of Bob Jenkins
+!! <https://burtleburtle.net/bob/hash/spooky.html> to signed 64 bit
+!! operations. Bob Jenkins has put his code in the public domain and has
+!! given permission to treat this code as public domain in the USA,
+!! provided the code can be used under other licenses and he is given
+!! appropriate credit.
+!! The code was designed for Little-Endian processors. The output is
+!! different on Big Endian processors, but still probably as good quality.
+!!------------------------------------------------------------------------------
+
+#! Integer kinds to be considered during templating
+#:set INT_KINDS = ["int16", "int32", "int64"]
+
+submodule(stdlib_64_bit_hash_functions) stdlib_64_bit_spookyv2_hashes
+
+! I have tried to make this portable while retaining efficiency. I assume
+! processors with two's complement integers from 8, 16, 32, and 64 bits.
+! The code is a transliteration of the 64 bit SpookyHash V2 of Bob Jenkins
+!     <https://burtleburtle.net/bob/hash/spooky.html>
+! The code was designed for Little-Endian processors. The output is
+! different on Big Endian processors, but still probably as good quality.
+
+    implicit none
+
+contains
+
+
+     module function int8_spooky_hash( key, seed ) result(hash_code)
+        integer(int8), intent(in)  :: key(:)
+        integer(int64), intent(in) :: seed(2)
+        integer(int64)             :: hash_code(2)
+
+        integer(int64) :: hash2(2)
+
+        hash2(:) = seed
+        call spookyhash_128( key, hash2 )
+        hash_code = hash2
+
+    end function int8_spooky_hash
+
+
+#:for k1 in INT_KINDS
+     module function ${k1}$_spooky_hash( key, seed ) result(hash_code)
+        integer(${k1}$), intent(in) :: key(:)
+        integer(int64), intent(in)  :: seed(2)
+        integer(int64)              :: hash_code(2)
+
+        integer(int64) :: hash2(2)
+
+        hash2(:) = seed
+        call spookyhash_128( transfer( key, 0_int8, &
+                     bytes_${k1}$*size(key, kind=int64) ), hash2 )
+        hash_code = hash2
+
+    end function ${k1}$_spooky_hash
+
+#:endfor
+
+     module function character_spooky_hash( key, seed ) result(hash_code)
+        character(*), intent(in)   :: key
+        integer(int64), intent(in) :: seed(2)
+        integer(int64)             :: hash_code(2)
+
+        integer(int64)    :: hash2(2)
+
+        hash2(:) = seed
+        call spookyhash_128( transfer( key, 0_int8,                    &
+                             bytes_char*len(key, kind=int64) ), hash2 )
+        hash_code = hash2
+
+    end function character_spooky_hash
+
+!
+! short hash ... it could be used on any message,
+! but it's used by Spooky just for short messages.
+!
+     subroutine spookyhash_short( key, hash_inout )
+        integer(int8), intent(in), target :: key(0:)
+        integer(int64), intent(inout)     :: hash_inout(2)
+
+        integer(int64) :: a, b, c, d
+        integer(int64) :: length, p8, remainder
+
+        p8 = 0
+        length = size( key, kind=int64 )
+
+        ! The number of bytes after all the INT256s
+        remainder = iand( length, 31_int64 )
+        a = hash_inout(1)
+        b = hash_inout(2)
+        c = sc_const
+        d = sc_const
+
+        if ( length > 15 ) then
+            block
+                integer(int64) :: bend, step
+                integer(int64) :: buf(0:2*sc_numVars-1)
+                bend = ishft(length, -4) ! The number of complete INT128s
+                buf(0:2*bend-1) = transfer( key(0:16*bend-1), 0_int64, 2*bend )
+                ! Number of Int64's in number of complete INT256s
+                bend = ishft(ishft(length, -5), 2)
+
+                ! handle all complete sets of 32 bytes
+                do step = 0_int64, bend-1, 4
+                    c = c + buf(step)
+                    d = d + buf(step+1)
+                    call shortmix( a, b, c, d )
+                    a = a + buf(step+2)
+                    b = b + buf(step+3)
+                end do
+                ! Completed all INT64s in complete INT256s
+                p8 = p8 + 8*bend ! Number of INT8s in complete INT256s
+
+                ! Handle the case of 16+ remaining bytes.
+                if (remainder >= 16) then
+                    c = c + buf(step)
+                    d = d + buf(step+1)
+                    call shortmix( a, b, c, d )
+                    p8 = p8 + 16
+                    remainder = remainder - 16
+                end if
+
+            end block
+        end if
+
+    ! Handle the last 0..15 bytes, and its length V2
+        d = d + shiftl( length, 56_int64 )
+
+        select case(remainder)
+        case(15)
+            go to 115
+        case(14)
+            go to 114
+        case(13)
+            go to 113
+        case(12)
+            go to 112
+        case(11)
+            go to 111
+        case(10)
+            go to 110
+        case(9)
+            go to 109
+        case(8)
+            go to 108
+        case(7)
+            go to 107
+        case(6)
+            go to 106
+        case(5)
+            go to 105
+        case(4)
+            go to 104
+        case(3)
+            go to 103
+        case(2)
+            go to 102
+        case(1)
+            go to 101
+        case(0)
+            go to 100
+        end select
+
+115     d = d + shiftl( map_to_64( key(p8+14) ), 48_int64 )
+114     d = d + shiftl( map_to_64( key(p8+13) ), 40_int64 )
+113     d = d + shiftl( map_to_64( key(p8+12) ), 32_int64 )
+112     if ( little_endian) then
+            d = d + transfer( [ transfer(key(p8+8:p8+11), 0_int32), &
+                                0_int32 ], 0_int64)
+        else
+            d = d + transfer( [ 0_int32, &
+                                transfer(key(p8+8:p8+11), 0_int32) ], &
+                                0_int64)
+        end if
+        c = c + transfer( key(p8+0:p8+7), 0_int64 )
+        go to 888
+
+111     d = d + shiftl( map_to_64( key(p8+10) ), 16_int32 )
+110     d = d + shiftl( map_to_64( key(p8+9) ), 8_int32 )
+109     d = d + map_to_64( key(p8+8) )
+108     c = c + transfer( key(p8+0:p8+7), 0_int64 )
+        go to 888
+
+107     c = c + shiftl( map_to_64( key(p8+6) ), 48_int64 )
+106     c = c + shiftl( map_to_64( key(p8+5) ), 40_int64 )
+105     c = c + shiftl( map_to_64( key(p8+4) ), 32_int64 )
+104     if ( little_endian) then
+            c = c + transfer( [ transfer( key(p8+0:p8+3), 0_int32 ), &
+                                0_int32 ], 0_int64 )
+        else
+            c = c + transfer( [ 0_int32, &
+                                transfer( key(p8+0:p8+3), 0_int32 ) ], 0_int64 )
+        end if
+
+        go to 888
+
+103     c = c + shiftl( map_to_64( key(p8+2) ), 16_int64 )
+102     c = c + shiftl( map_to_64( key(p8+1) ), 8_int64 )
+101     c = c + map_to_64( key(p8+0) )
+        go to 888
+
+100     c = c + sc_const
+        d = d + sc_const
+
+888     call short_end( a, b, c, d )
+
+        hash_inout(1) = a
+        hash_inout(2) = b
+        close(40)
+
+    contains
+
+        pure function map_to_64( key )
+            integer(int8), intent(in) :: key
+            integer(int64)         :: map_to_64
+
+            if ( little_endian ) then
+                map_to_64 = transfer( [ key, 0_int8, 0_int8, 0_int8, &
+                                        0_int8, 0_int8, 0_int8, 0_int8 ], &
+                                      0_int64 )
+            else
+                map_to_64 = transfer( [ 0_int8, 0_int8, 0_int8, 0_int8, &
+                                        0_int8, 0_int8, 0_int8, key ], &
+                                      0_int64 )
+            end if
+
+        end function map_to_64
+
+        pure subroutine shortmix( h0, h1, h2, h3 )
+    !
+    ! The goal is for each bit of the input to expand into 128 bits of
+    ! apparent entropy before it is fully overwritten.
+    ! n trials both set and cleared at least m bits of h0 h1 h2 h3
+    !   n: 2   m: 29
+    !   n: 3   m: 46
+    !   n: 4   m: 57
+    !   n: 5   m: 107
+    !   n: 6   m: 146
+    !   n: 7   m: 152
+    ! when run forwards or backwards
+    ! for all 1-bit and 2-bit diffs
+    ! with diffs defined by either xor or subtraction
+    ! with a base of all zeros plus a counter, or plus another bit, or random
+    !
+            integer(int64), intent(inout) :: h0, h1, h2, h3
+
+            h2 = ishftc( h2, 50 )
+            h2 = h2 + h3
+            h0 = ieor( h0, h2 )
+            h3 = ishftc( h3, 52 )
+            h3 = h3 + h0
+            h1 = ieor( h1, h3 )
+            h0 = ishftc( h0, 30 )
+            h0 = h0 + h1
+            h2 = ieor( h2, h0 )
+            h1 = ishftc( h1, 41 )
+            h1 = h1 + h2
+            h3 = ieor( h3, h1 )
+            h2 = ishftc( h2, 54 )
+            h2 = h2 + h3
+            h0 = ieor( h0, h2 )
+            h3 = ishftc( h3, 48 )
+            h3 = h3 + h0
+            h1 = ieor( h1, h3 )
+            h0 = ishftc( h0, 38 )
+            h0 = h0 + h1
+            h2 = ieor( h2, h0 )
+            h1 = ishftc( h1, 37 )
+            h1 = h1 + h2
+            h3 = ieor( h3, h1 )
+            h2 = ishftc( h2, 62 )
+            h2 = h2 + h3
+            h0 = ieor( h0, h2 )
+            h3 = ishftc( h3, 34 )
+            h3 = h3 + h0
+            h1 = ieor( h1, h3 )
+            h0 = ishftc( h0, 5 )
+            h0 = h0 + h1
+            h2 = ieor( h2, h0 )
+            h1 = ishftc( h1, 36 )
+            h1 = h1 + h2
+            h3 = ieor( h3, h1 )
+
+        end subroutine shortmix
+
+        pure subroutine short_end( h0, h1, h2, h3 )
+    !
+    ! Mix all 4 inputs together so that h0, h1 are a hash of them all.
+    !
+    ! For two inputs differing in just the input bits
+    ! Where "differ" means xor or subtraction
+    ! And the base value is random, or a counting value starting at that bit
+    ! The final result will have each bit of h0, h1 flip
+    ! For every input bit,
+    ! with probability 50 +- .3% (it is probably better than that)
+    ! For every pair of input bits,
+    ! with probability 50 +- .75% (the worst case is approximately that)
+    !
+            integer(int64), intent(inout) :: h0, h1, h2, h3
+
+            h3 = ieor( h3, h2 )
+            h2 = ishftc( h2, 15 )
+            h3 = h3 + h2
+            h0 = ieor( h0, h3 )
+            h3 = ishftc( h3, 52 )
+            h0 = h0 + h3
+            h1 = ieor( h1, h0 )
+            h0 = ishftc( h0, 26 )
+            h1 = h1 + h0
+            h2 = ieor( h2, h1 )
+            h1 = ishftc( h1, 51 )
+            h2 = h2 + h1
+            h3 = ieor( h3, h2 )
+            h2 = ishftc( h2, 28 )
+            h3 = h3 + h2
+            h0 = ieor( h0, h3 )
+            h3 = ishftc( h3, 9 )
+            h0 = h0 + h3
+            h1 = ieor( h1, h0 )
+            h0 = ishftc( h0, 47 )
+            h1 = h1 + h0
+            h2 = ieor( h2, h1 )
+            h1 = ishftc( h1, 54 )
+            h2 = h2 + h1
+            h3 = ieor( h3, h2 )
+            h2 = ishftc( h2, 32 )
+            h3 = h3 + h2
+            h0 = ieor( h0, h3 )
+            h3 = ishftc( h3, 25 )
+            h0 = h0 + h3
+            h1 = ieor( h1, h0 )
+            h0 = ishftc( h0, 63 )
+            h1 = h1 + h0
+
+        end subroutine short_end
+
+    end subroutine spookyhash_short
+
+
+! do the whole hash in one call
+     module subroutine spookyHash_128( key, hash_inout )
+        integer(int8), intent(in), target :: key(0:)
+        integer(int64), intent(inout)     :: hash_inout(2)
+
+        integer(int64) :: buf(sc_numvars)
+        integer(int64) :: h(0:11)
+        integer(int64) :: bend, i, length, p8, remain, remainder, tail
+        integer(int8)  :: buf8(8)
+
+        length = size(key, kind=int64)
+
+        if ( length < sc_buffsize ) then
+            call spookyhash_short( key, hash_inout )
+            return
+        end if
+
+        h( [ 0, 3, 6,  9 ] ) = hash_inout(1)
+        h( [ 1, 4, 7, 10 ] ) = hash_inout(2)
+        h( [ 2, 5, 8, 11 ] ) = sc_const
+
+        ! Number of bytes in number of complete internal states
+        bend = (length/sc_blocksize)*sc_blocksize
+
+        ! Handle all SC_BLOCKSIZE blocks of bytes
+        do i=0, bend-1, sc_blocksize
+            buf(:) = transfer( key(i:i+sc_blocksize-1), 0_int64, sc_numVars )
+            call spookyhash_mix( buf, h )
+        end do ! all complete internal states processed
+
+    ! handle the last partial block of sc_blocksize bytes
+        remainder = ( length - bend ) ! 0 <= remainder < sc_blocksize == 96
+        remain    = remainder / 8 ! Number of INT64's in partial block
+        buf(1:remain) = transfer( key(bend:bend+remain*8-1), 0_int64, remain )
+        buf(remain+1:sc_numvars) = 0_int64
+        tail = remainder - 8 * remain ! Number of INT8s after INT64s
+        p8 = bend + remain * 8 ! # of bytes until tail start
+        buf8(1:tail) = key(p8:p8+tail-1)
+        buf8(tail+1:8) = 0_int8
+        buf(remain+1) = transfer( buf8, 0_int64 )
+        buf8(1:7) = 0_int8
+        buf8(8) = int( remainder, kind=int8 ) ! 0 <= remainder < 96
+        buf(sc_numvars) = ieor( buf(sc_numvars), transfer( buf8, 0_int64 ) )
+
+    ! do some final mixing
+        call spookyhash_end( buf, h )
+        hash_inout(1:2) = h(0:1)
+
+    end subroutine spookyHash_128
+
+    !
+    ! This is used if the input is 96 bytes long or longer.
+    !
+    ! The internal state is fully overwritten every 96 bytes.
+    ! Every input bit appears to cause at least 128 bits of entropy
+    ! before 96 other bytes are combined, when run forward or backward
+    !   For every input bit,
+    !   Two inputs differing in just that input bit
+    !   Where "differ" means xor or subtraction
+    !   And the base value is random
+    !   When run forward or backwards one Mix
+    ! I tried 3 pairs of each; they all differed by at least 212 bits.
+    !
+    pure subroutine spookyhash_mix( data, s )
+        integer(int64), intent(in)    :: data(0:)
+        integer(int64), intent(inout) :: s(0:11)
+
+        s(0)  = s(0) + data(0)
+        s(2)  = ieor( s(2), s(10) )
+        s(11) = ieor( s(11), s(0) )
+        s(0)  = ishftc( s(0), 11 )
+        s(11) = s(11) + s(1)
+        s(1)  = s(1) + data(1)
+        s(3)  = ieor( s(3), s(11) )
+        s(0)  = ieor( s(0), s(1) )
+        s(1)  = ishftc( s(1), 32 )
+        s(0)  = s(0) + s(2)
+        s(2)  = s(2) + data(2)
+        s(4)  = ieor( s(4), s(0) )
+        s(1)  = ieor( s(1), s(2) )
+        s(2)  = ishftc( s(2), 43 )
+        s(1)  = s(1) + s(3)
+        s(3)  = s(3) + data(3)
+        s(5)  = ieor( s(5), s(1) )
+        s(2)  = ieor( s(2), s(3) )
+        s(3)  = ishftc( s(3), 31 )
+        s(2)  = s(2) + s(4)
+        s(4)  = s(4) + data(4)
+        s(6)  = ieor( s(6), s(2) )
+        s(3)  = ieor( s(3), s(4) )
+        s(4)  = ishftc( s(4), 17 )
+        s(3)  = s(3) + s(5)
+        s(5)  = s(5) + data(5)
+        s(7)  = ieor( s(7), s(3) )
+        s(4)  = ieor( s(4), s(5) )
+        s(5)  = ishftc( s(5), 28 )
+        s(4)  = s(4) + s(6)
+        s(6)  = s(6) + data(6)
+        s(8)  = ieor( s(8), s(4) )
+        s(5)  = ieor( s(5), s(6) )
+        s(6)  = ishftc( s(6), 39 )
+        s(5)  = s(5) + s(7)
+        s(7)  = s(7) + data(7)
+        s(9)  = ieor( s(9), s(5) )
+        s(6)  = ieor( s(6), s(7) )
+        s(7)  = ishftc( s(7), 57 )
+        s(6)  = s(6) + s(8)
+        s(8)  = s(8) + data(8)
+        s(10) = ieor( s(10), s(6) )
+        s(7)  = ieor( s(7), s(8) )
+        s(8)  = ishftc( s(8), 55 )
+        s(7)  = s(7) + s(9)
+        s(9)  = s(9) + data(9)
+        s(11) = ieor( s(11), s(7) )
+        s(8)  = ieor( s(8), s(9) )
+        s(9)  = ishftc( s(9), 54 )
+        s(8)  = s(8) + s(10)
+        s(10) = s(10) + data(10)
+        s(0)  = ieor( s(0), s(8) )
+        s(9)  = ieor( s(9), s(10) )
+        s(10) = ishftc( s(10), 22 )
+        s(9)  = s(9) + s(11)
+        s(11) = s(11) + data(11)
+        s(1)  = ieor( s(1), s(9) )
+        s(10) = ieor( s(10), s(11) )
+        s(11) = ishftc( s(11), 46 )
+        s(10) = s(10) + s(0)
+
+    end subroutine spookyhash_mix
+
+
+    pure subroutine spookyhash_end( data, h)
+        integer(int64), intent(in)    :: data(0:)
+        integer(int64), intent(inout) :: h(0:11)
+
+        h  = h + data(0:11)
+        call endpartial( h )
+        call endpartial( h )
+        call endpartial( h )
+
+    contains
+    !
+    ! Mix all 12 inputs together so that h0, h1 are a hash of them all.
+    !
+    ! For two inputs differing in just the input bits
+    ! Where "differ" means xor or subtraction
+    ! And the base value is random, or a counting value starting at that bit
+    ! The final result will have each bit of h0, h1 flip
+    ! For every input bit,
+    ! with probability 50 +- .3%
+    ! For every pair of input bits,
+    ! with probability 50 +- 3%
+    !
+    ! This does not rely on the last Mix() call having already mixed some.
+    ! Two iterations was almost good enough for a 64-bit result, but a
+    ! 128-bit result is reported, so End() does three iterations.
+    !
+        pure subroutine endpartial( h )
+            integer(int64), intent(inout) :: h(0:11)
+
+            h(11) = h(11) + h(1)
+            h(2)  =   ieor( h(2), h(11) )
+            h(1)  = ishftc( h(1), 44 )
+            h(0)  = h(0) + h(2)
+            h(3)  =   ieor( h(3), h(0) )
+            h(2)  = ishftc( h(2), 15 )
+            h(1)  = h(1) + h(3)
+            h(4)  =   ieor( h(4), h(1) )
+            h(3)  = ishftc( h(3), 34 )
+            h(2)  = h(2) + h(4)
+            h(5)  =   ieor( h(5), h(2) )
+            h(4)  = ishftc( h(4), 21 )
+            h(3)  = h(3) + h(5)
+            h(6)  =   ieor( h(6), h(3) )
+            h(5)  = ishftc( h(5), 38 )
+            h(4)  = h(4) + h(6)
+            h(7)  = ieor( h(7), h(4) )
+            h(6)  = ishftc( h(6), 33 )
+            h(5)  = h(5) + h(7)
+            h(8)  = ieor( h(8), h(5) )
+            h(7)  = ishftc( h(7), 10 )
+            h(6)  = h(6) + h(8)
+            h(9)  = ieor( h(9), h(6) )
+            h(8)  = ishftc( h(8), 13 )
+            h(7)  = h(7) + h(9)
+            h(10) = ieor( h(10), h(7) )
+            h(9)  = ishftc( h(9), 38 )
+            h(8)  = h(8) + h(10)
+            h(11) = ieor( h(11), h(8) )
+            h(10) = ishftc( h(10), 53 )
+            h(9)  = h(9) + h(11)
+            h(0)  = ieor( h(0), h(9) )
+            h(11) = ishftc( h(11), 42 )
+            h(10) = h(10) + h(0)
+            h(1)  = ieor( h(1), h(10) )
+            h(0)  = ishftc( h(0), 54 )
+
+        end subroutine endpartial
+
+    end subroutine spookyhash_end
+
+
+     module subroutine spookysubhash_init( self, seed )
+        type(spooky_subhash), intent(out) :: self
+        integer(int64), intent(in)     :: seed(2)
+
+        self % state(0:1) = seed
+        self % length     = 0
+        self % remainder  = 0_int8
+
+    end subroutine spookysubhash_init
+
+
+! add a message fragment to the state
+     module subroutine spookyhash_update( spooky, key )
+        type(spooky_subhash), intent(out) :: spooky
+        integer(int8), intent(in)         :: key(0:)
+
+        integer(int8)  :: dummy(0:7)
+        integer(int64) :: h(0:11)
+        integer(int64)  :: bend,       &
+                           length,     &
+                           new_length, &
+                           p8,         &
+                           remainder
+
+        length = size(key, kind=int64)
+        new_length = length + spooky % remainder
+
+    ! Is this message fragment too short?  If it is, stuff it away.
+        if ( new_Length < sc_buffsize ) then
+            remainder = spooky % remainder
+            spooky % data( remainder:remainder+length-1 ) = key
+            spooky % length = length + spooky % length
+            dummy = transfer( new_length, 0_int8, 8 )
+            if ( little_endian ) then
+                spooky % remainder = transfer( [ dummy(0), 0_int8 ], 0_int16 )
+            else
+                spooky % remainder = transfer( [ 0_int8, dummy(7) ], 0_int16 )
+            end if
+            return
+        end if
+
+    ! init the variables
+        if ( spooky % length < sc_buffsize ) then
+            h( [ 0, 3, 6,  9 ] ) = spooky % state(0)
+            h( [ 1, 4, 7, 10 ] ) = spooky % state(1)
+            h( [ 2, 5, 8, 11 ] ) = sc_const
+        else
+            h(0:11)  = spooky % state(0:11)
+        end if
+
+        spooky % length = length + spooky % length
+
+    ! if we've got anything stuffed away, use it now
+        if ( spooky % remainder /= 0_int16 ) then
+            block
+                integer(int16) :: prefix
+                prefix = sc_buffsize - spooky % remainder
+                remainder = spooky % remainder
+                spooky % data(remainder:remainder+prefix-1) = key(0:prefix-1)
+                call spookyhash_mix( transfer(spooky % data(0:sc_blocksize-1), &
+                                              0_int64, sc_numvars), h )
+                call spookyhash_mix(                                       &
+                    transfer(spooky % data(sc_blocksize:2*sc_blocksize-1), &
+                             0_int64, sc_numvars), h )
+                p8 = prefix
+                length = length - prefix
+            end block
+        else
+            p8 = 0
+        end if
+
+    ! handle all whole blocks of sc_blocksize bytes requiring aligned bytes
+        bend = p8 + 8*(length/sc_blocksize)*sc_numVars
+        remainder = length - ( bend - p8 )
+        do while( p8 < bend )
+            spooky % data(0:sc_blocksize-1) = key( p8:p8+sc_blocksize-1 )
+            call spookyhash_mix( transfer( spooky % data(0:sc_blocksize-1), &
+                                 0_int64, sc_numvars), h )
+            p8 = p8 + sc_blocksize
+        end do
+
+    ! stuff away the last few bytes
+        spooky % remainder = remainder
+
+        if ( remainder > 0 ) then
+            spooky % data(0:remainder-1) = &
+                key(bend:bend+remainder-1)
+        end if
+
+    ! stuff away the variables
+        spooky % state(0:11) = h(0:11)
+
+    end subroutine spookyhash_update
+
+
+! report the hash for the concatenation of all message fragments so far
+     module subroutine spookyhash_final(spooky, hash_code)
+        type(spooky_subhash), intent(inout) :: spooky
+        integer(int64), intent(inout)       :: hash_code(2)
+
+        integer(int64) :: h(0:11)
+        integer(int64) :: index, remainder
+        integer(int8)  :: dummy(2)
+
+    ! init the variables
+        if ( spooky % length < sc_buffsize ) then
+            hash_code = spooky % state(0:1)
+            call spookyhash_short( spooky % data(0:spooky % length-1), &
+                                   hash_code )
+            return
+        end if
+
+        remainder = spooky % remainder
+
+        h(0:11)  = spooky % state(0:11)
+
+        if ( remainder >= sc_blocksize ) then
+          ! m_data can contain two blocks; handle any whole first block
+            call spookyhash_mix( transfer( spooky % data, 0_int64, &
+                                           2*sc_numvars), h )
+            index = sc_blocksize
+            remainder = remainder - sc_blocksize
+        else
+            index = 0
+        end if
+
+    ! mix in the last partial block, and the length mod sc_blocksize
+        spooky % data(sc_blocksize+remainder:) = 0_int8
+        dummy = transfer( remainder, 0_int8, 2 )
+
+        if  ( little_endian ) then
+            spooky % data(sc_blocksize-1) = dummy(1)
+        else
+            spooky % data(sc_blocksize-1) = dummy(2)
+        end if
+
+    ! do some final mixing
+        call spookyhash_end( transfer(spooky % data, 0_int64, 2*sc_numvars), h )
+
+        hash_code(1:2) = h(0:1)
+
+    end subroutine spookyhash_final
+
+
+    function rot_64_32( a, k )
+        integer(int64)             :: rot_64_32
+        integer(int64), intent(in) :: a
+        integer, intent(in)           :: k
+
+        rot_64_32 = iand( ior( shiftl( a, k ), shiftr( a, 32-k ) ), two_32-1 )
+
+    end function rot_64_32
+
+
+    module subroutine new_spooky_hash_seed( seed )
+! Random SEED generator for
+        integer(int64), intent(inout) :: seed(2)
+
+        integer(int64) :: old_seed(2)
+        real(dp)       :: sample(4)
+        integer(int32) :: part(4)
+
+        old_seed = seed
+        find_seed: do
+            call random_number( sample )
+            part = int( floor( sample * 2_int64**32, int64 ) - 2_int64**31, &
+                int32 )
+            seed = transfer( part, seed, 2 )
+            if ( seed(1) /= old_seed(1) .or. seed(2) /= old_seed(2) ) return
+        end do find_seed
+
+    end subroutine new_spooky_hash_seed
+
+
+end submodule stdlib_64_bit_spookyv2_hashes
diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt
index 01df5d678..30a240beb 100644
--- a/src/tests/CMakeLists.txt
+++ b/src/tests/CMakeLists.txt
@@ -8,6 +8,7 @@ endmacro(ADDTEST)
 
 add_subdirectory(ascii)
 add_subdirectory(bitsets)
+add_subdirectory(hash_functions)
 add_subdirectory(io)
 add_subdirectory(linalg)
 add_subdirectory(logger)
diff --git a/src/tests/Makefile.manual b/src/tests/Makefile.manual
index 3e801ad4b..7726b8adb 100644
--- a/src/tests/Makefile.manual
+++ b/src/tests/Makefile.manual
@@ -3,6 +3,7 @@
 all test clean:
 	$(MAKE) -f Makefile.manual --directory=ascii $@
 	$(MAKE) -f Makefile.manual --directory=bitsets $@
+	$(MAKE) -f Makefile.manual --directory=hash_functions $@
 	$(MAKE) -f Makefile.manual --directory=io $@
 	$(MAKE) -f Makefile.manual --directory=logger $@
 	$(MAKE) -f Makefile.manual --directory=optval $@
diff --git a/src/tests/hash_functions/CMakeLists.txt b/src/tests/hash_functions/CMakeLists.txt
new file mode 100644
index 000000000..459719c32
--- /dev/null
+++ b/src/tests/hash_functions/CMakeLists.txt
@@ -0,0 +1,2 @@
+ADDTEST(32_bit_hash_performance)
+ADDTEST(64_bit_hash_performance)
diff --git a/src/tests/hash_functions/Makefile.manual b/src/tests/hash_functions/Makefile.manual
new file mode 100644
index 000000000..d3e59bd18
--- /dev/null
+++ b/src/tests/hash_functions/Makefile.manual
@@ -0,0 +1,3 @@
+PROGS_SRC = test_64_bit_hash_performance.f90 test_32_bit_hash_performance.f90
+
+include ../Makefile.manual.test.mk
diff --git a/src/tests/hash_functions/test_32_bit_hash_performance.f90 b/src/tests/hash_functions/test_32_bit_hash_performance.f90
new file mode 100644
index 000000000..acee5e36b
--- /dev/null
+++ b/src/tests/hash_functions/test_32_bit_hash_performance.f90
@@ -0,0 +1,190 @@
+program test_32_bit_hash_performance
+!! Program to compare the relative performance of different 32 bit hash
+!! functions
+
+    use stdlib_kinds, only: &
+        dp,           &
+        int8,         &
+        int32,        &
+        int64
+
+    use stdlib_32_bit_hash_functions
+
+    implicit none
+
+    integer, parameter :: &
+        block_size(8) = [ 1, 2, 4, 8, 16, 64, 256, 1024 ]
+    integer(int32), parameter :: huge32 = huge(0_int32)
+    real(dp), parameter :: hugep1 = real(huge32, dp) + 1.0_dp
+    integer, parameter :: rand_power = 16
+    integer, parameter :: rand_size = 2**rand_power
+    integer, parameter :: test_size = rand_size * 4
+    integer, parameter :: test_block = 2**10
+    integer, parameter :: repeat = 4
+    integer :: index, k
+    integer :: lun
+    real(dp) :: rand(2)
+    integer(int32) :: rand_object(rand_size)
+    integer(int8) :: test_object(test_size)
+
+    open( newunit=lun, file="32_bit_hash_performance.txt", &
+          access="sequential", action="write", form="formatted", &
+          position="rewind" )
+
+    do index=1, rand_size
+        call random_number(rand)
+        if (rand(1) < 0.5_dp) then
+            rand_object(index) = ceiling(-rand(2)*hugep1, int32) - 1
+        else
+            rand_object(index) = floor(rand(2)*hugep1, int32)
+        end if
+    end do
+
+    test_object(:) = transfer( rand_object, 0_int8, test_size )
+
+    write(lun, '("| Algorithm  | Key Size  | Key #      | Time (s) |")')
+    write(lun, '("|            | Bytes     |            |          |")')
+    write(lun, '("|------------|-----------|------------|----------|")')
+
+    call test_fnv_1()
+
+    call test_fnv_1a()
+
+    call test_nmhash32()
+
+    call test_nmhash32x()
+
+    call test_water()
+
+contains
+
+    subroutine test_fnv_1()
+        integer :: index2
+        integer(int_hash) :: hash
+        real :: t1, t2, tdiff
+        integer(int_hash) :: summary(repeat)
+
+        do k=1, size(block_size)
+            call cpu_time(t1)
+            do index=1, repeat
+                do index2=1, test_size, block_size(k)
+                    hash = fnv_1_hash( test_object( index2: &
+                                                    index2+block_size(k)-1 ) )
+                    if (index2 == index) summary(index) = hash
+                end do
+            end do
+            call cpu_time(t2)
+            tdiff = t2-t1
+            write(lun, '("|", a10, 2x, "|", i8, 3x, "|", 1x, i10, 1x, ' // &
+                '"|", f9.5, 1x, "|")') 'FNV-1', &
+                block_size(k), repeat*(test_size/block_size(k)), tdiff
+        end do
+
+    end subroutine test_fnv_1
+
+    subroutine test_fnv_1a()
+        integer :: index2
+        integer(int_hash) :: hash
+        real :: t1, t2, tdiff
+        integer(int_hash) :: summary(repeat)
+
+        do k=1, size(block_size)
+            call cpu_time(t1)
+            do index=1, repeat
+                do index2=1, test_size, block_size(k)
+                    hash = fnv_1a_hash( test_object( index2: &
+                                                     index2+block_size(k)-1 ) )
+                    if (index2 == index) summary(index) = hash
+                end do
+            end do
+            call cpu_time(t2)
+            tdiff = t2-t1
+            write(lun, '("|", a10, 2x, "|", i8, 3x, "|", 1x, i10, 1x, ' // &
+                '"|", f9.5, 1x, "|")') 'FNV-1a', &
+                block_size(k), repeat*(test_size/block_size(k)), tdiff
+        end do
+
+    end subroutine test_fnv_1a
+
+    subroutine test_nmhash32()
+        integer :: index2
+        integer(int_hash) :: hash
+        integer(int32) :: seed = 0_int32
+        real :: t1, t2, tdiff
+        integer(int_hash) :: summary(repeat)
+
+        call new_nmhash32_seed( seed )
+        do k=1, size(block_size)
+            call cpu_time(t1)
+            do index=1, repeat
+                do index2=1, test_size, block_size(k)
+                    hash = nmhash32( test_object( index2: &
+                                                  index2+block_size(k)-1 ),&
+                                                  seed )
+                    if (index2 == index) summary(index) = hash
+                end do
+            end do
+            call cpu_time(t2)
+            tdiff = t2-t1
+            write(lun, '("|", a10, 2x, "|", i8, 3x, "|", 1x, i10, 1x, ' // &
+                '"|", f9.5, 1x, "|")') 'nmhash32', &
+                block_size(k), repeat*(test_size/block_size(k)), tdiff
+        end do
+
+    end subroutine test_nmhash32
+
+    subroutine test_nmhash32x()
+        integer :: index2
+        integer(int_hash) :: hash
+        integer(int32) :: seed = 0_int32
+        real :: t1, t2, tdiff
+        integer(int_hash) :: summary(repeat)
+
+        call new_nmhash32x_seed( seed )
+        do k=1, size(block_size)
+            call cpu_time(t1)
+            do index=1, repeat
+                do index2=1, test_size, block_size(k)
+                    hash = nmhash32x( test_object( index2: &
+                                                   index2+block_size(k)-1 ),&
+                                                   seed )
+                    if (index2 == index) summary(index) = hash
+                end do
+            end do
+            call cpu_time(t2)
+            tdiff = t2-t1
+            write(lun, '("|", a10, 2x, "|", i8, 3x, "|", 1x, i10, 1x, ' // &
+                '"|", f9.5, 1x, "|")') 'nmhash32x', &
+                block_size(k), repeat*(test_size/block_size(k)), tdiff
+        end do
+
+    end subroutine test_nmhash32x
+
+    subroutine test_water()
+        integer :: index2
+        integer(int_hash) :: hash
+        integer(int64) :: seed = 0_int64
+        real :: t1, t2, tdiff
+        integer(int_hash) :: summary(repeat)
+
+        call new_water_hash_seed( seed )
+        do k=1, size(block_size)
+            call cpu_time(t1)
+            do index=1, repeat
+                do index2=1, test_size, block_size(k)
+                    hash = water_hash( test_object( index2:                  &
+                                                    index2+block_size(k)-1 ),&
+                                                    seed )
+                    if (index2 == index) summary(index) = hash
+                end do
+            end do
+            call cpu_time(t2)
+            tdiff = t2-t1
+            write(lun, '("|", a10, 2x, "|", i8, 3x, "|", 1x, i10, 1x, ' // &
+                '"|", f9.5, 1x, "|")') 'water', &
+                block_size(k), repeat*(test_size/block_size(k)), tdiff
+        end do
+
+    end subroutine test_water
+
+end program test_32_bit_hash_performance
diff --git a/src/tests/hash_functions/test_64_bit_hash_performance.f90 b/src/tests/hash_functions/test_64_bit_hash_performance.f90
new file mode 100644
index 000000000..6c445f781
--- /dev/null
+++ b/src/tests/hash_functions/test_64_bit_hash_performance.f90
@@ -0,0 +1,161 @@
+program test_64_bit_hash_performance
+!! Program to compare the relative performance of different 64 bit hash
+!! functions
+
+    use stdlib_kinds, only: &
+        dp,           &
+        int8,         &
+        int32,        &
+        int64
+
+    use stdlib_64_bit_hash_functions
+
+    implicit none
+
+    integer, parameter :: &
+        block_size(8) = [ 1, 2, 4, 8, 16, 64, 256, 1024 ]
+    integer(int32), parameter :: huge32 = huge(0_int32)
+    real(dp), parameter :: hugep1 = real(huge32, dp) + 1.0_dp
+    integer, parameter :: rand_power = 16
+    integer, parameter :: rand_size = 2**rand_power
+    integer, parameter :: test_size = rand_size * 4
+    integer, parameter :: repeat = 4
+    integer :: index, k
+    integer :: lun
+    real(dp) :: rand(2)
+    integer(int32) :: rand_object(rand_size)
+    integer(int8) :: test_object(test_size)
+
+
+    open( newunit=lun, file="64_bit_hash_performance.txt", &
+          access="sequential", action="write", form="formatted", &
+          position="rewind" )
+
+    do index=1, rand_size
+        call random_number(rand)
+        if (rand(1) < 0.5_dp) then
+            rand_object(index) = ceiling(-rand(2)*hugep1, int32) - 1
+        else
+            rand_object(index) = floor(rand(2)*hugep1, int32)
+        end if
+    end do
+
+    test_object(:) = transfer( rand_object, 0_int8, test_size )
+
+    write(lun, '("| Algorithm  | Key Size  | Key #      | Time (s) |")')
+    write(lun, '("|            | Bytes     |            |          |")')
+    write(lun, '("|------------|-----------|------------|----------|")')
+
+    call test_fnv_1()
+
+    call test_fnv_1a()
+
+    call test_pengy()
+
+    call test_spooky()
+
+contains
+
+    subroutine test_fnv_1()
+        integer :: index2
+        integer(int64) :: hash
+        real :: t1, t2, tdiff
+        integer(int64) :: summary(repeat)
+
+        do k=1, size(block_size)
+            call cpu_time(t1)
+            do index=1, repeat
+                do index2=1, test_size, block_size(k)
+                    hash = fnv_1_hash( test_object( index2: &
+                                                    index2+block_size(k)-1 ) )
+                    if (index2 == index) summary(index) = hash
+                end do
+            end do
+            call cpu_time(t2)
+            tdiff = t2-t1
+            write(lun, '("|", a10, 2x, "|", i8, 3x, "|", 1x, i10, 1x, ' // &
+                '"|", f9.5, 1x, "|")') 'FNV-1', &
+                block_size(k), repeat*(test_size/block_size(k)), tdiff
+        end do
+
+    end subroutine test_fnv_1
+
+    subroutine test_fnv_1a()
+        integer :: index2
+        integer(int64) :: hash
+        real :: t1, t2, tdiff
+        integer(int64) :: summary(repeat)
+
+        do k=1, size(block_size)
+            call cpu_time(t1)
+            do index=1, repeat
+                do index2=1, test_size, block_size(k)
+                    hash = fnv_1a_hash( test_object( index2: &
+                                                     index2+block_size(k)-1 ) )
+                    if (index2 == index) summary(index) = hash
+                end do
+            end do
+            call cpu_time(t2)
+            tdiff = t2-t1
+            write(lun, '("|", a10, 2x, "|", i8, 3x, "|", 1x, i10, 1x, ' // &
+                '"|", f9.5, 1x, "|")') 'FNV-1a', &
+                block_size(k), repeat*(test_size/block_size(k)), tdiff
+        end do
+
+    end subroutine test_fnv_1a
+
+    subroutine test_spooky()
+        integer :: index2
+        integer(int64) :: hash(2)
+        integer(int64) :: seed(2) = [ 0_int64, 0_int64 ]
+        real :: t1, t2, tdiff
+        integer(int64) :: summary(repeat)
+
+        call new_spooky_hash_seed( seed )
+        do k=1, size(block_size)
+            call cpu_time(t1)
+            do index=1, repeat
+                do index2=1, test_size, block_size(k)
+                    hash = spooky_hash( test_object( index2: &
+                                                     index2+block_size(k)-1 ), &
+                                                     seed )
+                    if (index2 == index) summary(index) = hash(1)
+                end do
+            end do
+            call cpu_time(t2)
+            tdiff = t2-t1
+            write(lun, '("|", a10, 2x, "|", i8, 3x, "|", 1x, i10, 1x, ' // &
+                '"|", f9.5, 1x, "|")') 'Spooky', &
+                block_size(k), repeat*(test_size/block_size(k)), tdiff
+        end do
+
+    end subroutine test_spooky
+
+    subroutine test_pengy()
+        integer :: index2
+        integer(int64) :: hash
+        integer(int32) :: seed = int( z'DEADBEEF', int32 )
+        real :: t1, t2, tdiff
+        integer(int64) :: summary(repeat)
+
+        call new_pengy_hash_seed( seed )
+        do k=1, size(block_size)
+            call cpu_time(t1)
+            do index=1, repeat
+                do index2=1, test_size, block_size(k)
+                    hash = pengy_hash( test_object( index2: &
+                                       index2+block_size(k)-1 ), &
+                                       seed )
+                    if (index2 == index) summary(index) = hash
+                end do
+            end do
+            call cpu_time(t2)
+            tdiff = t2-t1
+            write(lun, '("|", a10, 2x, "|", i8, 3x, "|", 1x, i10, 1x, ' // &
+                '"|", f9.5, 1x, "|")') 'Pengy', &
+                block_size(k), repeat*(test_size/block_size(k)), tdiff
+        end do
+
+    end subroutine test_pengy
+
+end program test_64_bit_hash_performance
diff --git a/src/tests/hash_functions/validation/Makefile.validation b/src/tests/hash_functions/validation/Makefile.validation
new file mode 100644
index 000000000..3f8e8ebcf
--- /dev/null
+++ b/src/tests/hash_functions/validation/Makefile.validation
@@ -0,0 +1,50 @@
+MOD_PATH = -I../../../
+
+FFLAGS = -O3
+CFLAGS = -O3
+CXXFLAGS = -O3
+LIBDIRS = -L./
+LIBS = -lc_hash
+INCLUDE_DIRS = -I./
+
+all: generate_hash_arrays generate_key_array hash_validity_test
+
+generate_key_array: generate_key_array.f90
+	$(FC) $(FFLAGS) generate_key_array.f90 -o generate_key_array
+
+hash_validity_test: hash_validity_test.f90
+	$(FC) $(FFLAGS) -L../../../ -lstdlib $(MOD_PATH) \
+	hash_validity_test.f90 -o hash_validity_test
+
+generate_hash_arrays: generate_hash_arrays.o ./libc_hash.a
+	$(CXX) $(CXXFLAGS) $(LIBDIRS) generate_hash_arrays.o \
+$(LIBS) -o generate_hash_arrays
+
+generate_hash_arrays.o: generate_hash_arrays.cpp libc_hash.a
+	$(CXX) $(CXXFLAGS) -c generate_hash_arrays.cpp -o generate_hash_arrays.o
+
+libc_hash.a: SpookyV2.o SpookyV2Test.o pengyhash.o nmhash_scalar.o waterhash.o
+	ar rcs libc_hash.a SpookyV2.o SpookyV2Test.o pengyhash.o \
+               nmhash_scalar.o waterhash.o
+
+pengyhash.o: pengyhash.c pengyhash.h
+	$(CC) $(CFLAGS) $(INCLUDE_DIRS) -c pengyhash.c -o pengyhash.o
+
+waterhash.o: waterhash.c waterhash.h
+	$(CC) $(CFLAGS) $(INCLUDE_DIRS) -c waterhash.c -o waterhash.o
+
+SpookyV2.o: SpookyV2.cpp SpookyV2.h
+	$(CXX) $(CXXFLAGS) $(INCLUDE_DIRS) -c SpookyV2.cpp -o SpookyV2.o
+
+SpookyV2Test.o: SpookyV2Test.cpp SpookyV2.h
+	$(CXX) $(CXXFLAGS) $(INCLUDE_DIRS) -c SpookyV2Test.cpp -o SpookyV2Test.o
+
+nmhash_scalar.o: nmhash_scalar.c nmhash_scalar.h
+	$(CC) $(CXXFLAGS) $(INCLUDE_DIRS) -c nmhash_scalar.c -o nmhash_scalar.o
+
+clean:
+	rm nmhash_scalar.o SpookyV2Test.o SpookyV2.o waterhash.o pengyhash.o \
+           libc_hash.a generate_hash_arrays.o generate_hash_arrays \
+	   hash_validity_test generate_key_array
+
+
diff --git a/src/tests/hash_functions/validation/README.md b/src/tests/hash_functions/validation/README.md
new file mode 100644
index 000000000..61836b0dd
--- /dev/null
+++ b/src/tests/hash_functions/validation/README.md
@@ -0,0 +1,24 @@
+# The `validation` subdirectory
+
+This directory contains source code and a makefile, `Makefile.validation`,
+for generatng applications from the souce code intended to test the more
+complicated hash functions in `libstdlib.a` aginst the original C and C++
+hash procedures. At least two of the hash procedures, nmhash32 anc
+nmhash32x assumes that the C processor is either gcc or MSVC so that
+currently the tests can only use gcc.
+
+The makefile generates three applications\:
+`generate_key_array`, `generate_hash_arrays`, and `hash_validity_test`.
+* `generate_key_array` generates a file, `key_array.bin`, that contains a
+random sequence of 2048 eight bit integers.
+* `generate_hash_arrays` generates five files, `c_nmhash32_array.bin`, `c_nmhash32x_array.bin`, `c_pengy_hash_array.bin`, `c_spooky_hash_array.bin`,
+and `c_water_hash_array.bin`, that, in turn, represent the results of applying
+the corresponding C/C++ hash functions on subsequences of the data in
+`key_array.bin`.
+* `hash_validity_test` compares the contents of the files generated by
+`generate_hash_arrays` against the results of applying the corresponding
+Fortran based hash functions in `libstdlib.a` on the same subsequences
+of `key_array.bin`, and reports whether the comparisons match.
+
+The applications sshould be run in the sequence: first, `generate_key_array`,
+then `generate_hash_arrays`, and finally `hash_validity_test`.
diff --git a/src/tests/hash_functions/validation/SpookyV2.cpp b/src/tests/hash_functions/validation/SpookyV2.cpp
new file mode 100644
index 000000000..735bd5629
--- /dev/null
+++ b/src/tests/hash_functions/validation/SpookyV2.cpp
@@ -0,0 +1,351 @@
+// Spooky Hash
+// A 128-bit noncryptographic hash, for checksums and table lookup
+// By Bob Jenkins.  Public domain.
+//   Oct 31 2010: published framework, disclaimer ShortHash isn't right
+//   Nov 7 2010: disabled ShortHash
+//   Oct 31 2011: replace End, ShortMix, ShortEnd, enable ShortHash again
+//   April 10 2012: buffer overflow on platforms without unaligned reads
+//   July 12 2012: was passing out variables in final to in/out in short
+//   July 30 2012: I reintroduced the buffer overflow
+//   August 5 2012: SpookyV2: d = should be d += in short hash, and remove extra mix from long hash
+
+#include <memory.h>
+#include "SpookyV2.h"
+
+#define ALLOW_UNALIGNED_READS 1
+
+//
+// short hash ... it could be used on any message, 
+// but it's used by Spooky just for short messages.
+//
+void SpookyHash::Short(
+    const void *message,
+    size_t length,
+    uint64 *hash1,
+    uint64 *hash2)
+{
+    uint64 buf[2*sc_numVars];
+    union 
+    { 
+        const uint8 *p8; 
+        uint32 *p32;
+        uint64 *p64; 
+        size_t i; 
+    } u;
+
+    u.p8 = (const uint8 *)message;
+    
+    if (!ALLOW_UNALIGNED_READS && (u.i & 0x7))
+    {
+        memcpy(buf, message, length);
+        u.p64 = buf;
+    }
+
+    size_t remainder = length%32;
+    uint64 a=*hash1;
+    uint64 b=*hash2;
+    uint64 c=sc_const;
+    uint64 d=sc_const;
+
+    if (length > 15)
+    {
+        const uint64 *end = u.p64 + (length/32)*4;
+        
+        // handle all complete sets of 32 bytes
+        for (; u.p64 < end; u.p64 += 4)
+        {
+            c += u.p64[0];
+            d += u.p64[1];
+            ShortMix(a,b,c,d);
+            a += u.p64[2];
+            b += u.p64[3];
+        }
+        
+        //Handle the case of 16+ remaining bytes.
+        if (remainder >= 16)
+        {
+            c += u.p64[0];
+            d += u.p64[1];
+            ShortMix(a,b,c,d);
+            u.p64 += 2;
+            remainder -= 16;
+        }
+    }
+    
+    // Handle the last 0..15 bytes, and its length
+    d += ((uint64)length) << 56;
+    switch (remainder)
+    {
+    case 15:
+    d += ((uint64)u.p8[14]) << 48;
+    case 14:
+        d += ((uint64)u.p8[13]) << 40;
+    case 13:
+        d += ((uint64)u.p8[12]) << 32;
+    case 12:
+        d += u.p32[2];
+        c += u.p64[0];
+        break;
+    case 11:
+        d += ((uint64)u.p8[10]) << 16;
+    case 10:
+        d += ((uint64)u.p8[9]) << 8;
+    case 9:
+        d += (uint64)u.p8[8];
+    case 8:
+        c += u.p64[0];
+        break;
+    case 7:
+        c += ((uint64)u.p8[6]) << 48;
+    case 6:
+        c += ((uint64)u.p8[5]) << 40;
+    case 5:
+        c += ((uint64)u.p8[4]) << 32;
+    case 4:
+        c += u.p32[0];
+        break;
+    case 3:
+        c += ((uint64)u.p8[2]) << 16;
+    case 2:
+        c += ((uint64)u.p8[1]) << 8;
+    case 1:
+        c += (uint64)u.p8[0];
+        break;
+    case 0:
+        c += sc_const;
+        d += sc_const;
+    }
+    ShortEnd(a,b,c,d);
+    *hash1 = a;
+    *hash2 = b;
+}
+
+
+
+
+// do the whole hash in one call
+void SpookyHash::Hash128(
+    const void *message, 
+    size_t length, 
+    uint64 *hash1, 
+    uint64 *hash2)
+{
+    if (length < sc_bufSize)
+    {
+        Short(message, length, hash1, hash2);
+        return;
+    }
+
+    uint64 h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,h10,h11;
+    uint64 buf[sc_numVars];
+    uint64 *end;
+    union 
+    { 
+        const uint8 *p8; 
+        uint64 *p64; 
+        size_t i; 
+    } u;
+    size_t remainder;
+    
+    h0=h3=h6=h9  = *hash1;
+    h1=h4=h7=h10 = *hash2;
+    h2=h5=h8=h11 = sc_const;
+    
+    u.p8 = (const uint8 *)message;
+    end = u.p64 + (length/sc_blockSize)*sc_numVars;
+
+    // handle all whole sc_blockSize blocks of bytes
+    if (ALLOW_UNALIGNED_READS || ((u.i & 0x7) == 0))
+    {
+        while (u.p64 < end)
+        { 
+            Mix(u.p64, h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,h10,h11);
+	    u.p64 += sc_numVars;
+        }
+    }
+    else
+    {
+        while (u.p64 < end)
+        {
+            memcpy(buf, u.p64, sc_blockSize);
+            Mix(buf, h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,h10,h11);
+	    u.p64 += sc_numVars;
+        }
+    }
+
+    // handle the last partial block of sc_blockSize bytes
+    remainder = (length - ((const uint8 *)end-(const uint8 *)message));
+    memcpy(buf, end, remainder);
+    memset(((uint8 *)buf)+remainder, 0, sc_blockSize-remainder);
+    ((uint8 *)buf)[sc_blockSize-1] = remainder;
+    
+    // do some final mixing 
+    End(buf, h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,h10,h11);
+    *hash1 = h0;
+    *hash2 = h1;
+}
+
+
+
+// init spooky state
+void SpookyHash::Init(uint64 seed1, uint64 seed2)
+{
+    m_length = 0;
+    m_remainder = 0;
+    m_state[0] = seed1;
+    m_state[1] = seed2;
+}
+
+
+// add a message fragment to the state
+void SpookyHash::Update(const void *message, size_t length)
+{
+    uint64 h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,h10,h11;
+    size_t newLength = length + m_remainder;
+    uint8  remainder;
+    union 
+    { 
+        const uint8 *p8; 
+        uint64 *p64; 
+        size_t i; 
+    } u;
+    const uint64 *end;
+    
+    // Is this message fragment too short?  If it is, stuff it away.
+    if (newLength < sc_bufSize)
+    {
+        memcpy(&((uint8 *)m_data)[m_remainder], message, length);
+        m_length = length + m_length;
+        m_remainder = (uint8)newLength;
+        return;
+    }
+    
+    // init the variables
+    if (m_length < sc_bufSize)
+    {
+        h0=h3=h6=h9  = m_state[0];
+        h1=h4=h7=h10 = m_state[1];
+        h2=h5=h8=h11 = sc_const;
+    }
+    else
+    {
+        h0 = m_state[0];
+        h1 = m_state[1];
+        h2 = m_state[2];
+        h3 = m_state[3];
+        h4 = m_state[4];
+        h5 = m_state[5];
+        h6 = m_state[6];
+        h7 = m_state[7];
+        h8 = m_state[8];
+        h9 = m_state[9];
+        h10 = m_state[10];
+        h11 = m_state[11];
+    }
+    m_length = length + m_length;
+    
+    // if we've got anything stuffed away, use it now
+    if (m_remainder)
+    {
+        uint8 prefix = sc_bufSize-m_remainder;
+        memcpy(&(((uint8 *)m_data)[m_remainder]), message, prefix);
+        u.p64 = m_data;
+        Mix(u.p64, h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,h10,h11);
+        Mix(&u.p64[sc_numVars], h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,h10,h11);
+        u.p8 = ((const uint8 *)message) + prefix;
+        length -= prefix;
+    }
+    else
+    {
+        u.p8 = (const uint8 *)message;
+    }
+    
+    // handle all whole blocks of sc_blockSize bytes
+    end = u.p64 + (length/sc_blockSize)*sc_numVars;
+    remainder = (uint8)(length-((const uint8 *)end-u.p8));
+    if (ALLOW_UNALIGNED_READS || (u.i & 0x7) == 0)
+    {
+        while (u.p64 < end)
+        { 
+            Mix(u.p64, h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,h10,h11);
+	    u.p64 += sc_numVars;
+        }
+    }
+    else
+    {
+        while (u.p64 < end)
+        { 
+            memcpy(m_data, u.p8, sc_blockSize);
+            Mix(m_data, h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,h10,h11);
+	    u.p64 += sc_numVars;
+        }
+    }
+
+    // stuff away the last few bytes
+    m_remainder = remainder;
+    memcpy(m_data, end, remainder);
+    
+    // stuff away the variables
+    m_state[0] = h0;
+    m_state[1] = h1;
+    m_state[2] = h2;
+    m_state[3] = h3;
+    m_state[4] = h4;
+    m_state[5] = h5;
+    m_state[6] = h6;
+    m_state[7] = h7;
+    m_state[8] = h8;
+    m_state[9] = h9;
+    m_state[10] = h10;
+    m_state[11] = h11;
+}
+
+
+// report the hash for the concatenation of all message fragments so far
+void SpookyHash::Final(uint64 *hash1, uint64 *hash2)
+{
+    // init the variables
+    if (m_length < sc_bufSize)
+    {
+        *hash1 = m_state[0];
+        *hash2 = m_state[1];
+        Short( m_data, m_length, hash1, hash2);
+        return;
+    }
+    
+    const uint64 *data = (const uint64 *)m_data;
+    uint8 remainder = m_remainder;
+    
+    uint64 h0 = m_state[0];
+    uint64 h1 = m_state[1];
+    uint64 h2 = m_state[2];
+    uint64 h3 = m_state[3];
+    uint64 h4 = m_state[4];
+    uint64 h5 = m_state[5];
+    uint64 h6 = m_state[6];
+    uint64 h7 = m_state[7];
+    uint64 h8 = m_state[8];
+    uint64 h9 = m_state[9];
+    uint64 h10 = m_state[10];
+    uint64 h11 = m_state[11];
+
+    if (remainder >= sc_blockSize)
+    {
+        // m_data can contain two blocks; handle any whole first block
+        Mix(data, h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,h10,h11);
+        data += sc_numVars;
+        remainder -= sc_blockSize;
+    }
+
+    // mix in the last partial block, and the length mod sc_blockSize
+    memset(&((uint8 *)data)[remainder], 0, (sc_blockSize-remainder));
+
+    ((uint8 *)data)[sc_blockSize-1] = remainder;
+    
+    // do some final mixing
+    End(data, h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,h10,h11);
+
+    *hash1 = h0;
+    *hash2 = h1;
+}
+
diff --git a/src/tests/hash_functions/validation/SpookyV2.h b/src/tests/hash_functions/validation/SpookyV2.h
new file mode 100644
index 000000000..4ccc0d523
--- /dev/null
+++ b/src/tests/hash_functions/validation/SpookyV2.h
@@ -0,0 +1,299 @@
+//
+// SpookyHash: a 128-bit noncryptographic hash function
+// By Bob Jenkins, public domain
+//   Oct 31 2010: alpha, framework + SpookyHash::Mix appears right
+//   Oct 31 2011: alpha again, Mix only good to 2^^69 but rest appears right
+//   Dec 31 2011: beta, improved Mix, tested it for 2-bit deltas
+//   Feb  2 2012: production, same bits as beta
+//   Feb  5 2012: adjusted definitions of uint* to be more portable
+//   Mar 30 2012: 3 bytes/cycle, not 4.  Alpha was 4 but wasn't thorough enough.
+//   August 5 2012: SpookyV2 (different results)
+// 
+// Up to 3 bytes/cycle for long messages.  Reasonably fast for short messages.
+// All 1 or 2 bit deltas achieve avalanche within 1% bias per output bit.
+//
+// This was developed for and tested on 64-bit x86-compatible processors.
+// It assumes the processor is little-endian.  There is a macro
+// controlling whether unaligned reads are allowed (by default they are).
+// This should be an equally good hash on big-endian machines, but it will
+// compute different results on them than on little-endian machines.
+//
+// Google's CityHash has similar specs to SpookyHash, and CityHash is faster
+// on new Intel boxes.  MD4 and MD5 also have similar specs, but they are orders
+// of magnitude slower.  CRCs are two or more times slower, but unlike 
+// SpookyHash, they have nice math for combining the CRCs of pieces to form 
+// the CRCs of wholes.  There are also cryptographic hashes, but those are even 
+// slower than MD5.
+//
+
+#include <stddef.h>
+
+#ifdef _MSC_VER
+# define INLINE __forceinline
+  typedef  unsigned __int64 uint64;
+  typedef  unsigned __int32 uint32;
+  typedef  unsigned __int16 uint16;
+  typedef  unsigned __int8  uint8;
+#else
+# include <stdint.h>
+# define INLINE inline
+  typedef  uint64_t  uint64;
+  typedef  uint32_t  uint32;
+  typedef  uint16_t  uint16;
+  typedef  uint8_t   uint8;
+#endif
+
+
+class SpookyHash
+{
+public:
+    //
+    // SpookyHash: hash a single message in one call, produce 128-bit output
+    //
+    static void Hash128(
+        const void *message,  // message to hash
+        size_t length,        // length of message in bytes
+        uint64 *hash1,        // in/out: in seed 1, out hash value 1
+        uint64 *hash2);       // in/out: in seed 2, out hash value 2
+
+    //
+    // Hash64: hash a single message in one call, return 64-bit output
+    //
+    static uint64 Hash64(
+        const void *message,  // message to hash
+        size_t length,        // length of message in bytes
+        uint64 seed)          // seed
+    {
+        uint64 hash1 = seed;
+        Hash128(message, length, &hash1, &seed);
+        return hash1;
+    }
+
+    //
+    // Hash32: hash a single message in one call, produce 32-bit output
+    //
+    static uint32 Hash32(
+        const void *message,  // message to hash
+        size_t length,        // length of message in bytes
+        uint32 seed)          // seed
+    {
+        uint64 hash1 = seed, hash2 = seed;
+        Hash128(message, length, &hash1, &hash2);
+        return (uint32)hash1;
+    }
+
+    //
+    // Init: initialize the context of a SpookyHash
+    //
+    void Init(
+        uint64 seed1,       // any 64-bit value will do, including 0
+        uint64 seed2);      // different seeds produce independent hashes
+    
+    //
+    // Update: add a piece of a message to a SpookyHash state
+    //
+    void Update(
+        const void *message,  // message fragment
+        size_t length);       // length of message fragment in bytes
+
+
+    //
+    // Final: compute the hash for the current SpookyHash state
+    //
+    // This does not modify the state; you can keep updating it afterward
+    //
+    // The result is the same as if SpookyHash() had been called with
+    // all the pieces concatenated into one message.
+    //
+    void Final(
+        uint64 *hash1,    // out only: first 64 bits of hash value.
+        uint64 *hash2);   // out only: second 64 bits of hash value.
+
+    //
+    // left rotate a 64-bit value by k bytes
+    //
+    static INLINE uint64 Rot64(uint64 x, int k)
+    {
+        return (x << k) | (x >> (64 - k));
+    }
+
+    //
+    // This is used if the input is 96 bytes long or longer.
+    //
+    // The internal state is fully overwritten every 96 bytes.
+    // Every input bit appears to cause at least 128 bits of entropy
+    // before 96 other bytes are combined, when run forward or backward
+    //   For every input bit,
+    //   Two inputs differing in just that input bit
+    //   Where "differ" means xor or subtraction
+    //   And the base value is random
+    //   When run forward or backwards one Mix
+    // I tried 3 pairs of each; they all differed by at least 212 bits.
+    //
+    static INLINE void Mix(
+        const uint64 *data, 
+        uint64 &s0, uint64 &s1, uint64 &s2, uint64 &s3,
+        uint64 &s4, uint64 &s5, uint64 &s6, uint64 &s7,
+        uint64 &s8, uint64 &s9, uint64 &s10,uint64 &s11)
+    {
+      s0 += data[0];    s2 ^= s10;    s11 ^= s0;    s0 = Rot64(s0,11);    s11 += s1;
+      s1 += data[1];    s3 ^= s11;    s0 ^= s1;    s1 = Rot64(s1,32);    s0 += s2;
+      s2 += data[2];    s4 ^= s0;    s1 ^= s2;    s2 = Rot64(s2,43);    s1 += s3;
+      s3 += data[3];    s5 ^= s1;    s2 ^= s3;    s3 = Rot64(s3,31);    s2 += s4;
+      s4 += data[4];    s6 ^= s2;    s3 ^= s4;    s4 = Rot64(s4,17);    s3 += s5;
+      s5 += data[5];    s7 ^= s3;    s4 ^= s5;    s5 = Rot64(s5,28);    s4 += s6;
+      s6 += data[6];    s8 ^= s4;    s5 ^= s6;    s6 = Rot64(s6,39);    s5 += s7;
+      s7 += data[7];    s9 ^= s5;    s6 ^= s7;    s7 = Rot64(s7,57);    s6 += s8;
+      s8 += data[8];    s10 ^= s6;    s7 ^= s8;    s8 = Rot64(s8,55);    s7 += s9;
+      s9 += data[9];    s11 ^= s7;    s8 ^= s9;    s9 = Rot64(s9,54);    s8 += s10;
+      s10 += data[10];    s0 ^= s8;    s9 ^= s10;    s10 = Rot64(s10,22);    s9 += s11;
+      s11 += data[11];    s1 ^= s9;    s10 ^= s11;    s11 = Rot64(s11,46);    s10 += s0;
+    }
+
+    //
+    // Mix all 12 inputs together so that h0, h1 are a hash of them all.
+    //
+    // For two inputs differing in just the input bits
+    // Where "differ" means xor or subtraction
+    // And the base value is random, or a counting value starting at that bit
+    // The final result will have each bit of h0, h1 flip
+    // For every input bit,
+    // with probability 50 +- .3%
+    // For every pair of input bits,
+    // with probability 50 +- 3%
+    //
+    // This does not rely on the last Mix() call having already mixed some.
+    // Two iterations was almost good enough for a 64-bit result, but a
+    // 128-bit result is reported, so End() does three iterations.
+    //
+    static INLINE void EndPartial(
+        uint64 &h0, uint64 &h1, uint64 &h2, uint64 &h3,
+        uint64 &h4, uint64 &h5, uint64 &h6, uint64 &h7, 
+        uint64 &h8, uint64 &h9, uint64 &h10,uint64 &h11)
+    {
+        h11+= h1;    h2 ^= h11;   h1 = Rot64(h1,44);
+        h0 += h2;    h3 ^= h0;    h2 = Rot64(h2,15);
+        h1 += h3;    h4 ^= h1;    h3 = Rot64(h3,34);
+        h2 += h4;    h5 ^= h2;    h4 = Rot64(h4,21);
+        h3 += h5;    h6 ^= h3;    h5 = Rot64(h5,38);
+        h4 += h6;    h7 ^= h4;    h6 = Rot64(h6,33);
+        h5 += h7;    h8 ^= h5;    h7 = Rot64(h7,10);
+        h6 += h8;    h9 ^= h6;    h8 = Rot64(h8,13);
+        h7 += h9;    h10^= h7;    h9 = Rot64(h9,38);
+        h8 += h10;   h11^= h8;    h10= Rot64(h10,53);
+        h9 += h11;   h0 ^= h9;    h11= Rot64(h11,42);
+        h10+= h0;    h1 ^= h10;   h0 = Rot64(h0,54);
+    }
+
+    static INLINE void End(
+        const uint64 *data, 
+        uint64 &h0, uint64 &h1, uint64 &h2, uint64 &h3,
+        uint64 &h4, uint64 &h5, uint64 &h6, uint64 &h7, 
+        uint64 &h8, uint64 &h9, uint64 &h10,uint64 &h11)
+    {
+        h0 += data[0];   h1 += data[1];   h2 += data[2];   h3 += data[3];
+        h4 += data[4];   h5 += data[5];   h6 += data[6];   h7 += data[7];
+        h8 += data[8];   h9 += data[9];   h10 += data[10]; h11 += data[11];
+        EndPartial(h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,h10,h11);
+        EndPartial(h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,h10,h11);
+        EndPartial(h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,h10,h11);
+    }
+
+    //
+    // The goal is for each bit of the input to expand into 128 bits of 
+    //   apparent entropy before it is fully overwritten.
+    // n trials both set and cleared at least m bits of h0 h1 h2 h3
+    //   n: 2   m: 29
+    //   n: 3   m: 46
+    //   n: 4   m: 57
+    //   n: 5   m: 107
+    //   n: 6   m: 146
+    //   n: 7   m: 152
+    // when run forwards or backwards
+    // for all 1-bit and 2-bit diffs
+    // with diffs defined by either xor or subtraction
+    // with a base of all zeros plus a counter, or plus another bit, or random
+    //
+    static INLINE void ShortMix(uint64 &h0, uint64 &h1, uint64 &h2, uint64 &h3)
+    {
+        h2 = Rot64(h2,50);  h2 += h3;  h0 ^= h2;
+        h3 = Rot64(h3,52);  h3 += h0;  h1 ^= h3;
+        h0 = Rot64(h0,30);  h0 += h1;  h2 ^= h0;
+        h1 = Rot64(h1,41);  h1 += h2;  h3 ^= h1;
+        h2 = Rot64(h2,54);  h2 += h3;  h0 ^= h2;
+        h3 = Rot64(h3,48);  h3 += h0;  h1 ^= h3;
+        h0 = Rot64(h0,38);  h0 += h1;  h2 ^= h0;
+        h1 = Rot64(h1,37);  h1 += h2;  h3 ^= h1;
+        h2 = Rot64(h2,62);  h2 += h3;  h0 ^= h2;
+        h3 = Rot64(h3,34);  h3 += h0;  h1 ^= h3;
+        h0 = Rot64(h0,5);   h0 += h1;  h2 ^= h0;
+        h1 = Rot64(h1,36);  h1 += h2;  h3 ^= h1;
+    }
+
+    //
+    // Mix all 4 inputs together so that h0, h1 are a hash of them all.
+    //
+    // For two inputs differing in just the input bits
+    // Where "differ" means xor or subtraction
+    // And the base value is random, or a counting value starting at that bit
+    // The final result will have each bit of h0, h1 flip
+    // For every input bit,
+    // with probability 50 +- .3% (it is probably better than that)
+    // For every pair of input bits,
+    // with probability 50 +- .75% (the worst case is approximately that)
+    //
+    static INLINE void ShortEnd(uint64 &h0, uint64 &h1, uint64 &h2, uint64 &h3)
+    {
+        h3 ^= h2;  h2 = Rot64(h2,15);  h3 += h2;
+        h0 ^= h3;  h3 = Rot64(h3,52);  h0 += h3;
+        h1 ^= h0;  h0 = Rot64(h0,26);  h1 += h0;
+        h2 ^= h1;  h1 = Rot64(h1,51);  h2 += h1;
+        h3 ^= h2;  h2 = Rot64(h2,28);  h3 += h2;
+        h0 ^= h3;  h3 = Rot64(h3,9);   h0 += h3;
+        h1 ^= h0;  h0 = Rot64(h0,47);  h1 += h0;
+        h2 ^= h1;  h1 = Rot64(h1,54);  h2 += h1;
+        h3 ^= h2;  h2 = Rot64(h2,32);  h3 += h2;
+        h0 ^= h3;  h3 = Rot64(h3,25);  h0 += h3;
+        h1 ^= h0;  h0 = Rot64(h0,63);  h1 += h0;
+    }
+    
+private:
+
+    //
+    // Short is used for messages under 192 bytes in length
+    // Short has a low startup cost, the normal mode is good for long
+    // keys, the cost crossover is at about 192 bytes.  The two modes were
+    // held to the same quality bar.
+    // 
+    static void Short(
+        const void *message,  // message (array of bytes, not necessarily aligned)
+        size_t length,        // length of message (in bytes)
+        uint64 *hash1,        // in/out: in the seed, out the hash value
+        uint64 *hash2);       // in/out: in the seed, out the hash value
+
+    // number of uint64's in internal state
+    static const size_t sc_numVars = 12;
+
+    // size of the internal state
+    static const size_t sc_blockSize = sc_numVars*8;
+
+    // size of buffer of unhashed data, in bytes
+    static const size_t sc_bufSize = 2*sc_blockSize;
+
+    //
+    // sc_const: a constant which:
+    //  * is not zero
+    //  * is odd
+    //  * is a not-very-regular mix of 1's and 0's
+    //  * does not need any other special mathematical properties
+    //
+    static const uint64 sc_const = 0xdeadbeefdeadbeefLL;
+
+    uint64 m_data[2*sc_numVars];   // unhashed data, for partial messages
+    uint64 m_state[sc_numVars];  // internal state of the hash
+    size_t m_length;             // total length of the input so far
+    uint8  m_remainder;          // length of unhashed data stashed in m_data
+};
+
+
+
diff --git a/src/tests/hash_functions/validation/SpookyV2Test.cpp b/src/tests/hash_functions/validation/SpookyV2Test.cpp
new file mode 100644
index 000000000..3b9e6826f
--- /dev/null
+++ b/src/tests/hash_functions/validation/SpookyV2Test.cpp
@@ -0,0 +1,52 @@
+#include "SpookyV2.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void SpookyHash32_with_state_test(const void *key, size_t len, const void *state, void *out) {
+  uint64_t *state64= (uint64_t *)state;
+  uint64_t s0 = state64[0];
+  uint64_t s1 = state64[1];
+  SpookyHash::Hash128(key, len, &s0, &s1);
+  ((uint32_t *)out)[0]= (uint32_t)s0;
+}
+
+void SpookyHash64_with_state_test(const void *key, size_t len, const void *state, void *out) {
+  uint64_t *state64= (uint64_t *)state;
+  uint64_t *out64= (uint64_t *)out;
+  out64[0] = state64[0];
+  uint64_t s1 = state64[1];
+  SpookyHash::Hash128(key, len, out64, &s1);
+}
+
+void SpookyHash128_with_state_test(const void *key, size_t len, const void *state, void *out) {
+  uint64_t *state64= (uint64_t *)state;
+  uint64_t *out64= (uint64_t *)out;
+  out64[0] = state64[0];
+  out64[1] = state64[1];
+  SpookyHash::Hash128(key, len, out64, out64+1);
+}
+
+void SpookyHash_seed_state_test(int in_bits, const void *seed, void *state) {
+    uint64_t *state64= (uint64_t *)state;
+    if (in_bits == 32) {
+        state64[0]= state64[1]= ((uint32_t*)seed)[0];
+    }
+    else {
+        uint64_t *seed64= (uint64_t *)seed;
+        if (in_bits == 64) {
+            state64[0]= state64[1]= seed64[0];
+        }
+        else
+        if (in_bits == 128) {
+            state64[0]= seed64[0];
+            state64[1]= seed64[1];
+        }
+    }
+}
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/tests/hash_functions/validation/generate_hash_arrays.cpp b/src/tests/hash_functions/validation/generate_hash_arrays.cpp
new file mode 100644
index 000000000..bcd95dfb3
--- /dev/null
+++ b/src/tests/hash_functions/validation/generate_hash_arrays.cpp
@@ -0,0 +1,174 @@
+#include <iostream>
+#include <fstream>
+
+extern "C" {
+  #include "nmhash.h"
+  #include "nmhash_scalar.h"
+  #include "pengyhash.h"
+  #include "waterhash.h"
+}
+
+#include "SpookyV2.h"
+
+void SpookyHash32_with_state_test(const void *key, size_t len, const void *state, void *out) {
+  uint64_t *state64= (uint64_t *)state;
+  uint64_t s0 = state64[0];
+  uint64_t s1 = state64[1];
+  SpookyHash::Hash128(key, len, &s0, &s1);
+  ((uint32_t *)out)[0]= (uint32_t)s0;
+}
+
+void SpookyHash64_with_state_test(const void *key, size_t len, const void *state, void *out) {
+  uint64_t *state64= (uint64_t *)state;
+  uint64_t *out64= (uint64_t *)out;
+  out64[0] = state64[0];
+  uint64_t s1 = state64[1];
+  SpookyHash::Hash128(key, len, out64, &s1);
+}
+
+void SpookyHash128_with_state_test(const void *key, size_t len, const void *state, void *out) {
+  uint64_t *state64= (uint64_t *)state;
+  uint64_t *out64= (uint64_t *)out;
+  out64[0] = state64[0];
+  out64[1] = state64[1];
+  SpookyHash::Hash128(key, len, out64, out64+1);
+}
+
+void SpookyHash_seed_state_test(int in_bits, const void *seed, void *state) {
+    uint64_t *state64= (uint64_t *)state;
+    if (in_bits == 32) {
+        state64[0]= state64[1]= ((uint32_t*)seed)[0];
+    }
+    else {
+        uint64_t *seed64= (uint64_t *)seed;
+        if (in_bits == 64) {
+            state64[0]= state64[1]= seed64[0];
+        }
+        else
+        if (in_bits == 128) {
+            state64[0]= seed64[0];
+            state64[1]= seed64[1];
+        }
+    }
+}
+
+using namespace std;
+
+static const int SIZE = 2048;
+char * key_array = new char[SIZE];
+static const uint32_t NM_SEED = 0xdeadbeef;
+static const uint64_t WATER_SEED = 0xdeadbeef1eadbeef;
+static const uint32_t PENGY_SEED = 0xdeadbeef;
+static const uint64_t SPOOKY_SEED[2] = { WATER_SEED, WATER_SEED };
+
+int read_keys(){
+    string inFileName = "key_array.bin";
+    std::ifstream fin( inFileName, ios::in | ios::binary );
+    if (!fin){
+        cout << "Cannot open key_array.bin!" << endl;
+        return 1;
+    }
+    fin.read(key_array, SIZE);
+    fin.close();
+    return 0;
+}
+
+int write_nmhash32(){
+    size_t i;
+    uint32_t hash;
+    string outFileName = "c_nmhash32_array.bin";
+    std::ofstream fout( outFileName, ios::out | ios::binary );
+
+    if (!fout){
+        cout << "Cannot open c_nmhash32_array.bin!" << endl;
+        return 1;
+    }
+    for( i=0; i<=SIZE; i+=1 ){
+        hash = NMHASH32((void *) key_array, i, NM_SEED);
+        fout.write((char *) &hash, 4);
+    }
+    fout.close();
+    return 0;
+}
+
+int write_nmhash32x(){
+    size_t i;
+    uint32_t hash;
+    string outFileName = "c_nmhash32x_array.bin";
+    std::ofstream fout( outFileName, ios::out | ios::binary );
+
+    if (!fout){
+        cout << "Cannot open c_nmhash32x_array.bin!" << endl;
+        return 1;
+    }
+    for( i=0; i<=SIZE; i+=1 ){
+        hash = NMHASH32X((void *) key_array, i, NM_SEED);
+        fout.write((char *) &hash, 4);
+    }
+    fout.close();
+    return 0;
+}
+
+int write_water(){
+    uint32_t i;
+    uint32_t hash;
+    string outFileName = "c_water_hash_array.bin";
+    std::ofstream fout( outFileName, ios::out | ios::binary );
+
+    if (!fout){
+        cout << "Cannot open c_water_hash_array.bin!" << endl;
+        return 1;
+    }
+    for( i=0; i<=SIZE; i+=1 ){
+        hash = waterhash((void *) key_array, i, WATER_SEED);
+        fout.write((char *) &hash, 4);
+    }
+    fout.close();
+    return 0;
+}
+
+int write_pengy(){
+    size_t i;
+    uint64_t hash;
+    string outFileName = "c_pengy_hash_array.bin";
+    std::ofstream fout( outFileName, ios::out | ios::binary );
+
+    if (!fout){
+        cout << "Cannot open c_pengy_hash_array.bin!" << endl;
+        return 1;
+    }
+    for( i=0; i<=SIZE; i+=1 ){
+        hash = pengyhash((void *) key_array, i, PENGY_SEED);
+        fout.write((char *) &hash, 8);
+    }
+    fout.close();
+    return 0;
+}
+
+int write_spooky(){
+    size_t i;
+    uint64_t hash[2];
+    string outFileName = "c_spooky_hash_array.bin";
+    std::ofstream fout( outFileName, ios::out | ios::binary );
+
+    if (!fout){
+        cout << "Cannot open c_spooky_hash_array.bin!" << endl;
+        return 1;
+    }
+    for( i=0; i<=SIZE; i+=1 ){
+        SpookyHash128_with_state_test((void *) key_array, i, (void *) SPOOKY_SEED, (void *) hash);
+        fout.write((char *) hash, 16);
+    }
+    fout.close();
+    return 0;
+}
+
+int main(){
+    if (read_keys()==1){return 1;};
+    if (write_nmhash32()==1){return 1;};
+    if (write_nmhash32x()==1){return 1;};
+    if (write_water()==1){return 1;};
+    if (write_pengy()==1){return 1;};
+    if (write_spooky()==1){return 1;};
+    return 0;
+}
diff --git a/src/tests/hash_functions/validation/generate_key_array.f90 b/src/tests/hash_functions/validation/generate_key_array.f90
new file mode 100644
index 000000000..40b43a043
--- /dev/null
+++ b/src/tests/hash_functions/validation/generate_key_array.f90
@@ -0,0 +1,22 @@
+program generate_key_array
+
+    use, intrinsic :: iso_fortran_env, only: int8, int32, int64, real64
+
+    integer        :: lun
+    integer(int8)  :: key_array(2048)
+    integer(int32) :: dummy(512)
+    real(real64)   :: rand(512)
+
+! Create key array
+    call random_number( rand )
+    do i=1, 512
+        dummy(i) = floor( rand(i) * 2_int64**32 - 2_int64**31, kind=int32 )
+    end do
+    key_array = transfer( dummy, 0_int8, 2048 )
+
+    open(newunit=lun, file="key_array.bin", form="unformatted", &
+        access="stream", status="new", action="write")
+    write(lun) key_array
+    close(lun)
+
+end program generate_key_array
diff --git a/src/tests/hash_functions/validation/hash_validity_test.f90 b/src/tests/hash_functions/validation/hash_validity_test.f90
new file mode 100644
index 000000000..86d2cc3df
--- /dev/null
+++ b/src/tests/hash_functions/validation/hash_validity_test.f90
@@ -0,0 +1,123 @@
+!! HASH_VALIDITY_TEST processes a vector of eight bit integers,
+!! extracting subvectors of length 0, 1, 2, ... 2048 from the beginning
+!! hashing each subvector and comparing the resulting hash with the
+!! corresponding hash produced by the original C/C++ code, stopping if
+!! they are different. As the original C/C++ code was typically developed
+!! for Little-Endian machines the testing should only be cone on such
+!! machones. The Fortran codes also assume two's complement integers.
+!! The code set assume that C's int32_t and int64_t have the same
+!! representation as Firtrans int32 and int64 respectively.
+
+program hash_validity_test
+
+    use, intrinsic :: iso_fortran_env, only: int8, int32, int64, real64
+    use stdlib_32_bit_hash_functions, only: &
+        little_endian, &
+        nmhash32,      &
+        nmhash32x,     &
+        water_hash
+    use stdlib_64_bit_hash_functions, only: &
+        pengy_hash, &
+        spooky_hash
+
+    integer(int32), parameter :: nm_seed = int( z'deadbeef', int32 )
+    integer(int64), parameter :: water_seed = int( z'deadbeef1eadbeef', int64 )
+    integer(int32), parameter :: pengy_seed = int( z'deadbeef', int32 )
+    integer(int64), parameter :: spooky_seed(2) = [ water_seed, water_seed ]
+    integer        :: index
+    integer        :: lun
+    integer(int8)  :: key_array(2048)
+    integer(int32) :: c_nmhash32(0:2048)
+    integer(int32) :: c_nmhash32x(0:2048)
+    integer(int32) :: c_water_hash(0:2048)
+    integer(int64) :: c_pengy_hash(0:2048)
+    integer(int64) :: c_spooky_hash(0:1, 0:2048)
+
+
+    ! Test for endianness
+    if ( .not. little_endian ) then
+        stop "The processor is not Little-Endian"
+    end if
+
+    ! Read key array used to generate hash array
+    open(newunit=lun, file="key_array.bin", form="unformatted", &
+        access="stream", status="old", action="read")
+    read(lun) key_array
+    close(lun)
+
+    ! Read hash array generated from key array by the C version of nmhash32
+    open(newunit=lun, file="c_nmhash32_array.bin", form="unformatted", &
+        access="stream", status="old", action="read")
+    read(lun) c_nmhash32
+    close(lun)
+
+    do index=0, 2048
+        if ( c_nmhash32(index) /= nmhash32(key_array(1:index), nm_seed) ) then
+            write(*,'("NMHASH32 failed for KEY_ARRAY(1:", I0, ")")') index
+            stop "NMHASH32 is invalid."
+        end if
+    end do
+    write(*,*) "NMHASH32 is valid."
+
+    ! Read hash array generated from key array by the C version of nmhash32x
+    open(newunit=lun, file="c_nmhash32x_array.bin", form="unformatted", &
+        access="stream", status="old", action="read")
+    read(lun) c_nmhash32x
+    close(lun)
+
+    do index=0, 2048
+        if ( c_nmhash32x(index) /= nmhash32x(key_array(1:index), nm_seed) ) then
+            write(*,'("NMHASH32X failed for KEY_ARRAY(1:", I0, ")")') index
+            stop "NMHASH32X is invalid."
+        end if
+    end do
+    write(*,*) "NMHASH32X is valid."
+
+    ! Read hash array generated from key array by the C version of water hash
+    open(newunit=lun, file="c_water_hash_array.bin", form="unformatted", &
+        access="stream", status="old", action="read")
+    read(lun) c_water_hash
+    close(lun)
+
+    do index=0, 2048
+        if ( c_water_hash(index) /= &
+             water_hash(key_array(1:index), water_seed) ) then
+            write(*,'("WATER_HASH failed for KEY_ARRAY(1:", I0, ")")') index
+            stop "WATER_HASH is invalid."
+        end if
+    end do
+    write(*,*) "WATER_HASH is valid."
+
+    ! Read hash array generated from key array by the C version of pengy hash
+    open(newunit=lun, file="c_pengy_hash_array.bin", form="unformatted", &
+        access="stream", status="old", action="read")
+    read(lun) c_pengy_hash
+    close(lun)
+
+    do index=0, 2048
+        if ( c_pengy_hash(index) /= &
+             pengy_hash(key_array(1:index), pengy_seed) ) then
+            write(*,'("PENGY_HASH failed for KEY_ARRAY(1:", I0, ")")') index
+            stop "PENGY_HASH is invalid."
+        end if
+    end do
+    write(*,*) "PENGY_HASH is valid."
+
+    ! Read hash array generated from key array by the C version of Spooky hash
+    open(newunit=lun, file="c_spooky_hash_array.bin", form="unformatted", &
+        access="stream", status="old", action="read")
+    do index=0, 2048
+        read(lun) c_spooky_hash(:, index)
+    end do
+    close(lun)
+
+    do index=0, 2048
+        if ( .not. all( c_spooky_hash(:,index) == &
+                        spooky_hash(key_array(1:index), spooky_seed) ) ) then
+            write(*,'("SPOOKY_HASH failed for KEY_ARRAY(:,1:", I0, ")")') index
+            stop "SPOOKY_HASH is invalid."
+        end if
+    end do
+    write(*,*) "SPOOKY_HASH is valid."
+
+end program hash_validity_test
diff --git a/src/tests/hash_functions/validation/nmhash.c b/src/tests/hash_functions/validation/nmhash.c
new file mode 100644
index 000000000..987bc568c
--- /dev/null
+++ b/src/tests/hash_functions/validation/nmhash.c
@@ -0,0 +1,8 @@
+#include "nmhash.h"
+int32_t nmhash32_test ( const void * key, size_t len, uint32_t seed ) {
+  return NMHASH32 (key, (const size_t) len, seed);
+}
+
+int32_t nmhash32x_test ( const void * key, size_t len, uint32_t seed ) {
+  return NMHASH32X (key, (const size_t) len, seed);
+}
diff --git a/src/tests/hash_functions/validation/nmhash.h b/src/tests/hash_functions/validation/nmhash.h
new file mode 100644
index 000000000..21bb90022
--- /dev/null
+++ b/src/tests/hash_functions/validation/nmhash.h
@@ -0,0 +1,832 @@
+/*
+ * verification:
+ * NMHASH32:
+ *   rurban/smhasher: 0x12A30553
+ *   demerphq/smhasher: 0x3D8F6C47
+ * NMHASH32X:
+ *   rurban/smhasher: 0xA8580227
+ *   demerphq/smhasher: 0x40B451B3
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef _nmhash_h_
+#define _nmhash_h_
+
+#define NMH_VERSION 2
+
+#ifdef _MSC_VER
+#  pragma warning(push, 3)
+#endif
+
+#if defined(__cplusplus) && __cplusplus < 201103L
+#  define __STDC_CONSTANT_MACROS 1
+#endif
+
+#include <stdint.h>
+#include <string.h>
+
+#if defined(__GNUC__)
+#  if defined(__AVX2__)
+#    include <immintrin.h>
+#  elif defined(__SSE2__)
+#    include <emmintrin.h>
+#  endif
+#elif defined(_MSC_VER)
+#  include <intrin.h>
+#endif
+
+#ifdef _MSC_VER
+#  pragma warning(pop)
+#endif
+
+#if (defined(__GNUC__) && (__GNUC__ >= 3))  \
+  || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) \
+  || defined(__clang__)
+#    define NMH_likely(x) __builtin_expect(x, 1)
+#else
+#    define NMH_likely(x) (x)
+#endif
+
+#if defined(__has_builtin)
+#  if __has_builtin(__builtin_rotateleft32)
+#    define NMH_rotl32 __builtin_rotateleft32 /* clang */
+#  endif
+#endif
+#if !defined(NMH_rotl32)
+#  if defined(_MSC_VER)
+     /* Note: although _rotl exists for minGW (GCC under windows), performance seems poor */
+#    define NMH_rotl32(x,r) _rotl(x,r)
+#  else
+#    define NMH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r))))
+#  endif
+#endif
+
+#if ((defined(sun) || defined(__sun)) && __cplusplus) /* Solaris includes __STDC_VERSION__ with C++. Tested with GCC 5.5 */
+#  define NMH_RESTRICT /* disable */
+#elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* >= C99 */
+#  define NMH_RESTRICT   restrict
+#elif defined(__cplusplus) && (defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER))
+#  define NMH_RESTRICT __restrict__
+#elif defined(__cplusplus) && defined(_MSC_VER)
+#  define NMH_RESTRICT __restrict
+#else
+#  define NMH_RESTRICT   /* disable */
+#endif
+
+/* endian macros */
+#ifndef NMHASH_LITTLE_ENDIAN
+#  if defined(_WIN32) || defined(__LITTLE_ENDIAN__) || defined(__x86_64__) || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) || defined(__SDCC)
+#    define NMHASH_LITTLE_ENDIAN 1
+#  elif defined(__BIG_ENDIAN__) || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#    define NMHASH_LITTLE_ENDIAN 0
+#  else
+#    warning could not determine endianness! Falling back to little endian.
+#    define NMHASH_LITTLE_ENDIAN 1
+#  endif
+#endif
+
+/* vector macros */
+#define NMH_SCALAR 0
+#define NMH_SSE2   1
+#define NMH_AVX2   2
+#define NMH_AVX512 3
+
+#ifndef NMH_VECTOR    /* can be defined on command line */
+#  if defined(__AVX512BW__)
+#    define NMH_VECTOR NMH_AVX512 /* _mm512_mullo_epi16 requires AVX512BW */
+#  elif defined(__AVX2__)
+#    define NMH_VECTOR NMH_AVX2  /* add '-mno-avx256-split-unaligned-load' and '-mn-oavx256-split-unaligned-store' for gcc */
+#  elif defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || (defined(_M_IX86_FP) && (_M_IX86_FP == 2))
+#    define NMH_VECTOR NMH_SSE2
+#  else
+#    define NMH_VECTOR NMH_SCALAR
+#  endif
+#endif
+
+/* align macros */
+#if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)   /* C11+ */
+#  include <stdalign.h>
+#  define NMH_ALIGN(n)      alignas(n)
+#elif defined(__GNUC__)
+#  define NMH_ALIGN(n)      __attribute__ ((aligned(n)))
+#elif defined(_MSC_VER)
+#  define NMH_ALIGN(n)      __declspec(align(n))
+#else
+#  define NMH_ALIGN(n)   /* disabled */
+#endif
+
+#if NMH_VECTOR > 0
+#  define NMH_ACC_ALIGN 64
+#elif defined(__BIGGEST_ALIGNMENT__)
+#  define NMH_ACC_ALIGN __BIGGEST_ALIGNMENT__
+#elif defined(__SDCC)
+#  define NMH_ACC_ALIGN 1
+#else
+#  define NMH_ACC_ALIGN 16
+#endif
+
+/* constants */
+
+/* primes from xxh */
+#define NMH_PRIME32_1  UINT32_C(0x9E3779B1)
+#define NMH_PRIME32_2  UINT32_C(0x85EBCA77)
+#define NMH_PRIME32_3  UINT32_C(0xC2B2AE3D)
+#define NMH_PRIME32_4  UINT32_C(0x27D4EB2F)
+
+/*! Pseudorandom secret taken directly from FARSH. */
+NMH_ALIGN(NMH_ACC_ALIGN) static const uint32_t NMH_ACC_INIT[32] = {
+	UINT32_C(0xB8FE6C39), UINT32_C(0x23A44BBE), UINT32_C(0x7C01812C), UINT32_C(0xF721AD1C),
+	UINT32_C(0xDED46DE9), UINT32_C(0x839097DB), UINT32_C(0x7240A4A4), UINT32_C(0xB7B3671F),
+	UINT32_C(0xCB79E64E), UINT32_C(0xCCC0E578), UINT32_C(0x825AD07D), UINT32_C(0xCCFF7221),
+	UINT32_C(0xB8084674), UINT32_C(0xF743248E), UINT32_C(0xE03590E6), UINT32_C(0x813A264C),
+
+	UINT32_C(0x3C2852BB), UINT32_C(0x91C300CB), UINT32_C(0x88D0658B), UINT32_C(0x1B532EA3),
+	UINT32_C(0x71644897), UINT32_C(0xA20DF94E), UINT32_C(0x3819EF46), UINT32_C(0xA9DEACD8),
+	UINT32_C(0xA8FA763F), UINT32_C(0xE39C343F), UINT32_C(0xF9DCBBC7), UINT32_C(0xC70B4F1D),
+	UINT32_C(0x8A51E04B), UINT32_C(0xCDB45931), UINT32_C(0xC89F7EC9), UINT32_C(0xD9787364),
+};
+
+#if defined(_MSC_VER) && _MSC_VER >= 1914
+#  pragma warning(push)
+#  pragma warning(disable: 5045)
+#endif
+#ifdef __SDCC
+#  define const
+#  pragma save
+#  pragma disable_warning 110
+#  pragma disable_warning 126
+#endif
+
+/* read functions */
+static inline
+uint32_t
+NMH_readLE32(const void *const p)
+{
+	uint32_t v;
+	memcpy(&v, p, 4);
+#	if (NMHASH_LITTLE_ENDIAN)
+	return v;
+#	elif defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__clang__)
+	return __builtin_bswap32(v);
+#	elif defined(_MSC_VER)
+	return _byteswap_ulong(v);
+#	else
+	return ((v >> 24) & 0xff) | ((v >> 8) & 0xff00) | ((v << 8) & 0xff0000) | ((v << 24) & 0xff000000);
+#	endif
+}
+
+static inline
+uint16_t
+NMH_readLE16(const void *const p)
+{
+	uint16_t v;
+	memcpy(&v, p, 2);
+#	if (NMHASH_LITTLE_ENDIAN)
+	return v;
+#	else
+	return (uint16_t)((v << 8) | (v >> 8));
+#	endif
+}
+
+static inline
+uint32_t
+NMHASH32_0to8(uint32_t const x, uint32_t const seed2)
+{
+	/* base mixer: [-6 -12 776bf593 -19 11 3fb39c65 -15 -9 e9139917 -11 16] = 0.027071104091278835 */
+	const uint32_t m1 = UINT32_C(0x776BF593);
+	const uint32_t m2 = UINT32_C(0x3FB39C65);
+	const uint32_t m3 = UINT32_C(0xE9139917);
+
+#	if NMH_VECTOR == NMH_SCALAR
+	{
+		union { uint32_t u32; uint16_t u16[2]; } vx;
+		vx.u32 = x;
+		vx.u32 ^= (vx.u32 >> 12) ^ (vx.u32 >> 6);
+		vx.u16[0] *= (uint16_t)m1;
+		vx.u16[1] *= (uint16_t)(m1 >> 16);
+		vx.u32 ^= (vx.u32 << 11) ^ ( vx.u32 >> 19);
+		vx.u16[0] *= (uint16_t)m2;
+		vx.u16[1] *= (uint16_t)(m2 >> 16);
+		vx.u32 ^= seed2;
+		vx.u32 ^= (vx.u32 >> 15) ^ ( vx.u32 >> 9);
+		vx.u16[0] *= (uint16_t)m3;
+		vx.u16[1] *= (uint16_t)(m3 >> 16);
+		vx.u32 ^= (vx.u32 << 16) ^ ( vx.u32 >> 11);
+		return vx.u32;
+	}
+#	else /* at least NMH_SSE2 */
+	{
+		__m128i hv = _mm_setr_epi32((int)x, 0, 0, 0);
+		const __m128i sv = _mm_setr_epi32((int)seed2, 0, 0, 0);
+		const uint32_t *const result = (const uint32_t*)&hv;
+
+		hv = _mm_xor_si128(_mm_xor_si128(hv, _mm_srli_epi32(hv, 12)), _mm_srli_epi32(hv, 6));
+		hv = _mm_mullo_epi16(hv, _mm_setr_epi32((int)m1, 0, 0, 0));
+		hv = _mm_xor_si128(_mm_xor_si128(hv, _mm_slli_epi32(hv, 11)), _mm_srli_epi32(hv, 19));
+		hv = _mm_mullo_epi16(hv, _mm_setr_epi32((int)m2, 0, 0, 0));
+
+		hv = _mm_xor_si128(hv, sv);
+
+		hv = _mm_xor_si128(_mm_xor_si128(hv, _mm_srli_epi32(hv, 15)), _mm_srli_epi32(hv, 9));
+		hv = _mm_mullo_epi16(hv, _mm_setr_epi32((int)m3, 0, 0, 0));
+		hv = _mm_xor_si128(_mm_xor_si128(hv, _mm_slli_epi32(hv, 16)), _mm_srli_epi32(hv, 11));
+
+		return *result;
+	}
+#	endif
+}
+
+#define __NMH_M1 UINT32_C(0xF0D9649B)
+#define __NMH_M2 UINT32_C(0x29A7935D)
+#define __NMH_M3 UINT32_C(0x55D35831)
+
+NMH_ALIGN(NMH_ACC_ALIGN) static const uint32_t __NMH_M1_V[32] = {
+	__NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1,
+	__NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1,
+	__NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1,
+	__NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1,
+};
+NMH_ALIGN(NMH_ACC_ALIGN) static const uint32_t __NMH_M2_V[32] = {
+	__NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2,
+	__NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2,
+	__NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2,
+	__NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2,
+};
+NMH_ALIGN(NMH_ACC_ALIGN) static const uint32_t __NMH_M3_V[32] = {
+	__NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3,
+	__NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3,
+	__NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3,
+	__NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3,
+};
+
+static inline
+uint32_t
+NMHASH32_9to255(const uint8_t* const NMH_RESTRICT p, size_t const len, uint32_t const seed, int const type)
+{
+	/* base mixer: [f0d9649b  5 -13 29a7935d -9 11 55d35831 -20 -10 ] = 0.93495901789135362 */
+	uint32_t result = 0;
+#	if NMH_VECTOR == NMH_SCALAR
+	{
+		union { uint32_t u32; uint16_t u16[2]; } x[4], y[4];
+		uint32_t const sl = seed + (uint32_t)len;
+		size_t j;
+		x[0].u32 = NMH_PRIME32_1;
+		x[1].u32 = NMH_PRIME32_2;
+		x[2].u32 = NMH_PRIME32_3;
+		x[3].u32 = NMH_PRIME32_4;
+		for (j = 0; j < 4; ++j) y[j].u32 = sl;
+
+		if (type) {
+			/* 33 to 255 bytes */
+			size_t const r = (len - 1) / 32;
+			size_t i;
+			for (i = 0; i < r; ++i) {
+				for (j = 0; j < 4; ++j) x[j].u32 ^= NMH_readLE32(p + i * 32 + j * 4);
+				for (j = 0; j < 4; ++j) y[j].u32 ^= NMH_readLE32(p + i * 32 + j * 4 + 16);
+				for (j = 0; j < 4; ++j) x[j].u32 += y[j].u32;
+
+				for (j = 0; j < 4; ++j) {
+					x[j].u16[0] *= (uint16_t)(__NMH_M1 & 0xFFFF);
+					x[j].u16[1] *= (uint16_t)(__NMH_M1 >> 16);
+				}
+				for (j = 0; j < 4; ++j) x[j].u32 ^= (x[j].u32 << 5) ^ (x[j].u32 >> 13);
+				for (j = 0; j < 4; ++j) {
+					x[j].u16[0] *= (uint16_t)(__NMH_M2 & 0xFFFF);
+					x[j].u16[1] *= (uint16_t)(__NMH_M2 >> 16);
+				}
+
+				for (j = 0; j < 4; ++j) x[j].u32 ^= y[j].u32;
+
+				for (j = 0; j < 4; ++j) x[j].u32 ^= (x[j].u32 << 11) ^ (x[j].u32 >> 9);
+				for (j = 0; j < 4; ++j) {
+					x[j].u16[0] *= (uint16_t)(__NMH_M3 & 0xFFFF);
+					x[j].u16[1] *= (uint16_t)(__NMH_M3 >> 16);
+				}
+				for (j = 0; j < 4; ++j) x[j].u32 ^= (x[j].u32 >> 10) ^ (x[j].u32 >> 20);
+			}
+			for (j = 0; j < 4; ++j) x[j].u32 ^= NMH_readLE32(p + len - 32 + j * 4);
+			for (j = 0; j < 4; ++j) y[j].u32 ^= NMH_readLE32(p + len - 16 + j * 4);
+		} else {
+			/* 9 to 32 bytes */
+			x[0].u32 ^= NMH_readLE32(p);
+			x[1].u32 ^= NMH_readLE32(p + ((len>>4)<<3));
+			x[2].u32 ^= NMH_readLE32(p + len - 8);
+			x[3].u32 ^= NMH_readLE32(p + len - 8 - ((len>>4)<<3));
+			y[0].u32 ^= NMH_readLE32(p + 4);
+			y[1].u32 ^= NMH_readLE32(p + ((len>>4)<<3) + 4);
+			y[2].u32 ^= NMH_readLE32(p + len - 8 + 4);
+			y[3].u32 ^= NMH_readLE32(p + len - 8 - ((len>>4)<<3) + 4);
+		}
+
+		for (j = 0; j < 4; ++j) x[j].u32 += y[j].u32;
+		for (j = 0; j < 4; ++j) y[j].u32 ^= (y[j].u32 << 17) ^ (y[j].u32 >> 6);
+
+		for (j = 0; j < 4; ++j) {
+			x[j].u16[0] *= (uint16_t)(__NMH_M1 & 0xFFFF);
+			x[j].u16[1] *= (uint16_t)(__NMH_M1 >> 16);
+		}
+		for (j = 0; j < 4; ++j) x[j].u32 ^= (x[j].u32 << 5) ^ (x[j].u32 >> 13);
+		for (j = 0; j < 4; ++j) {
+			x[j].u16[0] *= (uint16_t)(__NMH_M2 & 0xFFFF);
+			x[j].u16[1] *= (uint16_t)(__NMH_M2 >> 16);
+		}
+
+		for (j = 0; j < 4; ++j) x[j].u32 ^= y[j].u32;
+
+		for (j = 0; j < 4; ++j) x[j].u32 ^= (x[j].u32 << 11) ^ (x[j].u32 >> 9);
+		for (j = 0; j < 4; ++j) {
+			x[j].u16[0] *= (uint16_t)(__NMH_M3 & 0xFFFF);
+			x[j].u16[1] *= (uint16_t)(__NMH_M3 >> 16);
+		}
+		for (j = 0; j < 4; ++j) x[j].u32 ^= (x[j].u32 >> 10) ^ (x[j].u32 >> 20);
+
+		x[0].u32 ^= NMH_PRIME32_1;
+		x[1].u32 ^= NMH_PRIME32_2;
+		x[2].u32 ^= NMH_PRIME32_3;
+		x[3].u32 ^= NMH_PRIME32_4;
+
+		for (j = 1; j < 4; ++j) x[0].u32 += x[j].u32;
+
+		x[0].u32 ^= sl + (sl >> 5);
+		x[0].u16[0] *= (uint16_t)(__NMH_M3 & 0xFFFF);
+		x[0].u16[1] *= (uint16_t)(__NMH_M3 >> 16);
+		x[0].u32 ^= (x[0].u32 >> 10) ^ (x[0].u32 >> 20);
+
+		result = x[0].u32;
+	}
+#	else /* at least NMH_SSE2 */
+	{
+		__m128i const h0 = _mm_setr_epi32((int)NMH_PRIME32_1, (int)NMH_PRIME32_2, (int)NMH_PRIME32_3, (int)NMH_PRIME32_4);
+		__m128i const sl = _mm_set1_epi32((int)seed + (int)len);
+		__m128i const m1 = _mm_set1_epi32((int)__NMH_M1);
+		__m128i const m2 = _mm_set1_epi32((int)__NMH_M2);
+		__m128i const m3 = _mm_set1_epi32((int)__NMH_M3);
+		__m128i       x = h0;
+		__m128i       y = sl;
+		const uint32_t *const px = (const uint32_t*)&x;
+
+		if (type) {
+			/* 32 to 127 bytes */
+			size_t const r = (len - 1) / 32;
+			size_t i;
+			for (i = 0; i < r; ++i) {
+				x = _mm_xor_si128(x, _mm_loadu_si128((const __m128i *)(p + i * 32)));
+				y = _mm_xor_si128(y, _mm_loadu_si128((const __m128i *)(p + i * 32 + 16)));
+				x = _mm_add_epi32(x, y);
+				x = _mm_mullo_epi16(x, m1);
+				x = _mm_xor_si128(_mm_xor_si128(x, _mm_slli_epi32(x, 5)), _mm_srli_epi32(x, 13));
+				x = _mm_mullo_epi16(x, m2);
+				x = _mm_xor_si128(x, y);
+				x = _mm_xor_si128(_mm_xor_si128(x, _mm_slli_epi32(x, 11)), _mm_srli_epi32(x, 9));
+				x = _mm_mullo_epi16(x, m3);
+				x = _mm_xor_si128(_mm_xor_si128(x, _mm_srli_epi32(x, 10)), _mm_srli_epi32(x, 20));
+			}
+			x = _mm_xor_si128(x, _mm_loadu_si128((const __m128i *)(p + len - 32)));
+			y = _mm_xor_si128(y, _mm_loadu_si128((const __m128i *)(p + len - 16)));
+		} else {
+			/* 9 to 32 bytes */
+			x = _mm_xor_si128(x, _mm_setr_epi32((int)NMH_readLE32(p), (int)NMH_readLE32(p + ((len>>4)<<3)), (int)NMH_readLE32(p + len - 8), (int)NMH_readLE32(p + len - 8 - ((len>>4)<<3))));
+			y = _mm_xor_si128(y, _mm_setr_epi32((int)NMH_readLE32(p + 4), (int)NMH_readLE32(p + ((len>>4)<<3) + 4), (int)NMH_readLE32(p + len - 8 + 4), (int)NMH_readLE32(p + len - 8 - ((len>>4)<<3) + 4)));
+		}
+
+		x = _mm_add_epi32(x, y);
+
+		y = _mm_xor_si128(_mm_xor_si128(y, _mm_slli_epi32(y, 17)), _mm_srli_epi32(y, 6));
+
+		x = _mm_mullo_epi16(x, m1);
+		x = _mm_xor_si128(_mm_xor_si128(x, _mm_slli_epi32(x, 5)), _mm_srli_epi32(x, 13));
+		x = _mm_mullo_epi16(x, m2);
+		x = _mm_xor_si128(x, y);
+		x = _mm_xor_si128(_mm_xor_si128(x, _mm_slli_epi32(x, 11)), _mm_srli_epi32(x, 9));
+		x = _mm_mullo_epi16(x, m3);
+		x = _mm_xor_si128(_mm_xor_si128(x, _mm_srli_epi32(x, 10)), _mm_srli_epi32(x, 20));
+
+		x = _mm_xor_si128(x, h0);
+		x = _mm_add_epi32(x, _mm_srli_si128(x, 4));
+		x = _mm_add_epi32(x, _mm_srli_si128(x, 8));
+
+		x = _mm_xor_si128(x, _mm_add_epi32(sl, _mm_srli_epi32(sl, 5)));
+		x = _mm_mullo_epi16(x, m3);
+		x = _mm_xor_si128(_mm_xor_si128(x, _mm_srli_epi32(x, 10)), _mm_srli_epi32(x, 20));
+
+		result = *px;
+	}
+#	endif
+	return *&result;
+}
+#define NMHASH32_9to32(p, len, seed) NMHASH32_9to255(p, len, seed, 0)
+#define NMHASH32_33to255(p, len, seed) NMHASH32_9to255(p, len, seed, 1)
+
+#undef __NMH_M1
+#undef __NMH_M2
+#undef __NMH_M3
+
+#if NMH_VECTOR == NMH_SCALAR
+#define NMHASH32_long_round NMHASH32_long_round_scalar
+static inline
+void
+NMHASH32_long_round_scalar(uint32_t *const NMH_RESTRICT accX, uint32_t *const NMH_RESTRICT accY, const uint8_t* const NMH_RESTRICT p)
+{
+	/* breadth first calculation will hint some compiler to auto vectorize the code
+	 * on gcc, the performance becomes 10x than the depth first, and about 80% of the manually vectorized code
+	 */
+	const size_t nbGroups = sizeof(NMH_ACC_INIT) / sizeof(*NMH_ACC_INIT);
+	size_t i;
+
+	for (i = 0; i < nbGroups; ++i) {
+		accX[i] ^= NMH_readLE32(p + i * 4);
+	}
+	for (i = 0; i < nbGroups; ++i) {
+		accY[i] ^= NMH_readLE32(p + i * 4 + sizeof(NMH_ACC_INIT));
+	}
+	for (i = 0; i < nbGroups; ++i) {
+		accX[i] += accY[i];
+	}
+	for (i = 0; i < nbGroups; ++i) {
+		accY[i] ^= accX[i] >> 1;
+	}
+	for (i = 0; i < nbGroups * 2; ++i) {
+		((uint16_t*)accX)[i] *= ((uint16_t*)__NMH_M1_V)[i];
+	}
+	for (i = 0; i < nbGroups; ++i) {
+		accX[i] ^= accX[i] << 5 ^ accX[i] >> 13;
+	}
+	for (i = 0; i < nbGroups * 2; ++i) {
+		((uint16_t*)accX)[i] *= ((uint16_t*)__NMH_M2_V)[i];
+	}
+	for (i = 0; i < nbGroups; ++i) {
+		accX[i] ^= accY[i];
+	}
+	for (i = 0; i < nbGroups; ++i) {
+		accX[i] ^= accX[i] << 11 ^ accX[i] >> 9;
+	}
+	for (i = 0; i < nbGroups * 2; ++i) {
+		((uint16_t*)accX)[i] *= ((uint16_t*)__NMH_M3_V)[i];
+	}
+	for (i = 0; i < nbGroups; ++i) {
+		accX[i] ^= accX[i] >> 10 ^ accX[i] >> 20;
+	}
+}
+#endif
+
+#if NMH_VECTOR == NMH_SSE2
+#  define _NMH_MM_(F) _mm_ ## F
+#  define _NMH_MMW_(F) _mm_ ## F ## 128
+#  define _NMH_MM_T __m128i
+#elif NMH_VECTOR == NMH_AVX2
+#  define _NMH_MM_(F) _mm256_ ## F
+#  define _NMH_MMW_(F) _mm256_ ## F ## 256
+#  define _NMH_MM_T __m256i
+#elif NMH_VECTOR == NMH_AVX512
+#  define _NMH_MM_(F) _mm512_ ## F
+#  define _NMH_MMW_(F) _mm512_ ## F ## 512
+#  define _NMH_MM_T __m512i
+#endif
+
+#if NMH_VECTOR == NMH_SSE2 || NMH_VECTOR == NMH_AVX2 || NMH_VECTOR == NMH_AVX512
+#  define NMHASH32_long_round NMHASH32_long_round_sse
+#  define NMH_VECTOR_NB_GROUP (sizeof(NMH_ACC_INIT) / sizeof(*NMH_ACC_INIT) / (sizeof(_NMH_MM_T) / sizeof(*NMH_ACC_INIT)))
+static inline
+void
+NMHASH32_long_round_sse(uint32_t *const NMH_RESTRICT accX, uint32_t *const NMH_RESTRICT accY, const uint8_t* const NMH_RESTRICT p)
+{
+	const _NMH_MM_T *const NMH_RESTRICT m1    = (const _NMH_MM_T * NMH_RESTRICT)__NMH_M1_V;
+	const _NMH_MM_T *const NMH_RESTRICT m2    = (const _NMH_MM_T * NMH_RESTRICT)__NMH_M2_V;
+	const _NMH_MM_T *const NMH_RESTRICT m3    = (const _NMH_MM_T * NMH_RESTRICT)__NMH_M3_V;
+	      _NMH_MM_T *const              xaccX = (      _NMH_MM_T *             )accX;
+	      _NMH_MM_T *const              xaccY = (      _NMH_MM_T *             )accY;
+	      _NMH_MM_T *const              xp    = (      _NMH_MM_T *             )p;
+	size_t i;
+
+	for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) {
+		xaccX[i] = _NMH_MMW_(xor_si)(xaccX[i], _NMH_MMW_(loadu_si)(xp + i));
+	}
+	for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) {
+		xaccY[i] = _NMH_MMW_(xor_si)(xaccY[i], _NMH_MMW_(loadu_si)(xp + i + NMH_VECTOR_NB_GROUP));
+	}
+	for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) {
+		xaccX[i] = _NMH_MM_(add_epi32)(xaccX[i], xaccY[i]);
+	}
+	for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) {
+		xaccY[i] = _NMH_MMW_(xor_si)(xaccY[i], _NMH_MM_(srli_epi32)(xaccX[i], 1));
+	}
+	for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) {
+		xaccX[i] = _NMH_MM_(mullo_epi16)(xaccX[i], *m1);
+	}
+	for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) {
+		xaccX[i] = _NMH_MMW_(xor_si)(_NMH_MMW_(xor_si)(xaccX[i], _NMH_MM_(slli_epi32)(xaccX[i], 5)), _NMH_MM_(srli_epi32)(xaccX[i], 13));
+	}
+	for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) {
+		xaccX[i] = _NMH_MM_(mullo_epi16)(xaccX[i], *m2);
+	}
+	for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) {
+		xaccX[i] = _NMH_MMW_(xor_si)(xaccX[i], xaccY[i]);
+	}
+	for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) {
+		xaccX[i] = _NMH_MMW_(xor_si)(_NMH_MMW_(xor_si)(xaccX[i], _NMH_MM_(slli_epi32)(xaccX[i], 11)), _NMH_MM_(srli_epi32)(xaccX[i], 9));
+	}
+	for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) {
+		xaccX[i] = _NMH_MM_(mullo_epi16)(xaccX[i], *m3);
+	}
+	for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) {
+		xaccX[i] = _NMH_MMW_(xor_si)(_NMH_MMW_(xor_si)(xaccX[i], _NMH_MM_(srli_epi32)(xaccX[i], 10)), _NMH_MM_(srli_epi32)(xaccX[i], 20));
+	}
+}
+#  undef _NMH_MM_
+#  undef _NMH_MMW_
+#  undef _NMH_MM_T
+#  undef NMH_VECTOR_NB_GROUP
+#endif
+
+static
+uint32_t
+NMHASH32_long(const uint8_t* const NMH_RESTRICT p, size_t const len, uint32_t const seed)
+{
+	NMH_ALIGN(NMH_ACC_ALIGN) uint32_t accX[sizeof(NMH_ACC_INIT)/sizeof(*NMH_ACC_INIT)];
+	NMH_ALIGN(NMH_ACC_ALIGN) uint32_t accY[sizeof(accX)/sizeof(*accX)];
+	size_t const nbRounds = (len - 1) / (sizeof(accX) + sizeof(accY));
+	size_t i;
+	uint32_t sum = 0;
+
+	/* init */
+	for (i = 0; i < sizeof(accX)/sizeof(*accX); ++i) accX[i] = NMH_ACC_INIT[i];
+	for (i = 0; i < sizeof(accY)/sizeof(*accY); ++i) accY[i] = seed;
+
+	for (i = 0; i < nbRounds; ++i) {
+		NMHASH32_long_round(accX, accY, p + i * (sizeof(accX) + sizeof(accY)));
+	}
+	NMHASH32_long_round(accX, accY, p + len - (sizeof(accX) + sizeof(accY)));
+
+	/* merge acc */
+	for (i = 0; i < sizeof(accX)/sizeof(*accX); ++i) accX[i] ^= NMH_ACC_INIT[i];
+	for (i = 0; i < sizeof(accX)/sizeof(*accX); ++i) sum += accX[i];
+
+#	if SIZE_MAX > UINT32_C(-1)
+	sum += (uint32_t)(len >> 32);
+#	endif
+	return sum ^ (uint32_t)len;
+}
+
+static inline
+uint32_t
+NMHASH32_avalanche32(uint32_t const x)
+{
+	/* [-21 -8 cce5196d 12 -7 464be229 -21 -8] = 3.2267098842182733 */
+	const uint32_t m1 = UINT32_C(0xCCE5196D);
+	const uint32_t m2 = UINT32_C(0x464BE229);
+	union { uint32_t u32; uint16_t u16[2]; } vx;
+	vx.u32    = x;
+	vx.u32   ^= (vx.u32 >> 8) ^ (vx.u32 >> 21);
+	vx.u16[0] = (uint16_t)(vx.u16[0] * (uint16_t)m1);
+	vx.u16[1] = (uint16_t)(vx.u16[1] * (uint16_t)(m1 >> 16));
+	vx.u32   ^= (vx.u32 << 12) ^ (vx.u32 >> 7);
+	vx.u16[0] = (uint16_t)(vx.u16[0] * (uint16_t)m2);
+	vx.u16[1] = (uint16_t)(vx.u16[1] * (uint16_t)(m2 >> 16));
+	return vx.u32 ^ (vx.u32 >> 8) ^ (vx.u32 >> 21);
+}
+
+static inline
+uint32_t
+NMHASH32(const void* const NMH_RESTRICT input, size_t const len, uint32_t seed)
+{
+	const uint8_t *const p = (const uint8_t *)input;
+	if (NMH_likely(len <= 32)) {
+		if(NMH_likely(len > 8)) {
+			return NMHASH32_9to32(p, len, seed);
+		}
+		if(NMH_likely(len > 4)) {
+			uint32_t x = NMH_readLE32(p);
+			uint32_t y = NMH_readLE32(p + len - 4) ^ (NMH_PRIME32_4 + 2 + seed);
+			x += y;
+			x ^= x << (len + 7);
+			return NMHASH32_0to8(x, NMH_rotl32(y, 5));
+		} else {
+			union { uint32_t u32; uint16_t u16[2]; uint8_t u8[4]; } data;
+			switch (len) {
+				case 0: seed += NMH_PRIME32_2;
+					data.u32 = 0;
+					break;
+				case 1: seed += NMH_PRIME32_2 + (UINT32_C(1) << 24) + (1 << 1);
+					data.u32 = p[0];
+					break;
+				case 2: seed += NMH_PRIME32_2 + (UINT32_C(2) << 24) + (2 << 1);
+					data.u32 = NMH_readLE16(p);
+					break;
+				case 3: seed += NMH_PRIME32_2 + (UINT32_C(3) << 24) + (3 << 1);
+					data.u16[1] = p[2];
+					data.u16[0] = NMH_readLE16(p);
+					break;
+				case 4: seed += NMH_PRIME32_3;
+					data.u32 = NMH_readLE32(p);
+					break;
+				default: return 0;
+			}
+			return NMHASH32_0to8(data.u32 + seed, NMH_rotl32(seed, 5));
+		}
+	}
+	if (NMH_likely(len < 256)) {
+		return NMHASH32_33to255(p, len, seed);
+	}
+	return NMHASH32_avalanche32(NMHASH32_long(p, len, seed));
+}
+
+static inline
+uint32_t
+NMHASH32X_0to4(uint32_t x, uint32_t const seed)
+{
+	/* [bdab1ea9 18 a7896a1b 12 83796a2d 16] = 0.092922873297662509 */
+	x ^= seed;
+	x *= UINT32_C(0xBDAB1EA9);
+	x += NMH_rotl32(seed, 31);
+	x ^= x >> 18;
+	x *= UINT32_C(0xA7896A1B);
+	x ^= x >> 12;
+	x *= UINT32_C(0x83796A2D);
+	x ^= x >> 16;
+	return x;
+}
+
+static inline
+uint32_t
+NMHASH32X_5to8(const uint8_t* const NMH_RESTRICT p, size_t const len, uint32_t const seed)
+{
+	/* - 5 to 9 bytes
+	 * - mixer: [11049a7d 23 bcccdc7b 12 065e9dad 12] = 0.16577596555667246 */
+
+	uint32_t       x = NMH_readLE32(p) ^ NMH_PRIME32_3;
+	uint32_t const y = NMH_readLE32(p + len - 4) ^ seed;
+	x += y;
+	x ^= x >> len;
+	x *= UINT32_C(0x11049A7D);
+	x ^= x >> 23;
+	x *= UINT32_C(0xBCCCDC7B);
+	x ^= NMH_rotl32(y, 3);
+	x ^= x >> 12;
+	x *= UINT32_C(0x065E9DAD);
+	x ^= x >> 12;
+	return x;
+}
+
+static inline
+uint32_t
+NMHASH32X_9to255(const uint8_t* const NMH_RESTRICT p, size_t const len, uint32_t const seed)
+{
+	/* - at least 9 bytes
+	 * - base mixer: [11049a7d 23 bcccdc7b 12 065e9dad 12] = 0.16577596555667246
+	 * - tail mixer: [16 a52fb2cd 15 551e4d49 16] = 0.17162579707098322
+	 */
+
+	uint32_t x = NMH_PRIME32_3;
+	uint32_t y = seed;
+	uint32_t a = NMH_PRIME32_4;
+	uint32_t b = seed;
+	size_t i, r = (len - 1) / 16;
+
+	for (i = 0; i < r; ++i) {
+		x ^= NMH_readLE32(p + i * 16 + 0);
+		y ^= NMH_readLE32(p + i * 16 + 4);
+		x ^= y;
+		x *= UINT32_C(0x11049A7D);
+		x ^= x >> 23;
+		x *= UINT32_C(0xBCCCDC7B);
+		y  = NMH_rotl32(y, 4);
+		x ^= y;
+		x ^= x >> 12;
+		x *= UINT32_C(0x065E9DAD);
+		x ^= x >> 12;
+
+		a ^= NMH_readLE32(p + i * 16 + 8);
+		b ^= NMH_readLE32(p + i * 16 + 12);
+		a ^= b;
+		a *= UINT32_C(0x11049A7D);
+		a ^= a >> 23;
+		a *= UINT32_C(0xBCCCDC7B);
+		b  = NMH_rotl32(b, 3);
+		a ^= b;
+		a ^= a >> 12;
+		a *= UINT32_C(0x065E9DAD);
+		a ^= a >> 12;
+	}
+
+	if (NMH_likely(((uint8_t)len-1) & 8)) {
+		if (NMH_likely(((uint8_t)len-1) & 4)) {
+			a ^= NMH_readLE32(p + r * 16 + 0);
+			b ^= NMH_readLE32(p + r * 16 + 4);
+			a ^= b;
+			a *= UINT32_C(0x11049A7D);
+			a ^= a >> 23;
+			a *= UINT32_C(0xBCCCDC7B);
+			a ^= NMH_rotl32(b, 4);
+			a ^= a >> 12;
+			a *= UINT32_C(0x065E9DAD);
+		} else {
+			a ^= NMH_readLE32(p + r * 16) + b;
+			a ^= a >> 16;
+			a *= UINT32_C(0xA52FB2CD);
+			a ^= a >> 15;
+			a *= UINT32_C(0x551E4D49);
+		}
+
+		x ^= NMH_readLE32(p + len - 8);
+		y ^= NMH_readLE32(p + len - 4);
+		x ^= y;
+		x *= UINT32_C(0x11049A7D);
+		x ^= x >> 23;
+		x *= UINT32_C(0xBCCCDC7B);
+		x ^= NMH_rotl32(y, 3);
+		x ^= x >> 12;
+		x *= UINT32_C(0x065E9DAD);
+	} else {
+		if (NMH_likely(((uint8_t)len-1) & 4)) {
+			a ^= NMH_readLE32(p + r * 16) + b;
+			a ^= a >> 16;
+			a *= UINT32_C(0xA52FB2CD);
+			a ^= a >> 15;
+			a *= UINT32_C(0x551E4D49);
+		}
+		x ^= NMH_readLE32(p + len - 4) + y;
+		x ^= x >> 16;
+		x *= UINT32_C(0xA52FB2CD);
+		x ^= x >> 15;
+		x *= UINT32_C(0x551E4D49);
+	}
+
+	x ^= (uint32_t)len;
+	x ^= NMH_rotl32(a, 27); /* rotate one lane to pass Diff test */
+	x ^= x >> 14;
+	x *= UINT32_C(0x141CC535);
+
+	return x;
+}
+
+static inline
+uint32_t
+NMHASH32X_avalanche32(uint32_t x)
+{
+	/* mixer with 2 mul from skeeto/hash-prospector:
+	 * [15 d168aaad 15 af723597 15] = 0.15983776156606694
+	 */
+	x ^= x >> 15;
+	x *= UINT32_C(0xD168AAAD);
+	x ^= x >> 15;
+	x *= UINT32_C(0xAF723597);
+	x ^= x >> 15;
+	return x;
+}
+
+/* use 32*32->32 multiplication for short hash */
+static inline
+uint32_t
+NMHASH32X(const void* const NMH_RESTRICT input, size_t const len, uint32_t seed)
+{
+	const uint8_t *const p = (const uint8_t *)input;
+	if (NMH_likely(len <= 8)) {
+		if (NMH_likely(len > 4)) {
+			return NMHASH32X_5to8(p, len, seed);
+		} else {
+			/* 0-4 bytes */
+			union { uint32_t u32; uint16_t u16[2]; uint8_t u8[4]; } data;
+			switch (len) {
+				case 0: seed += NMH_PRIME32_2;
+					data.u32 = 0;
+					break;
+				case 1: seed += NMH_PRIME32_2 + (UINT32_C(1) << 24) + (1 << 1);
+					data.u32 = p[0];
+					break;
+				case 2: seed += NMH_PRIME32_2 + (UINT32_C(2) << 24) + (2 << 1);
+					data.u32 = NMH_readLE16(p);
+					break;
+				case 3: seed += NMH_PRIME32_2 + (UINT32_C(3) << 24) + (3 << 1);
+					data.u16[1] = p[2];
+					data.u16[0] = NMH_readLE16(p);
+					break;
+				case 4: seed += NMH_PRIME32_1;
+					data.u32 = NMH_readLE32(p);
+					break;
+				default: return 0;
+			}
+			return NMHASH32X_0to4(data.u32, seed);
+		}
+	}
+	if (NMH_likely(len < 256)) {
+		return NMHASH32X_9to255(p, len, seed);
+	}
+	return NMHASH32X_avalanche32(NMHASH32_long(p, len, seed));
+}
+
+#if defined(_MSC_VER) && _MSC_VER >= 1914
+#  pragma warning(pop)
+#endif
+#ifdef __SDCC
+#  pragma restore
+#  undef const
+#endif
+
+#endif /* _nmhash_h_ */
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/tests/hash_functions/validation/nmhash_scalar.c b/src/tests/hash_functions/validation/nmhash_scalar.c
new file mode 100644
index 000000000..051a65d5f
--- /dev/null
+++ b/src/tests/hash_functions/validation/nmhash_scalar.c
@@ -0,0 +1,8 @@
+#include "nmhash_scalar.h"
+int32_t nmhash32_test ( const void * key, size_t len, uint32_t seed ) {
+  return NMHASH32 (key, (const size_t) len, seed);
+}
+
+int32_t nmhash32x_test ( const void * key, size_t len, uint32_t seed ) {
+  return NMHASH32X (key, (const size_t) len, seed);
+}
diff --git a/src/tests/hash_functions/validation/nmhash_scalar.h b/src/tests/hash_functions/validation/nmhash_scalar.h
new file mode 100644
index 000000000..bee950670
--- /dev/null
+++ b/src/tests/hash_functions/validation/nmhash_scalar.h
@@ -0,0 +1,824 @@
+/*
+ * verification:
+ * NMHASH32:
+ *   rurban/smhasher: 0x12A30553
+ *   demerphq/smhasher: 0x3D8F6C47
+ * NMHASH32X:
+ *   rurban/smhasher: 0xA8580227
+ *   demerphq/smhasher: 0x40B451B3
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef _nmhash_h_
+#define _nmhash_h_
+
+#define NMH_VERSION 2
+
+#ifdef _MSC_VER
+#  pragma warning(push, 3)
+#endif
+
+#if defined(__cplusplus) && __cplusplus < 201103L
+#  define __STDC_CONSTANT_MACROS 1
+#endif
+
+#include <stdint.h>
+#include <string.h>
+
+#if defined(__GNUC__)
+#  if defined(__AVX2__)
+#    include <immintrin.h>
+#  elif defined(__SSE2__)
+#    include <emmintrin.h>
+#  endif
+#elif defined(_MSC_VER)
+#  include <intrin.h>
+#endif
+
+#ifdef _MSC_VER
+#  pragma warning(pop)
+#endif
+
+#if (defined(__GNUC__) && (__GNUC__ >= 3))  \
+  || (defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 800)) \
+  || defined(__clang__)
+#    define NMH_likely(x) __builtin_expect(x, 1)
+#else
+#    define NMH_likely(x) (x)
+#endif
+
+#if defined(__has_builtin)
+#  if __has_builtin(__builtin_rotateleft32)
+#    define NMH_rotl32 __builtin_rotateleft32 /* clang */
+#  endif
+#endif
+#if !defined(NMH_rotl32)
+#  if defined(_MSC_VER)
+     /* Note: although _rotl exists for minGW (GCC under windows), performance seems poor */
+#    define NMH_rotl32(x,r) _rotl(x,r)
+#  else
+#    define NMH_rotl32(x,r) (((x) << (r)) | ((x) >> (32 - (r))))
+#  endif
+#endif
+
+#if ((defined(sun) || defined(__sun)) && __cplusplus) /* Solaris includes __STDC_VERSION__ with C++. Tested with GCC 5.5 */
+#  define NMH_RESTRICT /* disable */
+#elif defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L   /* >= C99 */
+#  define NMH_RESTRICT   restrict
+#elif defined(__cplusplus) && (defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER))
+#  define NMH_RESTRICT __restrict__
+#elif defined(__cplusplus) && defined(_MSC_VER)
+#  define NMH_RESTRICT __restrict
+#else
+#  define NMH_RESTRICT   /* disable */
+#endif
+
+/* endian macros */
+#ifndef NMHASH_LITTLE_ENDIAN
+#  if defined(_WIN32) || defined(__LITTLE_ENDIAN__) || defined(__x86_64__) || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) || defined(__SDCC)
+#    define NMHASH_LITTLE_ENDIAN 1
+#  elif defined(__BIG_ENDIAN__) || (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#    define NMHASH_LITTLE_ENDIAN 0
+#  else
+#    warning could not determine endianness! Falling back to little endian.
+#    define NMHASH_LITTLE_ENDIAN 1
+#  endif
+#endif
+
+/* vector macros */
+#define NMH_SCALAR 0
+#define NMH_SSE2   1
+#define NMH_AVX2   2
+#define NMH_AVX512 3
+
+#ifndef NMH_VECTOR    /* can be defined on command line */
+#    define NMH_VECTOR NMH_SCALAR
+#endif
+
+/* align macros */
+#if defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)   /* C11+ */
+#  include <stdalign.h>
+#  define NMH_ALIGN(n)      alignas(n)
+#elif defined(__GNUC__)
+#  define NMH_ALIGN(n)      __attribute__ ((aligned(n)))
+#elif defined(_MSC_VER)
+#  define NMH_ALIGN(n)      __declspec(align(n))
+#else
+#  define NMH_ALIGN(n)   /* disabled */
+#endif
+
+#if NMH_VECTOR > 0
+#  define NMH_ACC_ALIGN 64
+#elif defined(__BIGGEST_ALIGNMENT__)
+#  define NMH_ACC_ALIGN __BIGGEST_ALIGNMENT__
+#elif defined(__SDCC)
+#  define NMH_ACC_ALIGN 1
+#else
+#  define NMH_ACC_ALIGN 16
+#endif
+
+/* constants */
+
+/* primes from xxh */
+#define NMH_PRIME32_1  UINT32_C(0x9E3779B1)
+#define NMH_PRIME32_2  UINT32_C(0x85EBCA77)
+#define NMH_PRIME32_3  UINT32_C(0xC2B2AE3D)
+#define NMH_PRIME32_4  UINT32_C(0x27D4EB2F)
+
+/*! Pseudorandom secret taken directly from FARSH. */
+NMH_ALIGN(NMH_ACC_ALIGN) static const uint32_t NMH_ACC_INIT[32] = {
+	UINT32_C(0xB8FE6C39), UINT32_C(0x23A44BBE), UINT32_C(0x7C01812C), UINT32_C(0xF721AD1C),
+	UINT32_C(0xDED46DE9), UINT32_C(0x839097DB), UINT32_C(0x7240A4A4), UINT32_C(0xB7B3671F),
+	UINT32_C(0xCB79E64E), UINT32_C(0xCCC0E578), UINT32_C(0x825AD07D), UINT32_C(0xCCFF7221),
+	UINT32_C(0xB8084674), UINT32_C(0xF743248E), UINT32_C(0xE03590E6), UINT32_C(0x813A264C),
+
+	UINT32_C(0x3C2852BB), UINT32_C(0x91C300CB), UINT32_C(0x88D0658B), UINT32_C(0x1B532EA3),
+	UINT32_C(0x71644897), UINT32_C(0xA20DF94E), UINT32_C(0x3819EF46), UINT32_C(0xA9DEACD8),
+	UINT32_C(0xA8FA763F), UINT32_C(0xE39C343F), UINT32_C(0xF9DCBBC7), UINT32_C(0xC70B4F1D),
+	UINT32_C(0x8A51E04B), UINT32_C(0xCDB45931), UINT32_C(0xC89F7EC9), UINT32_C(0xD9787364),
+};
+
+#if defined(_MSC_VER) && _MSC_VER >= 1914
+#  pragma warning(push)
+#  pragma warning(disable: 5045)
+#endif
+#ifdef __SDCC
+#  define const
+#  pragma save
+#  pragma disable_warning 110
+#  pragma disable_warning 126
+#endif
+
+/* read functions */
+static inline
+uint32_t
+NMH_readLE32(const void *const p)
+{
+	uint32_t v;
+	memcpy(&v, p, 4);
+#	if (NMHASH_LITTLE_ENDIAN)
+	return v;
+#	elif defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__clang__)
+	return __builtin_bswap32(v);
+#	elif defined(_MSC_VER)
+	return _byteswap_ulong(v);
+#	else
+	return ((v >> 24) & 0xff) | ((v >> 8) & 0xff00) | ((v << 8) & 0xff0000) | ((v << 24) & 0xff000000);
+#	endif
+}
+
+static inline
+uint16_t
+NMH_readLE16(const void *const p)
+{
+	uint16_t v;
+	memcpy(&v, p, 2);
+#	if (NMHASH_LITTLE_ENDIAN)
+	return v;
+#	else
+	return (uint16_t)((v << 8) | (v >> 8));
+#	endif
+}
+
+static inline
+uint32_t
+NMHASH32_0to8(uint32_t const x, uint32_t const seed2)
+{
+	/* base mixer: [-6 -12 776bf593 -19 11 3fb39c65 -15 -9 e9139917 -11 16] = 0.027071104091278835 */
+	const uint32_t m1 = UINT32_C(0x776BF593);
+	const uint32_t m2 = UINT32_C(0x3FB39C65);
+	const uint32_t m3 = UINT32_C(0xE9139917);
+
+#	if NMH_VECTOR == NMH_SCALAR
+	{
+		union { uint32_t u32; uint16_t u16[2]; } vx;
+		vx.u32 = x;
+		vx.u32 ^= (vx.u32 >> 12) ^ (vx.u32 >> 6);
+		vx.u16[0] *= (uint16_t)m1;
+		vx.u16[1] *= (uint16_t)(m1 >> 16);
+		vx.u32 ^= (vx.u32 << 11) ^ ( vx.u32 >> 19);
+		vx.u16[0] *= (uint16_t)m2;
+		vx.u16[1] *= (uint16_t)(m2 >> 16);
+		vx.u32 ^= seed2;
+		vx.u32 ^= (vx.u32 >> 15) ^ ( vx.u32 >> 9);
+		vx.u16[0] *= (uint16_t)m3;
+		vx.u16[1] *= (uint16_t)(m3 >> 16);
+		vx.u32 ^= (vx.u32 << 16) ^ ( vx.u32 >> 11);
+		return vx.u32;
+	}
+#	else /* at least NMH_SSE2 */
+	{
+		__m128i hv = _mm_setr_epi32((int)x, 0, 0, 0);
+		const __m128i sv = _mm_setr_epi32((int)seed2, 0, 0, 0);
+		const uint32_t *const result = (const uint32_t*)&hv;
+
+		hv = _mm_xor_si128(_mm_xor_si128(hv, _mm_srli_epi32(hv, 12)), _mm_srli_epi32(hv, 6));
+		hv = _mm_mullo_epi16(hv, _mm_setr_epi32((int)m1, 0, 0, 0));
+		hv = _mm_xor_si128(_mm_xor_si128(hv, _mm_slli_epi32(hv, 11)), _mm_srli_epi32(hv, 19));
+		hv = _mm_mullo_epi16(hv, _mm_setr_epi32((int)m2, 0, 0, 0));
+
+		hv = _mm_xor_si128(hv, sv);
+
+		hv = _mm_xor_si128(_mm_xor_si128(hv, _mm_srli_epi32(hv, 15)), _mm_srli_epi32(hv, 9));
+		hv = _mm_mullo_epi16(hv, _mm_setr_epi32((int)m3, 0, 0, 0));
+		hv = _mm_xor_si128(_mm_xor_si128(hv, _mm_slli_epi32(hv, 16)), _mm_srli_epi32(hv, 11));
+
+		return *result;
+	}
+#	endif
+}
+
+#define __NMH_M1 UINT32_C(0xF0D9649B)
+#define __NMH_M2 UINT32_C(0x29A7935D)
+#define __NMH_M3 UINT32_C(0x55D35831)
+
+NMH_ALIGN(NMH_ACC_ALIGN) static const uint32_t __NMH_M1_V[32] = {
+	__NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1,
+	__NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1,
+	__NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1,
+	__NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1, __NMH_M1,
+};
+NMH_ALIGN(NMH_ACC_ALIGN) static const uint32_t __NMH_M2_V[32] = {
+	__NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2,
+	__NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2,
+	__NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2,
+	__NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2, __NMH_M2,
+};
+NMH_ALIGN(NMH_ACC_ALIGN) static const uint32_t __NMH_M3_V[32] = {
+	__NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3,
+	__NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3,
+	__NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3,
+	__NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3, __NMH_M3,
+};
+
+static inline
+uint32_t
+NMHASH32_9to255(const uint8_t* const NMH_RESTRICT p, size_t const len, uint32_t const seed, int const type)
+{
+	/* base mixer: [f0d9649b  5 -13 29a7935d -9 11 55d35831 -20 -10 ] = 0.93495901789135362 */
+	uint32_t result = 0;
+#	if NMH_VECTOR == NMH_SCALAR
+	{
+		union { uint32_t u32; uint16_t u16[2]; } x[4], y[4];
+		uint32_t const sl = seed + (uint32_t)len;
+		size_t j;
+		x[0].u32 = NMH_PRIME32_1;
+		x[1].u32 = NMH_PRIME32_2;
+		x[2].u32 = NMH_PRIME32_3;
+		x[3].u32 = NMH_PRIME32_4;
+		for (j = 0; j < 4; ++j) y[j].u32 = sl;
+
+		if (type) {
+			/* 33 to 255 bytes */
+			size_t const r = (len - 1) / 32;
+			size_t i;
+			for (i = 0; i < r; ++i) {
+				for (j = 0; j < 4; ++j) x[j].u32 ^= NMH_readLE32(p + i * 32 + j * 4);
+				for (j = 0; j < 4; ++j) y[j].u32 ^= NMH_readLE32(p + i * 32 + j * 4 + 16);
+				for (j = 0; j < 4; ++j) x[j].u32 += y[j].u32;
+
+				for (j = 0; j < 4; ++j) {
+					x[j].u16[0] *= (uint16_t)(__NMH_M1 & 0xFFFF);
+					x[j].u16[1] *= (uint16_t)(__NMH_M1 >> 16);
+				}
+				for (j = 0; j < 4; ++j) x[j].u32 ^= (x[j].u32 << 5) ^ (x[j].u32 >> 13);
+				for (j = 0; j < 4; ++j) {
+					x[j].u16[0] *= (uint16_t)(__NMH_M2 & 0xFFFF);
+					x[j].u16[1] *= (uint16_t)(__NMH_M2 >> 16);
+				}
+
+				for (j = 0; j < 4; ++j) x[j].u32 ^= y[j].u32;
+
+				for (j = 0; j < 4; ++j) x[j].u32 ^= (x[j].u32 << 11) ^ (x[j].u32 >> 9);
+				for (j = 0; j < 4; ++j) {
+					x[j].u16[0] *= (uint16_t)(__NMH_M3 & 0xFFFF);
+					x[j].u16[1] *= (uint16_t)(__NMH_M3 >> 16);
+				}
+				for (j = 0; j < 4; ++j) x[j].u32 ^= (x[j].u32 >> 10) ^ (x[j].u32 >> 20);
+			}
+			for (j = 0; j < 4; ++j) x[j].u32 ^= NMH_readLE32(p + len - 32 + j * 4);
+			for (j = 0; j < 4; ++j) y[j].u32 ^= NMH_readLE32(p + len - 16 + j * 4);
+		} else {
+			/* 9 to 32 bytes */
+			x[0].u32 ^= NMH_readLE32(p);
+			x[1].u32 ^= NMH_readLE32(p + ((len>>4)<<3));
+			x[2].u32 ^= NMH_readLE32(p + len - 8);
+			x[3].u32 ^= NMH_readLE32(p + len - 8 - ((len>>4)<<3));
+			y[0].u32 ^= NMH_readLE32(p + 4);
+			y[1].u32 ^= NMH_readLE32(p + ((len>>4)<<3) + 4);
+			y[2].u32 ^= NMH_readLE32(p + len - 8 + 4);
+			y[3].u32 ^= NMH_readLE32(p + len - 8 - ((len>>4)<<3) + 4);
+		}
+
+		for (j = 0; j < 4; ++j) x[j].u32 += y[j].u32;
+		for (j = 0; j < 4; ++j) y[j].u32 ^= (y[j].u32 << 17) ^ (y[j].u32 >> 6);
+
+		for (j = 0; j < 4; ++j) {
+			x[j].u16[0] *= (uint16_t)(__NMH_M1 & 0xFFFF);
+			x[j].u16[1] *= (uint16_t)(__NMH_M1 >> 16);
+		}
+		for (j = 0; j < 4; ++j) x[j].u32 ^= (x[j].u32 << 5) ^ (x[j].u32 >> 13);
+		for (j = 0; j < 4; ++j) {
+			x[j].u16[0] *= (uint16_t)(__NMH_M2 & 0xFFFF);
+			x[j].u16[1] *= (uint16_t)(__NMH_M2 >> 16);
+		}
+
+		for (j = 0; j < 4; ++j) x[j].u32 ^= y[j].u32;
+
+		for (j = 0; j < 4; ++j) x[j].u32 ^= (x[j].u32 << 11) ^ (x[j].u32 >> 9);
+		for (j = 0; j < 4; ++j) {
+			x[j].u16[0] *= (uint16_t)(__NMH_M3 & 0xFFFF);
+			x[j].u16[1] *= (uint16_t)(__NMH_M3 >> 16);
+		}
+		for (j = 0; j < 4; ++j) x[j].u32 ^= (x[j].u32 >> 10) ^ (x[j].u32 >> 20);
+
+		x[0].u32 ^= NMH_PRIME32_1;
+		x[1].u32 ^= NMH_PRIME32_2;
+		x[2].u32 ^= NMH_PRIME32_3;
+		x[3].u32 ^= NMH_PRIME32_4;
+
+		for (j = 1; j < 4; ++j) x[0].u32 += x[j].u32;
+
+		x[0].u32 ^= sl + (sl >> 5);
+		x[0].u16[0] *= (uint16_t)(__NMH_M3 & 0xFFFF);
+		x[0].u16[1] *= (uint16_t)(__NMH_M3 >> 16);
+		x[0].u32 ^= (x[0].u32 >> 10) ^ (x[0].u32 >> 20);
+
+		result = x[0].u32;
+	}
+#	else /* at least NMH_SSE2 */
+	{
+		__m128i const h0 = _mm_setr_epi32((int)NMH_PRIME32_1, (int)NMH_PRIME32_2, (int)NMH_PRIME32_3, (int)NMH_PRIME32_4);
+		__m128i const sl = _mm_set1_epi32((int)seed + (int)len);
+		__m128i const m1 = _mm_set1_epi32((int)__NMH_M1);
+		__m128i const m2 = _mm_set1_epi32((int)__NMH_M2);
+		__m128i const m3 = _mm_set1_epi32((int)__NMH_M3);
+		__m128i       x = h0;
+		__m128i       y = sl;
+		const uint32_t *const px = (const uint32_t*)&x;
+
+		if (type) {
+			/* 32 to 127 bytes */
+			size_t const r = (len - 1) / 32;
+			size_t i;
+			for (i = 0; i < r; ++i) {
+				x = _mm_xor_si128(x, _mm_loadu_si128((const __m128i *)(p + i * 32)));
+				y = _mm_xor_si128(y, _mm_loadu_si128((const __m128i *)(p + i * 32 + 16)));
+				x = _mm_add_epi32(x, y);
+				x = _mm_mullo_epi16(x, m1);
+				x = _mm_xor_si128(_mm_xor_si128(x, _mm_slli_epi32(x, 5)), _mm_srli_epi32(x, 13));
+				x = _mm_mullo_epi16(x, m2);
+				x = _mm_xor_si128(x, y);
+				x = _mm_xor_si128(_mm_xor_si128(x, _mm_slli_epi32(x, 11)), _mm_srli_epi32(x, 9));
+				x = _mm_mullo_epi16(x, m3);
+				x = _mm_xor_si128(_mm_xor_si128(x, _mm_srli_epi32(x, 10)), _mm_srli_epi32(x, 20));
+			}
+			x = _mm_xor_si128(x, _mm_loadu_si128((const __m128i *)(p + len - 32)));
+			y = _mm_xor_si128(y, _mm_loadu_si128((const __m128i *)(p + len - 16)));
+		} else {
+			/* 9 to 32 bytes */
+			x = _mm_xor_si128(x, _mm_setr_epi32((int)NMH_readLE32(p), (int)NMH_readLE32(p + ((len>>4)<<3)), (int)NMH_readLE32(p + len - 8), (int)NMH_readLE32(p + len - 8 - ((len>>4)<<3))));
+			y = _mm_xor_si128(y, _mm_setr_epi32((int)NMH_readLE32(p + 4), (int)NMH_readLE32(p + ((len>>4)<<3) + 4), (int)NMH_readLE32(p + len - 8 + 4), (int)NMH_readLE32(p + len - 8 - ((len>>4)<<3) + 4)));
+		}
+
+		x = _mm_add_epi32(x, y);
+
+		y = _mm_xor_si128(_mm_xor_si128(y, _mm_slli_epi32(y, 17)), _mm_srli_epi32(y, 6));
+
+		x = _mm_mullo_epi16(x, m1);
+		x = _mm_xor_si128(_mm_xor_si128(x, _mm_slli_epi32(x, 5)), _mm_srli_epi32(x, 13));
+		x = _mm_mullo_epi16(x, m2);
+		x = _mm_xor_si128(x, y);
+		x = _mm_xor_si128(_mm_xor_si128(x, _mm_slli_epi32(x, 11)), _mm_srli_epi32(x, 9));
+		x = _mm_mullo_epi16(x, m3);
+		x = _mm_xor_si128(_mm_xor_si128(x, _mm_srli_epi32(x, 10)), _mm_srli_epi32(x, 20));
+
+		x = _mm_xor_si128(x, h0);
+		x = _mm_add_epi32(x, _mm_srli_si128(x, 4));
+		x = _mm_add_epi32(x, _mm_srli_si128(x, 8));
+
+		x = _mm_xor_si128(x, _mm_add_epi32(sl, _mm_srli_epi32(sl, 5)));
+		x = _mm_mullo_epi16(x, m3);
+		x = _mm_xor_si128(_mm_xor_si128(x, _mm_srli_epi32(x, 10)), _mm_srli_epi32(x, 20));
+
+		result = *px;
+	}
+#	endif
+	return *&result;
+}
+#define NMHASH32_9to32(p, len, seed) NMHASH32_9to255(p, len, seed, 0)
+#define NMHASH32_33to255(p, len, seed) NMHASH32_9to255(p, len, seed, 1)
+
+#undef __NMH_M1
+#undef __NMH_M2
+#undef __NMH_M3
+
+#if NMH_VECTOR == NMH_SCALAR
+#define NMHASH32_long_round NMHASH32_long_round_scalar
+static inline
+void
+NMHASH32_long_round_scalar(uint32_t *const NMH_RESTRICT accX, uint32_t *const NMH_RESTRICT accY, const uint8_t* const NMH_RESTRICT p)
+{
+	/* breadth first calculation will hint some compiler to auto vectorize the code
+	 * on gcc, the performance becomes 10x than the depth first, and about 80% of the manually vectorized code
+	 */
+	const size_t nbGroups = sizeof(NMH_ACC_INIT) / sizeof(*NMH_ACC_INIT);
+	size_t i;
+
+	for (i = 0; i < nbGroups; ++i) {
+		accX[i] ^= NMH_readLE32(p + i * 4);
+	}
+	for (i = 0; i < nbGroups; ++i) {
+		accY[i] ^= NMH_readLE32(p + i * 4 + sizeof(NMH_ACC_INIT));
+	}
+	for (i = 0; i < nbGroups; ++i) {
+		accX[i] += accY[i];
+	}
+	for (i = 0; i < nbGroups; ++i) {
+		accY[i] ^= accX[i] >> 1;
+	}
+	for (i = 0; i < nbGroups * 2; ++i) {
+		((uint16_t*)accX)[i] *= ((uint16_t*)__NMH_M1_V)[i];
+	}
+	for (i = 0; i < nbGroups; ++i) {
+		accX[i] ^= accX[i] << 5 ^ accX[i] >> 13;
+	}
+	for (i = 0; i < nbGroups * 2; ++i) {
+		((uint16_t*)accX)[i] *= ((uint16_t*)__NMH_M2_V)[i];
+	}
+	for (i = 0; i < nbGroups; ++i) {
+		accX[i] ^= accY[i];
+	}
+	for (i = 0; i < nbGroups; ++i) {
+		accX[i] ^= accX[i] << 11 ^ accX[i] >> 9;
+	}
+	for (i = 0; i < nbGroups * 2; ++i) {
+		((uint16_t*)accX)[i] *= ((uint16_t*)__NMH_M3_V)[i];
+	}
+	for (i = 0; i < nbGroups; ++i) {
+		accX[i] ^= accX[i] >> 10 ^ accX[i] >> 20;
+	}
+}
+#endif
+
+#if NMH_VECTOR == NMH_SSE2
+#  define _NMH_MM_(F) _mm_ ## F
+#  define _NMH_MMW_(F) _mm_ ## F ## 128
+#  define _NMH_MM_T __m128i
+#elif NMH_VECTOR == NMH_AVX2
+#  define _NMH_MM_(F) _mm256_ ## F
+#  define _NMH_MMW_(F) _mm256_ ## F ## 256
+#  define _NMH_MM_T __m256i
+#elif NMH_VECTOR == NMH_AVX512
+#  define _NMH_MM_(F) _mm512_ ## F
+#  define _NMH_MMW_(F) _mm512_ ## F ## 512
+#  define _NMH_MM_T __m512i
+#endif
+
+#if NMH_VECTOR == NMH_SSE2 || NMH_VECTOR == NMH_AVX2 || NMH_VECTOR == NMH_AVX512
+#  define NMHASH32_long_round NMHASH32_long_round_sse
+#  define NMH_VECTOR_NB_GROUP (sizeof(NMH_ACC_INIT) / sizeof(*NMH_ACC_INIT) / (sizeof(_NMH_MM_T) / sizeof(*NMH_ACC_INIT)))
+static inline
+void
+NMHASH32_long_round_sse(uint32_t *const NMH_RESTRICT accX, uint32_t *const NMH_RESTRICT accY, const uint8_t* const NMH_RESTRICT p)
+{
+	const _NMH_MM_T *const NMH_RESTRICT m1    = (const _NMH_MM_T * NMH_RESTRICT)__NMH_M1_V;
+	const _NMH_MM_T *const NMH_RESTRICT m2    = (const _NMH_MM_T * NMH_RESTRICT)__NMH_M2_V;
+	const _NMH_MM_T *const NMH_RESTRICT m3    = (const _NMH_MM_T * NMH_RESTRICT)__NMH_M3_V;
+	      _NMH_MM_T *const              xaccX = (      _NMH_MM_T *             )accX;
+	      _NMH_MM_T *const              xaccY = (      _NMH_MM_T *             )accY;
+	      _NMH_MM_T *const              xp    = (      _NMH_MM_T *             )p;
+	size_t i;
+
+	for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) {
+		xaccX[i] = _NMH_MMW_(xor_si)(xaccX[i], _NMH_MMW_(loadu_si)(xp + i));
+	}
+	for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) {
+		xaccY[i] = _NMH_MMW_(xor_si)(xaccY[i], _NMH_MMW_(loadu_si)(xp + i + NMH_VECTOR_NB_GROUP));
+	}
+	for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) {
+		xaccX[i] = _NMH_MM_(add_epi32)(xaccX[i], xaccY[i]);
+	}
+	for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) {
+		xaccY[i] = _NMH_MMW_(xor_si)(xaccY[i], _NMH_MM_(srli_epi32)(xaccX[i], 1));
+	}
+	for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) {
+		xaccX[i] = _NMH_MM_(mullo_epi16)(xaccX[i], *m1);
+	}
+	for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) {
+		xaccX[i] = _NMH_MMW_(xor_si)(_NMH_MMW_(xor_si)(xaccX[i], _NMH_MM_(slli_epi32)(xaccX[i], 5)), _NMH_MM_(srli_epi32)(xaccX[i], 13));
+	}
+	for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) {
+		xaccX[i] = _NMH_MM_(mullo_epi16)(xaccX[i], *m2);
+	}
+	for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) {
+		xaccX[i] = _NMH_MMW_(xor_si)(xaccX[i], xaccY[i]);
+	}
+	for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) {
+		xaccX[i] = _NMH_MMW_(xor_si)(_NMH_MMW_(xor_si)(xaccX[i], _NMH_MM_(slli_epi32)(xaccX[i], 11)), _NMH_MM_(srli_epi32)(xaccX[i], 9));
+	}
+	for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) {
+		xaccX[i] = _NMH_MM_(mullo_epi16)(xaccX[i], *m3);
+	}
+	for (i = 0; i < NMH_VECTOR_NB_GROUP; ++i) {
+		xaccX[i] = _NMH_MMW_(xor_si)(_NMH_MMW_(xor_si)(xaccX[i], _NMH_MM_(srli_epi32)(xaccX[i], 10)), _NMH_MM_(srli_epi32)(xaccX[i], 20));
+	}
+}
+#  undef _NMH_MM_
+#  undef _NMH_MMW_
+#  undef _NMH_MM_T
+#  undef NMH_VECTOR_NB_GROUP
+#endif
+
+static
+uint32_t
+NMHASH32_long(const uint8_t* const NMH_RESTRICT p, size_t const len, uint32_t const seed)
+{
+	NMH_ALIGN(NMH_ACC_ALIGN) uint32_t accX[sizeof(NMH_ACC_INIT)/sizeof(*NMH_ACC_INIT)];
+	NMH_ALIGN(NMH_ACC_ALIGN) uint32_t accY[sizeof(accX)/sizeof(*accX)];
+	size_t const nbRounds = (len - 1) / (sizeof(accX) + sizeof(accY));
+	size_t i;
+	uint32_t sum = 0;
+
+	/* init */
+	for (i = 0; i < sizeof(accX)/sizeof(*accX); ++i) accX[i] = NMH_ACC_INIT[i];
+	for (i = 0; i < sizeof(accY)/sizeof(*accY); ++i) accY[i] = seed;
+
+	for (i = 0; i < nbRounds; ++i) {
+		NMHASH32_long_round(accX, accY, p + i * (sizeof(accX) + sizeof(accY)));
+	}
+	NMHASH32_long_round(accX, accY, p + len - (sizeof(accX) + sizeof(accY)));
+
+	/* merge acc */
+	for (i = 0; i < sizeof(accX)/sizeof(*accX); ++i) accX[i] ^= NMH_ACC_INIT[i];
+	for (i = 0; i < sizeof(accX)/sizeof(*accX); ++i) sum += accX[i];
+
+#	if SIZE_MAX > UINT32_C(-1)
+	sum += (uint32_t)(len >> 32);
+#	endif
+	return sum ^ (uint32_t)len;
+}
+
+static inline
+uint32_t
+NMHASH32_avalanche32(uint32_t const x)
+{
+	/* [-21 -8 cce5196d 12 -7 464be229 -21 -8] = 3.2267098842182733 */
+	const uint32_t m1 = UINT32_C(0xCCE5196D);
+	const uint32_t m2 = UINT32_C(0x464BE229);
+	union { uint32_t u32; uint16_t u16[2]; } vx;
+	vx.u32    = x;
+	vx.u32   ^= (vx.u32 >> 8) ^ (vx.u32 >> 21);
+	vx.u16[0] = (uint16_t)(vx.u16[0] * (uint16_t)m1);
+	vx.u16[1] = (uint16_t)(vx.u16[1] * (uint16_t)(m1 >> 16));
+	vx.u32   ^= (vx.u32 << 12) ^ (vx.u32 >> 7);
+	vx.u16[0] = (uint16_t)(vx.u16[0] * (uint16_t)m2);
+	vx.u16[1] = (uint16_t)(vx.u16[1] * (uint16_t)(m2 >> 16));
+	return vx.u32 ^ (vx.u32 >> 8) ^ (vx.u32 >> 21);
+}
+
+static inline
+uint32_t
+NMHASH32(const void* const NMH_RESTRICT input, size_t const len, uint32_t seed)
+{
+	const uint8_t *const p = (const uint8_t *)input;
+	if (NMH_likely(len <= 32)) {
+		if(NMH_likely(len > 8)) {
+			return NMHASH32_9to32(p, len, seed);
+		}
+		if(NMH_likely(len > 4)) {
+			uint32_t x = NMH_readLE32(p);
+			uint32_t y = NMH_readLE32(p + len - 4) ^ (NMH_PRIME32_4 + 2 + seed);
+			x += y;
+			x ^= x << (len + 7);
+			return NMHASH32_0to8(x, NMH_rotl32(y, 5));
+		} else {
+			union { uint32_t u32; uint16_t u16[2]; uint8_t u8[4]; } data;
+			switch (len) {
+				case 0: seed += NMH_PRIME32_2;
+					data.u32 = 0;
+					break;
+				case 1: seed += NMH_PRIME32_2 + (UINT32_C(1) << 24) + (1 << 1);
+					data.u32 = p[0];
+					break;
+				case 2: seed += NMH_PRIME32_2 + (UINT32_C(2) << 24) + (2 << 1);
+					data.u32 = NMH_readLE16(p);
+					break;
+				case 3: seed += NMH_PRIME32_2 + (UINT32_C(3) << 24) + (3 << 1);
+					data.u16[1] = p[2];
+					data.u16[0] = NMH_readLE16(p);
+					break;
+				case 4: seed += NMH_PRIME32_3;
+					data.u32 = NMH_readLE32(p);
+					break;
+				default: return 0;
+			}
+			return NMHASH32_0to8(data.u32 + seed, NMH_rotl32(seed, 5));
+		}
+	}
+	if (NMH_likely(len < 256)) {
+		return NMHASH32_33to255(p, len, seed);
+	}
+	return NMHASH32_avalanche32(NMHASH32_long(p, len, seed));
+}
+
+static inline
+uint32_t
+NMHASH32X_0to4(uint32_t x, uint32_t const seed)
+{
+	/* [bdab1ea9 18 a7896a1b 12 83796a2d 16] = 0.092922873297662509 */
+	x ^= seed;
+	x *= UINT32_C(0xBDAB1EA9);
+	x += NMH_rotl32(seed, 31);
+	x ^= x >> 18;
+	x *= UINT32_C(0xA7896A1B);
+	x ^= x >> 12;
+	x *= UINT32_C(0x83796A2D);
+	x ^= x >> 16;
+	return x;
+}
+
+static inline
+uint32_t
+NMHASH32X_5to8(const uint8_t* const NMH_RESTRICT p, size_t const len, uint32_t const seed)
+{
+	/* - 5 to 9 bytes
+	 * - mixer: [11049a7d 23 bcccdc7b 12 065e9dad 12] = 0.16577596555667246 */
+
+	uint32_t       x = NMH_readLE32(p) ^ NMH_PRIME32_3;
+	uint32_t const y = NMH_readLE32(p + len - 4) ^ seed;
+	x += y;
+	x ^= x >> len;
+	x *= UINT32_C(0x11049A7D);
+	x ^= x >> 23;
+	x *= UINT32_C(0xBCCCDC7B);
+	x ^= NMH_rotl32(y, 3);
+	x ^= x >> 12;
+	x *= UINT32_C(0x065E9DAD);
+	x ^= x >> 12;
+	return x;
+}
+
+static inline
+uint32_t
+NMHASH32X_9to255(const uint8_t* const NMH_RESTRICT p, size_t const len, uint32_t const seed)
+{
+	/* - at least 9 bytes
+	 * - base mixer: [11049a7d 23 bcccdc7b 12 065e9dad 12] = 0.16577596555667246
+	 * - tail mixer: [16 a52fb2cd 15 551e4d49 16] = 0.17162579707098322
+	 */
+
+	uint32_t x = NMH_PRIME32_3;
+	uint32_t y = seed;
+	uint32_t a = NMH_PRIME32_4;
+	uint32_t b = seed;
+	size_t i, r = (len - 1) / 16;
+
+	for (i = 0; i < r; ++i) {
+		x ^= NMH_readLE32(p + i * 16 + 0);
+		y ^= NMH_readLE32(p + i * 16 + 4);
+		x ^= y;
+		x *= UINT32_C(0x11049A7D);
+		x ^= x >> 23;
+		x *= UINT32_C(0xBCCCDC7B);
+		y  = NMH_rotl32(y, 4);
+		x ^= y;
+		x ^= x >> 12;
+		x *= UINT32_C(0x065E9DAD);
+		x ^= x >> 12;
+
+		a ^= NMH_readLE32(p + i * 16 + 8);
+		b ^= NMH_readLE32(p + i * 16 + 12);
+		a ^= b;
+		a *= UINT32_C(0x11049A7D);
+		a ^= a >> 23;
+		a *= UINT32_C(0xBCCCDC7B);
+		b  = NMH_rotl32(b, 3);
+		a ^= b;
+		a ^= a >> 12;
+		a *= UINT32_C(0x065E9DAD);
+		a ^= a >> 12;
+	}
+
+	if (NMH_likely(((uint8_t)len-1) & 8)) {
+		if (NMH_likely(((uint8_t)len-1) & 4)) {
+			a ^= NMH_readLE32(p + r * 16 + 0);
+			b ^= NMH_readLE32(p + r * 16 + 4);
+			a ^= b;
+			a *= UINT32_C(0x11049A7D);
+			a ^= a >> 23;
+			a *= UINT32_C(0xBCCCDC7B);
+			a ^= NMH_rotl32(b, 4);
+			a ^= a >> 12;
+			a *= UINT32_C(0x065E9DAD);
+		} else {
+			a ^= NMH_readLE32(p + r * 16) + b;
+			a ^= a >> 16;
+			a *= UINT32_C(0xA52FB2CD);
+			a ^= a >> 15;
+			a *= UINT32_C(0x551E4D49);
+		}
+
+		x ^= NMH_readLE32(p + len - 8);
+		y ^= NMH_readLE32(p + len - 4);
+		x ^= y;
+		x *= UINT32_C(0x11049A7D);
+		x ^= x >> 23;
+		x *= UINT32_C(0xBCCCDC7B);
+		x ^= NMH_rotl32(y, 3);
+		x ^= x >> 12;
+		x *= UINT32_C(0x065E9DAD);
+	} else {
+		if (NMH_likely(((uint8_t)len-1) & 4)) {
+			a ^= NMH_readLE32(p + r * 16) + b;
+			a ^= a >> 16;
+			a *= UINT32_C(0xA52FB2CD);
+			a ^= a >> 15;
+			a *= UINT32_C(0x551E4D49);
+		}
+		x ^= NMH_readLE32(p + len - 4) + y;
+		x ^= x >> 16;
+		x *= UINT32_C(0xA52FB2CD);
+		x ^= x >> 15;
+		x *= UINT32_C(0x551E4D49);
+	}
+
+	x ^= (uint32_t)len;
+	x ^= NMH_rotl32(a, 27); /* rotate one lane to pass Diff test */
+	x ^= x >> 14;
+	x *= UINT32_C(0x141CC535);
+
+	return x;
+}
+
+static inline
+uint32_t
+NMHASH32X_avalanche32(uint32_t x)
+{
+	/* mixer with 2 mul from skeeto/hash-prospector:
+	 * [15 d168aaad 15 af723597 15] = 0.15983776156606694
+	 */
+	x ^= x >> 15;
+	x *= UINT32_C(0xD168AAAD);
+	x ^= x >> 15;
+	x *= UINT32_C(0xAF723597);
+	x ^= x >> 15;
+	return x;
+}
+
+/* use 32*32->32 multiplication for short hash */
+static inline
+uint32_t
+NMHASH32X(const void* const NMH_RESTRICT input, size_t const len, uint32_t seed)
+{
+	const uint8_t *const p = (const uint8_t *)input;
+	if (NMH_likely(len <= 8)) {
+		if (NMH_likely(len > 4)) {
+			return NMHASH32X_5to8(p, len, seed);
+		} else {
+			/* 0-4 bytes */
+			union { uint32_t u32; uint16_t u16[2]; uint8_t u8[4]; } data;
+			switch (len) {
+				case 0: seed += NMH_PRIME32_2;
+					data.u32 = 0;
+					break;
+				case 1: seed += NMH_PRIME32_2 + (UINT32_C(1) << 24) + (1 << 1);
+					data.u32 = p[0];
+					break;
+				case 2: seed += NMH_PRIME32_2 + (UINT32_C(2) << 24) + (2 << 1);
+					data.u32 = NMH_readLE16(p);
+					break;
+				case 3: seed += NMH_PRIME32_2 + (UINT32_C(3) << 24) + (3 << 1);
+					data.u16[1] = p[2];
+					data.u16[0] = NMH_readLE16(p);
+					break;
+				case 4: seed += NMH_PRIME32_1;
+					data.u32 = NMH_readLE32(p);
+					break;
+				default: return 0;
+			}
+			return NMHASH32X_0to4(data.u32, seed);
+		}
+	}
+	if (NMH_likely(len < 256)) {
+		return NMHASH32X_9to255(p, len, seed);
+	}
+	return NMHASH32X_avalanche32(NMHASH32_long(p, len, seed));
+}
+
+#if defined(_MSC_VER) && _MSC_VER >= 1914
+#  pragma warning(pop)
+#endif
+#ifdef __SDCC
+#  pragma restore
+#  undef const
+#endif
+
+#endif /* _nmhash_h_ */
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/tests/hash_functions/validation/pengyhash.c b/src/tests/hash_functions/validation/pengyhash.c
new file mode 100644
index 000000000..d7b1ec02d
--- /dev/null
+++ b/src/tests/hash_functions/validation/pengyhash.c
@@ -0,0 +1,30 @@
+/* pengyhash v0.2 */
+
+#include "pengyhash.h"
+
+uint64_t pengyhash(const void *p, size_t size, uint32_t seed)
+{
+	uint64_t b[4] = { 0 };
+	uint64_t s[4] = { 0, 0, 0, size };
+	int i;
+
+	for(; size >= 32; size -= 32, p = (const char*)p + 32) {
+		memcpy(b, p, 32);
+		
+		s[1] = (s[0] += s[1] + b[3]) + (s[1] << 14 | s[1] >> 50);
+		s[3] = (s[2] += s[3] + b[2]) + (s[3] << 23 | s[3] >> 41);
+		s[3] = (s[0] += s[3] + b[1]) ^ (s[3] << 16 | s[3] >> 48);
+		s[1] = (s[2] += s[1] + b[0]) ^ (s[1] << 40 | s[1] >> 24);
+	}
+
+	memcpy(b, p, size);
+
+	for(i = 0; i < 6; i++) {
+		s[1] = (s[0] += s[1] + b[3]) + (s[1] << 14 | s[1] >> 50) + seed;
+		s[3] = (s[2] += s[3] + b[2]) + (s[3] << 23 | s[3] >> 41);
+		s[3] = (s[0] += s[3] + b[1]) ^ (s[3] << 16 | s[3] >> 48);
+		s[1] = (s[2] += s[1] + b[0]) ^ (s[1] << 40 | s[1] >> 24);
+	}
+
+	return s[0] + s[1] + s[2] + s[3];
+}
diff --git a/src/tests/hash_functions/validation/pengyhash.h b/src/tests/hash_functions/validation/pengyhash.h
new file mode 100644
index 000000000..b9ff7010c
--- /dev/null
+++ b/src/tests/hash_functions/validation/pengyhash.h
@@ -0,0 +1,9 @@
+#ifndef _PENGYHASH_H
+#define _PENGYHASH_H
+
+#include <stdint.h>
+#include <string.h>
+
+uint64_t pengyhash(const void *p, size_t size, uint32_t seed);
+
+#endif
diff --git a/src/tests/hash_functions/validation/waterhash.c b/src/tests/hash_functions/validation/waterhash.c
new file mode 100644
index 000000000..7d6c92d99
--- /dev/null
+++ b/src/tests/hash_functions/validation/waterhash.c
@@ -0,0 +1,6 @@
+#include "waterhash.h"
+
+int32_t waterhash_test ( const void * key, uint32_t len, uint64_t seed ) {
+  return waterhash (key, len, seed);
+}
+
diff --git a/src/tests/hash_functions/validation/waterhash.h b/src/tests/hash_functions/validation/waterhash.h
new file mode 100644
index 000000000..d05dc1269
--- /dev/null
+++ b/src/tests/hash_functions/validation/waterhash.h
@@ -0,0 +1,54 @@
+/*
+    Waterhash takes (optimally) 32-bit inputs and produces a 32-bit hash as its result.
+    It is an edited version of wyhash that uses at most 64-bit math instead of 128-bit.
+    It is meant to use very similar code to Wheathash, which produces a 64-bit hash.
+    Original Author: Wang Yi <godspeed_china@yeah.net>
+    Waterhash Variant Author: Tommy Ettinger <tommy.ettinger@gmail.com>
+*/
+#ifndef waterhash_version_3
+#define waterhash_version_3
+#include <string.h>
+#include <stdint.h>
+#include <math.h>
+const uint64_t _waterp0 = 0xa0761d65ull, _waterp1 = 0xe7037ed1ull, _waterp2 = 0x8ebc6af1ull;
+const uint64_t _waterp3 = 0x589965cdull, _waterp4 = 0x1d8e4e27ull, _waterp5 = 0xeb44accbull;
+
+static inline uint64_t _watermum(const uint64_t A, const uint64_t B) {
+    uint64_t r = A * B;
+    return r - (r >> 32);
+}
+
+static inline uint64_t _waterr08(const uint8_t *p){ uint8_t  v; memcpy(&v, p, 1); return v; }
+static inline uint64_t _waterr16(const uint8_t *p){ uint16_t v; memcpy(&v, p, 2); return v; }
+static inline uint64_t _waterr32(const uint8_t *p){ uint32_t v; memcpy(&v, p, 4); return v; }
+static inline uint32_t waterhash(const void* key, uint32_t len, uint64_t seed){
+    const uint8_t *p = (const uint8_t*)key;
+    uint32_t i;
+    for (i = 0; i + 16 <= len; i += 16, p += 16) {
+        seed = _watermum(
+			_watermum(_waterr32(p) ^ _waterp1, _waterr32(p + 4) ^ _waterp2) + seed,
+			_watermum(_waterr32(p + 8) ^ _waterp3, _waterr32(p + 12) ^ _waterp4));
+    }
+	seed += _waterp5;
+	switch (len & 15) {
+	case 1:  seed = _watermum(_waterp2 ^ seed, _waterr08(p) ^ _waterp1); break;
+	case 2:  seed = _watermum(_waterp3 ^ seed, _waterr16(p) ^ _waterp4); break;
+	case 3:  seed = _watermum(_waterr16(p) ^ seed, _waterr08(p + 2) ^ _waterp2); break;
+	case 4:  seed = _watermum(_waterr16(p) ^ seed, _waterr16(p + 2) ^ _waterp3); break;
+	case 5:  seed = _watermum(_waterr32(p) ^ seed, _waterr08(p + 4) ^ _waterp1); break;
+	case 6:  seed = _watermum(_waterr32(p) ^ seed, _waterr16(p + 4) ^ _waterp1); break;
+	case 7:  seed = _watermum(_waterr32(p) ^ seed, (_waterr16(p + 4) << 8 | _waterr08(p + 6)) ^ _waterp1); break;
+	case 8:  seed = _watermum(_waterr32(p) ^ seed, _waterr32(p + 4) ^ _waterp0); break;
+	case 9:  seed = _watermum(_waterr32(p) ^ seed, _waterr32(p + 4) ^ _waterp2) ^ _watermum(seed ^ _waterp4, _waterr08(p + 8) ^ _waterp3); break;
+	case 10: seed = _watermum(_waterr32(p) ^ seed, _waterr32(p + 4) ^ _waterp2) ^ _watermum(seed, _waterr16(p + 8) ^ _waterp3); break;
+	case 11: seed = _watermum(_waterr32(p) ^ seed, _waterr32(p + 4) ^ _waterp2) ^ _watermum(seed, ((_waterr16(p + 8) << 8) | _waterr08(p + 10)) ^ _waterp3); break;
+	case 12: seed = _watermum(_waterr32(p) ^ seed, _waterr32(p + 4) ^ _waterp2) ^ _watermum(seed ^ _waterr32(p + 8), _waterp4); break;
+	case 13: seed = _watermum(_waterr32(p) ^ seed, _waterr32(p + 4) ^ _waterp2) ^ _watermum(seed ^ _waterr32(p + 8), (_waterr08(p + 12)) ^ _waterp4); break;
+	case 14: seed = _watermum(_waterr32(p) ^ seed, _waterr32(p + 4) ^ _waterp2) ^ _watermum(seed ^ _waterr32(p + 8), (_waterr16(p + 12)) ^ _waterp4); break;
+	case 15: seed = _watermum(_waterr32(p) ^ seed, _waterr32(p + 4) ^ _waterp2) ^ _watermum(seed ^ _waterr32(p + 8), (_waterr16(p + 12) << 8 | _waterr08(p + 14)) ^ _waterp4); break;
+	}
+	seed = (seed ^ seed << 16) * (len ^ _waterp0);
+	return (uint32_t)(seed - (seed >> 32));
+}
+#endif
+