From 24d340075de9677464ee73c68c6b70182ce1a3cc Mon Sep 17 00:00:00 2001
From: Trevor Vincent <tvincent@cita.utoronto.ca>
Date: Sun, 12 Jan 2020 23:14:11 -0500
Subject: [PATCH 01/10] Add openmp parallelization to repeated hafnian

---
 include/repeated_hafnian.hpp | 499 +++++++++++++++++++++--------------
 1 file changed, 301 insertions(+), 198 deletions(-)
diff --git a/include/repeated_hafnian.hpp b/include/repeated_hafnian.hpp
index 6fd960ac7..66e582fe2 100644
--- a/include/repeated_hafnian.hpp
+++ b/include/repeated_hafnian.hpp
@@ -16,203 +16,294 @@
  * \rst
  * Contains functions for computing the hafnian using the algorithm
  * described in *From moments of sum to moments of product*,
- * `doi:10.1016/j.jmva.2007.01.013 <https://dx.doi.org/10.1016/j.jmva.2007.01.013>`__.
- * \endrst
+ * `doi:10.1016/j.jmva.2007.01.013
+ * <https://dx.doi.org/10.1016/j.jmva.2007.01.013>`__. \endrst
  */
 #pragma once
 #include <stdafx.h>
 
 namespace libwalrus {
 
+/**
+ * Converts a linear index to a multi index
+ * e.g. if we wanted the multi-index (i,j) of an element in a 2x2 matrix given
+ * a linear index of 3 in the array storing the matrix, the maxes vector would
+ * be {1,1} and this function would return (1,0)
+ *
+ * @param linear_index the "flattened" index
+ * @param maxes a vector of integers, representing the max index value
+ * of each indice in the multi-index object.
+ *
+ * @return hafnian of the input matrix
+ */
+
+std::vector<int> lin_to_multi(int linear_index, const std::vector<int> &maxes) {
+  std::vector<int> i(maxes.size(), 0);
+  int l = 0;
+  int s0 = maxes[0] + 1;
+  i[l] = linear_index % s0;
+  int prod = s0;
+  int gl = linear_index;
+
+  while (l < i.size() - 1) {
+    if (prod > linear_index) {
+      break;
+    } else {
+      int glp1 = (gl - i[l]) / (maxes[l] + 1);
+      i[l + 1] = glp1 % (maxes[l + 1] + 1);
+      prod *= (maxes[1 + 1] + 1);
+      gl = glp1;
+    }
+    l++;
+  }
+  return i;
+}
+
+/**
+ * Returns the binomial coefficient N!/K!(N-K)!
+ * Adapted from http://blog.plover.com/math/choose.html
+ *
+ * @param N
+ * @param K
+ *
+ * @return N!/K!(N-K)!
+ */
+
+template <typename T> T get_binom_coeff(T N, T K) {
+  T r = 1;
+  T d;
+  if (K > N)
+    return 0;
+  for (d = 1; d <= K; d++) {
+    r *= N--;
+    r /= d;
+  }
+  return r;
+}
+
 /**
  * Returns the hafnian of a matrix using the algorithm
  * described in *From moments of sum to moments of product*,
  * [doi:10.1016/j.jmva.2007.01.013](https://dx.doi.org/10.1016/j.jmva.2007.01.013>).
  *
- * Note that this algorithm, while generally slower than others, can be significantly more
- * efficient in the cases where the matrix has repeated rows and columns.
+ * Note that this algorithm, while generally slower than others, can be
+ * significantly more efficient in the cases where the matrix has repeated rows
+ * and columns.
  *
  * @param mat a flattened vector of size \f$n^2\f$, representing an
  *      \f$n\times n\f$ row-ordered symmetric matrix.
  * @param rpt a vector of integers, representing the number of
  *      times each row/column in `mat` is repeated. For example,
- *      `mat = [1]` and `rpt = [6]` represents a \f$6\times 6\f$ matrix of all ones.
+ *      `mat = [1]` and `rpt = [6]` represents a \f$6\times 6\f$ matrix of all
+ * ones.
  * @return hafnian of the input matrix
  */
 template <typename T>
 inline T hafnian_rpt(std::vector<T> &mat, std::vector<int> &rpt) {
-    int n = std::sqrt(static_cast<double>(mat.size()));
-    assert(static_cast<int>(rpt.size()) == n);
 
-    long double p = 2;
-    T y = 0.0, q = 0.0;
+  int n = std::sqrt(static_cast<double>(mat.size()));
+  assert(static_cast<int>(rpt.size()) == n);
+  T y = 0.0;
 
-    std::vector<int> x(n, 0.0);
+#pragma omp parallel
+  {
+    T q = 0.0L;
+
+    long double p = 2;
+    // std::vector<int> x(n, 0.0);
     int s = std::accumulate(rpt.begin(), rpt.end(), 0);
     int s2 = s / 2;
 
     for (int i = 1; i < s2; i++) {
-        p /= i + 1;
-    }
-
-    std::vector<long double> nu2(n);
-    std::transform(rpt.begin(), rpt.end(), nu2.begin(),
-                   std::bind(std::multiplies<long double>(), std::placeholders::_1, 0.5L));
-
-    for (int i = 0; i < n; i++) {
-        for (int j = 0; j < n; j++) {
-            q += 0.5L * nu2[j] * mat[i * n + j] * nu2[i];
-        }
+      p /= i + 1;
     }
 
     unsigned long long int steps = 1;
 
     for (auto i : rpt) {
-        steps *= i + 1;
+      steps *= i + 1;
     }
 
     steps /= 2;
 
-    for (unsigned long long int i = 0; i < steps; i++) {
-        y += static_cast<long double>(p) * pow(q, s2);
-
-        for (int j = 0; j < n; j++) {
-
-            if (x[j] < rpt[j]) {
-                x[j] += 1;
-                p *= -static_cast<long double>(rpt[j] + 1 - x[j]) / x[j];
-
-                for (int k = 0; k < n; k++) {
-                    q -= mat[k * n + j] * (nu2[k] - x[k]);
-                }
-                q -= 0.5L * mat[j * n + j];
-                break;
-            }
-            else {
-                x[j] = 0;
-                if (rpt[j] % 2 == 1) {
-                    p *= -1;
-                }
-                for (int k = 0; k < n; k++) {
-                    q += (1.0L * rpt[j]) * mat[k * n + j] * (nu2[k] - x[k]);
-                }
-                q -= 0.5L * rpt[j] * rpt[j] * mat[j * n + j];
-            }
+    int chunks = omp_get_num_threads();
+    int id = omp_get_thread_num();
+    int chunk_size = steps / chunks;
+    int beg = id * chunk_size;
+    int end = (id == chunks - 1) ? steps : beg + chunk_size;
+    T sum_chunk = 0.L;
+
+    std::vector<int> x = lin_to_multi(beg, rpt);
+    std::vector<long double> nu2(n);
+    for (int i = 0; i < n; i++)
+      nu2[i] = 0.5 * rpt[i] - x[i];
+
+    int x_sum = std::accumulate(x.begin(), x.end(), 0);
+    for (int i = 0; i < n; i++) {
+      p *= get_binom_coeff(rpt[i], x[i]);
+    }
+    p *= (x_sum % 2 == 0) ? 1 : -1;
+
+    for (int i = 0; i < n; i++) {
+      for (int j = 0; j < n; j++) {
+        q += 0.5L * nu2[j] * mat[i * n + j] * nu2[i];
+      }
+    }
+
+    for (unsigned long long int i = beg; i < end; i++) {
+      sum_chunk += static_cast<long double>(p) * pow(q, s2);
+
+      for (int j = 0; j < n; j++) {
+
+        if (x[j] < rpt[j]) {
+          x[j] += 1;
+          p *= -static_cast<long double>(rpt[j] + 1 - x[j]) / x[j];
+
+          for (int k = 0; k < n; k++) {
+            q -= mat[k * n + j] * (0.5L * rpt[k] - x[k]);
+          }
+          q -= 0.5L * mat[j * n + j];
+          break;
+        } else {
+          x[j] = 0;
+          if (rpt[j] % 2 == 1) {
+            p *= -1;
+          }
+          for (int k = 0; k < n; k++) {
+            q += (1.0L * rpt[j]) * mat[k * n + j] * (0.5L * rpt[k] - x[k]);
+          }
+          q -= 0.5L * rpt[j] * rpt[j] * mat[j * n + j];
         }
+      }
     }
+#pragma omp critical
+    y += sum_chunk;
+  }
 
-    return y;
+  return y;
 }
 
-
 /**
  * Returns the loop hafnian of a matrix using the algorithm
  * described in *From moments of sum to moments of product*,
  * [doi:10.1016/j.jmva.2007.01.013](https://dx.doi.org/10.1016/j.jmva.2007.01.013>).
  *
- * Note that this algorithm, while generally slower than others, can be significantly more
- * efficient in the cases where the matrix has repeated rows and columns.
+ * Note that this algorithm, while generally slower than others, can be
+ * significantly more efficient in the cases where the matrix has repeated rows
+ * and columns.
  *
  * @param mat a flattened vector of size \f$n^2\f$, representing an
  *      \f$n\times n\f$ row-ordered symmetric matrix.
- * @param mu a vector of length \f$n\f$ representing the vector of means/displacement.
+ * @param mu a vector of length \f$n\f$ representing the vector of
+ * means/displacement.
  * @param rpt a vector of integers, representing the number of
  *      times each row/column in `mat` is repeated. For example,
- *      `mat = [1]` and `rpt = [6]` represents a \f$6\times 6\f$ matrix of all ones.
+ *      `mat = [1]` and `rpt = [6]` represents a \f$6\times 6\f$ matrix of all
+ * ones.
  * @return loop hafnian of the input matrix
  */
 template <typename T>
-inline T loop_hafnian_rpt(std::vector<T> &mat, std::vector<T> &mu, std::vector<int> &rpt) {
-    int n = std::sqrt(static_cast<double>(mat.size()));
-    assert(static_cast<int>(rpt.size()) == n);
-
+inline T loop_hafnian_rpt(std::vector<T> &mat, std::vector<T> &mu,
+                          std::vector<int> &rpt) {
+  int n = std::sqrt(static_cast<double>(mat.size()));
+  assert(static_cast<int>(rpt.size()) == n);
+
+  T y = 0.0L;
+  unsigned long long int steps = 1;
+  for (auto i : rpt) {
+    steps *= i + 1;
+  }
+  steps /= 2;
+
+#pragma omp parallel
+  {
     long long int p = 2;
-    T y = 0.0L, q = 0.0L, q1 = 0.0L;
+    T q = 0.0L, q1 = 0.0L;
+
+    int chunks = omp_get_num_threads();
+    int id = omp_get_thread_num();
+    int chunk_size = steps / chunks;
+    int beg = id * chunk_size;
+    int end = (id == chunks - 1) ? steps : beg + chunk_size;
+    T sum_chunk = 0.L;
 
-    std::vector<int> x(n, 0.0);
+    std::vector<int> x = lin_to_multi(beg, rpt);
     int s = std::accumulate(rpt.begin(), rpt.end(), 0);
     int s1 = std::floor(0.5 * s) + 1;
     std::vector<T> z1(s1, 1.0L);
     std::vector<T> z2(s1, 1.0L);
 
-    // diagonal of matrix mat
-    // std::vector<T> mu(n);
-    // for (int i=0; i<n; i++) {
-    //     mu[i] = mat[i*n+i];
-    // }
-
     std::vector<long double> nu2(n);
-    std::transform(rpt.begin(), rpt.end(), nu2.begin(),
-                   std::bind(std::multiplies<long double>(), std::placeholders::_1, 0.5L));
+    for (int i = 0; i < n; i++)
+      nu2[i] = 0.5 * rpt[i] - x[i];
 
     for (int i = 0; i < n; i++) {
-        q1 += nu2[i] * mu[i];
-        for (int j = 0; j < n; j++) {
-            q += 0.5L * nu2[j] * mat[i * n + j] * nu2[i];
-        }
+      q1 += nu2[i] * mu[i];
+      for (int j = 0; j < n; j++) {
+        q += 0.5L * nu2[j] * mat[i * n + j] * nu2[i];
+      }
     }
 
-    unsigned long long int steps = 1;
-
-    for (auto i : rpt) {
-        steps *= i + 1;
+    int x_sum = std::accumulate(x.begin(), x.end(), 0);
+    for (int i = 0; i < n; i++) {
+      p *= get_binom_coeff(rpt[i], x[i]);
     }
+    p *= (x_sum % 2 == 0) ? 1 : -1;
 
-    steps /= 2;
+    for (unsigned long long int i = beg; i < end; i++) {
+      for (int j = 1; j < s1; j++) {
+        z1[j] = z1[j - 1] * q / (1.0L * j);
+      }
 
-    for (unsigned long long int i = 0; i < steps; i++) {
+      if (s % 2 == 1) {
+        z2[0] = q1;
         for (int j = 1; j < s1; j++) {
-            z1[j] = z1[j - 1] * q / (1.0L * j);
-        }
-
-        if (s % 2 == 1) {
-            z2[0] = q1;
-            for (int j = 1; j < s1; j++) {
-                z2[j] = z2[j - 1] * pow(q1, 2) / (2.0L * j) / (2.0L * j + 1);
-            }
-        }
-        else {
-            for (int j = 1; j < s1; j++) {
-                z2[j] = z2[j - 1] * pow(q1, 2) / (2.0L * j) / (2.0L * j - 1);
-            }
+          z2[j] = z2[j - 1] * pow(q1, 2) / (2.0L * j) / (2.0L * j + 1);
         }
-
-
-        T z1z2prod = 0.0;
-        for (int j = 0; j < s1; j++) {
-            z1z2prod += z1[j] * z2[s1 - 1 - j];
+      } else {
+        for (int j = 1; j < s1; j++) {
+          z2[j] = z2[j - 1] * pow(q1, 2) / (2.0L * j) / (2.0L * j - 1);
         }
-
-        y += static_cast<long double>(p) * z1z2prod;
-
-        for (int j = 0; j < n; j++) {
-
-            if (x[j] < rpt[j]) {
-                x[j] += 1;
-                p = -std::round(p * static_cast<long double>(rpt[j] + 1 - x[j]) / x[j]);
-
-                for (int k = 0; k < n; k++) {
-                    q -= mat[k * n + j] * (nu2[k] - x[k]);
-                }
-                q -= 0.5L * mat[j * n + j];
-                q1 -= mu[j];
-                break;
-            }
-            else {
-                x[j] = 0;
-                if (rpt[j] % 2 == 1) {
-                    p *= -1;
-                }
-                for (int k = 0; k < n; k++) {
-                    q += (1.0L * rpt[j]) * mat[k * n + j] * (nu2[k] - x[k]);
-                }
-                q -= 0.5L * rpt[j] * rpt[j] * mat[j * n + j];
-                q1 += static_cast<long double>(rpt[j]) * mu[j];
-            }
+      }
+
+      T z1z2prod = 0.0;
+      for (int j = 0; j < s1; j++) {
+        z1z2prod += z1[j] * z2[s1 - 1 - j];
+      }
+
+      sum_chunk += static_cast<long double>(p) * z1z2prod;
+
+      for (int j = 0; j < n; j++) {
+        if (x[j] < rpt[j]) {
+          x[j] += 1;
+          p = -std::round(p * static_cast<long double>(rpt[j] + 1 - x[j]) /
+                          x[j]);
+
+          for (int k = 0; k < n; k++) {
+            q -= mat[k * n + j] * (0.5L * rpt[k] - x[k]);
+          }
+          q -= 0.5L * mat[j * n + j];
+          q1 -= mu[j];
+          break;
+        } else {
+          x[j] = 0;
+          if (rpt[j] % 2 == 1) {
+            p *= -1;
+          }
+          for (int k = 0; k < n; k++) {
+            q += (1.0L * rpt[j]) * mat[k * n + j] * (0.5L * rpt[k] - x[k]);
+          }
+          q -= 0.5L * rpt[j] * rpt[j] * mat[j * n + j];
+          q1 += static_cast<long double>(rpt[j]) * mu[j];
         }
+      }
     }
 
-    return y;
+#pragma omp critical
+    y += sum_chunk;
+  }
+
+  return y;
 }
 
 /**
@@ -220,11 +311,12 @@ inline T loop_hafnian_rpt(std::vector<T> &mat, std::vector<T> &mu, std::vector<i
  * described in *From moments of sum to moments of product*,
  * [doi:10.1016/j.jmva.2007.01.013](https://dx.doi.org/10.1016/j.jmva.2007.01.013>).
  *
- * Note that this algorithm, while generally slower than others, can be significantly more
- * efficient in the cases where the matrix has repeated rows and columns.
+ * Note that this algorithm, while generally slower than others, can be
+ * significantly more efficient in the cases where the matrix has repeated rows
+ * and columns.
  *
- * This is a wrapper around the templated function `libwalrus::hafnian_rpt` for Python
- * integration. It accepts and returns complex double numeric types, and
+ * This is a wrapper around the templated function `libwalrus::hafnian_rpt` for
+ * Python integration. It accepts and returns complex double numeric types, and
  * returns sensible values for empty and non-even matrices.
  *
  * In addition, this wrapper function automatically casts all matrices
@@ -235,35 +327,37 @@ inline T loop_hafnian_rpt(std::vector<T> &mat, std::vector<T> &mu, std::vector<i
  *      \f$n\times n\f$ row-ordered symmetric matrix.
  * @param rpt a vector of integers, representing the number of
  *      times each row/column in `mat` is repeated. For example,
- *      `mat = [1]` and `rpt = [6]` represents a \f$6\times 6\f$ matrix of all ones.
+ *      `mat = [1]` and `rpt = [6]` represents a \f$6\times 6\f$ matrix of all
+ * ones.
  * @return hafnian of the input matrix
  */
-std::complex<double> hafnian_rpt_quad(std::vector<std::complex<double>> &mat, std::vector<int> &rpt) {
-    std::vector<std::complex<long double>> matq(mat.begin(), mat.end());
-    int s = std::accumulate(rpt.begin(), rpt.end(), 0);
-    int n = std::sqrt(static_cast<double>(mat.size()));
-    std::complex<long double> haf;
-
-    if ( s == 0 || n == 0)
-        haf = std::complex<long double>(1.0, 0.0);
-    else
-        haf = hafnian_rpt(matq, rpt);
-
-    return static_cast<std::complex<double>>(haf);
+std::complex<double> hafnian_rpt_quad(std::vector<std::complex<double>> &mat,
+                                      std::vector<int> &rpt) {
+  std::vector<std::complex<long double>> matq(mat.begin(), mat.end());
+  int s = std::accumulate(rpt.begin(), rpt.end(), 0);
+  int n = std::sqrt(static_cast<double>(mat.size()));
+  std::complex<long double> haf;
+
+  if (s == 0 || n == 0)
+    haf = std::complex<long double>(1.0, 0.0);
+  else
+    haf = hafnian_rpt(matq, rpt);
+
+  return static_cast<std::complex<double>>(haf);
 }
 
-
 /**
  * Returns the hafnian of a matrix using the algorithm
  * described in *From moments of sum to moments of product*,
  * [doi:10.1016/j.jmva.2007.01.013](https://dx.doi.org/10.1016/j.jmva.2007.01.013>).
  *
- * Note that this algorithm, while generally slower than others, can be significantly more
- * efficient in the cases where the matrix has repeated rows and columns.
+ * Note that this algorithm, while generally slower than others, can be
+ * significantly more efficient in the cases where the matrix has repeated rows
+ * and columns.
  *
- * This is a wrapper around the templated function `libwalrus::hafnian_rpt` for Python
- * integration. It accepts and returns double numeric types, and
- * returns sensible values for empty and non-even matrices.
+ * This is a wrapper around the templated function `libwalrus::hafnian_rpt` for
+ * Python integration. It accepts and returns double numeric types, and returns
+ * sensible values for empty and non-even matrices.
  *
  * In addition, this wrapper function automatically casts all matrices
  * to type `long double`, allowing for greater precision than supported
@@ -273,35 +367,36 @@ std::complex<double> hafnian_rpt_quad(std::vector<std::complex<double>> &mat, st
  *      \f$n\times n\f$ row-ordered symmetric matrix.
  * @param rpt a vector of integers, representing the number of
  *      times each row/column in `mat` is repeated. For example,
- *      `mat = [1]` and `rpt = [6]` represents a \f$6\times 6\f$ matrix of all ones.
+ *      `mat = [1]` and `rpt = [6]` represents a \f$6\times 6\f$ matrix of all
+ * ones.
  * @return hafnian of the input matrix
  */
 double hafnian_rpt_quad(std::vector<double> &mat, std::vector<int> &rpt) {
-    std::vector<long double> matq(mat.begin(), mat.end());
-    int s = std::accumulate(rpt.begin(), rpt.end(), 0);
-    int n = std::sqrt(static_cast<double>(mat.size()));
-    long double haf;
+  std::vector<long double> matq(mat.begin(), mat.end());
+  int s = std::accumulate(rpt.begin(), rpt.end(), 0);
+  int n = std::sqrt(static_cast<double>(mat.size()));
+  long double haf;
 
-    if ( s == 0 || n == 0)
-        haf = 1.0;
-    else
-        haf = hafnian_rpt(matq, rpt);
+  if (s == 0 || n == 0)
+    haf = 1.0;
+  else
+    haf = hafnian_rpt(matq, rpt);
 
-    return static_cast<double>(haf);
+  return static_cast<double>(haf);
 }
 
-
 /**
  * Returns the loop hafnian of a matrix using the algorithm
  * described in *From moments of sum to moments of product*,
  * [doi:10.1016/j.jmva.2007.01.013](https://dx.doi.org/10.1016/j.jmva.2007.01.013>).
  *
- * Note that this algorithm, while generally slower than others, can be significantly more
- * efficient in the cases where the matrix has repeated rows and columns.
+ * Note that this algorithm, while generally slower than others, can be
+ * significantly more efficient in the cases where the matrix has repeated rows
+ * and columns.
  *
- * This is a wrapper around the templated function `libwalrus::hafnian_rpt_loop` for Python
- * integration. It accepts and returns complex double numeric types, and
- * returns sensible values for empty and non-even matrices.
+ * This is a wrapper around the templated function `libwalrus::hafnian_rpt_loop`
+ * for Python integration. It accepts and returns complex double numeric types,
+ * and returns sensible values for empty and non-even matrices.
  *
  * In addition, this wrapper function automatically casts all matrices
  * to type `complex<long double>`, allowing for greater precision than supported
@@ -309,38 +404,43 @@ double hafnian_rpt_quad(std::vector<double> &mat, std::vector<int> &rpt) {
  *
  * @param mat a flattened vector of size \f$n^2\f$, representing an
  *      \f$n\times n\f$ row-ordered symmetric matrix.
- * @param mu a vector of length \f$n\f$ representing the vector of means/displacement.
+ * @param mu a vector of length \f$n\f$ representing the vector of
+ * means/displacement.
  * @param rpt a vector of integers, representing the number of
  *      times each row/column in `mat` is repeated. For example,
- *      `mat = [1]` and `rpt = [6]` represents a \f$6\times 6\f$ matrix of all ones.
+ *      `mat = [1]` and `rpt = [6]` represents a \f$6\times 6\f$ matrix of all
+ * ones.
  * @return loop hafnian of the input matrix
  */
-std::complex<double> loop_hafnian_rpt_quad(std::vector<std::complex<double>> &mat, std::vector<std::complex<double>> &mu, std::vector<int> &rpt) {
-    std::vector<std::complex<long double>> matq(mat.begin(), mat.end());
-    std::vector<std::complex<long double>> muq(mu.begin(), mu.end());
-    std::complex<long double> haf;
-    int s = std::accumulate(rpt.begin(), rpt.end(), 0);
-    int n = std::sqrt(static_cast<double>(mat.size()));
-
-    if ( s == 0 || n == 0)
-        haf = std::complex<long double>(1.0, 0.0);
-    else
-        haf = loop_hafnian_rpt(matq, muq, rpt);
-
-    return static_cast<std::complex<double>>(haf);
+std::complex<double>
+loop_hafnian_rpt_quad(std::vector<std::complex<double>> &mat,
+                      std::vector<std::complex<double>> &mu,
+                      std::vector<int> &rpt) {
+  std::vector<std::complex<long double>> matq(mat.begin(), mat.end());
+  std::vector<std::complex<long double>> muq(mu.begin(), mu.end());
+  std::complex<long double> haf;
+  int s = std::accumulate(rpt.begin(), rpt.end(), 0);
+  int n = std::sqrt(static_cast<double>(mat.size()));
+
+  if (s == 0 || n == 0)
+    haf = std::complex<long double>(1.0, 0.0);
+  else
+    haf = loop_hafnian_rpt(matq, muq, rpt);
+
+  return static_cast<std::complex<double>>(haf);
 }
 
-
 /**
  * Returns the loop hafnian of a matrix using the algorithm
  * described in *From moments of sum to moments of product*,
  * [doi:10.1016/j.jmva.2007.01.013](https://dx.doi.org/10.1016/j.jmva.2007.01.013>).
  *
- * Note that this algorithm, while generally slower than others, can be significantly more
- * efficient in the cases where the matrix has repeated rows and columns.
+ * Note that this algorithm, while generally slower than others, can be
+ * significantly more efficient in the cases where the matrix has repeated rows
+ * and columns.
  *
- * This is a wrapper around the templated function `libwalrus::hafnian_rpt_loop` for Python
- * integration. It accepts and returns double numeric types, and
+ * This is a wrapper around the templated function `libwalrus::hafnian_rpt_loop`
+ * for Python integration. It accepts and returns double numeric types, and
  * returns sensible values for empty and non-even matrices.
  *
  * In addition, this wrapper function automatically casts all matrices
@@ -349,25 +449,28 @@ std::complex<double> loop_hafnian_rpt_quad(std::vector<std::complex<double>> &ma
  *
  * @param mat a flattened vector of size \f$n^2\f$, representing an
  *      \f$n\times n\f$ row-ordered symmetric matrix.
- * @param mu a vector of length \f$n\f$ representing the vector of means/displacement.
+ * @param mu a vector of length \f$n\f$ representing the vector of
+ * means/displacement.
  * @param rpt a vector of integers, representing the number of
  *      times each row/column in `mat` is repeated. For example,
- *      `mat = [1]` and `rpt = [6]` represents a \f$6\times 6\f$ matrix of all ones.
+ *      `mat = [1]` and `rpt = [6]` represents a \f$6\times 6\f$ matrix of all
+ * ones.
  * @return loop hafnian of the input matrix
  */
-double loop_hafnian_rpt_quad(std::vector<double> &mat, std::vector<double> &mu, std::vector<int> &rpt) {
-    std::vector<long double> matq(mat.begin(), mat.end());
-    std::vector<long double> muq(mu.begin(), mu.end());
-    long double haf;
-    int s = std::accumulate(rpt.begin(), rpt.end(), 0);
-    int n = std::sqrt(static_cast<double>(mat.size()));
-
-    if ( s == 0 || n == 0)
-        haf = 1.0;
-    else
-        haf = loop_hafnian_rpt(matq, muq, rpt);
-
-    return static_cast<double>(haf);
+double loop_hafnian_rpt_quad(std::vector<double> &mat, std::vector<double> &mu,
+                             std::vector<int> &rpt) {
+  std::vector<long double> matq(mat.begin(), mat.end());
+  std::vector<long double> muq(mu.begin(), mu.end());
+  long double haf;
+  int s = std::accumulate(rpt.begin(), rpt.end(), 0);
+  int n = std::sqrt(static_cast<double>(mat.size()));
+
+  if (s == 0 || n == 0)
+    haf = 1.0;
+  else
+    haf = loop_hafnian_rpt(matq, muq, rpt);
+
+  return static_cast<double>(haf);
 }
 
-}
+} // namespace libwalrus

From 350083d16f7e9f0271695f59afb90fcdcde17b3e Mon Sep 17 00:00:00 2001
From: Trevor Vincent <tvincent@cita.utoronto.ca>
Date: Sun, 12 Jan 2020 23:23:18 -0500
Subject: [PATCH 02/10] Fix compiler warnings

---
 include/repeated_hafnian.hpp | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/include/repeated_hafnian.hpp b/include/repeated_hafnian.hpp
index 66e582fe2..daa5f3a3a 100644
--- a/include/repeated_hafnian.hpp
+++ b/include/repeated_hafnian.hpp
@@ -37,19 +37,19 @@ namespace libwalrus {
  * @return hafnian of the input matrix
  */
 
-std::vector<int> lin_to_multi(int linear_index, const std::vector<int> &maxes) {
+std::vector<int> lin_to_multi(unsigned long long int linear_index, const std::vector<int> &maxes) {
   std::vector<int> i(maxes.size(), 0);
-  int l = 0;
+  unsigned int l = 0;
   int s0 = maxes[0] + 1;
   i[l] = linear_index % s0;
-  int prod = s0;
-  int gl = linear_index;
+  unsigned long long int prod = s0;
+  unsigned long long int gl = linear_index;
 
   while (l < i.size() - 1) {
     if (prod > linear_index) {
       break;
     } else {
-      int glp1 = (gl - i[l]) / (maxes[l] + 1);
+      unsigned long long int glp1 = (gl - i[l]) / (maxes[l] + 1);
       i[l + 1] = glp1 % (maxes[l + 1] + 1);
       prod *= (maxes[1 + 1] + 1);
       gl = glp1;
@@ -128,9 +128,9 @@ inline T hafnian_rpt(std::vector<T> &mat, std::vector<int> &rpt) {
 
     int chunks = omp_get_num_threads();
     int id = omp_get_thread_num();
-    int chunk_size = steps / chunks;
-    int beg = id * chunk_size;
-    int end = (id == chunks - 1) ? steps : beg + chunk_size;
+    unsigned long long int chunk_size = steps / chunks;
+    unsigned long long int beg = id * chunk_size;
+    unsigned long long int end = (id == chunks - 1) ? steps : beg + chunk_size;
     T sum_chunk = 0.L;
 
     std::vector<int> x = lin_to_multi(beg, rpt);
@@ -222,9 +222,9 @@ inline T loop_hafnian_rpt(std::vector<T> &mat, std::vector<T> &mu,
 
     int chunks = omp_get_num_threads();
     int id = omp_get_thread_num();
-    int chunk_size = steps / chunks;
-    int beg = id * chunk_size;
-    int end = (id == chunks - 1) ? steps : beg + chunk_size;
+    unsigned long long int chunk_size = steps / chunks;
+    unsigned long long int beg = id * chunk_size;
+    unsigned long long int end = (id == chunks - 1) ? steps : beg + chunk_size;
     T sum_chunk = 0.L;
 
     std::vector<int> x = lin_to_multi(beg, rpt);

From 6fa8f446950e377369003ea4b1e715f86ff06885 Mon Sep 17 00:00:00 2001
From: Trevor Vincent <trevor@xanadu.ai>
Date: Mon, 13 Jan 2020 13:50:08 -0500
Subject: [PATCH 03/10] Fix bug in lin_to_multi

---
 include/repeated_hafnian.hpp | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/include/repeated_hafnian.hpp b/include/repeated_hafnian.hpp
index daa5f3a3a..56bf4a013 100644
--- a/include/repeated_hafnian.hpp
+++ b/include/repeated_hafnian.hpp
@@ -46,14 +46,10 @@ std::vector<int> lin_to_multi(unsigned long long int linear_index, const std::ve
   unsigned long long int gl = linear_index;
 
   while (l < i.size() - 1) {
-    if (prod > linear_index) {
-      break;
-    } else {
-      unsigned long long int glp1 = (gl - i[l]) / (maxes[l] + 1);
-      i[l + 1] = glp1 % (maxes[l + 1] + 1);
-      prod *= (maxes[1 + 1] + 1);
-      gl = glp1;
-    }
+    unsigned long long int glp1 = (gl - i[l]) / (maxes[l] + 1);
+    i[l + 1] = glp1 % (maxes[l + 1] + 1);
+    prod *= (maxes[1 + 1] + 1);
+    gl = glp1;
     l++;
   }
   return i;

From a8cce49dd9d69e80ea43beba65a3b59c20bb81f1 Mon Sep 17 00:00:00 2001
From: Trevor Vincent <trevor@xanadu.ai>
Date: Mon, 13 Jan 2020 14:40:55 -0500
Subject: [PATCH 04/10] update changelog

---
 .github/CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md
index 8353d4b49..0e7271906 100644
--- a/.github/CHANGELOG.md
+++ b/.github/CHANGELOG.md
@@ -10,6 +10,8 @@
 
 * Updates and improves the speed and accuracy of `thewalrus.quantum.fock_tensor`. [#107](https://github.com/XanaduAI/thewalrus/pull/107)
 
+* Add OpenMP support to the repeated moment hafnian code. [#120](https://github.com/XanaduAI/thewalrus/pull/120)
+
 ### Bug fixes
 
 * Corrects typos in the random number generation in the C++ unit tests. [#118](https://github.com/XanaduAI/thewalrus/pull/118)

From 9055570a87db1dd0d08d5720bae5bfb3bbd5641c Mon Sep 17 00:00:00 2001
From: Trevor Vincent <trevor@xanadu.ai>
Date: Mon, 13 Jan 2020 15:57:26 -0500
Subject: [PATCH 05/10] Fix windows issue

---
 include/repeated_hafnian.hpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/include/repeated_hafnian.hpp b/include/repeated_hafnian.hpp
index 56bf4a013..79b6b305c 100644
--- a/include/repeated_hafnian.hpp
+++ b/include/repeated_hafnian.hpp
@@ -22,6 +22,12 @@
 #pragma once
 #include <stdafx.h>
 
+#if not defined(_OPENMP)
+typedef int omp_int_t;
+inline omp_int_t omp_get_thread_num() { return 0;}
+inline omp_int_t omp_get_num_threads() { return 1;}
+#endif
+
 namespace libwalrus {
 
 /**

From 1deab6d1f5c8c5476a8833a09351207374d63f71 Mon Sep 17 00:00:00 2001
From: Trevor Vincent <trevor@xanadu.ai>
Date: Mon, 13 Jan 2020 16:02:03 -0500
Subject: [PATCH 06/10] Revert clang-format doxygen changes

---
 include/repeated_hafnian.hpp | 78 +++++++++++++++---------------------
 1 file changed, 32 insertions(+), 46 deletions(-)

diff --git a/include/repeated_hafnian.hpp b/include/repeated_hafnian.hpp
index 79b6b305c..7b4990c15 100644
--- a/include/repeated_hafnian.hpp
+++ b/include/repeated_hafnian.hpp
@@ -88,16 +88,14 @@ template <typename T> T get_binom_coeff(T N, T K) {
  * described in *From moments of sum to moments of product*,
  * [doi:10.1016/j.jmva.2007.01.013](https://dx.doi.org/10.1016/j.jmva.2007.01.013>).
  *
- * Note that this algorithm, while generally slower than others, can be
- * significantly more efficient in the cases where the matrix has repeated rows
- * and columns.
+ * Note that this algorithm, while generally slower than others, can be significantly more
+ * efficient in the cases where the matrix has repeated rows and columns.
  *
  * @param mat a flattened vector of size \f$n^2\f$, representing an
  *      \f$n\times n\f$ row-ordered symmetric matrix.
  * @param rpt a vector of integers, representing the number of
  *      times each row/column in `mat` is repeated. For example,
- *      `mat = [1]` and `rpt = [6]` represents a \f$6\times 6\f$ matrix of all
- * ones.
+ *      `mat = [1]` and `rpt = [6]` represents a \f$6\times 6\f$ matrix of all ones.
  * @return hafnian of the input matrix
  */
 template <typename T>
@@ -185,23 +183,21 @@ inline T hafnian_rpt(std::vector<T> &mat, std::vector<int> &rpt) {
   return y;
 }
 
+
 /**
  * Returns the loop hafnian of a matrix using the algorithm
  * described in *From moments of sum to moments of product*,
  * [doi:10.1016/j.jmva.2007.01.013](https://dx.doi.org/10.1016/j.jmva.2007.01.013>).
  *
- * Note that this algorithm, while generally slower than others, can be
- * significantly more efficient in the cases where the matrix has repeated rows
- * and columns.
+ * Note that this algorithm, while generally slower than others, can be significantly more
+ * efficient in the cases where the matrix has repeated rows and columns.
  *
  * @param mat a flattened vector of size \f$n^2\f$, representing an
  *      \f$n\times n\f$ row-ordered symmetric matrix.
- * @param mu a vector of length \f$n\f$ representing the vector of
- * means/displacement.
+ * @param mu a vector of length \f$n\f$ representing the vector of means/displacement.
  * @param rpt a vector of integers, representing the number of
  *      times each row/column in `mat` is repeated. For example,
- *      `mat = [1]` and `rpt = [6]` represents a \f$6\times 6\f$ matrix of all
- * ones.
+ *      `mat = [1]` and `rpt = [6]` represents a \f$6\times 6\f$ matrix of all ones.
  * @return loop hafnian of the input matrix
  */
 template <typename T>
@@ -313,12 +309,11 @@ inline T loop_hafnian_rpt(std::vector<T> &mat, std::vector<T> &mu,
  * described in *From moments of sum to moments of product*,
  * [doi:10.1016/j.jmva.2007.01.013](https://dx.doi.org/10.1016/j.jmva.2007.01.013>).
  *
- * Note that this algorithm, while generally slower than others, can be
- * significantly more efficient in the cases where the matrix has repeated rows
- * and columns.
+ * Note that this algorithm, while generally slower than others, can be significantly more
+ * efficient in the cases where the matrix has repeated rows and columns.
  *
- * This is a wrapper around the templated function `libwalrus::hafnian_rpt` for
- * Python integration. It accepts and returns complex double numeric types, and
+ * This is a wrapper around the templated function `libwalrus::hafnian_rpt` for Python
+ * integration. It accepts and returns complex double numeric types, and
  * returns sensible values for empty and non-even matrices.
  *
  * In addition, this wrapper function automatically casts all matrices
@@ -329,8 +324,7 @@ inline T loop_hafnian_rpt(std::vector<T> &mat, std::vector<T> &mu,
  *      \f$n\times n\f$ row-ordered symmetric matrix.
  * @param rpt a vector of integers, representing the number of
  *      times each row/column in `mat` is repeated. For example,
- *      `mat = [1]` and `rpt = [6]` represents a \f$6\times 6\f$ matrix of all
- * ones.
+ *      `mat = [1]` and `rpt = [6]` represents a \f$6\times 6\f$ matrix of all ones.
  * @return hafnian of the input matrix
  */
 std::complex<double> hafnian_rpt_quad(std::vector<std::complex<double>> &mat,
@@ -353,13 +347,12 @@ std::complex<double> hafnian_rpt_quad(std::vector<std::complex<double>> &mat,
  * described in *From moments of sum to moments of product*,
  * [doi:10.1016/j.jmva.2007.01.013](https://dx.doi.org/10.1016/j.jmva.2007.01.013>).
  *
- * Note that this algorithm, while generally slower than others, can be
- * significantly more efficient in the cases where the matrix has repeated rows
- * and columns.
+ * Note that this algorithm, while generally slower than others, can be significantly more
+ * efficient in the cases where the matrix has repeated rows and columns.
  *
- * This is a wrapper around the templated function `libwalrus::hafnian_rpt` for
- * Python integration. It accepts and returns double numeric types, and returns
- * sensible values for empty and non-even matrices.
+ * This is a wrapper around the templated function `libwalrus::hafnian_rpt` for Python
+ * integration. It accepts and returns double numeric types, and
+ * returns sensible values for empty and non-even matrices.
  *
  * In addition, this wrapper function automatically casts all matrices
  * to type `long double`, allowing for greater precision than supported
@@ -369,8 +362,7 @@ std::complex<double> hafnian_rpt_quad(std::vector<std::complex<double>> &mat,
  *      \f$n\times n\f$ row-ordered symmetric matrix.
  * @param rpt a vector of integers, representing the number of
  *      times each row/column in `mat` is repeated. For example,
- *      `mat = [1]` and `rpt = [6]` represents a \f$6\times 6\f$ matrix of all
- * ones.
+ *      `mat = [1]` and `rpt = [6]` represents a \f$6\times 6\f$ matrix of all ones.
  * @return hafnian of the input matrix
  */
 double hafnian_rpt_quad(std::vector<double> &mat, std::vector<int> &rpt) {
@@ -392,13 +384,12 @@ double hafnian_rpt_quad(std::vector<double> &mat, std::vector<int> &rpt) {
  * described in *From moments of sum to moments of product*,
  * [doi:10.1016/j.jmva.2007.01.013](https://dx.doi.org/10.1016/j.jmva.2007.01.013>).
  *
- * Note that this algorithm, while generally slower than others, can be
- * significantly more efficient in the cases where the matrix has repeated rows
- * and columns.
+ * Note that this algorithm, while generally slower than others, can be significantly more
+ * efficient in the cases where the matrix has repeated rows and columns.
  *
- * This is a wrapper around the templated function `libwalrus::hafnian_rpt_loop`
- * for Python integration. It accepts and returns complex double numeric types,
- * and returns sensible values for empty and non-even matrices.
+ * This is a wrapper around the templated function `libwalrus::hafnian_rpt_loop` for Python
+ * integration. It accepts and returns complex double numeric types, and
+ * returns sensible values for empty and non-even matrices.
  *
  * In addition, this wrapper function automatically casts all matrices
  * to type `complex<long double>`, allowing for greater precision than supported
@@ -406,12 +397,10 @@ double hafnian_rpt_quad(std::vector<double> &mat, std::vector<int> &rpt) {
  *
  * @param mat a flattened vector of size \f$n^2\f$, representing an
  *      \f$n\times n\f$ row-ordered symmetric matrix.
- * @param mu a vector of length \f$n\f$ representing the vector of
- * means/displacement.
+ * @param mu a vector of length \f$n\f$ representing the vector of means/displacement.
  * @param rpt a vector of integers, representing the number of
  *      times each row/column in `mat` is repeated. For example,
- *      `mat = [1]` and `rpt = [6]` represents a \f$6\times 6\f$ matrix of all
- * ones.
+ *      `mat = [1]` and `rpt = [6]` represents a \f$6\times 6\f$ matrix of all ones.
  * @return loop hafnian of the input matrix
  */
 std::complex<double>
@@ -437,12 +426,11 @@ loop_hafnian_rpt_quad(std::vector<std::complex<double>> &mat,
  * described in *From moments of sum to moments of product*,
  * [doi:10.1016/j.jmva.2007.01.013](https://dx.doi.org/10.1016/j.jmva.2007.01.013>).
  *
- * Note that this algorithm, while generally slower than others, can be
- * significantly more efficient in the cases where the matrix has repeated rows
- * and columns.
+ * Note that this algorithm, while generally slower than others, can be significantly more
+ * efficient in the cases where the matrix has repeated rows and columns.
  *
- * This is a wrapper around the templated function `libwalrus::hafnian_rpt_loop`
- * for Python integration. It accepts and returns double numeric types, and
+ * This is a wrapper around the templated function `libwalrus::hafnian_rpt_loop` for Python
+ * integration. It accepts and returns double numeric types, and
  * returns sensible values for empty and non-even matrices.
  *
  * In addition, this wrapper function automatically casts all matrices
@@ -451,12 +439,10 @@ loop_hafnian_rpt_quad(std::vector<std::complex<double>> &mat,
  *
  * @param mat a flattened vector of size \f$n^2\f$, representing an
  *      \f$n\times n\f$ row-ordered symmetric matrix.
- * @param mu a vector of length \f$n\f$ representing the vector of
- * means/displacement.
+ * @param mu a vector of length \f$n\f$ representing the vector of means/displacement.
  * @param rpt a vector of integers, representing the number of
  *      times each row/column in `mat` is repeated. For example,
- *      `mat = [1]` and `rpt = [6]` represents a \f$6\times 6\f$ matrix of all
- * ones.
+ *      `mat = [1]` and `rpt = [6]` represents a \f$6\times 6\f$ matrix of all ones.
  * @return loop hafnian of the input matrix
  */
 double loop_hafnian_rpt_quad(std::vector<double> &mat, std::vector<double> &mu,

From d18c141ac598e3aafe6c0a789f02d290bad3630c Mon Sep 17 00:00:00 2001
From: Trevor Vincent <trevor@xanadu.ai>
Date: Mon, 13 Jan 2020 16:35:24 -0500
Subject: [PATCH 07/10] Fix windows problem attempt #2

---
 include/repeated_hafnian.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/repeated_hafnian.hpp b/include/repeated_hafnian.hpp
index 7b4990c15..97f6e46d3 100644
--- a/include/repeated_hafnian.hpp
+++ b/include/repeated_hafnian.hpp
@@ -22,7 +22,7 @@
 #pragma once
 #include <stdafx.h>
 
-#if not defined(_OPENMP)
+#if defined(_MSC_VER)
 typedef int omp_int_t;
 inline omp_int_t omp_get_thread_num() { return 0;}
 inline omp_int_t omp_get_num_threads() { return 1;}

From 1fa6d7241558f7e41b8aea075c6d5641a6f41c36 Mon Sep 17 00:00:00 2001
From: Trevor Vincent <trevor@xanadu.ai>
Date: Tue, 14 Jan 2020 11:59:24 -0500
Subject: [PATCH 08/10] Remove lines that are not needed anymore

---
 include/repeated_hafnian.hpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/repeated_hafnian.hpp b/include/repeated_hafnian.hpp
index 97f6e46d3..e7e911f58 100644
--- a/include/repeated_hafnian.hpp
+++ b/include/repeated_hafnian.hpp
@@ -48,13 +48,11 @@ std::vector<int> lin_to_multi(unsigned long long int linear_index, const std::ve
   unsigned int l = 0;
   int s0 = maxes[0] + 1;
   i[l] = linear_index % s0;
-  unsigned long long int prod = s0;
   unsigned long long int gl = linear_index;
 
   while (l < i.size() - 1) {
     unsigned long long int glp1 = (gl - i[l]) / (maxes[l] + 1);
     i[l + 1] = glp1 % (maxes[l + 1] + 1);
-    prod *= (maxes[1 + 1] + 1);
     gl = glp1;
     l++;
   }

From d99d0f87a6e5e06080077b432c9f560ae746ca48 Mon Sep 17 00:00:00 2001
From: Trevor Vincent <trevor@xanadu.ai>
Date: Thu, 16 Jan 2020 17:05:27 -0500
Subject: [PATCH 09/10] Add minor optimizations for one core

---
 include/repeated_hafnian.hpp | 69 ++++++++++++++++++++----------------
 1 file changed, 38 insertions(+), 31 deletions(-)

diff --git a/include/repeated_hafnian.hpp b/include/repeated_hafnian.hpp
index e7e911f58..c88417ebb 100644
--- a/include/repeated_hafnian.hpp
+++ b/include/repeated_hafnian.hpp
@@ -42,8 +42,8 @@ namespace libwalrus {
  *
  * @return hafnian of the input matrix
  */
-
-std::vector<int> lin_to_multi(unsigned long long int linear_index, const std::vector<int> &maxes) {
+std::vector<int> lin_to_multi(unsigned long long int linear_index,
+                              const std::vector<int> &maxes) {
   std::vector<int> i(maxes.size(), 0);
   unsigned int l = 0;
   int s0 = maxes[0] + 1;
@@ -68,7 +68,6 @@ std::vector<int> lin_to_multi(unsigned long long int linear_index, const std::ve
  *
  * @return N!/K!(N-K)!
  */
-
 template <typename T> T get_binom_coeff(T N, T K) {
   T r = 1;
   T d;
@@ -98,7 +97,6 @@ template <typename T> T get_binom_coeff(T N, T K) {
  */
 template <typename T>
 inline T hafnian_rpt(std::vector<T> &mat, std::vector<int> &rpt) {
-
   int n = std::sqrt(static_cast<double>(mat.size()));
   assert(static_cast<int>(rpt.size()) == n);
   T y = 0.0;
@@ -131,20 +129,24 @@ inline T hafnian_rpt(std::vector<T> &mat, std::vector<int> &rpt) {
     unsigned long long int end = (id == chunks - 1) ? steps : beg + chunk_size;
     T sum_chunk = 0.L;
 
-    std::vector<int> x = lin_to_multi(beg, rpt);
+    std::vector<int> x =
+        (chunks == 1) ? std::vector<int>(n, 0) : lin_to_multi(beg, rpt);
     std::vector<long double> nu2(n);
-    for (int i = 0; i < n; i++)
-      nu2[i] = 0.5 * rpt[i] - x[i];
+    for (int i = 0; i < n; i++) nu2[i] = 0.5 * rpt[i];
 
-    int x_sum = std::accumulate(x.begin(), x.end(), 0);
-    for (int i = 0; i < n; i++) {
-      p *= get_binom_coeff(rpt[i], x[i]);
+    if (chunks != 1) {
+      for (int i = 0; i < n; i++) {
+        p *= get_binom_coeff(rpt[i], x[i]);
+      }
+      p *= (std::accumulate(x.begin(), x.end(), 0) % 2 == 0) ? 1 : -1;
     }
-    p *= (x_sum % 2 == 0) ? 1 : -1;
 
     for (int i = 0; i < n; i++) {
-      for (int j = 0; j < n; j++) {
-        q += 0.5L * nu2[j] * mat[i * n + j] * nu2[i];
+      T nu2_min_x = (nu2[i] - static_cast<long double>(x[i]));
+      q += 0.5L * nu2_min_x * nu2_min_x * mat[i * n + i];
+      for (int j = 0; j < i; j++) {
+        q += (nu2[j] - static_cast<long double>(x[j])) * nu2_min_x *
+             mat[i * n + j];
       }
     }
 
@@ -152,13 +154,12 @@ inline T hafnian_rpt(std::vector<T> &mat, std::vector<int> &rpt) {
       sum_chunk += static_cast<long double>(p) * pow(q, s2);
 
       for (int j = 0; j < n; j++) {
-
         if (x[j] < rpt[j]) {
           x[j] += 1;
           p *= -static_cast<long double>(rpt[j] + 1 - x[j]) / x[j];
 
           for (int k = 0; k < n; k++) {
-            q -= mat[k * n + j] * (0.5L * rpt[k] - x[k]);
+            q -= mat[k * n + j] * (nu2[k] - static_cast<long double>(x[k]));
           }
           q -= 0.5L * mat[j * n + j];
           break;
@@ -168,7 +169,8 @@ inline T hafnian_rpt(std::vector<T> &mat, std::vector<int> &rpt) {
             p *= -1;
           }
           for (int k = 0; k < n; k++) {
-            q += (1.0L * rpt[j]) * mat[k * n + j] * (0.5L * rpt[k] - x[k]);
+            q += static_cast<long double>(rpt[j]) * mat[k * n + j] *
+                 (nu2[k] - static_cast<long double>(x[k]));
           }
           q -= 0.5L * rpt[j] * rpt[j] * mat[j * n + j];
         }
@@ -181,7 +183,6 @@ inline T hafnian_rpt(std::vector<T> &mat, std::vector<int> &rpt) {
   return y;
 }
 
-
 /**
  * Returns the loop hafnian of a matrix using the algorithm
  * described in *From moments of sum to moments of product*,
@@ -217,34 +218,39 @@ inline T loop_hafnian_rpt(std::vector<T> &mat, std::vector<T> &mu,
     T q = 0.0L, q1 = 0.0L;
 
     int chunks = omp_get_num_threads();
-    int id = omp_get_thread_num();
+    int id = (chunks == 1) ? 0 : omp_get_thread_num();
     unsigned long long int chunk_size = steps / chunks;
     unsigned long long int beg = id * chunk_size;
     unsigned long long int end = (id == chunks - 1) ? steps : beg + chunk_size;
+
     T sum_chunk = 0.L;
 
-    std::vector<int> x = lin_to_multi(beg, rpt);
+    std::vector<int> x =
+        (chunks == 1) ? std::vector<int>(n, 0) : lin_to_multi(beg, rpt);
     int s = std::accumulate(rpt.begin(), rpt.end(), 0);
     int s1 = std::floor(0.5 * s) + 1;
     std::vector<T> z1(s1, 1.0L);
     std::vector<T> z2(s1, 1.0L);
 
-    std::vector<long double> nu2(n);
-    for (int i = 0; i < n; i++)
-      nu2[i] = 0.5 * rpt[i] - x[i];
+    std::vector<T> nu2(n);
+    for (int i = 0; i < n; i++) nu2[i] = 0.5 * rpt[i];
 
     for (int i = 0; i < n; i++) {
-      q1 += nu2[i] * mu[i];
-      for (int j = 0; j < n; j++) {
-        q += 0.5L * nu2[j] * mat[i * n + j] * nu2[i];
+      T nu2_min_x = (nu2[i] - static_cast<long double>(x[i]));
+      q1 += nu2_min_x * mu[i];
+      q += 0.5L * nu2_min_x * nu2_min_x * mat[i * n + i];
+      for (int j = 0; j < i; j++) {
+        q += (nu2[j] - static_cast<long double>(x[j])) * nu2_min_x *
+             mat[i * n + j];
       }
     }
 
-    int x_sum = std::accumulate(x.begin(), x.end(), 0);
-    for (int i = 0; i < n; i++) {
-      p *= get_binom_coeff(rpt[i], x[i]);
+    if (chunks != 1) {
+      for (int i = 0; i < n; i++) {
+        p *= get_binom_coeff(rpt[i], x[i]);
+      }
+      p *= (std::accumulate(x.begin(), x.end(), 0) % 2 == 0) ? 1 : -1;
     }
-    p *= (x_sum % 2 == 0) ? 1 : -1;
 
     for (unsigned long long int i = beg; i < end; i++) {
       for (int j = 1; j < s1; j++) {
@@ -276,7 +282,7 @@ inline T loop_hafnian_rpt(std::vector<T> &mat, std::vector<T> &mu,
                           x[j]);
 
           for (int k = 0; k < n; k++) {
-            q -= mat[k * n + j] * (0.5L * rpt[k] - x[k]);
+            q -= mat[k * n + j] * (nu2[k] - static_cast<long double>(x[k]));
           }
           q -= 0.5L * mat[j * n + j];
           q1 -= mu[j];
@@ -287,7 +293,8 @@ inline T loop_hafnian_rpt(std::vector<T> &mat, std::vector<T> &mu,
             p *= -1;
           }
           for (int k = 0; k < n; k++) {
-            q += (1.0L * rpt[j]) * mat[k * n + j] * (0.5L * rpt[k] - x[k]);
+            q += static_cast<long double>(rpt[j]) * mat[k * n + j] *
+                 (nu2[k] - static_cast<long double>(x[k]));
           }
           q -= 0.5L * rpt[j] * rpt[j] * mat[j * n + j];
           q1 += static_cast<long double>(rpt[j]) * mu[j];

From ba11956e535670f6f9e5c94b20364de490fa572d Mon Sep 17 00:00:00 2001
From: Trevor Vincent <trevor@xanadu.ai>
Date: Fri, 17 Jan 2020 15:32:12 -0500
Subject: [PATCH 10/10] Add josh suggestions

---
 include/repeated_hafnian.hpp | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/include/repeated_hafnian.hpp b/include/repeated_hafnian.hpp
index c88417ebb..d36176ce3 100644
--- a/include/repeated_hafnian.hpp
+++ b/include/repeated_hafnian.hpp
@@ -16,8 +16,8 @@
  * \rst
  * Contains functions for computing the hafnian using the algorithm
  * described in *From moments of sum to moments of product*,
- * `doi:10.1016/j.jmva.2007.01.013
- * <https://dx.doi.org/10.1016/j.jmva.2007.01.013>`__. \endrst
+ * `doi:10.1016/j.jmva.2007.01.013 <https://dx.doi.org/10.1016/j.jmva.2007.01.013>`__.
+ * \endrst
  */
 #pragma once
 #include <stdafx.h>
@@ -40,7 +40,7 @@ namespace libwalrus {
  * @param maxes a vector of integers, representing the max index value
  * of each indice in the multi-index object.
  *
- * @return hafnian of the input matrix
+ * @return multi-index corresponding to the linear index
  */
 std::vector<int> lin_to_multi(unsigned long long int linear_index,
                               const std::vector<int> &maxes) {
@@ -60,19 +60,19 @@ std::vector<int> lin_to_multi(unsigned long long int linear_index,
 }
 
 /**
- * Returns the binomial coefficient N!/K!(N-K)!
+ * Returns the binomial coefficient \f$N!/K!(N-K)!\f$
  * Adapted from http://blog.plover.com/math/choose.html
  *
  * @param N
  * @param K
  *
- * @return N!/K!(N-K)!
+ * @return \f$N!/K!(N-K)!\f$
  */
-template <typename T> T get_binom_coeff(T N, T K) {
+template <typename T>
+T get_binom_coeff(T N, T K) {
   T r = 1;
   T d;
-  if (K > N)
-    return 0;
+  if (K > N) return 0;
   for (d = 1; d <= K; d++) {
     r *= N--;
     r /= d;
@@ -106,7 +106,6 @@ inline T hafnian_rpt(std::vector<T> &mat, std::vector<int> &rpt) {
     T q = 0.0L;
 
     long double p = 2;
-    // std::vector<int> x(n, 0.0);
     int s = std::accumulate(rpt.begin(), rpt.end(), 0);
     int s2 = s / 2;
 
@@ -408,10 +407,9 @@ double hafnian_rpt_quad(std::vector<double> &mat, std::vector<int> &rpt) {
  *      `mat = [1]` and `rpt = [6]` represents a \f$6\times 6\f$ matrix of all ones.
  * @return loop hafnian of the input matrix
  */
-std::complex<double>
-loop_hafnian_rpt_quad(std::vector<std::complex<double>> &mat,
-                      std::vector<std::complex<double>> &mu,
-                      std::vector<int> &rpt) {
+std::complex<double> loop_hafnian_rpt_quad(
+    std::vector<std::complex<double>> &mat,
+    std::vector<std::complex<double>> &mu, std::vector<int> &rpt) {
   std::vector<std::complex<long double>> matq(mat.begin(), mat.end());
   std::vector<std::complex<long double>> muq(mu.begin(), mu.end());
   std::complex<long double> haf;