From f859521425f8b63c1d84789480b51013165f7065 Mon Sep 17 00:00:00 2001
From: Karl Pierce <kmp5@vt.edu>
Date: Wed, 3 May 2023 10:57:15 -0400
Subject: [PATCH 01/18] Start making a block coordinate diagonal decomposition

---
 btas/btas.h           |   1 +
 btas/generic/cp_bcd.h | 394 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 395 insertions(+)
 create mode 100644 btas/generic/cp_bcd.h
diff --git a/btas/btas.h b/btas/btas.h
index 53a9b008..095fd517 100644
--- a/btas/btas.h
+++ b/btas/btas.h
@@ -12,6 +12,7 @@
 #include <btas/generic/cp_df_als.h>
 #include <btas/generic/tuck_cp_als.h>
 #include <btas/generic/coupled_cp_als.h>
+#include <btas/generic/cp_bcd.h>
 #include <btas/generic/dot_impl.h>
 #include <btas/generic/scal_impl.h>
 #include <btas/generic/axpy_impl.h>
diff --git a/btas/generic/cp_bcd.h b/btas/generic/cp_bcd.h
new file mode 100644
index 00000000..f8844834
--- /dev/null
+++ b/btas/generic/cp_bcd.h
@@ -0,0 +1,394 @@
+//
+// Created by Karl Pierce on 4/29/23.
+//
+
+#ifndef BTAS_GENERIC_CP_BCD_H
+#define BTAS_GENERIC_CP_BCD_H
+
+#include <btas/generic/cp.h>
+
+#include <algorithm>
+#include <cstdlib>
+#include <iostream>
+#include <vector>
+
+namespace btas{
+  /** \brief Computes the Canonical Product (CP) decomposition of an order-N
+    tensor using block coordinate descent (BCD).
+
+  This computes the CP decomposition of btas::Tensor objects with row
+  major storage only with fixed (compile-time) and variable (run-time)
+  ranks. Also provides Tucker and randomized Tucker-like compressions coupled
+  with CP-BCD decomposition. Does not support strided ranges.
+
+   \warning this code takes a non-const reference \c tensor_ref but does
+   not modify the values. This is a result of API (reshape needs non-const tensor)
+
+   Synopsis:
+    \code
+      // Constructors
+      CP_BCD A(tensor)         // CP_BCD object with empty factor
+                               // matrices and no symmetries
+      CP_BCD A(tensor, symms)  // CP_ALS object with empty factor
+                               // matrices and symmetries
+
+      // Operations
+      A.compute_rank(rank, converge_test)  // Computes the CP_BCD of tensor to
+                                           // rank, rank build and HOSVD options
+
+      A.compute_rank_random(rank, converge_test)  // Computes the CP_BCD of tensor to
+                                                  // rank. Factor matrices built at rank
+                                                  // with random numbers
+
+      A.compute_error(converge_test, omega)  // Computes the CP_BCD of tensor to
+                                             // 2-norm
+                                             // error < omega.
+
+      //See documentation for full range of options
+
+      // Accessing Factor Matrices
+      A.get_factor_matrices()  // Returns a vector of factor matrices, if
+                               // they have been computed
+
+      A.reconstruct()  // Returns the tensor computed using the
+                       // CP factor matrices
+    \endcode
+                                         */
+
+  template <typename Tensor, class ConvClass = NormCheck<Tensor> >
+  class CP_BCD : public CP_ALS<Tensor, ConvClass> {
+   public:
+    using T = typename Tensor::value_type;
+    using RT = real_type_t<T>;
+    using RTensor = rebind_tensor_t<Tensor, RT>;
+
+    using CP<Tensor, ConvClass>::A;
+    using CP<Tensor, ConvClass>::ndim;
+    using CP<Tensor, ConvClass>::symmetries;
+    using typename CP<Tensor, ConvClass>::ind_t;
+    using typename CP<Tensor, ConvClass>::ord_t;
+    using CP_ALS<Tensor, ConvClass>::tensor_ref;
+    using CP_ALS<Tensor, ConvClass>::size;
+
+    /// Create a CP BCD object, child class of the CP object
+    /// that stores the reference tensor.
+    /// Reference tensor has no symmetries.
+    /// \param[in] tensor the reference tensor to be decomposed.
+    CP_BCD(Tensor &tensor, ind_t block_size = 1) : CP_ALS<Tensor, ConvClass>(tensor), blocksize(block_size){
+    }
+
+    /// Create a CP BCD object, child class of the CP object
+    /// that stores the reference tensor.
+    /// Reference tensor has symmetries.
+    /// Symmetries should be set such that the higher modes index
+    /// are set equal to lower mode indices (a 4th order tensor,
+    /// where the second & third modes are equal would have a
+    /// symmetries of {0,1,1,3}
+    /// \param[in] tensor the reference tensor to be decomposed.
+    /// \param[in] symms the symmetries of the reference tensor.
+    CP_BCD(Tensor &tensor, std::vector<size_t> &symms, ind_t block_size = 1)
+        : CP_ALS<Tensor, ConvClass>(tensor, symms), blocksize(block_size) {
+    }
+
+    CP_BCD() = default;
+
+    ~CP_BCD() = default;
+
+   protected:
+    Tensor gradient, block_ref;           // Stores gradient of BCD for fitting.
+    ind_t blocksize;           // Size of block gradient
+    std::vector<Tensor> blockfactors;
+
+    /// computed the CP decomposition using ALS to minimize the loss function for fixed rank \p rank
+    /// \param[in] rank The rank of the CP decomposition.
+    /// \param[in, out] converge_test Test to see if ALS is converged, holds the value of fit.
+    /// \param[in] dir The CP decomposition be computed without calculating the
+    /// Khatri-Rao product?
+    /// \param[in] max_als If CP decomposition is to finite
+    /// error, max_als is the highest rank approximation computed before giving up
+    /// on CP-ALS. Default = 1e5.
+    /// \param[in] calculate_epsilon Should the 2-norm
+    /// error be calculated ||T_exact - T_approx|| = epsilon.
+    /// \param[in] tcutALS
+    /// How small difference in factor matrices must be to consider ALS of a
+    /// single rank converged. Default = 0.1.
+    /// \param[in, out] epsilon The 2-norm
+    /// error between the exact and approximated reference tensor
+    /// \param[in,out] fast_pI Whether the pseudo inverse be computed using a fast cholesky decomposition,
+    ///       on return \c fast_pI will be true if use of Cholesky was successful
+    virtual void ALS(ind_t rank, ConvClass &converge_test, bool dir, int max_als, bool calculate_epsilon, double &epsilon,
+                     bool &fast_pI){
+      std::vector<size_t> order;
+      for(auto i = 0; i < ndim; ++i){
+        order.emplace_back(i);
+      }
+
+      block_ref = tensor_ref;
+
+      // Number of full blocks with rank blocksize
+      // plus one more block with rank rank % blocksize
+      int n_full_blocks = int( rank / blocksize),
+          last_blocksize = rank % blocksize;
+
+      long block_step = 0, z = 0;
+      this->AtA = std::vector<Tensor>(ndim);
+      // copying all of A to block factors. Then we are going to take slices of A
+      // that way we can leverage all of `CP_ALS` code without modification
+      blockfactors = A;
+      // this stores the factors from rank 0 to #blocks * blocksize
+      std::vector<Tensor> current_grads(ndim);
+      // Compute all the BCD of the full blocks
+      for (long b = 0; b < n_full_blocks; ++b, block_step += blocksize) {
+        // Take the b-th block of the factor matrix also take the
+        // find the partial grammian for the blocked factors
+        for (size_t i = 0; i < ndim; ++i){
+          auto & a_mat = A[i];
+          auto lower = {z, block_step}, upper = {long(a_mat.extent(0)), block_step + blocksize};
+          a_mat = make_view(blockfactors[i].range().slice(lower, upper), blockfactors[i].storage());
+          contract(this->one, a_mat, {1, 2}, a_mat.conj(), {1, 3}, this->zero, this->AtA[i], {2, 3});
+        }
+
+        // Do the ALS loop
+        //CP_ALS::ALS(blocksize, converge_test, dir, max_als, calculate_epsilon, epsilon, fast_pI);
+        // First do it manually so we know its right
+        // Compute ALS of the bth block against the gradient
+        // computed by subtracting the previous blocks from the reference
+        size_t count = 0;
+        bool is_converged = false;
+        bool matlab = true;
+        auto one_over_tref = 1.0 / norm(tensor_ref);
+        auto fit = 1.0, change = 0.0;
+        do {
+          ++count;
+          this->num_ALS++;
+
+          for (size_t i = 0; i < ndim; i++) {
+            this->direct(i, blocksize, fast_pI, matlab, converge_test, gradient);
+                // BCD(i, block_step, block_step + blocksize, fast_pI, matlab, converge_test);
+                auto &ai = A[i];
+            contract(this->one, ai, {1, 2}, ai.conj(), {1, 3}, this->zero, this->AtA[i], {2, 3});
+            // Copy the block computed in A to the correct portion in blockfactors
+            // (this will replace A at the end)
+            copy_blocks(blockfactors[i], A[i], block_step, block_step + blocksize);
+          }
+          // Copy the lambda values into the correct location in blockfactors
+          size_t c = 0;
+          for (auto b = block_step; b < block_step + blocksize; ++b, ++c) {
+            blockfactors[ndim](b) = A[ndim](c);
+          }
+          // Test the system to see if converged. Doing the hard way first
+          // To compute the accuracy fully compute CP approximation using blocks 0 through b.
+          for (size_t i = 0; i < ndim; ++i) {
+            auto & a_mat = blockfactors[i];
+            auto lower = {z, block_step}, upper = {long(a_mat.extent(0)), block_step + blocksize};
+            current_grads[i] = make_view(a_mat.range().slice(lower, upper), a_mat.storage());
+          }
+
+          auto temp = reconstruct(current_grads, order, blockfactors[ndim]);
+          auto newfit = 1.0 - norm(temp - tensor_ref) * one_over_tref;
+          std::cout << newfit << std::endl;
+          change = abs(fit - newfit);
+          fit = newfit;
+          std::cout << fit << "\t" << change << std::endl;
+          is_converged = (change < 0.001);
+          if(is_converged) {
+            auto grad = gradient - temp;
+            auto grad_ptr = grad.begin();
+            for(auto ptr = tensor_ref.begin(); ptr != tensor_ref.end(); ++ptr, ++grad_ptr)
+              *(ptr) = *(grad_ptr);
+          }
+        }while(count < max_als && !is_converged);
+        std::cout << tensor_ref << std::endl;
+      }
+      // Fix tensor_ref
+      auto ptr = tensor_ref.begin();
+      for(auto g_ptr = gradient.begin(); g_ptr != gradient.end(); ++g_ptr, ++ptr)
+        *(ptr) = *(g_ptr);
+    }
+
+    void copy_blocks(Tensor & to, Tensor & from, ind_t block_start, ind_t block_end){
+        auto tref_dim = to.extent(0);
+        auto to_rank = to.extent(1), from_rank = from.extent(1);
+
+        auto to_ptr = to.data(), from_ptr = from.data();
+        ind_t from_pos = 0;
+        for (ind_t i = 0, skip = 0; i < tref_dim; ++i, skip += to_rank){
+          for (auto b = block_start; b < block_end; ++b, ++from_pos){
+            *(to_ptr + skip + b) = *(from_ptr + from_pos);
+          }
+        }
+    }
+    /// Computes an optimized factor matrix holding all others constant.
+    /// No Khatri-Rao product computed, immediate contraction
+
+    /// \param[in] n The mode being optimized, all other modes held constant
+    /// \param[in] rank The current rank, column dimension of the factor matrices
+    /// \param[in,out] fast_pI Should the pseudo inverse be computed using a fast cholesky decomposition
+    /// return if computing the \c fast_pI was successful.
+    /// \param[in, out] matlab If \c fast_pI = true then try to solve VA = B instead of taking pseudoinverse
+    /// in the same manner that matlab would compute the inverse. If this fails, variable will be manually
+    /// set to false and SVD will be used.
+    /// return if \c matlab was successful
+    /// \param[in, out] converge_test Test to see if ALS is converged, holds the value of fit. test to see if the ALS is converged
+
+    void BCD(size_t n, ind_t block_start, ind_t block_end, bool &fast_pI, bool &matlab, ConvClass &converge_test, double lambda = 0.0) {
+      // Determine if n is the last mode, if it is first contract with first mode
+      // and transpose the product
+      bool last_dim = n == ndim - 1;
+      // product of all dimensions
+      ord_t LH_size = size;
+      size_t contract_dim = last_dim ? 0 : ndim - 1;
+      ind_t offset_dim = tensor_ref.extent(n);
+      ind_t brank = block_end - block_start;
+      ind_t pseudo_rank = brank;
+
+      // Store the dimensions which are available to hadamard contract
+      std::vector<ind_t> dimensions;
+      for (size_t i = last_dim ? 1 : 0; i < (last_dim ? ndim : ndim - 1); i++) {
+        dimensions.push_back(tensor_ref.extent(i));
+      }
+
+      // Modifying the dimension of tensor_ref so store the range here to resize
+      Range R = tensor_ref.range();
+
+      // Resize the tensor which will store the product of tensor_ref and the first factor matrix
+      Tensor temp = Tensor(size / tensor_ref.extent(contract_dim), brank);
+      gradient.resize(
+          Range{Range1{last_dim ? tensor_ref.extent(contract_dim) : size / tensor_ref.extent(contract_dim)},
+                Range1{last_dim ? size / tensor_ref.extent(contract_dim) : tensor_ref.extent(contract_dim)}});
+
+      // contract tensor ref and the first factor matrix
+      gemm((last_dim ? blas::Op::Trans : blas::Op::NoTrans), blas::Op::NoTrans, this->one , (last_dim? gradient.conj():gradient), blockfactors[contract_dim].conj(), this->zero,
+           temp);
+
+      // Resize tensor_ref
+      gradient.resize(R);
+      // Remove the dimension which was just contracted out
+      LH_size /= tensor_ref.extent(contract_dim);
+
+      // n tells which dimension not to contract, and contract_dim says which dimension I am trying to contract.
+      // If n == contract_dim then that mode is skipped.
+      // if n == ndim - 1, my contract_dim = 0. The gemm transposes to make rank = ndim - 1, so I
+      // move the pointer that preserves the last dimension to n = ndim -2.
+      // In all cases I want to walk through the orders in tensor_ref backward so contract_dim = ndim - 2
+      n = last_dim ? ndim - 2 : n;
+      contract_dim = ndim - 2;
+
+      while (contract_dim > 0) {
+        // Now temp is three index object where temp has size
+        // (size of tensor_ref/product of dimension contracted, dimension to be
+        // contracted, rank)
+        ord_t idx2 = dimensions[contract_dim],
+              idx1 = LH_size / idx2;
+        temp.resize(
+            Range{Range1{idx1}, Range1{idx2}, Range1{pseudo_rank}});
+        Tensor contract_tensor;
+        //Tensor contract_tensor(Range{Range1{idx1}, Range1{pseudo_rank}});
+        //contract_tensor.fill(0.0);
+        const auto &a = blockfactors[(last_dim ? contract_dim + 1 : contract_dim)];
+        // If the middle dimension is the mode not being contracted, I will move
+        // it to the right hand side temp((size of tensor_ref/product of
+        // dimension contracted, rank * mode n dimension)
+        if (n == contract_dim) {
+          pseudo_rank *= offset_dim;
+        }
+
+        // If the code hasn't hit the mode of interest yet, it will contract
+        // over the middle dimension and sum over the rank.
+
+        else if (contract_dim > n) {
+          middle_contract(this->one, temp, a.conj(), this->zero, contract_tensor);
+          temp = contract_tensor;
+        }
+
+        // If the code has passed the mode of interest, it will contract over
+        // the middle dimension and sum over rank * mode n dimension
+        else {
+          middle_contract_with_pseudorank(this->one, temp, a.conj(), this->zero, contract_tensor);
+          temp = contract_tensor;
+        }
+
+        LH_size /= idx2;
+        contract_dim--;
+      }
+      n = last_dim ? n+1 : n;
+
+      // If the mode of interest is the 0th mode, then the while loop above
+      // contracts over all other dimensions and resulting temp is of the
+      // correct dimension If the mode of interest isn't 0th mode, must contract
+      // out the 0th mode here, the above algorithm can't perform this
+      // contraction because the mode of interest is coupled with the rank
+      if (n != 0) {
+        ind_t idx1 = dimensions[0];
+        temp.resize(Range{Range1{idx1}, Range1{offset_dim}, Range1{brank}});
+        Tensor contract_tensor(Range{Range1{offset_dim}, Range1{brank}});
+        contract_tensor.fill(0.0);
+
+        const auto &a = blockfactors[(last_dim ? 1 : 0)];
+        front_contract(this->one, temp, a.conj(), this->zero, contract_tensor);
+
+        temp = contract_tensor;
+      }
+
+      // Add lambda to factor matrices if RALS
+      if(lambda !=0){
+        auto LamA = blockfactors[n];
+        scal(lambda, LamA);
+        temp += LamA;
+      }
+
+      // multiply resulting matrix temp by pseudoinverse to calculate optimized
+      // factor matrix
+      pseudoinverse_block(n, brank, fast_pI, matlab, temp, lambda);
+
+      // Normalize the columns of the new factor matrix and update
+      this->normCol(temp, block_start);
+      auto ptr = temp.data();
+      blockfactors[n] = temp;
+      {
+        auto tref_dim = tensor_ref.extent(n);
+        auto A_ptr = A[n].data();
+        auto rank = A[n].extent(1);
+        auto temp_pos = 0;
+        for (ind_t i = 0, skip = 0; i < tref_dim; ++i, skip += rank){
+          for (auto b = block_start; b < block_end; ++b, ++temp_pos){
+            *(A_ptr + skip + b) = *(ptr + temp_pos);
+          }
+        }
+      }
+    }
+
+    /// Calculates the column norms of a matrix and saves the norm values into
+    /// lambda tensor (last matrix in the A)
+
+    /// \param[in, out] Mat The matrix whose column will be normalized, return
+    /// \c Mat with all columns normalized
+
+    void normCol(Tensor &Mat, ord_t block_start) {
+      if (Mat.rank() > 2) BTAS_EXCEPTION("normCol with rank > 2 not yet supported");
+      ind_t rank = Mat.extent(1),
+            Nsize = Mat.extent(0);
+      ord_t size = Mat.size();
+
+      auto Mat_ptr = Mat.data();
+      std::vector<T> lambda;
+      for(auto i = 0; i < rank; ++i) lambda.emplace_back(T(0.0));
+
+      auto lam_ptr = lambda.data();
+      for (ord_t i = 0; i < size; ++i) {
+        *(lam_ptr + i % rank) += *(Mat_ptr + i) * btas::impl::conj(*(Mat_ptr + i));
+      }
+
+      auto A_ptr = A[ndim].data() + block_start;
+      for (ind_t i = 0; i < rank; ++i) {
+        auto val = sqrt(*(lam_ptr + i));
+        *(A_ptr + i) = val;
+        val = (abs(val) < 1e-12 ? 0.0 : 1.0 / val);
+        btas::scal(Nsize, val, (Mat_ptr + i), rank);
+      }
+    }
+
+  };
+}
+
+#endif  // BTAS_GENERIC_CP_BCD_H

From bc235f03d70c2810ee8b340ec1cda97ee3f0ed08 Mon Sep 17 00:00:00 2001
From: Karl Pierce <kmp5@vt.edu>
Date: Wed, 3 May 2023 15:41:19 -0400
Subject: [PATCH 02/18] Use target instead of assuming tensor_ref

For BCD
---
 btas/generic/cp_als.h      | 36 ++++++++++++++++++------------------
 btas/generic/cp_rals.h     |  4 ++--
 btas/generic/tuck_cp_als.h |  2 +-
 3 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/btas/generic/cp_als.h b/btas/generic/cp_als.h
index 132956ce..12bba3f9 100644
--- a/btas/generic/cp_als.h
+++ b/btas/generic/cp_als.h
@@ -625,7 +625,7 @@ namespace btas {
           if (tmp != i) {
             A[i] = A[tmp];
           } else if (dir) {
-            direct(i, rank, fast_pI, matlab, converge_test);
+            direct(i, rank, fast_pI, matlab, converge_test, tensor_ref);
           } else {
             update_w_KRP(i, rank, fast_pI, matlab, converge_test);
           }
@@ -737,52 +737,52 @@ namespace btas {
     /// return if \c matlab was successful
     /// \param[in, out] converge_test Test to see if ALS is converged, holds the value of fit. test to see if the ALS is converged
 
-    void direct(size_t n, ind_t rank, bool &fast_pI, bool &matlab, ConvClass &converge_test, double lambda = 0.0) {
+    void direct(size_t n, ind_t rank, bool &fast_pI, bool &matlab, ConvClass &converge_test, Tensor& target, double lambda = 0.0) {
       // Determine if n is the last mode, if it is first contract with first mode
       // and transpose the product
       bool last_dim = n == ndim - 1;
       // product of all dimensions
       ord_t LH_size = size;
       size_t contract_dim = last_dim ? 0 : ndim - 1;
-      ind_t offset_dim = tensor_ref.extent(n);
+      ind_t offset_dim = target.extent(n);
       ind_t pseudo_rank = rank;
 
       // Store the dimensions which are available to hadamard contract
       std::vector<ind_t> dimensions;
       for (size_t i = last_dim ? 1 : 0; i < (last_dim ? ndim : ndim - 1); i++) {
-        dimensions.push_back(tensor_ref.extent(i));
+        dimensions.push_back(target.extent(i));
       }
 
-      // Modifying the dimension of tensor_ref so store the range here to resize
-      Range R = tensor_ref.range();
+      // Modifying the dimension of target so store the range here to resize
+      Range R = target.range();
       //Tensor an(A[n].range());
 
-      // Resize the tensor which will store the product of tensor_ref and the first factor matrix
-      Tensor temp = Tensor(size / tensor_ref.extent(contract_dim), rank);
-      tensor_ref.resize(
-          Range{Range1{last_dim ? tensor_ref.extent(contract_dim) : size / tensor_ref.extent(contract_dim)},
-                Range1{last_dim ? size / tensor_ref.extent(contract_dim) : tensor_ref.extent(contract_dim)}});
+      // Resize the tensor which will store the product of target and the first factor matrix
+      Tensor temp = Tensor(size / target.extent(contract_dim), rank);
+      target.resize(
+          Range{Range1{last_dim ? target.extent(contract_dim) : size / target.extent(contract_dim)},
+                Range1{last_dim ? size / target.extent(contract_dim) : target.extent(contract_dim)}});
 
       // contract tensor ref and the first factor matrix
-      gemm((last_dim ? blas::Op::Trans : blas::Op::NoTrans), blas::Op::NoTrans, this->one , (last_dim? tensor_ref.conj():tensor_ref), A[contract_dim].conj(), this->zero,
+      gemm((last_dim ? blas::Op::Trans : blas::Op::NoTrans), blas::Op::NoTrans, this->one , (last_dim? target.conj():target), A[contract_dim].conj(), this->zero,
            temp);
 
-      // Resize tensor_ref
-      tensor_ref.resize(R);
+      // Resize target
+      target.resize(R);
       // Remove the dimension which was just contracted out
-      LH_size /= tensor_ref.extent(contract_dim);
+      LH_size /= target.extent(contract_dim);
 
       // n tells which dimension not to contract, and contract_dim says which dimension I am trying to contract.
       // If n == contract_dim then that mode is skipped.
       // if n == ndim - 1, my contract_dim = 0. The gemm transposes to make rank = ndim - 1, so I
       // move the pointer that preserves the last dimension to n = ndim -2.
-      // In all cases I want to walk through the orders in tensor_ref backward so contract_dim = ndim - 2
+      // In all cases I want to walk through the orders in target backward so contract_dim = ndim - 2
       n = last_dim ? ndim - 2 : n;
       contract_dim = ndim - 2;
 
       while (contract_dim > 0) {
         // Now temp is three index object where temp has size
-        // (size of tensor_ref/product of dimension contracted, dimension to be
+        // (size of target/product of dimension contracted, dimension to be
         // contracted, rank)
         ord_t idx2 = dimensions[contract_dim],
               idx1 = LH_size / idx2;
@@ -793,7 +793,7 @@ namespace btas {
         //contract_tensor.fill(0.0);
         const auto &a = A[(last_dim ? contract_dim + 1 : contract_dim)];
         // If the middle dimension is the mode not being contracted, I will move
-        // it to the right hand side temp((size of tensor_ref/product of
+        // it to the right hand side temp((size of target/product of
         // dimension contracted, rank * mode n dimension)
         if (n == contract_dim) {
           pseudo_rank *= offset_dim;
diff --git a/btas/generic/cp_rals.h b/btas/generic/cp_rals.h
index 55e612b9..a3d6ef08 100644
--- a/btas/generic/cp_rals.h
+++ b/btas/generic/cp_rals.h
@@ -172,9 +172,9 @@ namespace btas {
             A[i] = A[tmp];
             lambda[i] = lambda[tmp];
           } else if (dir) {
-            this->direct(i, rank, fast_pI, matlab, converge_test, lambda[i]);
+            this->direct(i, rank, fast_pI, matlab, converge_test, tensor_ref, lambda[i]);
           } else {
-            update_w_KRP(i, rank, fast_pI, matlab, converge_test, lambda[i]);
+            update_w_KRP(i, rank, fast_pI, matlab, converge_test, tensor_ref, lambda[i]);
           }
           // Compute the value s after normalizing the columns
           auto & ai = A[i];
diff --git a/btas/generic/tuck_cp_als.h b/btas/generic/tuck_cp_als.h
index 25d0db81..469723df 100644
--- a/btas/generic/tuck_cp_als.h
+++ b/btas/generic/tuck_cp_als.h
@@ -484,7 +484,7 @@ namespace btas{
         count++;
         this->num_ALS++;
         for (size_t i = 0; i < ndim; i++) {
-          this->direct(i, rank, fast_pI, matlab, converge_test, lambda[i]);
+          this->direct(i, rank, fast_pI, matlab, converge_test, tensor_ref, lambda[i]);
           // Compute the value s after normalizing the columns
           auto & ai = A[i];
           this->s = helper(i, ai);

From 4079f1a26c3642a3fabfa9b2d2cb797148f7450e Mon Sep 17 00:00:00 2001
From: Karl Pierce <kmp5@vt.edu>
Date: Wed, 3 May 2023 15:48:07 -0400
Subject: [PATCH 03/18] Start fixing BCD

---
 btas/generic/cp_bcd.h | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/btas/generic/cp_bcd.h b/btas/generic/cp_bcd.h
index f8844834..49db5953 100644
--- a/btas/generic/cp_bcd.h
+++ b/btas/generic/cp_bcd.h
@@ -192,13 +192,10 @@ namespace btas{
           std::cout << fit << "\t" << change << std::endl;
           is_converged = (change < 0.001);
           if(is_converged) {
-            auto grad = gradient - temp;
-            auto grad_ptr = grad.begin();
-            for(auto ptr = tensor_ref.begin(); ptr != tensor_ref.end(); ++ptr, ++grad_ptr)
-              *(ptr) = *(grad_ptr);
+            gradient = tensor_ref - temp;
           }
         }while(count < max_als && !is_converged);
-        std::cout << tensor_ref << std::endl;
+        std::cout << gradient << std::endl;
       }
       // Fix tensor_ref
       auto ptr = tensor_ref.begin();

From f4a5ab8ca500ee5a0d5411024d77e028a58db87b Mon Sep 17 00:00:00 2001
From: Karl Pierce <kmp5@vt.edu>
Date: Thu, 4 May 2023 12:28:17 -0400
Subject: [PATCH 04/18] Fix cp_bcd

Working for block size of 1 and rank. Working on intermediate block size, specifically if there is a remainder not working
---
 btas/generic/cp_bcd.h | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/btas/generic/cp_bcd.h b/btas/generic/cp_bcd.h
index 49db5953..e7c9dc54 100644
--- a/btas/generic/cp_bcd.h
+++ b/btas/generic/cp_bcd.h
@@ -135,6 +135,7 @@ namespace btas{
       // copying all of A to block factors. Then we are going to take slices of A
       // that way we can leverage all of `CP_ALS` code without modification
       blockfactors = A;
+      gradient = tensor_ref;
       // this stores the factors from rank 0 to #blocks * blocksize
       std::vector<Tensor> current_grads(ndim);
       // Compute all the BCD of the full blocks
@@ -150,7 +151,7 @@ namespace btas{
 
         // Do the ALS loop
         //CP_ALS::ALS(blocksize, converge_test, dir, max_als, calculate_epsilon, epsilon, fast_pI);
-        // First do it manually so we know its right
+        // First do it manually, so we know its right
         // Compute ALS of the bth block against the gradient
         // computed by subtracting the previous blocks from the reference
         size_t count = 0;
@@ -158,6 +159,7 @@ namespace btas{
         bool matlab = true;
         auto one_over_tref = 1.0 / norm(tensor_ref);
         auto fit = 1.0, change = 0.0;
+        detail::set_norm(converge_test, norm(gradient));
         do {
           ++count;
           this->num_ALS++;
@@ -171,6 +173,8 @@ namespace btas{
             // (this will replace A at the end)
             copy_blocks(blockfactors[i], A[i], block_step, block_step + blocksize);
           }
+
+          auto grad = reconstruct(A, order, A[ndim]);
           // Copy the lambda values into the correct location in blockfactors
           size_t c = 0;
           for (auto b = block_step; b < block_step + blocksize; ++b, ++c) {
@@ -180,22 +184,22 @@ namespace btas{
           // To compute the accuracy fully compute CP approximation using blocks 0 through b.
           for (size_t i = 0; i < ndim; ++i) {
             auto & a_mat = blockfactors[i];
-            auto lower = {z, block_step}, upper = {long(a_mat.extent(0)), block_step + blocksize};
+            auto lower = {z, z}, upper = {long(a_mat.extent(0)), block_step + blocksize};
             current_grads[i] = make_view(a_mat.range().slice(lower, upper), a_mat.storage());
           }
 
           auto temp = reconstruct(current_grads, order, blockfactors[ndim]);
           auto newfit = 1.0 - norm(temp - tensor_ref) * one_over_tref;
-          std::cout << newfit << std::endl;
           change = abs(fit - newfit);
           fit = newfit;
           std::cout << fit << "\t" << change << std::endl;
           is_converged = (change < 0.001);
+
+          is_converged = converge_test(A, this->AtA);
           if(is_converged) {
             gradient = tensor_ref - temp;
           }
         }while(count < max_als && !is_converged);
-        std::cout << gradient << std::endl;
       }
       // Fix tensor_ref
       auto ptr = tensor_ref.begin();

From 1f2f05af82946600ec0f09046575b99480b93dc4 Mon Sep 17 00:00:00 2001
From: Karl Pierce <kmp5@vt.edu>
Date: Thu, 4 May 2023 12:28:30 -0400
Subject: [PATCH 05/18] Add function to change norm of converge_test

---
 btas/generic/cp.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/btas/generic/cp.h b/btas/generic/cp.h
index 8b557289..384b6e08 100644
--- a/btas/generic/cp.h
+++ b/btas/generic/cp.h
@@ -71,6 +71,16 @@ namespace btas {
       epsilon = t.get_fit(max_iter);
       return;
     }
+
+    template <typename T>
+    void set_norm(T &t, double norm){
+      return;
+    }
+
+    template <typename Tensor>
+    void set_norm(FitCheck<Tensor> &t, double norm){
+      t.set_norm(norm);
+    }
   }  // namespace detail
 
   /** \brief Base class to compute the Canonical Product (CP) decomposition of an order-N

From 62da0718aeb9d4206f9dd24b78242b53c716c2d6 Mon Sep 17 00:00:00 2001
From: Karl Pierce <kmp5@vt.edu>
Date: Mon, 8 May 2023 13:02:51 -0400
Subject: [PATCH 06/18] Update cp_bcd.h

---
 btas/generic/cp_bcd.h | 279 ++++++++++--------------------------------
 1 file changed, 65 insertions(+), 214 deletions(-)

diff --git a/btas/generic/cp_bcd.h b/btas/generic/cp_bcd.h
index e7c9dc54..d502a5ca 100644
--- a/btas/generic/cp_bcd.h
+++ b/btas/generic/cp_bcd.h
@@ -98,6 +98,8 @@ namespace btas{
     Tensor gradient, block_ref;           // Stores gradient of BCD for fitting.
     ind_t blocksize;           // Size of block gradient
     std::vector<Tensor> blockfactors;
+    std::vector<size_t> order;  // convenient to write order of tensors for reconstruct
+    long z = 0;
 
     /// computed the CP decomposition using ALS to minimize the loss function for fixed rank \p rank
     /// \param[in] rank The rank of the CP decomposition.
@@ -118,7 +120,6 @@ namespace btas{
     ///       on return \c fast_pI will be true if use of Cholesky was successful
     virtual void ALS(ind_t rank, ConvClass &converge_test, bool dir, int max_als, bool calculate_epsilon, double &epsilon,
                      bool &fast_pI){
-      std::vector<size_t> order;
       for(auto i = 0; i < ndim; ++i){
         order.emplace_back(i);
       }
@@ -130,81 +131,18 @@ namespace btas{
       int n_full_blocks = int( rank / blocksize),
           last_blocksize = rank % blocksize;
 
-      long block_step = 0, z = 0;
+      long block_step = 0;
       this->AtA = std::vector<Tensor>(ndim);
       // copying all of A to block factors. Then we are going to take slices of A
       // that way we can leverage all of `CP_ALS` code without modification
       blockfactors = A;
       gradient = tensor_ref;
       // this stores the factors from rank 0 to #blocks * blocksize
-      std::vector<Tensor> current_grads(ndim);
       // Compute all the BCD of the full blocks
+      auto matlab = true;
       for (long b = 0; b < n_full_blocks; ++b, block_step += blocksize) {
-        // Take the b-th block of the factor matrix also take the
-        // find the partial grammian for the blocked factors
-        for (size_t i = 0; i < ndim; ++i){
-          auto & a_mat = A[i];
-          auto lower = {z, block_step}, upper = {long(a_mat.extent(0)), block_step + blocksize};
-          a_mat = make_view(blockfactors[i].range().slice(lower, upper), blockfactors[i].storage());
-          contract(this->one, a_mat, {1, 2}, a_mat.conj(), {1, 3}, this->zero, this->AtA[i], {2, 3});
-        }
-
-        // Do the ALS loop
-        //CP_ALS::ALS(blocksize, converge_test, dir, max_als, calculate_epsilon, epsilon, fast_pI);
-        // First do it manually, so we know its right
-        // Compute ALS of the bth block against the gradient
-        // computed by subtracting the previous blocks from the reference
-        size_t count = 0;
-        bool is_converged = false;
-        bool matlab = true;
-        auto one_over_tref = 1.0 / norm(tensor_ref);
-        auto fit = 1.0, change = 0.0;
-        detail::set_norm(converge_test, norm(gradient));
-        do {
-          ++count;
-          this->num_ALS++;
-
-          for (size_t i = 0; i < ndim; i++) {
-            this->direct(i, blocksize, fast_pI, matlab, converge_test, gradient);
-                // BCD(i, block_step, block_step + blocksize, fast_pI, matlab, converge_test);
-                auto &ai = A[i];
-            contract(this->one, ai, {1, 2}, ai.conj(), {1, 3}, this->zero, this->AtA[i], {2, 3});
-            // Copy the block computed in A to the correct portion in blockfactors
-            // (this will replace A at the end)
-            copy_blocks(blockfactors[i], A[i], block_step, block_step + blocksize);
-          }
-
-          auto grad = reconstruct(A, order, A[ndim]);
-          // Copy the lambda values into the correct location in blockfactors
-          size_t c = 0;
-          for (auto b = block_step; b < block_step + blocksize; ++b, ++c) {
-            blockfactors[ndim](b) = A[ndim](c);
-          }
-          // Test the system to see if converged. Doing the hard way first
-          // To compute the accuracy fully compute CP approximation using blocks 0 through b.
-          for (size_t i = 0; i < ndim; ++i) {
-            auto & a_mat = blockfactors[i];
-            auto lower = {z, z}, upper = {long(a_mat.extent(0)), block_step + blocksize};
-            current_grads[i] = make_view(a_mat.range().slice(lower, upper), a_mat.storage());
-          }
-
-          auto temp = reconstruct(current_grads, order, blockfactors[ndim]);
-          auto newfit = 1.0 - norm(temp - tensor_ref) * one_over_tref;
-          change = abs(fit - newfit);
-          fit = newfit;
-          std::cout << fit << "\t" << change << std::endl;
-          is_converged = (change < 0.001);
-
-          is_converged = converge_test(A, this->AtA);
-          if(is_converged) {
-            gradient = tensor_ref - temp;
-          }
-        }while(count < max_als && !is_converged);
+        BCD(block_step, block_step + blocksize, max_als, fast_pI, matlab, converge_test);
       }
-      // Fix tensor_ref
-      auto ptr = tensor_ref.begin();
-      for(auto g_ptr = gradient.begin(); g_ptr != gradient.end(); ++g_ptr, ++ptr)
-        *(ptr) = *(g_ptr);
     }
 
     void copy_blocks(Tensor & to, Tensor & from, ind_t block_start, ind_t block_end){
@@ -232,161 +170,74 @@ namespace btas{
     /// return if \c matlab was successful
     /// \param[in, out] converge_test Test to see if ALS is converged, holds the value of fit. test to see if the ALS is converged
 
-    void BCD(size_t n, ind_t block_start, ind_t block_end, bool &fast_pI, bool &matlab, ConvClass &converge_test, double lambda = 0.0) {
-      // Determine if n is the last mode, if it is first contract with first mode
-      // and transpose the product
-      bool last_dim = n == ndim - 1;
-      // product of all dimensions
-      ord_t LH_size = size;
-      size_t contract_dim = last_dim ? 0 : ndim - 1;
-      ind_t offset_dim = tensor_ref.extent(n);
-      ind_t brank = block_end - block_start;
-      ind_t pseudo_rank = brank;
-
-      // Store the dimensions which are available to hadamard contract
-      std::vector<ind_t> dimensions;
-      for (size_t i = last_dim ? 1 : 0; i < (last_dim ? ndim : ndim - 1); i++) {
-        dimensions.push_back(tensor_ref.extent(i));
+    void BCD(ind_t block_start, ind_t block_end, size_t max_als,
+             bool &fast_pI, bool &matlab, ConvClass &converge_test, double lambda = 0.0){
+      // Take the b-th block of the factor matrix also take the
+      // find the partial grammian for the blocked factors
+      auto cur_block_size = block_end - block_start;
+      for (size_t i = 0; i < ndim; ++i){
+        auto & a_mat = A[i];
+        auto lower = {z, block_start}, upper = {long(a_mat.extent(0)), block_end};
+        a_mat = make_view(blockfactors[i].range().slice(lower, upper), blockfactors[i].storage());
+        contract(this->one, a_mat, {1, 2}, a_mat.conj(), {1, 3}, this->zero, this->AtA[i], {2, 3});
       }
-
-      // Modifying the dimension of tensor_ref so store the range here to resize
-      Range R = tensor_ref.range();
-
-      // Resize the tensor which will store the product of tensor_ref and the first factor matrix
-      Tensor temp = Tensor(size / tensor_ref.extent(contract_dim), brank);
-      gradient.resize(
-          Range{Range1{last_dim ? tensor_ref.extent(contract_dim) : size / tensor_ref.extent(contract_dim)},
-                Range1{last_dim ? size / tensor_ref.extent(contract_dim) : tensor_ref.extent(contract_dim)}});
-
-      // contract tensor ref and the first factor matrix
-      gemm((last_dim ? blas::Op::Trans : blas::Op::NoTrans), blas::Op::NoTrans, this->one , (last_dim? gradient.conj():gradient), blockfactors[contract_dim].conj(), this->zero,
-           temp);
-
-      // Resize tensor_ref
-      gradient.resize(R);
-      // Remove the dimension which was just contracted out
-      LH_size /= tensor_ref.extent(contract_dim);
-
-      // n tells which dimension not to contract, and contract_dim says which dimension I am trying to contract.
-      // If n == contract_dim then that mode is skipped.
-      // if n == ndim - 1, my contract_dim = 0. The gemm transposes to make rank = ndim - 1, so I
-      // move the pointer that preserves the last dimension to n = ndim -2.
-      // In all cases I want to walk through the orders in tensor_ref backward so contract_dim = ndim - 2
-      n = last_dim ? ndim - 2 : n;
-      contract_dim = ndim - 2;
-
-      while (contract_dim > 0) {
-        // Now temp is three index object where temp has size
-        // (size of tensor_ref/product of dimension contracted, dimension to be
-        // contracted, rank)
-        ord_t idx2 = dimensions[contract_dim],
-              idx1 = LH_size / idx2;
-        temp.resize(
-            Range{Range1{idx1}, Range1{idx2}, Range1{pseudo_rank}});
-        Tensor contract_tensor;
-        //Tensor contract_tensor(Range{Range1{idx1}, Range1{pseudo_rank}});
-        //contract_tensor.fill(0.0);
-        const auto &a = blockfactors[(last_dim ? contract_dim + 1 : contract_dim)];
-        // If the middle dimension is the mode not being contracted, I will move
-        // it to the right hand side temp((size of tensor_ref/product of
-        // dimension contracted, rank * mode n dimension)
-        if (n == contract_dim) {
-          pseudo_rank *= offset_dim;
+      A[ndim] = Tensor(cur_block_size);
+      A[ndim].fill(0.0);
+      // Do the ALS loop
+      //CP_ALS::ALS(cur_block_size, converge_test, dir, max_als, calculate_epsilon, epsilon, fast_pI);
+      // First do it manually, so we know its right
+      // Compute ALS of the bth block against the gradient
+      // computed by subtracting the previous blocks from the reference
+      size_t count = 0;
+      bool is_converged = false;
+      auto one_over_tref = 1.0 / norm(tensor_ref);
+      auto fit = 1.0, change = 0.0;
+      detail::set_norm(converge_test, norm(gradient));
+      do {
+        ++count;
+        this->num_ALS++;
+
+        for (size_t i = 0; i < ndim; i++) {
+          this->direct(i, cur_block_size, fast_pI, matlab, converge_test, gradient);
+          auto &ai = A[i];
+          contract(this->one, ai, {1, 2}, ai.conj(), {1, 3}, this->zero, this->AtA[i], {2, 3});
+          // Copy the block computed in A to the correct portion in blockfactors
+          // (this will replace A at the end)
+          copy_blocks(blockfactors[i], A[i], block_start, block_end);
         }
 
-        // If the code hasn't hit the mode of interest yet, it will contract
-        // over the middle dimension and sum over the rank.
-
-        else if (contract_dim > n) {
-          middle_contract(this->one, temp, a.conj(), this->zero, contract_tensor);
-          temp = contract_tensor;
+        // Copy the lambda values into the correct location in blockfactors
+        size_t c = 0;
+        for (auto b = block_start; b < block_end; ++b, ++c) {
+          blockfactors[ndim](b) = A[ndim](c);
         }
+        // Test the system to see if converged. Doing the hard way first
+        // To compute the accuracy fully compute CP approximation using blocks 0 through b.
+        bool compute_full_fit = true;
+        Tensor temp;
+        if(compute_full_fit) {
+          std::vector<Tensor> current_grads(ndim);
+          for (size_t i = 0; i < ndim; ++i) {
+            auto &a_mat = blockfactors[i];
+            auto lower = {z, z}, upper = {long(a_mat.extent(0)), block_end};
+            current_grads[i] = make_view(a_mat.range().slice(lower, upper), a_mat.storage());
+          }
 
-        // If the code has passed the mode of interest, it will contract over
-        // the middle dimension and sum over rank * mode n dimension
-        else {
-          middle_contract_with_pseudorank(this->one, temp, a.conj(), this->zero, contract_tensor);
-          temp = contract_tensor;
+          temp = reconstruct(current_grads, order, blockfactors[ndim]);
+          auto newfit = 1.0 - norm(temp - tensor_ref) * one_over_tref;
+          change = abs(fit - newfit);
+          fit = newfit;
+          std::cout << block_end << "\t";
+          std::cout << fit << "\t" << change << std::endl;
         }
 
-        LH_size /= idx2;
-        contract_dim--;
-      }
-      n = last_dim ? n+1 : n;
-
-      // If the mode of interest is the 0th mode, then the while loop above
-      // contracts over all other dimensions and resulting temp is of the
-      // correct dimension If the mode of interest isn't 0th mode, must contract
-      // out the 0th mode here, the above algorithm can't perform this
-      // contraction because the mode of interest is coupled with the rank
-      if (n != 0) {
-        ind_t idx1 = dimensions[0];
-        temp.resize(Range{Range1{idx1}, Range1{offset_dim}, Range1{brank}});
-        Tensor contract_tensor(Range{Range1{offset_dim}, Range1{brank}});
-        contract_tensor.fill(0.0);
-
-        const auto &a = blockfactors[(last_dim ? 1 : 0)];
-        front_contract(this->one, temp, a.conj(), this->zero, contract_tensor);
-
-        temp = contract_tensor;
-      }
-
-      // Add lambda to factor matrices if RALS
-      if(lambda !=0){
-        auto LamA = blockfactors[n];
-        scal(lambda, LamA);
-        temp += LamA;
-      }
-
-      // multiply resulting matrix temp by pseudoinverse to calculate optimized
-      // factor matrix
-      pseudoinverse_block(n, brank, fast_pI, matlab, temp, lambda);
-
-      // Normalize the columns of the new factor matrix and update
-      this->normCol(temp, block_start);
-      auto ptr = temp.data();
-      blockfactors[n] = temp;
-      {
-        auto tref_dim = tensor_ref.extent(n);
-        auto A_ptr = A[n].data();
-        auto rank = A[n].extent(1);
-        auto temp_pos = 0;
-        for (ind_t i = 0, skip = 0; i < tref_dim; ++i, skip += rank){
-          for (auto b = block_start; b < block_end; ++b, ++temp_pos){
-            *(A_ptr + skip + b) = *(ptr + temp_pos);
-          }
+        is_converged = converge_test(A, this->AtA);
+        if(1.0 - fit < 1e-8) is_converged = true;
+        if(is_converged) {
+          gradient -= reconstruct(A, order, A[ndim]);
+          std::cout << norm(gradient - (tensor_ref - temp)) << std::endl;
         }
-      }
-    }
-
-    /// Calculates the column norms of a matrix and saves the norm values into
-    /// lambda tensor (last matrix in the A)
-
-    /// \param[in, out] Mat The matrix whose column will be normalized, return
-    /// \c Mat with all columns normalized
-
-    void normCol(Tensor &Mat, ord_t block_start) {
-      if (Mat.rank() > 2) BTAS_EXCEPTION("normCol with rank > 2 not yet supported");
-      ind_t rank = Mat.extent(1),
-            Nsize = Mat.extent(0);
-      ord_t size = Mat.size();
-
-      auto Mat_ptr = Mat.data();
-      std::vector<T> lambda;
-      for(auto i = 0; i < rank; ++i) lambda.emplace_back(T(0.0));
-
-      auto lam_ptr = lambda.data();
-      for (ord_t i = 0; i < size; ++i) {
-        *(lam_ptr + i % rank) += *(Mat_ptr + i) * btas::impl::conj(*(Mat_ptr + i));
-      }
-
-      auto A_ptr = A[ndim].data() + block_start;
-      for (ind_t i = 0; i < rank; ++i) {
-        auto val = sqrt(*(lam_ptr + i));
-        *(A_ptr + i) = val;
-        val = (abs(val) < 1e-12 ? 0.0 : 1.0 / val);
-        btas::scal(Nsize, val, (Mat_ptr + i), rank);
-      }
+      }while(count < max_als && !is_converged);
     }
 
   };

From df4938ba8653961aa3cae65a8a83aa03303eda22 Mon Sep 17 00:00:00 2001
From: Karl Pierce <kmp5@vt.edu>
Date: Mon, 8 May 2023 14:41:34 -0400
Subject: [PATCH 07/18] BCD now completely working!

---
 btas/generic/cp_bcd.h | 64 +++++++++++++++++++++++++++----------------
 1 file changed, 41 insertions(+), 23 deletions(-)

diff --git a/btas/generic/cp_bcd.h b/btas/generic/cp_bcd.h
index d502a5ca..f6141439 100644
--- a/btas/generic/cp_bcd.h
+++ b/btas/generic/cp_bcd.h
@@ -140,9 +140,50 @@ namespace btas{
       // this stores the factors from rank 0 to #blocks * blocksize
       // Compute all the BCD of the full blocks
       auto matlab = true;
+      auto one_over_tref = 1.0 / norm(tensor_ref);
+      auto fit = 1.0, change = 0.0;
+      bool compute_full_fit = true;
       for (long b = 0; b < n_full_blocks; ++b, block_step += blocksize) {
         BCD(block_step, block_step + blocksize, max_als, fast_pI, matlab, converge_test);
+        // Test the system to see if converged. Doing the hard way first
+        // To compute the accuracy fully compute CP approximation using blocks 0 through b.
+        if(compute_full_fit) {
+          std::vector<Tensor> current_grads(ndim);
+          for (size_t i = 0; i < ndim; ++i) {
+            auto &a_mat = blockfactors[i];
+            auto lower = {z, z}, upper = {long(a_mat.extent(0)), block_step + blocksize};
+            current_grads[i] = make_view(a_mat.range().slice(lower, upper), a_mat.storage());
+          }
+
+          auto temp = reconstruct(current_grads, order, blockfactors[ndim]);
+          auto newfit = 1.0 - norm(temp - tensor_ref) * one_over_tref;
+          change = abs(fit - newfit);
+          fit = newfit;
+          std::cout << block_step + blocksize << "\t";
+          std::cout << fit << "\t" << change << std::endl;
+        }
       }
+      if(last_blocksize != 0) {
+        this->AtA = std::vector<Tensor>(ndim);
+        block_step = n_full_blocks * blocksize;
+        BCD(block_step, block_step + last_blocksize, max_als, fast_pI, matlab, converge_test);
+        if(compute_full_fit) {
+          std::vector<Tensor> current_grads(ndim);
+          for (size_t i = 0; i < ndim; ++i) {
+            auto &a_mat = blockfactors[i];
+            auto lower = {z, z}, upper = {long(a_mat.extent(0)), block_step + last_blocksize};
+            current_grads[i] = make_view(a_mat.range().slice(lower, upper), a_mat.storage());
+          }
+
+          auto temp = reconstruct(current_grads, order, blockfactors[ndim]);
+          auto newfit = 1.0 - norm(temp - tensor_ref) * one_over_tref;
+          change = abs(fit - newfit);
+          fit = newfit;
+          std::cout << block_step + blocksize << "\t";
+          std::cout << fit << "\t" << change << std::endl;
+        }
+      }
+      A = blockfactors;
     }
 
     void copy_blocks(Tensor & to, Tensor & from, ind_t block_start, ind_t block_end){
@@ -190,8 +231,6 @@ namespace btas{
       // computed by subtracting the previous blocks from the reference
       size_t count = 0;
       bool is_converged = false;
-      auto one_over_tref = 1.0 / norm(tensor_ref);
-      auto fit = 1.0, change = 0.0;
       detail::set_norm(converge_test, norm(gradient));
       do {
         ++count;
@@ -211,31 +250,10 @@ namespace btas{
         for (auto b = block_start; b < block_end; ++b, ++c) {
           blockfactors[ndim](b) = A[ndim](c);
         }
-        // Test the system to see if converged. Doing the hard way first
-        // To compute the accuracy fully compute CP approximation using blocks 0 through b.
-        bool compute_full_fit = true;
-        Tensor temp;
-        if(compute_full_fit) {
-          std::vector<Tensor> current_grads(ndim);
-          for (size_t i = 0; i < ndim; ++i) {
-            auto &a_mat = blockfactors[i];
-            auto lower = {z, z}, upper = {long(a_mat.extent(0)), block_end};
-            current_grads[i] = make_view(a_mat.range().slice(lower, upper), a_mat.storage());
-          }
-
-          temp = reconstruct(current_grads, order, blockfactors[ndim]);
-          auto newfit = 1.0 - norm(temp - tensor_ref) * one_over_tref;
-          change = abs(fit - newfit);
-          fit = newfit;
-          std::cout << block_end << "\t";
-          std::cout << fit << "\t" << change << std::endl;
-        }
 
         is_converged = converge_test(A, this->AtA);
-        if(1.0 - fit < 1e-8) is_converged = true;
         if(is_converged) {
           gradient -= reconstruct(A, order, A[ndim]);
-          std::cout << norm(gradient - (tensor_ref - temp)) << std::endl;
         }
       }while(count < max_als && !is_converged);
     }

From 5bb3bc861470f7fa6c0b5701931e433e5149d60b Mon Sep 17 00:00:00 2001
From: kmp5VT <kmp5@vt.edu>
Date: Thu, 18 May 2023 15:06:10 -0400
Subject: [PATCH 08/18] BCD wasn't calculating gradient correctly

---
 btas/generic/cp_bcd.h | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/btas/generic/cp_bcd.h b/btas/generic/cp_bcd.h
index f6141439..bff88034 100644
--- a/btas/generic/cp_bcd.h
+++ b/btas/generic/cp_bcd.h
@@ -140,9 +140,9 @@ namespace btas{
       // this stores the factors from rank 0 to #blocks * blocksize
       // Compute all the BCD of the full blocks
       auto matlab = true;
-      auto one_over_tref = 1.0 / norm(tensor_ref);
+      auto one_over_tref = 1.0 / this->norm(tensor_ref);
       auto fit = 1.0, change = 0.0;
-      bool compute_full_fit = true;
+      bool compute_full_fit = false;
       for (long b = 0; b < n_full_blocks; ++b, block_step += blocksize) {
         BCD(block_step, block_step + blocksize, max_als, fast_pI, matlab, converge_test);
         // Test the system to see if converged. Doing the hard way first
@@ -156,7 +156,7 @@ namespace btas{
           }
 
           auto temp = reconstruct(current_grads, order, blockfactors[ndim]);
-          auto newfit = 1.0 - norm(temp - tensor_ref) * one_over_tref;
+          auto newfit = 1.0 - this->norm(temp - tensor_ref) * one_over_tref;
           change = abs(fit - newfit);
           fit = newfit;
           std::cout << block_step + blocksize << "\t";
@@ -176,13 +176,16 @@ namespace btas{
           }
 
           auto temp = reconstruct(current_grads, order, blockfactors[ndim]);
-          auto newfit = 1.0 - norm(temp - tensor_ref) * one_over_tref;
+          auto newfit = 1.0 - this->norm(temp - tensor_ref) * one_over_tref;
           change = abs(fit - newfit);
           fit = newfit;
-          std::cout << block_step + blocksize << "\t";
+          std::cout << block_step + last_blocksize << "\t";
           std::cout << fit << "\t" << change << std::endl;
         }
       }
+      epsilon = (compute_full_fit == false ? 
+                      this->norm(tensor_ref - reconstruct(blockfactors, order, blockfactors[ndim]))
+                      : 1.0 - fit);
       A = blockfactors;
     }
 
@@ -231,7 +234,7 @@ namespace btas{
       // computed by subtracting the previous blocks from the reference
       size_t count = 0;
       bool is_converged = false;
-      detail::set_norm(converge_test, norm(gradient));
+      detail::set_norm(converge_test, this->norm(gradient));
       do {
         ++count;
         this->num_ALS++;
@@ -252,10 +255,8 @@ namespace btas{
         }
 
         is_converged = converge_test(A, this->AtA);
-        if(is_converged) {
-          gradient -= reconstruct(A, order, A[ndim]);
-        }
       }while(count < max_als && !is_converged);
+      gradient -= reconstruct(A, order, A[ndim]);
     }
 
   };

From d66b529ad6d538302a2a875409a061b5af9af952 Mon Sep 17 00:00:00 2001
From: kmp5VT <kmp5@vt.edu>
Date: Thu, 18 May 2023 15:06:38 -0400
Subject: [PATCH 09/18] Add BCD unit tests

---
 unittest/tensor_cp_test.cc | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/unittest/tensor_cp_test.cc b/unittest/tensor_cp_test.cc
index beb097f4..815c521e 100644
--- a/unittest/tensor_cp_test.cc
+++ b/unittest/tensor_cp_test.cc
@@ -20,6 +20,7 @@ TEST_CASE("CP")
   using conv_class = btas::FitCheck<tensor>;
   using conv_class_coupled = btas::CoupledFitCheck<tensor>;
   using btas::CP_ALS;
+  using btas::CP_BCD;
   using btas::CP_RALS;
   using btas::CP_DF_ALS;
   using btas::COUPLED_CP_ALS;
@@ -374,7 +375,28 @@ TEST_CASE("CP")
      double diff = 1.0 - A1.compute_error(conv_coupled, 1e-2, 1, 19);
      CHECK(std::abs(diff - results(38,0)) <= epsilon);
    }
-     */
+  */
+  }
+  // Block Coodirnate descent test
+  {
+      SECTION("BCD MODE = 3, Finite rank"){
+      CP_BCD<tensor, conv_class> A1(D3, 9);
+      conv.set_norm(norm3);
+      double diff = A1.compute_rank_random(13, conv, 100);
+      CHECK(std::abs(diff) <= epsilon);
+    }
+    SECTION("BCD MODE = 4, Finite rank"){
+      CP_BCD<tensor, conv_class> A1(D4, 37);
+      conv.set_norm(norm4);
+      double diff = A1.compute_rank_random(55, conv, 100);
+      CHECK(std::abs(diff) <= epsilon);
+    }
+    SECTION("BCD MODE = 5, Finite rank"){
+      CP_BCD<tensor, conv_class> A1(D5, 26);
+      conv.set_norm(norm5);
+      double diff = A1.compute_rank_random(60, conv, 100);
+      CHECK(std::abs(diff) <= epsilon);
+    }
   }
 }
 #endif

From d56085e497838c9b2503980852fb07acc641c6e9 Mon Sep 17 00:00:00 2001
From: kmp5VT <kmp5@vt.edu>
Date: Thu, 25 May 2023 14:17:18 -0400
Subject: [PATCH 10/18] Allow more than 1 sweep of block coordinates, this
 allows each block to be updated with knowledge of the other blocks

---
 btas/generic/cp_bcd.h | 123 +++++++++++++++++++++++-------------------
 1 file changed, 69 insertions(+), 54 deletions(-)

diff --git a/btas/generic/cp_bcd.h b/btas/generic/cp_bcd.h
index bff88034..5819916a 100644
--- a/btas/generic/cp_bcd.h
+++ b/btas/generic/cp_bcd.h
@@ -74,7 +74,7 @@ namespace btas{
     /// that stores the reference tensor.
     /// Reference tensor has no symmetries.
     /// \param[in] tensor the reference tensor to be decomposed.
-    CP_BCD(Tensor &tensor, ind_t block_size = 1) : CP_ALS<Tensor, ConvClass>(tensor), blocksize(block_size){
+    CP_BCD(Tensor &tensor, ind_t block_size = 1, size_t sweeps=1) : CP_ALS<Tensor, ConvClass>(tensor), blocksize(block_size), nsweeps(sweeps){
     }
 
     /// Create a CP BCD object, child class of the CP object
@@ -86,8 +86,8 @@ namespace btas{
     /// symmetries of {0,1,1,3}
     /// \param[in] tensor the reference tensor to be decomposed.
     /// \param[in] symms the symmetries of the reference tensor.
-    CP_BCD(Tensor &tensor, std::vector<size_t> &symms, ind_t block_size = 1)
-        : CP_ALS<Tensor, ConvClass>(tensor, symms), blocksize(block_size) {
+    CP_BCD(Tensor &tensor, std::vector<size_t> &symms, ind_t block_size = 1, size_t sweeps=1)
+        : CP_ALS<Tensor, ConvClass>(tensor, symms), blocksize(block_size), nsweeps(sweeps){
     }
 
     CP_BCD() = default;
@@ -100,6 +100,7 @@ namespace btas{
     std::vector<Tensor> blockfactors;
     std::vector<size_t> order;  // convenient to write order of tensors for reconstruct
     long z = 0;
+    size_t nsweeps;
 
     /// computed the CP decomposition using ALS to minimize the loss function for fixed rank \p rank
     /// \param[in] rank The rank of the CP decomposition.
@@ -131,8 +132,6 @@ namespace btas{
       int n_full_blocks = int( rank / blocksize),
           last_blocksize = rank % blocksize;
 
-      long block_step = 0;
-      this->AtA = std::vector<Tensor>(ndim);
       // copying all of A to block factors. Then we are going to take slices of A
       // that way we can leverage all of `CP_ALS` code without modification
       blockfactors = A;
@@ -143,53 +142,57 @@ namespace btas{
       auto one_over_tref = 1.0 / this->norm(tensor_ref);
       auto fit = 1.0, change = 0.0;
       bool compute_full_fit = false;
-      for (long b = 0; b < n_full_blocks; ++b, block_step += blocksize) {
-        BCD(block_step, block_step + blocksize, max_als, fast_pI, matlab, converge_test);
-        // Test the system to see if converged. Doing the hard way first
-        // To compute the accuracy fully compute CP approximation using blocks 0 through b.
-        if(compute_full_fit) {
-          std::vector<Tensor> current_grads(ndim);
-          for (size_t i = 0; i < ndim; ++i) {
-            auto &a_mat = blockfactors[i];
-            auto lower = {z, z}, upper = {long(a_mat.extent(0)), block_step + blocksize};
-            current_grads[i] = make_view(a_mat.range().slice(lower, upper), a_mat.storage());
+      for(auto s = 0; s < nsweeps; ++s){
+        long block_step = 0;
+        this->AtA = std::vector<Tensor>(ndim);
+        for (long b = 0; b < n_full_blocks; ++b, block_step += blocksize) {
+          BCD(block_step, block_step + blocksize, max_als, fast_pI, matlab, converge_test, s);
+          // Test the system to see if converged. Doing the hard way first
+          // To compute the accuracy fully compute CP approximation using blocks 0 through b.
+          if(compute_full_fit) {
+            std::vector<Tensor> current_grads(ndim);
+            for (size_t i = 0; i < ndim; ++i) {
+              auto &a_mat = blockfactors[i];
+              auto lower = {z, z}, upper = {long(a_mat.extent(0)), block_step + blocksize};
+              current_grads[i] = make_view(a_mat.range().slice(lower, upper), a_mat.storage());
+            }
+
+            auto temp = reconstruct(current_grads, order, blockfactors[ndim]);
+            auto newfit = 1.0 - this->norm(temp - tensor_ref) * one_over_tref;
+            change = abs(fit - newfit);
+            fit = newfit;
+            std::cout << block_step + blocksize << "\t";
+            std::cout << fit << "\t" << change << std::endl;
           }
-
-          auto temp = reconstruct(current_grads, order, blockfactors[ndim]);
-          auto newfit = 1.0 - this->norm(temp - tensor_ref) * one_over_tref;
-          change = abs(fit - newfit);
-          fit = newfit;
-          std::cout << block_step + blocksize << "\t";
-          std::cout << fit << "\t" << change << std::endl;
         }
-      }
-      if(last_blocksize != 0) {
-        this->AtA = std::vector<Tensor>(ndim);
-        block_step = n_full_blocks * blocksize;
-        BCD(block_step, block_step + last_blocksize, max_als, fast_pI, matlab, converge_test);
-        if(compute_full_fit) {
-          std::vector<Tensor> current_grads(ndim);
-          for (size_t i = 0; i < ndim; ++i) {
-            auto &a_mat = blockfactors[i];
-            auto lower = {z, z}, upper = {long(a_mat.extent(0)), block_step + last_blocksize};
-            current_grads[i] = make_view(a_mat.range().slice(lower, upper), a_mat.storage());
+        if(last_blocksize != 0) {
+          this->AtA = std::vector<Tensor>(ndim);
+          block_step = n_full_blocks * blocksize;
+          BCD(block_step, block_step + last_blocksize, max_als, fast_pI, matlab, converge_test, s);
+          if(compute_full_fit) {
+            std::vector<Tensor> current_grads(ndim);
+            for (size_t i = 0; i < ndim; ++i) {
+              auto &a_mat = blockfactors[i];
+              auto lower = {z, z}, upper = {long(a_mat.extent(0)), block_step + last_blocksize};
+              current_grads[i] = make_view(a_mat.range().slice(lower, upper), a_mat.storage());
+            }
+
+            auto temp = reconstruct(current_grads, order, blockfactors[ndim]);
+            auto newfit = 1.0 - this->norm(temp - tensor_ref) * one_over_tref;
+            change = abs(fit - newfit);
+            fit = newfit;
+            std::cout << block_step + last_blocksize << "\t";
+            std::cout << fit << "\t" << change << std::endl;
           }
-
-          auto temp = reconstruct(current_grads, order, blockfactors[ndim]);
-          auto newfit = 1.0 - this->norm(temp - tensor_ref) * one_over_tref;
-          change = abs(fit - newfit);
-          fit = newfit;
-          std::cout << block_step + last_blocksize << "\t";
-          std::cout << fit << "\t" << change << std::endl;
         }
+        epsilon = (compute_full_fit == false ? 
+                        this->norm(tensor_ref - reconstruct(blockfactors, order, blockfactors[ndim]))
+                        : 1.0 - fit);
       }
-      epsilon = (compute_full_fit == false ? 
-                      this->norm(tensor_ref - reconstruct(blockfactors, order, blockfactors[ndim]))
-                      : 1.0 - fit);
       A = blockfactors;
     }
 
-    void copy_blocks(Tensor & to, Tensor & from, ind_t block_start, ind_t block_end){
+    void copy_blocks(Tensor & to, const Tensor & from, ind_t block_start, ind_t block_end){
         auto tref_dim = to.extent(0);
         auto to_rank = to.extent(1), from_rank = from.extent(1);
 
@@ -215,7 +218,7 @@ namespace btas{
     /// \param[in, out] converge_test Test to see if ALS is converged, holds the value of fit. test to see if the ALS is converged
 
     void BCD(ind_t block_start, ind_t block_end, size_t max_als,
-             bool &fast_pI, bool &matlab, ConvClass &converge_test, double lambda = 0.0){
+             bool &fast_pI, bool &matlab, ConvClass &converge_test, size_t sweep, double lambda = 0.0){
       // Take the b-th block of the factor matrix also take the
       // find the partial grammian for the blocked factors
       auto cur_block_size = block_end - block_start;
@@ -227,6 +230,14 @@ namespace btas{
       }
       A[ndim] = Tensor(cur_block_size);
       A[ndim].fill(0.0);
+      if(sweep != 0){
+        size_t c = 0;
+        auto lam_full_ptr = blockfactors[ndim].data(), new_lam_ptr = A[ndim].data();
+        for(auto b = block_start; b < block_end; ++b, ++c)
+          *(new_lam_ptr + c) = *(lam_full_ptr + b);
+        
+        gradient += reconstruct(A, order, A[ndim]);
+      }
       // Do the ALS loop
       //CP_ALS::ALS(cur_block_size, converge_test, dir, max_als, calculate_epsilon, epsilon, fast_pI);
       // First do it manually, so we know its right
@@ -243,20 +254,24 @@ namespace btas{
           this->direct(i, cur_block_size, fast_pI, matlab, converge_test, gradient);
           auto &ai = A[i];
           contract(this->one, ai, {1, 2}, ai.conj(), {1, 3}, this->zero, this->AtA[i], {2, 3});
-          // Copy the block computed in A to the correct portion in blockfactors
-          // (this will replace A at the end)
-          copy_blocks(blockfactors[i], A[i], block_start, block_end);
-        }
-
-        // Copy the lambda values into the correct location in blockfactors
-        size_t c = 0;
-        for (auto b = block_start; b < block_end; ++b, ++c) {
-          blockfactors[ndim](b) = A[ndim](c);
         }
 
         is_converged = converge_test(A, this->AtA);
       }while(count < max_als && !is_converged);
+      // Compute new difference
       gradient -= reconstruct(A, order, A[ndim]);
+
+      // Copy the block computed in A to the correct portion in blockfactors
+          // (this will replace A at the end)
+      for(size_t i = 0; i < ndim; ++i){
+        copy_blocks(blockfactors[i], A[i], block_start, block_end);
+      }
+      // Copy the lambda values into the correct location in blockfactors
+      size_t c = 0;
+      auto lam_full_ptr = blockfactors[ndim].data(), new_lam_ptr = A[ndim].data();
+      for (auto b = block_start; b < block_end; ++b, ++c) {
+        *(lam_full_ptr + b) = *(new_lam_ptr + c);
+      }
     }
 
   };

From c79a7cc503caf39ef8b3e65a894c636cd6bc29ab Mon Sep 17 00:00:00 2001
From: kmp5VT <kmp5@vt.edu>
Date: Thu, 25 May 2023 14:17:38 -0400
Subject: [PATCH 11/18] Fix for complex tensors

---
 btas/generic/cp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/btas/generic/cp.h b/btas/generic/cp.h
index 384b6e08..e59be5f8 100644
--- a/btas/generic/cp.h
+++ b/btas/generic/cp.h
@@ -571,7 +571,7 @@ namespace btas {
     /// \param[in] Mat Calculates the 2-norm of the matrix mat
     /// \return the 2-norm.
 
-    auto norm(const Tensor &Mat) { return sqrt(abs(dot(Mat, Mat))); }
+    auto norm(const Tensor &Mat) { return sqrt(abs(dot(Mat, btas::impl::conj(Mat)))); }
 
     /// SVD referencing code from
     /// http://www.netlib.org/lapack/explore-html/de/ddd/lapacke_8h_af31b3cb47f7cc3b9f6541303a2968c9f.html

From 132eecbf3e3051f0592b0b4a0ef5b17e7d879f41 Mon Sep 17 00:00:00 2001
From: Karl Pierce <kmp5@vt.edu>
Date: Fri, 26 May 2023 09:56:24 -0400
Subject: [PATCH 12/18] one is const, one is not

---
 btas/generic/cp_bcd.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/btas/generic/cp_bcd.h b/btas/generic/cp_bcd.h
index 5819916a..6aaf634b 100644
--- a/btas/generic/cp_bcd.h
+++ b/btas/generic/cp_bcd.h
@@ -196,7 +196,8 @@ namespace btas{
         auto tref_dim = to.extent(0);
         auto to_rank = to.extent(1), from_rank = from.extent(1);
 
-        auto to_ptr = to.data(), from_ptr = from.data();
+        auto to_ptr = to.data();
+        auto from_ptr = from.data();
         ind_t from_pos = 0;
         for (ind_t i = 0, skip = 0; i < tref_dim; ++i, skip += to_rank){
           for (auto b = block_start; b < block_end; ++b, ++from_pos){

From 2b3a4a0077753e7b76b565cca6a16737b6484849 Mon Sep 17 00:00:00 2001
From: Karl Pierce <kmp5@vt.edu>
Date: Tue, 18 Jul 2023 11:53:23 -0400
Subject: [PATCH 13/18] create `compute_full` fit function

---
 btas/generic/cp_bcd.h | 47 ++++++++++++++++++-------------------------
 1 file changed, 20 insertions(+), 27 deletions(-)

diff --git a/btas/generic/cp_bcd.h b/btas/generic/cp_bcd.h
index 6aaf634b..3cd16841 100644
--- a/btas/generic/cp_bcd.h
+++ b/btas/generic/cp_bcd.h
@@ -150,19 +150,7 @@ namespace btas{
           // Test the system to see if converged. Doing the hard way first
           // To compute the accuracy fully compute CP approximation using blocks 0 through b.
           if(compute_full_fit) {
-            std::vector<Tensor> current_grads(ndim);
-            for (size_t i = 0; i < ndim; ++i) {
-              auto &a_mat = blockfactors[i];
-              auto lower = {z, z}, upper = {long(a_mat.extent(0)), block_step + blocksize};
-              current_grads[i] = make_view(a_mat.range().slice(lower, upper), a_mat.storage());
-            }
-
-            auto temp = reconstruct(current_grads, order, blockfactors[ndim]);
-            auto newfit = 1.0 - this->norm(temp - tensor_ref) * one_over_tref;
-            change = abs(fit - newfit);
-            fit = newfit;
-            std::cout << block_step + blocksize << "\t";
-            std::cout << fit << "\t" << change << std::endl;
+            compute_full(long(blockfactors.extent(0)), block_step + blocksize, one_over_tref, change, fit);
           }
         }
         if(last_blocksize != 0) {
@@ -170,24 +158,13 @@ namespace btas{
           block_step = n_full_blocks * blocksize;
           BCD(block_step, block_step + last_blocksize, max_als, fast_pI, matlab, converge_test, s);
           if(compute_full_fit) {
-            std::vector<Tensor> current_grads(ndim);
-            for (size_t i = 0; i < ndim; ++i) {
-              auto &a_mat = blockfactors[i];
-              auto lower = {z, z}, upper = {long(a_mat.extent(0)), block_step + last_blocksize};
-              current_grads[i] = make_view(a_mat.range().slice(lower, upper), a_mat.storage());
-            }
-
-            auto temp = reconstruct(current_grads, order, blockfactors[ndim]);
-            auto newfit = 1.0 - this->norm(temp - tensor_ref) * one_over_tref;
-            change = abs(fit - newfit);
-            fit = newfit;
-            std::cout << block_step + last_blocksize << "\t";
-            std::cout << fit << "\t" << change << std::endl;
+            compute_full(long(blockfactors.extent(0)), block_step + last_blocksize, one_over_tref, change, fit);
           }
         }
         epsilon = (compute_full_fit == false ? 
-                        this->norm(tensor_ref - reconstruct(blockfactors, order, blockfactors[ndim]))
+                        this->norm(gradient) * one_over_tref
                         : 1.0 - fit);
+        std::cout << epsilon << std::endl;
       }
       A = blockfactors;
     }
@@ -205,6 +182,22 @@ namespace btas{
           }
         }
     }
+
+    void compute_full(long extent, long block_end, T one_over_tref, T & change, T& fit){
+      std::vector<Tensor> current_grads(ndim);
+      for (size_t i = 0; i < ndim; ++i) {
+        auto &a_mat = blockfactors[i];
+        auto lower = {z, z}, upper = {extent, block_end};
+        current_grads[i] = make_view(a_mat.range().slice(lower, upper), a_mat.storage());
+      }
+
+      auto temp = reconstruct(current_grads, order, blockfactors[ndim]);
+      auto newfit = 1.0 - this->norm(temp - tensor_ref) * one_over_tref;
+      change = abs(fit - newfit);
+      fit = newfit;
+      std::cout << block_end << "\t";
+      std::cout << fit << "\t" << change << std::endl;
+    }
     /// Computes an optimized factor matrix holding all others constant.
     /// No Khatri-Rao product computed, immediate contraction
 

From dbd76aa9a46ac734564ab1a951cf632d510320eb Mon Sep 17 00:00:00 2001
From: Karl Pierce <kmp5@vt.edu>
Date: Thu, 17 Aug 2023 12:31:42 -0400
Subject: [PATCH 14/18] Add ability to use different block sizes

---
 btas/generic/cp_bcd.h | 77 ++++++++++++++++++++++++++++++++-----------
 1 file changed, 58 insertions(+), 19 deletions(-)

diff --git a/btas/generic/cp_bcd.h b/btas/generic/cp_bcd.h
index 3cd16841..b2826c4d 100644
--- a/btas/generic/cp_bcd.h
+++ b/btas/generic/cp_bcd.h
@@ -74,7 +74,10 @@ namespace btas{
     /// that stores the reference tensor.
     /// Reference tensor has no symmetries.
     /// \param[in] tensor the reference tensor to be decomposed.
-    CP_BCD(Tensor &tensor, ind_t block_size = 1, size_t sweeps=1) : CP_ALS<Tensor, ConvClass>(tensor), blocksize(block_size), nsweeps(sweeps){
+    CP_BCD(Tensor &tensor, ind_t block_size = 1, size_t sweeps=1, std::vector<size_t> bs = std::vector<size_t>()) : CP_ALS<Tensor, ConvClass>(tensor), blocksize(block_size), nsweeps(sweeps){
+      if(!bs.empty()){
+        block_sizes = bs;
+      }
     }
 
     /// Create a CP BCD object, child class of the CP object
@@ -94,6 +97,11 @@ namespace btas{
 
     ~CP_BCD() = default;
 
+    void set_block_sizes(size_t rank, std::vector<size_t> bs){
+      BTAS_ASSERT(bs[bs.size() - 1] == rank);
+      block_sizes = bs;
+      return;
+    }
    protected:
     Tensor gradient, block_ref;           // Stores gradient of BCD for fitting.
     ind_t blocksize;           // Size of block gradient
@@ -101,6 +109,7 @@ namespace btas{
     std::vector<size_t> order;  // convenient to write order of tensors for reconstruct
     long z = 0;
     size_t nsweeps;
+    std::vector<size_t> block_sizes;
 
     /// computed the CP decomposition using ALS to minimize the loss function for fixed rank \p rank
     /// \param[in] rank The rank of the CP decomposition.
@@ -128,10 +137,19 @@ namespace btas{
       block_ref = tensor_ref;
 
       // Number of full blocks with rank blocksize
-      // plus one more block with rank rank % blocksize
+      // plus one more block with rank: rank % blocksize
       int n_full_blocks = int( rank / blocksize),
           last_blocksize = rank % blocksize;
 
+      if(block_sizes.empty()) {
+        size_t index = 0;
+        block_sizes.emplace_back(index);
+        for(size_t i = 0; i < n_full_blocks; ++i) {
+          index += blocksize;
+          block_sizes.emplace_back(index);
+        }
+        if(last_blocksize != 0) block_sizes.emplace_back(index + last_blocksize);
+      }
       // copying all of A to block factors. Then we are going to take slices of A
       // that way we can leverage all of `CP_ALS` code without modification
       blockfactors = A;
@@ -142,31 +160,49 @@ namespace btas{
       auto one_over_tref = 1.0 / this->norm(tensor_ref);
       auto fit = 1.0, change = 0.0;
       bool compute_full_fit = false;
+      size_t n_blocks = block_sizes.size();
       for(auto s = 0; s < nsweeps; ++s){
         long block_step = 0;
         this->AtA = std::vector<Tensor>(ndim);
-        for (long b = 0; b < n_full_blocks; ++b, block_step += blocksize) {
-          BCD(block_step, block_step + blocksize, max_als, fast_pI, matlab, converge_test, s);
+
+        // E - A1 - A2 - ... = gradient
+        // initial : gradient = E
+        // gradient = E - A1
+        for (long b = 1; b < n_blocks; ++b) {
+          std::cout << "\tnorm (remainder) / tref: " << norm(gradient) / norm(tensor_ref) << std::endl;
+          this->AtA = std::vector<Tensor>(ndim);
+          BCD(block_sizes[b - 1], block_sizes[b], max_als, fast_pI, matlab, converge_test, s);
           // Test the system to see if converged. Doing the hard way first
           // To compute the accuracy fully compute CP approximation using blocks 0 through b.
           if(compute_full_fit) {
-            compute_full(long(blockfactors.extent(0)), block_step + blocksize, one_over_tref, change, fit);
-          }
-        }
-        if(last_blocksize != 0) {
-          this->AtA = std::vector<Tensor>(ndim);
-          block_step = n_full_blocks * blocksize;
-          BCD(block_step, block_step + last_blocksize, max_als, fast_pI, matlab, converge_test, s);
-          if(compute_full_fit) {
-            compute_full(long(blockfactors.extent(0)), block_step + last_blocksize, one_over_tref, change, fit);
+            compute_full(block_step + blocksize, one_over_tref, change, fit);
           }
         }
+//        if(last_blocksize != 0) {
+//          std::cout << "\t\tnorm grad / tref: " << norm(gradient) / norm(tensor_ref) << std::endl;
+//          this->AtA = std::vector<Tensor>(ndim);
+//          block_step = n_full_blocks * blocksize;
+//          BCD(block_step, block_step + last_blocksize, max_als, fast_pI, matlab, converge_test, s);
+//          if(compute_full_fit) {
+//            compute_full(block_step + last_blocksize, one_over_tref, change, fit);
+//          }
+//        }
+        std::cout << "\t\tnorm grad / tref: " << norm(gradient) / norm(tensor_ref) << std::endl;
         epsilon = (compute_full_fit == false ? 
-                        this->norm(gradient) * one_over_tref
-                        : 1.0 - fit);
-        std::cout << epsilon << std::endl;
+                        1.0 - this->norm(gradient) * one_over_tref
+                        : fit);
+        std::cout << epsilon << "\n" << std::endl;
       }
+      detail::set_norm(converge_test, this->norm(tensor_ref));
+      converge_test.verbose(true);
       A = blockfactors;
+      this->AtA = std::vector<Tensor>(ndim);
+      for (size_t i = 0; i < ndim; ++i) {
+        auto &a_mat = A[i];
+        contract(this->one, a_mat, {1, 2}, a_mat.conj(), {1, 3}, this->zero, this->AtA[i], {2, 3});
+      }
+      gradient = reconstruct(A, order, A[ndim]);
+      std::cout << norm(tensor_ref - gradient) * one_over_tref << std::endl;
     }
 
     void copy_blocks(Tensor & to, const Tensor & from, ind_t block_start, ind_t block_end){
@@ -183,11 +219,11 @@ namespace btas{
         }
     }
 
-    void compute_full(long extent, long block_end, T one_over_tref, T & change, T& fit){
+    void compute_full(long block_end, T one_over_tref, T & change, T& fit){
       std::vector<Tensor> current_grads(ndim);
       for (size_t i = 0; i < ndim; ++i) {
         auto &a_mat = blockfactors[i];
-        auto lower = {z, z}, upper = {extent, block_end};
+        auto lower = {z, z}, upper = {long(a_mat.extent(0)), block_end};
         current_grads[i] = make_view(a_mat.range().slice(lower, upper), a_mat.storage());
       }
 
@@ -229,9 +265,11 @@ namespace btas{
         auto lam_full_ptr = blockfactors[ndim].data(), new_lam_ptr = A[ndim].data();
         for(auto b = block_start; b < block_end; ++b, ++c)
           *(new_lam_ptr + c) = *(lam_full_ptr + b);
-        
+
         gradient += reconstruct(A, order, A[ndim]);
+
       }
+      std::cout << "\t\tnorm(remainder + hat{T}_block) / tref: " << norm(gradient) / norm(tensor_ref) << std::endl;
       // Do the ALS loop
       //CP_ALS::ALS(cur_block_size, converge_test, dir, max_als, calculate_epsilon, epsilon, fast_pI);
       // First do it manually, so we know its right
@@ -240,6 +278,7 @@ namespace btas{
       size_t count = 0;
       bool is_converged = false;
       detail::set_norm(converge_test, this->norm(gradient));
+      converge_test.verbose(true);
       do {
         ++count;
         this->num_ALS++;

From 4a1304cea6677255dc6f70705469b8c387713ccc Mon Sep 17 00:00:00 2001
From: Karl Pierce <kmp5@vt.edu>
Date: Tue, 22 Aug 2023 12:10:50 -0400
Subject: [PATCH 15/18] update BCD printing

---
 btas/generic/cp_bcd.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/btas/generic/cp_bcd.h b/btas/generic/cp_bcd.h
index b2826c4d..afa69ed0 100644
--- a/btas/generic/cp_bcd.h
+++ b/btas/generic/cp_bcd.h
@@ -169,9 +169,9 @@ namespace btas{
         // initial : gradient = E
         // gradient = E - A1
         for (long b = 1; b < n_blocks; ++b) {
-          std::cout << "\tnorm (remainder) / tref: " << norm(gradient) / norm(tensor_ref) << std::endl;
           this->AtA = std::vector<Tensor>(ndim);
           BCD(block_sizes[b - 1], block_sizes[b], max_als, fast_pI, matlab, converge_test, s);
+          std::cout << "\tnorm (remainder) / tref: " << norm(gradient) / norm(tensor_ref) << std::endl;
           // Test the system to see if converged. Doing the hard way first
           // To compute the accuracy fully compute CP approximation using blocks 0 through b.
           if(compute_full_fit) {
@@ -187,7 +187,7 @@ namespace btas{
 //            compute_full(block_step + last_blocksize, one_over_tref, change, fit);
 //          }
 //        }
-        std::cout << "\t\tnorm grad / tref: " << norm(gradient) / norm(tensor_ref) << std::endl;
+        //std::cout << "\t\tnorm grad / tref: " << norm(gradient) / norm(tensor_ref) << std::endl;
         epsilon = (compute_full_fit == false ? 
                         1.0 - this->norm(gradient) * one_over_tref
                         : fit);
@@ -269,7 +269,7 @@ namespace btas{
         gradient += reconstruct(A, order, A[ndim]);
 
       }
-      std::cout << "\t\tnorm(remainder + hat{T}_block) / tref: " << norm(gradient) / norm(tensor_ref) << std::endl;
+      //std::cout << "\t\tnorm(remainder + hat{T}_block) / tref: " << norm(gradient) / norm(tensor_ref) << std::endl;
       // Do the ALS loop
       //CP_ALS::ALS(cur_block_size, converge_test, dir, max_als, calculate_epsilon, epsilon, fast_pI);
       // First do it manually, so we know its right
@@ -278,7 +278,7 @@ namespace btas{
       size_t count = 0;
       bool is_converged = false;
       detail::set_norm(converge_test, this->norm(gradient));
-      converge_test.verbose(true);
+      converge_test.verbose(false);
       do {
         ++count;
         this->num_ALS++;

From 8bcee70b4cf1bf3d41c3d62122ddc9188014f879 Mon Sep 17 00:00:00 2001
From: Karl Pierce <kmp5@vt.edu>
Date: Tue, 9 Jan 2024 20:04:32 -0500
Subject: [PATCH 16/18] Goal here is to try make new CP

Using interpolative decomposition
---
 btas/btas.h                      |  1 +
 btas/generic/cp_als.h            | 82 ++++++++++++++++++++++++++++++++
 btas/generic/lapack_extensions.h | 26 ++++++++++
 3 files changed, 109 insertions(+)

diff --git a/btas/btas.h b/btas/btas.h
index 095fd517..9c4fac5f 100644
--- a/btas/btas.h
+++ b/btas/btas.h
@@ -9,6 +9,7 @@
 
 #include <btas/generic/cp_als.h>
 #include <btas/generic/cp_rals.h>
+//#include <btas/generic/cp_id.h>
 #include <btas/generic/cp_df_als.h>
 #include <btas/generic/tuck_cp_als.h>
 #include <btas/generic/coupled_cp_als.h>
diff --git a/btas/generic/cp_als.h b/btas/generic/cp_als.h
index 12bba3f9..4bc0fbc7 100644
--- a/btas/generic/cp_als.h
+++ b/btas/generic/cp_als.h
@@ -339,6 +339,7 @@ namespace btas {
     Tensor &tensor_ref;        // Tensor to be decomposed
     ord_t size;                // Total number of elements
     bool factors_set = false;  // Are the factors preset (not implemented yet).
+    std::vector<std::vector<int64_t>> pivs;
 
     /// Creates an initial guess by computing the SVD of each mode
     /// If the rank of the mode is smaller than the CP rank requested
@@ -659,6 +660,87 @@ namespace btas {
       Tensor temp(A[n].extent(0), rank);
       Tensor an(A[n].range());
 
+      // Testing the code to see if pivoted QR can help
+      if(false){
+        // First create a Pivot matrix from the flattened tensor_ref
+        auto f = flatten(tensor_ref, n);
+        auto square_dim = f.extent(0), full = f.extent(1);
+        auto scale_factor = double(full) / double(square_dim);
+        auto extended = (scale_factor > 2.0 ? square_dim * (scale_factor - 1.0) : full);
+        std::vector<int64_t> piv;
+        if(pivs.size() < (n + 1)) {
+          piv = std::vector<int64_t>(f.extent(1));
+          std::vector<T> tau(f.extent(1));
+          btas::geqp3_pivot(blas::Layout::RowMajor, f.extent(0), f.extent(1), f.data(), f.extent(1), piv.data(),
+                            tau.data());
+          Tensor R(full, square_dim);
+          R.fill(0.0);
+          for(auto i = 0; i < square_dim; ++i) {
+            for (auto j = i; j < square_dim; ++j) {
+
+            }
+          }
+          f = flatten(tensor_ref, n);
+          pivs.emplace_back(piv);
+        } else {
+          piv = pivs[n];
+        }
+
+        auto K = this->generate_KRP(n, rank, true);
+        // For reference I compute the Matricized tensor times khatri rao product
+        gemm(blas::Op::NoTrans, blas::Op::NoTrans, this->one, f, K, this->zero, temp);
+        detail::set_MtKRP(converge_test, temp);
+
+        Tensor Fp(square_dim, square_dim);
+        {
+          Tensor t(square_dim, full);
+          for (auto i = 0; i < square_dim; ++i) {
+            for (auto j = 0; j < full; ++j) {
+              int v = pivs[n][j];
+              t(i, j) = f(i, v);
+            }
+          }
+
+          for (auto i = 0; i < square_dim; ++i) {
+            for (auto j = 0; j < square_dim; ++j) {
+              Fp(i, j) = t(i, j);
+            }
+          }
+        }
+
+        Tensor Kp(square_dim, rank);
+        {
+          Tensor t(full, rank);
+          for(auto j = 0; j < full; ++j) {
+            int v = pivs[n][j];
+            for (auto r = 0; r < rank; ++r) {
+              t(j, r) = K(v, r);
+            }
+          }
+
+          for(auto j = 0; j < square_dim; ++j) {
+            for (auto r = 0; r < rank; ++r) {
+              Kp(j, r) = t(j, r);
+            }
+          }
+        }
+
+        // contract the product from above with the pseudoinverse of the Hadamard
+        // produce an optimize factor matrix
+        fast_pI = false;
+        auto pInv = pseudoInverse(Kp, fast_pI);
+
+        Tensor t;
+        gemm(blas::Op::NoTrans, blas::Op::NoTrans, this->one, Fp, pInv, this->zero, temp);
+
+        // compute the difference between this new factor matrix and the previous
+        // iteration
+        this->normCol(temp);
+
+        // Replace the old factor matrix with the new optimized result
+        A[n] = temp;
+        return;
+      }
 #ifdef BTAS_HAS_INTEL_MKL
 
       // Computes the Khatri-Rao product intermediate
diff --git a/btas/generic/lapack_extensions.h b/btas/generic/lapack_extensions.h
index cb1560e6..bc4ecea0 100644
--- a/btas/generic/lapack_extensions.h
+++ b/btas/generic/lapack_extensions.h
@@ -40,6 +40,32 @@ int64_t getrf( blas::Layout order, int64_t M, int64_t N, T* A, int64_t LDA,
 
 }
 
+template <typename T, typename Alloc = std::allocator<T>>
+int64_t geqp3_pivot( blas::Layout order, int64_t M, int64_t N, T* A, int64_t LDA,
+              int64_t* IPIV, T* tau, Alloc alloc = Alloc() ) {
+
+  //std::cout << "IN GETRF IMPL" << std::endl;
+  if( order == blas::Layout::ColMajor ) {
+    return lapack::geqp3( M, N, A, LDA, IPIV, tau);
+  } else {
+
+    // Transpose input
+    auto* A_transpose = alloc.allocate(M*N);
+    transpose( N, M, A, LDA, A_transpose, M );
+
+    // A -> LU
+    auto info = lapack::geqp3( M, N, A_transpose, M, IPIV, tau);
+
+    // Transpose output + cleanup
+    if(!info)
+      transpose( M, N, A_transpose, M, A, LDA );
+    alloc.deallocate( A_transpose, M*N );
+
+    return info;
+  }
+
+}
+
 template <typename T, typename Alloc = std::allocator<T>, 
           typename IntAlloc = std::allocator<int64_t> >
 int64_t gesv( blas::Layout order, int64_t N, int64_t NRHS, T* A, int64_t LDA,

From fdb1b8d8f694dd75a24100ecc50373fe15a61250 Mon Sep 17 00:00:00 2001
From: Karl Pierce <kmp5@vt.edu>
Date: Wed, 24 Jan 2024 12:21:19 -0500
Subject: [PATCH 17/18] Fix testing area

---
 btas/generic/cp_als.h | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/btas/generic/cp_als.h b/btas/generic/cp_als.h
index f66dd324..b10fd6af 100644
--- a/btas/generic/cp_als.h
+++ b/btas/generic/cp_als.h
@@ -661,25 +661,20 @@ namespace btas {
       Tensor an(A[n].range());
 
       // Testing the code to see if pivoted QR can help
-      if(false){
+      if (false) {
         // First create a Pivot matrix from the flattened tensor_ref
         auto f = flatten(tensor_ref, n);
         auto square_dim = f.extent(0), full = f.extent(1);
         auto scale_factor = double(full) / double(square_dim);
         auto extended = (scale_factor > 2.0 ? square_dim * (scale_factor - 1.0) : full);
         std::vector<int64_t> piv;
-        if(pivs.size() < (n + 1)) {
+        if (pivs.size() < (n + 1)) {
           piv = std::vector<int64_t>(f.extent(1));
           std::vector<T> tau(f.extent(1));
           btas::geqp3_pivot(blas::Layout::RowMajor, f.extent(0), f.extent(1), f.data(), f.extent(1), piv.data(),
                             tau.data());
           Tensor R(full, square_dim);
           R.fill(0.0);
-          for(auto i = 0; i < square_dim; ++i) {
-            for (auto j = i; j < square_dim; ++j) {
-
-            }
-          }
           f = flatten(tensor_ref, n);
           pivs.emplace_back(piv);
         } else {
@@ -711,14 +706,14 @@ namespace btas {
         Tensor Kp(square_dim, rank);
         {
           Tensor t(full, rank);
-          for(auto j = 0; j < full; ++j) {
+          for (auto j = 0; j < full; ++j) {
             int v = pivs[n][j];
             for (auto r = 0; r < rank; ++r) {
               t(j, r) = K(v, r);
             }
           }
 
-          for(auto j = 0; j < square_dim; ++j) {
+          for (auto j = 0; j < square_dim; ++j) {
             for (auto r = 0; r < rank; ++r) {
               Kp(j, r) = t(j, r);
             }

From 8007360c90eb28193a46cc03af64b88844172216 Mon Sep 17 00:00:00 2001
From: Karl Pierce <kmp5@vt.edu>
Date: Tue, 30 Jan 2024 14:42:53 -0500
Subject: [PATCH 18/18] Bump VG tag

---
 external/versions.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/external/versions.cmake b/external/versions.cmake
index e738e91b..caa3e428 100644
--- a/external/versions.cmake
+++ b/external/versions.cmake
@@ -1,4 +1,4 @@
-set(BTAS_TRACKED_VGCMAKEKIT_TAG e0d04e91a84b7e71d9b87682c46c518e9966bd78)
+set(BTAS_TRACKED_VGCMAKEKIT_TAG 9b541a5f3708a58dce59d00f3c47ac030ef4d8b4)
 
 # likely can use earlier, but
 # - as of oct 2023 tested with 1.71 and up only