From 324411a3e033c7f05b77e3bff829b55d674438da Mon Sep 17 00:00:00 2001
From: "Suren A. Chilingaryan" <csa@suren.me>
Date: Fri, 27 Mar 2020 20:15:30 +0100
Subject: [PATCH 01/10] Optimized TNV routine (10x performance, 1/3 memcory
 consumption)

---
 CMakeLists.txt                                |   1 +
 cmake/FindGLIB2.cmake                         |  54 +
 src/Core/CMakeLists.txt                       |  19 +-
 src/Core/regularisers_CPU/TNV_core.c          | 948 +++++++++++-------
 .../regularisers_CPU/TNV_core_backtrack.c     | 695 +++++++++++++
 src/Core/regularisers_CPU/hw_sched.c          | 396 ++++++++
 src/Core/regularisers_CPU/hw_sched.h          | 151 +++
 src/Core/regularisers_CPU/hw_thread.c         | 141 +++
 src/Core/regularisers_CPU/hw_thread.h         |  76 ++
 9 files changed, 2097 insertions(+), 384 deletions(-)
 create mode 100644 cmake/FindGLIB2.cmake
 create mode 100755 src/Core/regularisers_CPU/TNV_core_backtrack.c
 create mode 100644 src/Core/regularisers_CPU/hw_sched.c
 create mode 100644 src/Core/regularisers_CPU/hw_sched.h
 create mode 100644 src/Core/regularisers_CPU/hw_thread.c
 create mode 100644 src/Core/regularisers_CPU/hw_thread.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 043f13c5..afbff2da 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -13,6 +13,7 @@
 #   limitations under the License.
 
 cmake_minimum_required (VERSION 3.0)
+set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake" ${CMAKE_MODULE_PATH})
 
 project(RGL)
 #https://stackoverflow.com/questions/13298504/using-cmake-with-setup-py
diff --git a/cmake/FindGLIB2.cmake b/cmake/FindGLIB2.cmake
new file mode 100644
index 00000000..2940d9f2
--- /dev/null
+++ b/cmake/FindGLIB2.cmake
@@ -0,0 +1,54 @@
+# - Try to find the GLIB2 libraries
+# Once done this will define
+#
+#  GLIB2_FOUND - system has glib2
+#  GLIB2_DIR - path to the glib2 base directory
+#  GLIB2_INCLUDE_DIR - the glib2 include directory
+#  GLIB2_LIBRARIES - glib2 library
+
+set(GLIB2_DIR GLIB2_DIR-NOTFOUND CACHE PATH "Location of GLIB2 package")
+
+if(GLIB2_INCLUDE_DIR AND GLIB2_LIBRARIES)
+    # Already in cache, be silent
+    set(GLIB2_FIND_QUIETLY TRUE)
+endif(GLIB2_INCLUDE_DIR AND GLIB2_LIBRARIES)
+
+if (GLIB2_DIR)
+    set(PKG_GLIB_LIBRARY_DIRS ${GLIB2_DIR}/lib${CMAKE_BUILD_ARCH} ${GLIB2_DIR}/lib)
+    set(PKG_GLIB_INCLUDE_DIRS ${GLIB2_DIR}/include/)
+else (GLIB2_DIR)
+    if (NOT WIN32)
+	find_package(PkgConfig REQUIRED)
+	pkg_check_modules(PKG_GLIB REQUIRED glib-2.0)
+    endif(NOT WIN32)
+endif (GLIB2_DIR)
+
+find_path(GLIB2_MAIN_INCLUDE_DIR glib.h
+         PATH_SUFFIXES glib-2.0
+         PATHS ${PKG_GLIB_INCLUDE_DIRS} )
+
+# search the glibconfig.h include dir under the same root where the library is found
+find_library(GLIB2_LIBRARIES
+             NAMES glib-2.0
+             PATHS ${PKG_GLIB_LIBRARY_DIRS} )
+
+find_library(GTHREAD2_LIBRARIES
+             NAMES gthread-2.0
+             PATHS ${PKG_GLIB_LIBRARY_DIRS} )
+
+find_path(GLIB2_INTERNAL_INCLUDE_DIR glibconfig.h
+          PATH_SUFFIXES glib-2.0/include
+          PATHS ${PKG_GLIB_INCLUDE_DIRS} ${PKG_GLIB_LIBRARY_DIRS} ${CMAKE_SYSTEM_LIBRARY_PATH})
+
+set(GLIB2_INCLUDE_DIR ${GLIB2_MAIN_INCLUDE_DIR})
+
+# not sure if this include dir is optional or required
+# for now it is optional
+if(GLIB2_INTERNAL_INCLUDE_DIR)
+  set(GLIB2_INCLUDE_DIR ${GLIB2_INCLUDE_DIR} ${GLIB2_INTERNAL_INCLUDE_DIR})
+endif(GLIB2_INTERNAL_INCLUDE_DIR)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(GLIB2  DEFAULT_MSG  GLIB2_LIBRARIES GTHREAD2_LIBRARIES GLIB2_MAIN_INCLUDE_DIR)
+
+mark_as_advanced(GLIB2_INCLUDE_DIR GLIB2_LIBRARIES GTHREAD2_LIBRARIES GLIB2_INTERNAL_INCLUDE_DIR GLIB2_MAIN_INCLUDE_DIR)
diff --git a/src/Core/CMakeLists.txt b/src/Core/CMakeLists.txt
index 4e43b5e6..10b16f30 100644
--- a/src/Core/CMakeLists.txt
+++ b/src/Core/CMakeLists.txt
@@ -15,6 +15,7 @@ message("CIL_VERSION ${CIL_VERSION}")
 ## Build the regularisers package as a library
 message("Creating Regularisers as a shared library")
 
+find_package(GLIB2 REQUIRED)
 find_package(OpenMP REQUIRED)
 if (OPENMP_FOUND)
     set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
@@ -24,8 +25,8 @@ if (OPENMP_FOUND)
    set (CMAKE_STATIC_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_STATIC_LINKER_FLAGS} ${OpenMP_CXX_FLAGS}")
 endif()
 
-message("CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}")
-message("CMAKE_C_FLAGS ${CMAKE_C_FLAGS}")
+message("CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} -fno-omit-frame-pointer")
+message("CMAKE_C_FLAGS ${CMAKE_C_FLAGS} -fno-omit-frame-pointer")
 message("CMAKE_EXE_LINKER_FLAGS ${CMAKE_EXE_LINKER_FLAGS}")
 message("CMAKE_SHARED_LINKER_FLAGS ${CMAKE_SHARED_LINKER_FLAGS}")
 message("CMAKE_STATIC_LINKER_FLAGS ${CMAKE_STATIC_LINKER_FLAGS}")
@@ -58,10 +59,17 @@ message("CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}")
 ## Build the regularisers package as a library
 message("Adding regularisers as a shared library")
 
-#set(CMAKE_C_COMPILER /apps/pgi/linux86-64/17.4/bin/pgcc)
+#set(CMAKE_C_COMPILER /opt/intel/compilers_and_libraries/linux/bin/intel64/icc)
+#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -xSSE4.2 -qopt-report=2 -qopt-report-phase=vec -qopenmp")
+set(CMAKE_C_FLAGS "-march=nocona -msse -msse2 -msse3 -mssse3 -msse4 -msse4.1 -msse4.2 -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=128 -fopenmp")
+#set(CMAKE_C_FLAGS "-march=native -msse -msse2 -msse3 -mssse3 -msse4 -msse4.1 -msse4.2 -mavx -mavx2 -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=512 -fopenmp")
+#set(CMAKE_C_FLAGS_RELEASE "-g -gdwarf-2 -g3 -fno-omit-frame-pointer")
+
 #set(CMAKE_C_FLAGS "-acc -Minfo -ta=tesla:cc20 -openmp")
 #set(CMAKE_C_FLAGS "-acc -Minfo -ta=multicore -openmp -fPIC")
 add_library(cilreg SHARED
+	    ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/hw_sched.c
+	    ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/hw_thread.c
 	    ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/FGP_TV_core.c
 	    ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/SB_TV_core.c
             ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/PD_TV_core.c
@@ -78,8 +86,9 @@ add_library(cilreg SHARED
 	    ${CMAKE_CURRENT_SOURCE_DIR}/inpainters_CPU/Diffusion_Inpaint_core.c
 	    ${CMAKE_CURRENT_SOURCE_DIR}/inpainters_CPU/NonlocalMarching_Inpaint_core.c
 	    )
-target_link_libraries(cilreg ${EXTRA_LIBRARIES} )
+target_link_libraries(cilreg ${EXTRA_LIBRARIES} ${GLIB2_LIBRARIES} ${GTHREAD2_LIBRARIES} )
 include_directories(cilreg PUBLIC
+                      ${GLIB2_INCLUDE_DIR}
                       ${LIBRARY_INC}/include
 					  ${CMAKE_CURRENT_SOURCE_DIR}
 		              ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/
@@ -105,7 +114,7 @@ endif()
 
 
 # GPU Regularisers
-if (BUILD_CUDA)
+if (BUILD_CUDA1)
     find_package(CUDA)
     if (CUDA_FOUND)
       set(CUDA_NVCC_FLAGS "-Xcompiler -fPIC -shared -D_FORCE_INLINES")
diff --git a/src/Core/regularisers_CPU/TNV_core.c b/src/Core/regularisers_CPU/TNV_core.c
index 753cc5f3..be7fdef7 100755
--- a/src/Core/regularisers_CPU/TNV_core.c
+++ b/src/Core/regularisers_CPU/TNV_core.c
@@ -5,6 +5,12 @@
  *
  * Copyright 2017 Daniil Kazantsev
  * Copyright 2017 Srikanth Nagella, Edoardo Pasca
+ * 
+ * Copyriht 2020 Suren A. Chlingaryan
+ * Optimized version with 1/3 of memory consumption and ~10x performance
+ * This version is not able to perform back-track except during first iterations
+ * But warning would be printed if backtracking is required and slower version (TNV_core_backtrack.c) 
+ * could be executed instead. It still better than original with 1/2 of memory consumption and 4x performance gain
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +25,467 @@
 
 #include "TNV_core.h"
 
+#define min(a,b) (((a)<(b))?(a):(b))
+
+inline void coefF(float *t, float M1, float M2, float M3, float sigma, int p, int q, int r) {
+    int ii, num;
+    float divsigma = 1.0f / sigma;
+    float sum, shrinkfactor;
+    float T,D,det,eig1,eig2,sig1,sig2,V1, V2, V3, V4, v0,v1,v2, mu1,mu2,sig1_upd,sig2_upd;
+    float proj[2] = {0};
+
+    // Compute eigenvalues of M
+    T = M1 + M3;
+    D = M1 * M3 - M2 * M2;
+    det = sqrtf(MAX((T * T / 4.0f) - D, 0.0f));
+    eig1 = MAX((T / 2.0f) + det, 0.0f);
+    eig2 = MAX((T / 2.0f) - det, 0.0f);
+    sig1 = sqrtf(eig1);
+    sig2 = sqrtf(eig2);
+
+    // Compute normalized eigenvectors
+    V1 = V2 = V3 = V4 = 0.0f;
+
+    if(M2 != 0.0f)
+    {
+        v0 = M2;
+        v1 = eig1 - M3;
+        v2 = eig2 - M3;
+
+        mu1 = sqrtf(v0 * v0 + v1 * v1);
+        mu2 = sqrtf(v0 * v0 + v2 * v2);
+
+        if(mu1 > fTiny)
+        {
+            V1 = v1 / mu1;
+            V3 = v0 / mu1;
+        }
+
+        if(mu2 > fTiny)
+        {
+            V2 = v2 / mu2;
+            V4 = v0 / mu2;
+        }
+
+    } else
+    {
+        if(M1 > M3)
+        {
+            V1 = V4 = 1.0f;
+            V2 = V3 = 0.0f;
+
+        } else
+        {
+            V1 = V4 = 0.0f;
+            V2 = V3 = 1.0f;
+        }
+    }
+
+    // Compute prox_p of the diagonal entries
+    sig1_upd = sig2_upd = 0.0f;
+
+    if(p == 1)
+    {
+        sig1_upd = MAX(sig1 - divsigma, 0.0f);
+        sig2_upd = MAX(sig2 - divsigma, 0.0f);
+
+    } else if(p == INFNORM)
+    {
+        proj[0] = sigma * fabs(sig1);
+        proj[1] = sigma * fabs(sig2);
+
+        /*l1 projection part */
+        sum = fLarge;
+        num = 0l;
+        shrinkfactor = 0.0f;
+        while(sum > 1.0f)
+        {
+            sum = 0.0f;
+            num = 0;
+
+            for(ii = 0; ii < 2; ii++)
+            {
+                proj[ii] = MAX(proj[ii] - shrinkfactor, 0.0f);
+
+                sum += fabs(proj[ii]);
+                if(proj[ii]!= 0.0f)
+                    num++;
+            }
+
+            if(num > 0)
+                shrinkfactor = (sum - 1.0f) / num;
+            else
+                break;
+        }
+        /*l1 proj ends*/
+
+        sig1_upd = sig1 - divsigma * proj[0];
+        sig2_upd = sig2 - divsigma * proj[1];
+    }
+
+    // Compute the diagonal entries of $\widehat{\Sigma}\Sigma^{\dagger}_0$
+    if(sig1 > fTiny)
+        sig1_upd /= sig1;
+
+    if(sig2 > fTiny)
+        sig2_upd /= sig2;
+
+    // Compute solution
+    t[0] = sig1_upd * V1 * V1 + sig2_upd * V2 * V2;
+    t[1] = sig1_upd * V1 * V3 + sig2_upd * V2 * V4;
+    t[2] = sig1_upd * V3 * V3 + sig2_upd * V4 * V4;
+}
+
+
+#include "hw_sched.h"
+typedef struct {
+    int offY, stepY, copY;
+    float *Input, *u, *qx, *qy, *gradx, *grady, *div;
+    float *div0, *udiff0, *udiff;
+    float resprimal, resdual;
+    float unorm, qnorm, product;
+} tnv_thread_t;
+
+typedef struct {
+    int threads;
+    tnv_thread_t *thr_ctx;
+    float *InputT, *uT;
+    int dimX, dimY, dimZ, padZ;
+    float lambda, sigma, tau, theta;
+} tnv_context_t;
+
+HWSched sched = NULL;
+tnv_context_t tnv_ctx;
+
+
+static int tnv_free(HWThread thr, void *hwctx, int device_id, void *data) {
+    int i,j,k;
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+    free(ctx->Input);
+    free(ctx->u);
+    free(ctx->qx);
+    free(ctx->qy);
+    free(ctx->gradx);
+    free(ctx->grady);
+    free(ctx->div);
+    
+    free(ctx->div0);
+    free(ctx->udiff0);
+    free(ctx->udiff);
+    
+    return 0;
+}
+
+static int tnv_init(HWThread thr, void *hwctx, int device_id, void *data) {
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+    
+    int dimX = tnv_ctx->dimX;
+    int dimY = tnv_ctx->dimY;
+    int dimZ = tnv_ctx->dimZ;
+    int padZ = tnv_ctx->padZ;
+    int offY = ctx->offY;
+    int stepY = ctx->stepY;
+    
+//    printf("%i %p - %i %i %i x %i %i\n", device_id, ctx, dimX, dimY, dimZ, offY, stepY);
+
+    long DimTotal = (long)(dimX*stepY*padZ);
+    long Dim1Total = (long)(dimX*(stepY+1)*padZ);
+    long DimRow = (long)(dimX * padZ);
+
+    // Auxiliar vectors
+    ctx->Input = malloc(Dim1Total * sizeof(float));
+    ctx->u = malloc(Dim1Total * sizeof(float));
+    ctx->qx = malloc(DimTotal * sizeof(float));
+    ctx->qy = malloc(DimTotal * sizeof(float));
+    ctx->gradx = malloc(DimTotal * sizeof(float));
+    ctx->grady = malloc(DimTotal * sizeof(float));
+    ctx->div = malloc(Dim1Total * sizeof(float));
+
+    ctx->div0 = malloc(DimRow * sizeof(float));
+    ctx->udiff0 = malloc(DimRow * sizeof(float));
+    ctx->udiff = malloc(DimRow * sizeof(float));
+
+    if ((!ctx->Input)||(!ctx->u)||(!ctx->qx)||(!ctx->qy)||(!ctx->gradx)||(!ctx->grady)||(!ctx->div)||(!ctx->div0)||(!ctx->udiff)||(!ctx->udiff0)) {
+        fprintf(stderr, "Error allocating memory\n");
+        exit(-1);
+    }
+
+    return 0;
+}
+
+static int tnv_start(HWThread thr, void *hwctx, int device_id, void *data) {
+    int i,j,k;
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+    
+    int dimX = tnv_ctx->dimX;
+    int dimY = tnv_ctx->dimY;
+    int dimZ = tnv_ctx->dimZ;
+    int padZ = tnv_ctx->padZ;
+    int offY = ctx->offY;
+    int stepY = ctx->stepY;
+    int copY = ctx->copY;
+    
+//    printf("%i %p - %i %i %i (%i) x %i %i\n", device_id, ctx, dimX, dimY, dimZ, padZ, offY, stepY);
+
+    long DimTotal = (long)(dimX*stepY*padZ);
+    long Dim1Total = (long)(dimX*copY*padZ);
+
+    memset(ctx->u, 0, Dim1Total * sizeof(float));
+    memset(ctx->qx, 0, DimTotal * sizeof(float));
+    memset(ctx->qy, 0, DimTotal * sizeof(float));
+    memset(ctx->gradx, 0, DimTotal * sizeof(float));
+    memset(ctx->grady, 0, DimTotal * sizeof(float));
+    memset(ctx->div, 0, Dim1Total * sizeof(float));
+    
+    for(k=0; k<dimZ; k++) {
+        for(j=0; j<copY; j++) {
+            for(i=0; i<dimX; i++) {
+                ctx->Input[j * dimX * padZ + i * padZ + k] =  tnv_ctx->InputT[k * dimX * dimY + (j + offY) * dimX + i];
+                ctx->u[j * dimX * padZ + i * padZ + k] =  tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i];
+            }
+        }
+    }
+
+    return 0;
+}
+
+static int tnv_finish(HWThread thr, void *hwctx, int device_id, void *data) {
+    int i,j,k;
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+    int dimX = tnv_ctx->dimX;
+    int dimY = tnv_ctx->dimY;
+    int dimZ = tnv_ctx->dimZ;
+    int padZ = tnv_ctx->padZ;
+    int offY = ctx->offY;
+    int stepY = ctx->stepY;
+    int copY = ctx->copY;
+
+    for(k=0; k<dimZ; k++) {
+        for(j=0; j<stepY; j++) {
+            for(i=0; i<dimX; i++) {
+                tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i] = ctx->u[j * dimX * padZ + i * padZ + k];
+            }
+        }
+    }
+    
+    return 0;
+}
+
+
+static int tnv_restore(HWThread thr, void *hwctx, int device_id, void *data) {
+    int i,j,k;
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+    
+    int dimX = tnv_ctx->dimX;
+    int dimY = tnv_ctx->dimY;
+    int dimZ = tnv_ctx->dimZ;
+    int stepY = ctx->stepY;
+    int copY = ctx->copY;
+    int padZ = tnv_ctx->padZ;
+    long DimTotal = (long)(dimX*stepY*padZ);
+    long Dim1Total = (long)(dimX*copY*padZ);
+
+    memset(ctx->u, 0, Dim1Total * sizeof(float));
+    memset(ctx->qx, 0, DimTotal * sizeof(float));
+    memset(ctx->qy, 0, DimTotal * sizeof(float));
+    memset(ctx->gradx, 0, DimTotal * sizeof(float));
+    memset(ctx->grady, 0, DimTotal * sizeof(float));
+    memset(ctx->div, 0, Dim1Total * sizeof(float));
+
+    return 0;
+}
+
+
+static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) {
+    long i, j, k, l, m;
+
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+    
+    int dimX = tnv_ctx->dimX;
+    int dimY = tnv_ctx->dimY;
+    int dimZ = tnv_ctx->dimZ;
+    int padZ = tnv_ctx->padZ;
+    int offY = ctx->offY;
+    int stepY = ctx->stepY;
+    int copY = ctx->copY;
+
+    float *Input = ctx->Input;
+    float *u = ctx->u;
+    float *qx = ctx->qx;
+    float *qy = ctx->qy;
+    float *gradx = ctx->gradx;
+    float *grady = ctx->grady;
+    float *div = ctx->div;
+
+    long p = 1l;
+    long q = 1l;
+    long r = 0l;
+
+    float lambda = tnv_ctx->lambda;
+    float sigma = tnv_ctx->sigma;
+    float tau = tnv_ctx->tau;
+    float theta = tnv_ctx->theta;
+    
+    float taulambda = tau * lambda;
+    float divtau = 1.0f / tau;
+    float divsigma = 1.0f / sigma;
+    float theta1 = 1.0f + theta;
+    float constant = 1.0f + taulambda;
+
+    float resprimal = 0.0f;
+    float resdual = 0.0f;
+    float product = 0.0f;
+    float unorm = 0.0f;
+    float qnorm = 0.0f;
+
+    float qxdiff;
+    float qydiff;
+    float divdiff;
+    float gradxdiff[dimZ];
+    float gradydiff[dimZ];
+    float ubarx[dimZ];
+    float ubary[dimZ];
+    float udiff_next[dimZ];
+
+    for(i=0; i < dimX; i++) {
+        for(k = 0; k < dimZ; k++) { 
+            int l = i * padZ + k;
+            float u_upd = (u[l] + tau * div[l] + taulambda * Input[l])/constant;
+            float udiff = u[l] - u_upd;
+            ctx->udiff[l] = udiff;
+            ctx->udiff0[l] = udiff;
+            ctx->div0[l] = div[l];
+            u[l] = u_upd;
+        }
+    }
+
+    for(j = 0; j < stepY; j++) {
+        for(i = 0; i < dimX; i++) {
+            float t[3];
+            float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f;
+            l = (j * dimX  + i) * padZ;
+            m = dimX * padZ;
+        
+//#pragma unroll 64
+            for(k = 0; k < dimZ; k++) {
+                float u_upd = (u[l + k + m] + tau * div[l + k + m] + taulambda * Input[l + k + m]) / constant;
+                udiff_next[k] = u[l + k + m] - u_upd;
+                u[l + k + m] = u_upd;
+
+                float gradx_upd = (i == (dimX - 1))?0:(u[l + k + padZ] - u[l + k]);
+                float grady_upd = (j == (copY - 1))?0:(u[l + k + m] - u[l + k]);
+                gradxdiff[k] = gradx[l + k] - gradx_upd;
+                gradydiff[k] = grady[l + k] - grady_upd;
+                gradx[l + k] = gradx_upd;
+                grady[l + k] = grady_upd;
+                
+                ubarx[k] = gradx_upd - theta * gradxdiff[k];
+                ubary[k] = grady_upd - theta * gradydiff[k];
+
+                float vx = ubarx[k] + divsigma * qx[l + k];
+                float vy = ubary[k] + divsigma * qy[l + k];
+
+                M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy);
+            }
+
+            coefF(t, M1, M2, M3, sigma, p, q, r);
+            
+//#pragma unroll 64
+            for(k = 0; k < dimZ; k++) {
+                float vx = ubarx[k] + divsigma * qx[l + k];
+                float vy = ubary[k] + divsigma * qy[l + k];
+                float gx_upd = vx * t[0] + vy * t[1];
+                float gy_upd = vx * t[1] + vy * t[2];
+
+                qxdiff = sigma * (ubarx[k] - gx_upd);
+                qydiff = sigma * (ubary[k] - gy_upd);
+                
+                qx[l + k] += qxdiff;
+                qy[l + k] += qydiff;
+
+                float udiff = ctx->udiff[i * padZ + k];
+                ctx->udiff[i * padZ + k] = udiff_next[k];
+                unorm += (udiff * udiff);
+                qnorm += (qxdiff * qxdiff + qydiff * qydiff);
+
+                float div_upd = 0;
+                div_upd -= (i > 0)?qx[l + k - padZ]:0;
+                div_upd -= (j > 0)?qy[l + k - m]:0;
+                div_upd += (i < (dimX-1))?qx[l + k]:0;
+                div_upd += (j < (copY-1))?qy[l + k]:0;
+                divdiff = div[l + k] - div_upd;  
+                div[l + k] = div_upd;
+
+                resprimal +=  ((offY == 0)||(j > 0))?fabs(divtau * udiff + divdiff):0; 
+                resdual += fabs(divsigma * qxdiff + gradxdiff[k]);
+                resdual += fabs(divsigma * qydiff + gradydiff[k]);
+                product -= (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff);
+            }
+            
+        } // i
+    } // j
+
+
+    ctx->resprimal = resprimal;
+    ctx->resdual = resdual;
+    ctx->product = product;
+    ctx->unorm = unorm;
+    ctx->qnorm = qnorm;
+
+    return 0;
+}
+
+static void TNV_CPU_init(float *InputT, float *uT, int dimX, int dimY, int dimZ) {
+    int i, off, size, err;
+
+    if (sched) return;
+
+    tnv_ctx.dimX = dimX;
+    tnv_ctx.dimY = dimY;
+    tnv_ctx.dimZ = dimZ;
+        // Padding seems actually slower
+//    tnv_ctx.padZ = 64 * ((dimZ / 64) + ((dimZ % 64)?1:0));
+    tnv_ctx.padZ = dimZ;
+    
+    hw_sched_init();
+
+    int threads = hw_sched_get_cpu_count();
+    if (threads > dimY) threads = dimY/2;
+
+    int step = dimY / threads;
+    int extra = dimY % threads;
+
+    tnv_ctx.threads = threads;
+    tnv_ctx.thr_ctx = (tnv_thread_t*)calloc(threads, sizeof(tnv_thread_t));
+    for (i = 0, off = 0; i < threads; i++, off += size) {
+        tnv_thread_t *ctx = tnv_ctx.thr_ctx + i;
+        size = step + ((i < extra)?1:0);
+
+        ctx->offY = off;
+        ctx->stepY = size;
+
+        if (i == (threads-1)) ctx->copY = ctx->stepY;
+        else ctx->copY = ctx->stepY + 1;
+    }
+
+    sched = hw_sched_create(threads);
+    if (!sched) { fprintf(stderr, "Error creating threads\n"); exit(-1); }
+
+    err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_init);
+    if (!err) err = hw_sched_wait_task(sched);
+    if (err) { fprintf(stderr, "Error %i scheduling init threads", err); exit(-1); }
+}
+
+
+
 /*
  * C-OMP implementation of Total Nuclear Variation regularisation model (2D + channels) [1]
  * The code is modified from the implementation by Joan Duran <joan.duran@uib.es> see
@@ -32,50 +499,25 @@
  * 5. print information: 0 (off) or 1 (on)  [OPTIONAL parameter]
  *
  * Output:
- * 1. Filtered/regularized image
+ * 1. Filtered/regularized image (u)
  *
  * [1]. Duran, J., Moeller, M., Sbert, C. and Cremers, D., 2016. Collaborative total variation: a general framework for vectorial TV models. SIAM Journal on Imaging Sciences, 9(1), pp.116-151.
  */
 
-float TNV_CPU_main(float *Input, float *u, float lambda, int maxIter, float tol, int dimX, int dimY, int dimZ)
+float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float tol, int dimX, int dimY, int dimZ)
 {
-    long k, p, q, r, DimTotal;
-    float taulambda;
-    float *u_upd, *gx, *gy, *gx_upd, *gy_upd, *qx, *qy, *qx_upd, *qy_upd, *v, *vx, *vy, *gradx, *grady, *gradx_upd, *grady_upd, *gradx_ubar, *grady_ubar, *div, *div_upd;
-    
-    p = 1l;
-    q = 1l;
-    r = 0l;
-    
+    int err;
+    int iter;
+    int i,j,k,l,m;
+
     lambda = 1.0f/(2.0f*lambda);
-    DimTotal = (long)(dimX*dimY*dimZ);
-    /* PDHG algorithm parameters*/
+    tnv_ctx.lambda = lambda;
+
+    // PDHG algorithm parameters
     float tau = 0.5f;
     float sigma = 0.5f;
     float theta = 1.0f;
-    
-    // Auxiliar vectors
-    u_upd = calloc(DimTotal, sizeof(float));
-    gx = calloc(DimTotal, sizeof(float));
-    gy = calloc(DimTotal, sizeof(float));
-    gx_upd = calloc(DimTotal, sizeof(float));
-    gy_upd = calloc(DimTotal, sizeof(float));
-    qx = calloc(DimTotal, sizeof(float));
-    qy = calloc(DimTotal, sizeof(float));
-    qx_upd = calloc(DimTotal, sizeof(float));
-    qy_upd = calloc(DimTotal, sizeof(float));
-    v = calloc(DimTotal, sizeof(float));
-    vx = calloc(DimTotal, sizeof(float));
-    vy = calloc(DimTotal, sizeof(float));
-    gradx = calloc(DimTotal, sizeof(float));
-    grady = calloc(DimTotal, sizeof(float));
-    gradx_upd = calloc(DimTotal, sizeof(float));
-    grady_upd = calloc(DimTotal, sizeof(float));
-    gradx_ubar = calloc(DimTotal, sizeof(float));
-    grady_ubar = calloc(DimTotal, sizeof(float));
-    div = calloc(DimTotal, sizeof(float));
-    div_upd = calloc(DimTotal, sizeof(float));
-    
+
     // Backtracking parameters
     float s = 1.0f;
     float gamma = 0.75f;
@@ -84,369 +526,117 @@ float TNV_CPU_main(float *Input, float *u, float lambda, int maxIter, float tol,
     float alpha = alpha0;
     float delta = 1.5f;
     float eta = 0.95f;
+
+    TNV_CPU_init(InputT, uT, dimX, dimY, dimZ);
+
+    tnv_ctx.InputT = InputT;
+    tnv_ctx.uT = uT;
     
-    // PDHG algorithm parameters
-    taulambda = tau * lambda;
-    float divtau = 1.0f / tau;
-    float divsigma = 1.0f / sigma;
-    float theta1 = 1.0f + theta;
-    
-    /*allocate memory for  taulambda */
-    //taulambda = (float*) calloc(dimZ, sizeof(float));
-    //for(k=0; k < dimZ; k++)  {taulambda[k] = tau*lambda[k];}
-    
+    int padZ = tnv_ctx.padZ;
+
+    err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_start);
+    if (!err) err = hw_sched_wait_task(sched);
+    if (err) { fprintf(stderr, "Error %i scheduling start threads", err); exit(-1); }
+
+
     // Apply Primal-Dual Hybrid Gradient scheme
-    int iter = 0;
     float residual = fLarge;
-    float ubarx, ubary;
-    
+    int started = 0;
     for(iter = 0; iter < maxIter; iter++)   {
-        // Argument of proximal mapping of fidelity term
-#pragma omp parallel for shared(v, u) private(k)
-        for(k=0; k<dimX*dimY*dimZ; k++)  {v[k] = u[k] + tau*div[k];}
-
-// Proximal solution of fidelity term
-proxG(u_upd, v, Input, taulambda, (long)(dimX), (long)(dimY), (long)(dimZ));
-
-// Gradient of updated primal variable
-gradient(u_upd, gradx_upd, grady_upd, (long)(dimX), (long)(dimY), (long)(dimZ));
-
-// Argument of proximal mapping of regularization term
-#pragma omp parallel for shared(gradx_upd, grady_upd, gradx, grady) private(k, ubarx, ubary)
-for(k=0; k<dimX*dimY*dimZ; k++) {
-    ubarx = theta1 * gradx_upd[k] - theta * gradx[k];
-    ubary = theta1 * grady_upd[k] - theta * grady[k];
-    vx[k] = ubarx + divsigma * qx[k];
-    vy[k] = ubary + divsigma * qy[k];
-    gradx_ubar[k] = ubarx;
-    grady_ubar[k] = ubary;
-}
+        float resprimal = 0.0f;
+        float resdual = 0.0f;
+        float product = 0.0f;
+        float unorm = 0.0f;
+        float qnorm = 0.0f;
 
-proxF(gx_upd, gy_upd, vx, vy, sigma, p, q, r, (long)(dimX), (long)(dimY), (long)(dimZ));
+        float divtau = 1.0f / tau;
 
-// Update dual variable
-#pragma omp parallel for shared(qx_upd, qy_upd) private(k)
-for(k=0; k<dimX*dimY*dimZ; k++) {
-    qx_upd[k] = qx[k] + sigma * (gradx_ubar[k] - gx_upd[k]);
-    qy_upd[k] = qy[k] + sigma * (grady_ubar[k] - gy_upd[k]);
-}
+        tnv_ctx.sigma = sigma;
+        tnv_ctx.tau = tau;
+        tnv_ctx.theta = theta;
 
-// Divergence of updated dual variable
-#pragma omp parallel for shared(div_upd) private(k)
-for(k=0; k<dimX*dimY*dimZ; k++)  {div_upd[k] = 0.0f;}
-divergence(qx_upd, qy_upd, div_upd, dimX, dimY, dimZ);
-
-// Compute primal residual, dual residual, and backtracking condition
-float resprimal = 0.0f;
-float resdual = 0.0f;
-float product = 0.0f;
-float unorm = 0.0f;
-float qnorm = 0.0f;
-
-for(k=0; k<dimX*dimY*dimZ; k++) {
-    float udiff = u[k] - u_upd[k];
-    float qxdiff = qx[k] - qx_upd[k];
-    float qydiff = qy[k] - qy_upd[k];
-    float divdiff = div[k] - div_upd[k];
-    float gradxdiff = gradx[k] - gradx_upd[k];
-    float gradydiff = grady[k] - grady_upd[k];
-    
-    resprimal += fabs(divtau*udiff + divdiff);
-    resdual += fabs(divsigma*qxdiff - gradxdiff);
-    resdual += fabs(divsigma*qydiff - gradydiff);
-    
-    unorm += (udiff * udiff);
-    qnorm += (qxdiff * qxdiff + qydiff * qydiff);
-    product += (gradxdiff * qxdiff + gradydiff * qydiff);
-}
+        err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_step);
+        if (!err) err = hw_sched_wait_task(sched);
+        if (err) { fprintf(stderr, "Error %i scheduling tnv threads", err); exit(-1); }
 
-float b = (2.0f * tau * sigma * product) / (gamma * sigma * unorm +
-        gamma * tau * qnorm);
+            // border regions
+        for (j = 1; j < tnv_ctx.threads; j++) {
+            tnv_thread_t *ctx0 = tnv_ctx.thr_ctx + (j - 1);
+            tnv_thread_t *ctx = tnv_ctx.thr_ctx + j;
 
-// Adapt step-size parameters
-float dual_dot_delta = resdual * s * delta;
-float dual_div_delta = (resdual * s) / delta;
+            m = (ctx0->stepY - 1) * dimX * padZ;
+            for(i = 0; i < dimX; i++) {
+                for(k = 0; k < dimZ; k++) {
+                    int l = i * padZ + k;
+                        
+                    float divdiff = ctx->div0[l] - ctx->div[l];
+                    float udiff = ctx->udiff0[l];
 
-if(b > 1)
-{
-    // Decrease step-sizes to fit balancing principle
-    tau = (beta * tau) / b;
-    sigma = (beta * sigma) / b;
-    alpha = alpha0;
-    
-    copyIm(u, u_upd, (long)(dimX), (long)(dimY), (long)(dimZ));
-    copyIm(gx, gx_upd, (long)(dimX), (long)(dimY), (long)(dimZ));
-    copyIm(gy, gy_upd, (long)(dimX), (long)(dimY), (long)(dimZ));
-    copyIm(qx, qx_upd, (long)(dimX), (long)(dimY), (long)(dimZ));
-    copyIm(qy, qy_upd, (long)(dimX), (long)(dimY), (long)(dimZ));
-    copyIm(gradx, gradx_upd, (long)(dimX), (long)(dimY), (long)(dimZ));
-    copyIm(grady, grady_upd, (long)(dimX), (long)(dimY), (long)(dimZ));
-    copyIm(div, div_upd, (long)(dimX), (long)(dimY), (long)(dimZ));
-    
-} else if(resprimal > dual_dot_delta)
-{
-    // Increase primal step-size and decrease dual step-size
-    tau = tau / (1.0f - alpha);
-    sigma = sigma * (1.0f - alpha);
-    alpha = alpha * eta;
-    
-} else if(resprimal < dual_div_delta)
-{
-    // Decrease primal step-size and increase dual step-size
-    tau = tau * (1.0f - alpha);
-    sigma = sigma / (1.0f - alpha);
-    alpha = alpha * eta;
-}
+                    ctx->div[l] -= ctx0->qy[l + m];
+                    ctx0->div[m + l + dimX*padZ] = ctx->div[l];
+                    
+                    divdiff += ctx0->qy[l + m];
+                    resprimal += fabs(divtau * udiff + divdiff); 
+                }
+            }
+        }
 
-// Update variables
-taulambda = tau * lambda;
-//for(k=0; k < dimZ; k++) taulambda[k] = tau*lambda[k];
+        for (j = 0; j < tnv_ctx.threads; j++) {
+            tnv_thread_t *ctx = tnv_ctx.thr_ctx + j;
+            resprimal += ctx->resprimal;
+            resdual += ctx->resdual;
+            product += ctx->product;
+            unorm += ctx->unorm;
+            qnorm += ctx->qnorm;
+        } 
 
-divsigma = 1.0f / sigma;
-divtau = 1.0f / tau;
+        residual = (resprimal + resdual) / ((float) (dimX*dimY*dimZ));
+        float b = (2.0f * tau * sigma * product) / (gamma * sigma * unorm + gamma * tau * qnorm);
+        float dual_dot_delta = resdual * s * delta;
+        float dual_div_delta = (resdual * s) / delta;
+        printf("resprimal: %f, resdual: %f, b: %f (product: %f, unorm: %f, qnorm: %f)\n", resprimal, resdual, b, product, unorm, qnorm);
 
-copyIm(u_upd, u, (long)(dimX), (long)(dimY), (long)(dimZ));
-copyIm(gx_upd, gx, (long)(dimX), (long)(dimY), (long)(dimZ));
-copyIm(gy_upd, gy, (long)(dimX), (long)(dimY), (long)(dimZ));
-copyIm(qx_upd, qx, (long)(dimX), (long)(dimY), (long)(dimZ));
-copyIm(qy_upd, qy, (long)(dimX), (long)(dimY), (long)(dimZ));
-copyIm(gradx_upd, gradx, (long)(dimX), (long)(dimY), (long)(dimZ));
-copyIm(grady_upd, grady, (long)(dimX), (long)(dimY), (long)(dimZ));
-copyIm(div_upd, div, (long)(dimX), (long)(dimY), (long)(dimZ));
 
-// Compute residual at current iteration
-residual = (resprimal + resdual) / ((float) (dimX*dimY*dimZ));
+        if(b > 1) {
+            
+            // Decrease step-sizes to fit balancing principle
+            tau = (beta * tau) / b;
+            sigma = (beta * sigma) / b;
+            alpha = alpha0;
 
-//       printf("%f \n", residual);
-if (residual < tol) {
-    printf("Iterations stopped at %i with the residual %f \n", iter, residual);
-    break; }
+            if (started) {
+                fprintf(stderr, "\n\n\nWARNING: Back-tracking is required in the middle of iterative optimization! We CAN'T do it in the fast version. The standard TNV recommended\n\n\n");
+            } else {
+                err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_restore);
+                if (!err) err = hw_sched_wait_task(sched);
+                if (err) { fprintf(stderr, "Error %i scheduling restore threads", err); exit(-1); }
+            }
+        } else {
+            started = 1;
+            if(resprimal > dual_dot_delta) {
+                    // Increase primal step-size and decrease dual step-size
+                tau = tau / (1.0f - alpha);
+                sigma = sigma * (1.0f - alpha);
+                alpha = alpha * eta;
+            } else if(resprimal < dual_div_delta) {
+                    // Decrease primal step-size and increase dual step-size
+                tau = tau * (1.0f - alpha);
+                sigma = sigma / (1.0f - alpha);
+                alpha = alpha * eta;
+            }
+        }
 
+        if (residual < tol) break;
     }
-    printf("Iterations stopped at %i with the residual %f \n", iter, residual);
-    free (u_upd); free(gx); free(gy); free(gx_upd); free(gy_upd);
-    free(qx); free(qy); free(qx_upd); free(qy_upd); free(v); free(vx); free(vy);
-    free(gradx); free(grady); free(gradx_upd); free(grady_upd); free(gradx_ubar);
-    free(grady_ubar); free(div); free(div_upd);    
-    return *u;
-}
 
-float proxG(float *u_upd, float *v, float *f, float taulambda, long dimX, long dimY, long dimZ)
-{
-    float constant;
-    long k;
-    constant = 1.0f + taulambda;
-#pragma omp parallel for shared(v, f, u_upd) private(k)
-    for(k=0; k<dimZ*dimX*dimY; k++) {
-        u_upd[k] = (v[k] + taulambda * f[k])/constant;
-        //u_upd[(dimX*dimY)*k + l] = (v[(dimX*dimY)*k + l] + taulambda * f[(dimX*dimY)*k + l])/constant;
-    }
-    return *u_upd;
-}
+    err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_finish);
+    if (!err) err = hw_sched_wait_task(sched);
+    if (err) { fprintf(stderr, "Error %i scheduling finish threads", err); exit(-1); }
 
-float gradient(float *u_upd, float *gradx_upd, float *grady_upd, long dimX, long dimY, long dimZ)
-{
-    long i, j, k, l;
-    // Compute discrete gradient using forward differences
-#pragma omp parallel for shared(gradx_upd,grady_upd,u_upd) private(i, j, k, l)
-    for(k = 0; k < dimZ; k++)   {
-        for(j = 0; j < dimY; j++)   {
-            l = j * dimX;           
-            for(i = 0; i < dimX; i++)   {
-                // Derivatives in the x-direction
-                if(i != dimX-1)
-                    gradx_upd[(dimX*dimY)*k + i+l] = u_upd[(dimX*dimY)*k + i+1+l] - u_upd[(dimX*dimY)*k + i+l];
-                else
-                    gradx_upd[(dimX*dimY)*k + i+l] = 0.0f;
-                
-                // Derivatives in the y-direction
-                if(j != dimY-1)
-                    //grady_upd[(dimX*dimY)*k + i+l] = u_upd[(dimX*dimY)*k + i+dimY+l] -u_upd[(dimX*dimY)*k + i+l];
-                    grady_upd[(dimX*dimY)*k + i+l] = u_upd[(dimX*dimY)*k + i+(j+1)*dimX] -u_upd[(dimX*dimY)*k + i+l];
-                else
-                    grady_upd[(dimX*dimY)*k + i+l] = 0.0f;
-            }}}
-    return 1;
-}
 
-float proxF(float *gx, float *gy, float *vx, float *vy, float sigma, int p, int q, int r, long dimX, long dimY, long dimZ)
-{
-    // (S^p, \ell^1) norm decouples at each pixel
-//   Spl1(gx, gy, vx, vy, sigma, p, num_channels, dim);
-    float divsigma = 1.0f / sigma;
-    
-    // $\ell^{1,1,1}$-TV regularization
-//       int i,j,k;
-//     #pragma omp parallel for shared (gx,gy,vx,vy) private(i,j,k)
-//      for(k = 0; k < dimZ; k++)  {
-//         for(i=0; i<dimX; i++) {
-//              for(j=0; j<dimY; j++) {
-//                 gx[(dimX*dimY)*k + (i)*dimY + (j)] = SIGN(vx[(dimX*dimY)*k + (i)*dimY + (j)]) * MAX(fabs(vx[(dimX*dimY)*k + (i)*dimY + (j)]) - divsigma,  0.0f);
-//                 gy[(dimX*dimY)*k + (i)*dimY + (j)] = SIGN(vy[(dimX*dimY)*k + (i)*dimY + (j)]) * MAX(fabs(vy[(dimX*dimY)*k + (i)*dimY + (j)]) - divsigma,  0.0f);
-//             }}}
-    
-    // Auxiliar vector
-    float *proj, sum, shrinkfactor ;
-    float M1,M2,M3,valuex,valuey,T,D,det,eig1,eig2,sig1,sig2,V1, V2, V3, V4, v0,v1,v2, mu1,mu2,sig1_upd,sig2_upd,t1,t2,t3;
-    long i,j,k, ii, num;
-#pragma omp parallel for shared (gx,gy,vx,vy,p) private(i,ii,j,k,proj,num, sum, shrinkfactor, M1,M2,M3,valuex,valuey,T,D,det,eig1,eig2,sig1,sig2,V1, V2, V3, V4,v0,v1,v2,mu1,mu2,sig1_upd,sig2_upd,t1,t2,t3)
-    for(i=0; i<dimX; i++) {
-        for(j=0; j<dimY; j++) {
-            
-            proj = (float*) calloc (2,sizeof(float));
-            // Compute matrix $M\in\R^{2\times 2}$
-            M1 = 0.0f;
-            M2 = 0.0f;
-            M3 = 0.0f;
-            
-            for(k = 0; k < dimZ; k++)
-            {
-                valuex = vx[(dimX*dimY)*k + (j)*dimX + (i)];
-                valuey = vy[(dimX*dimY)*k + (j)*dimX + (i)];
-                
-                M1 += (valuex * valuex);
-                M2 += (valuex * valuey);
-                M3 += (valuey * valuey);
-            }
-            
-            // Compute eigenvalues of M
-            T = M1 + M3;
-            D = M1 * M3 - M2 * M2;
-            det = sqrt(MAX((T * T / 4.0f) - D, 0.0f));
-            eig1 = MAX((T / 2.0f) + det, 0.0f);
-            eig2 = MAX((T / 2.0f) - det, 0.0f);
-            sig1 = sqrt(eig1);
-            sig2 = sqrt(eig2);
-            
-            // Compute normalized eigenvectors
-            V1 = V2 = V3 = V4 = 0.0f;
-            
-            if(M2 != 0.0f)
-            {
-                v0 = M2;
-                v1 = eig1 - M3;
-                v2 = eig2 - M3;
-                
-                mu1 = sqrtf(v0 * v0 + v1 * v1);
-                mu2 = sqrtf(v0 * v0 + v2 * v2);
-                
-                if(mu1 > fTiny)
-                {
-                    V1 = v1 / mu1;
-                    V3 = v0 / mu1;
-                }
-                
-                if(mu2 > fTiny)
-                {
-                    V2 = v2 / mu2;
-                    V4 = v0 / mu2;
-                }
-                
-            } else
-            {
-                if(M1 > M3)
-                {
-                    V1 = V4 = 1.0f;
-                    V2 = V3 = 0.0f;
-                    
-                } else
-                {
-                    V1 = V4 = 0.0f;
-                    V2 = V3 = 1.0f;
-                }
-            }
-            
-            // Compute prox_p of the diagonal entries
-            sig1_upd = sig2_upd = 0.0f;
-            
-            if(p == 1)
-            {
-                sig1_upd = MAX(sig1 - divsigma, 0.0f);
-                sig2_upd = MAX(sig2 - divsigma, 0.0f);
-                
-            } else if(p == INFNORM)
-            {
-                proj[0] = sigma * fabs(sig1);
-                proj[1] = sigma * fabs(sig2);
-                
-                /*l1 projection part */
-                sum = fLarge;
-                num = 0l;
-                shrinkfactor = 0.0f;
-                while(sum > 1.0f)
-                {
-                    sum = 0.0f;
-                    num = 0;
-                    
-                    for(ii = 0; ii < 2; ii++)
-                    {
-                        proj[ii] = MAX(proj[ii] - shrinkfactor, 0.0f);
-                        
-                        sum += fabs(proj[ii]);
-                        if(proj[ii]!= 0.0f)
-                            num++;
-                    }
-                    
-                    if(num > 0)
-                        shrinkfactor = (sum - 1.0f) / num;
-                    else
-                        break;
-                }
-                /*l1 proj ends*/
-                
-                sig1_upd = sig1 - divsigma * proj[0];
-                sig2_upd = sig2 - divsigma * proj[1];
-            }
-            
-            // Compute the diagonal entries of $\widehat{\Sigma}\Sigma^{\dagger}_0$
-            if(sig1 > fTiny)
-                sig1_upd /= sig1;
-            
-            if(sig2 > fTiny)
-                sig2_upd /= sig2;
-            
-            // Compute solution
-            t1 = sig1_upd * V1 * V1 + sig2_upd * V2 * V2;
-            t2 = sig1_upd * V1 * V3 + sig2_upd * V2 * V4;
-            t3 = sig1_upd * V3 * V3 + sig2_upd * V4 * V4;
-            
-            for(k = 0; k < dimZ; k++)
-            {
-                gx[(dimX*dimY)*k + j*dimX + i] = vx[(dimX*dimY)*k + j*dimX + i] * t1 + vy[(dimX*dimY)*k + j*dimX + i] * t2;
-                gy[(dimX*dimY)*k + j*dimX + i] = vx[(dimX*dimY)*k + j*dimX + i] * t2 + vy[(dimX*dimY)*k + j*dimX + i] * t3;
-            }           
-            
-            // Delete allocated memory
-            free(proj);
-        }}
-    
-    return 1;
-}
+    printf("Iterations stopped at %i with the residual %f \n", iter, residual);
+    printf("Return: %f\n", *uT);
 
-float divergence(float *qx_upd, float *qy_upd, float *div_upd, long dimX, long dimY, long dimZ)
-{
-    long i, j, k, l;
-#pragma omp parallel for shared(qx_upd,qy_upd,div_upd) private(i, j, k, l)
-    for(k = 0; k < dimZ; k++)   {
-        for(j = 0; j < dimY; j++)   {
-            l = j * dimX;            
-            for(i = 0; i < dimX; i++)   {
-                if(i != dimX-1)
-                {
-                    // ux[k][i+l] = u[k][i+1+l] - u[k][i+l]
-                    div_upd[(dimX*dimY)*k + i+1+l] -= qx_upd[(dimX*dimY)*k + i+l];
-                    div_upd[(dimX*dimY)*k + i+l] += qx_upd[(dimX*dimY)*k + i+l];
-                }
-                
-                if(j != dimY-1)
-                {
-                    // uy[k][i+l] = u[k][i+width+l] - u[k][i+l]
-                    //div_upd[(dimX*dimY)*k + i+dimY+l] -= qy_upd[(dimX*dimY)*k + i+l];
-                    div_upd[(dimX*dimY)*k + i+(j+1)*dimX] -= qy_upd[(dimX*dimY)*k + i+l];                    
-                    div_upd[(dimX*dimY)*k + i+l] += qy_upd[(dimX*dimY)*k + i+l];
-                }
-            }
-        }
-    }
-    return *div_upd;
+//    exit(-1);
+    return *uT;
 }
diff --git a/src/Core/regularisers_CPU/TNV_core_backtrack.c b/src/Core/regularisers_CPU/TNV_core_backtrack.c
new file mode 100755
index 00000000..d771db52
--- /dev/null
+++ b/src/Core/regularisers_CPU/TNV_core_backtrack.c
@@ -0,0 +1,695 @@
+/*
+ * This work is part of the Core Imaging Library developed by
+ * Visual Analytics and Imaging System Group of the Science Technology
+ * Facilities Council, STFC
+ *
+ * Copyright 2017 Daniil Kazantsev
+ * Copyright 2017 Srikanth Nagella, Edoardo Pasca
+ *
+ * Copyriht 2020 Suren A. Chlingaryan
+ * Optimized version with 1/2 of memory consumption and ~4x performance
+ * This version is algorithmicly comptable with the original, but slight change in results 
+ * is expected due to different order of floating-point operations.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "TNV_core.h"
+
+#define min(a,b) (((a)<(b))?(a):(b))
+
+inline void coefF(float *t, float M1, float M2, float M3, float sigma, int p, int q, int r) {
+    int ii, num;
+    float divsigma = 1.0f / sigma;
+    float sum, shrinkfactor;
+    float T,D,det,eig1,eig2,sig1,sig2,V1, V2, V3, V4, v0,v1,v2, mu1,mu2,sig1_upd,sig2_upd;
+    float proj[2] = {0};
+
+    // Compute eigenvalues of M
+    T = M1 + M3;
+    D = M1 * M3 - M2 * M2;
+    det = sqrtf(MAX((T * T / 4.0f) - D, 0.0f));
+    eig1 = MAX((T / 2.0f) + det, 0.0f);
+    eig2 = MAX((T / 2.0f) - det, 0.0f);
+    sig1 = sqrtf(eig1);
+    sig2 = sqrtf(eig2);
+
+    // Compute normalized eigenvectors
+    V1 = V2 = V3 = V4 = 0.0f;
+
+    if(M2 != 0.0f)
+    {
+        v0 = M2;
+        v1 = eig1 - M3;
+        v2 = eig2 - M3;
+
+        mu1 = sqrtf(v0 * v0 + v1 * v1);
+        mu2 = sqrtf(v0 * v0 + v2 * v2);
+
+        if(mu1 > fTiny)
+        {
+            V1 = v1 / mu1;
+            V3 = v0 / mu1;
+        }
+
+        if(mu2 > fTiny)
+        {
+            V2 = v2 / mu2;
+            V4 = v0 / mu2;
+        }
+
+    } else
+    {
+        if(M1 > M3)
+        {
+            V1 = V4 = 1.0f;
+            V2 = V3 = 0.0f;
+
+        } else
+        {
+            V1 = V4 = 0.0f;
+            V2 = V3 = 1.0f;
+        }
+    }
+
+    // Compute prox_p of the diagonal entries
+    sig1_upd = sig2_upd = 0.0f;
+
+    if(p == 1)
+    {
+        sig1_upd = MAX(sig1 - divsigma, 0.0f);
+        sig2_upd = MAX(sig2 - divsigma, 0.0f);
+
+    } else if(p == INFNORM)
+    {
+        proj[0] = sigma * fabs(sig1);
+        proj[1] = sigma * fabs(sig2);
+
+        /*l1 projection part */
+        sum = fLarge;
+        num = 0l;
+        shrinkfactor = 0.0f;
+        while(sum > 1.0f)
+        {
+            sum = 0.0f;
+            num = 0;
+
+            for(ii = 0; ii < 2; ii++)
+            {
+                proj[ii] = MAX(proj[ii] - shrinkfactor, 0.0f);
+
+                sum += fabs(proj[ii]);
+                if(proj[ii]!= 0.0f)
+                    num++;
+            }
+
+            if(num > 0)
+                shrinkfactor = (sum - 1.0f) / num;
+            else
+                break;
+        }
+        /*l1 proj ends*/
+
+        sig1_upd = sig1 - divsigma * proj[0];
+        sig2_upd = sig2 - divsigma * proj[1];
+    }
+
+    // Compute the diagonal entries of $\widehat{\Sigma}\Sigma^{\dagger}_0$
+    if(sig1 > fTiny)
+        sig1_upd /= sig1;
+
+    if(sig2 > fTiny)
+        sig2_upd /= sig2;
+
+    // Compute solution
+    t[0] = sig1_upd * V1 * V1 + sig2_upd * V2 * V2;
+    t[1] = sig1_upd * V1 * V3 + sig2_upd * V2 * V4;
+    t[2] = sig1_upd * V3 * V3 + sig2_upd * V4 * V4;
+}
+
+
+#include "hw_sched.h"
+typedef struct {
+    int offY, stepY, copY;
+    float *Input, *u, *u_upd, *qx, *qy, *qx_upd, *qy_upd, *gradx, *grady, *gradx_upd, *grady_upd;
+    float *div, *div_upd;
+    float resprimal, resdual;
+    float unorm, qnorm, product;
+} tnv_thread_t;
+
+typedef struct {
+    int threads;
+    tnv_thread_t *thr_ctx;
+    float *InputT, *uT;
+    int dimX, dimY, dimZ, padZ;
+    float lambda, sigma, tau, theta;
+} tnv_context_t;
+
+HWSched sched = NULL;
+tnv_context_t tnv_ctx;
+
+
+static int tnv_free(HWThread thr, void *hwctx, int device_id, void *data) {
+    int i,j,k;
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+    free(ctx->Input);
+    free(ctx->u);
+    free(ctx->u_upd);
+    free(ctx->qx);
+    free(ctx->qy);
+    free(ctx->qx_upd);
+    free(ctx->qy_upd);
+    free(ctx->gradx);
+    free(ctx->grady);
+    free(ctx->gradx_upd);
+    free(ctx->grady_upd);
+    free(ctx->div);
+    free(ctx->div_upd);
+    
+    return 0;
+}
+
+static int tnv_init(HWThread thr, void *hwctx, int device_id, void *data) {
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+    
+    int dimX = tnv_ctx->dimX;
+    int dimY = tnv_ctx->dimY;
+    int dimZ = tnv_ctx->dimZ;
+    int padZ = tnv_ctx->padZ;
+    int offY = ctx->offY;
+    int stepY = ctx->stepY;
+    
+//    printf("%i %p - %i %i %i x %i %i\n", device_id, ctx, dimX, dimY, dimZ, offY, stepY);
+
+    long DimTotal = (long)(dimX*stepY*padZ);
+    long Dim1Total = (long)(dimX*(stepY+1)*padZ);
+
+    // Auxiliar vectors
+    ctx->Input = malloc(Dim1Total * sizeof(float));
+    ctx->u = malloc(Dim1Total * sizeof(float));
+    ctx->u_upd = malloc(Dim1Total * sizeof(float));
+    ctx->qx = malloc(DimTotal * sizeof(float));
+    ctx->qy = malloc(DimTotal * sizeof(float));
+    ctx->qx_upd = malloc(DimTotal * sizeof(float));
+    ctx->qy_upd = malloc(DimTotal * sizeof(float));
+    ctx->gradx = malloc(DimTotal * sizeof(float));
+    ctx->grady = malloc(DimTotal * sizeof(float));
+    ctx->gradx_upd = malloc(DimTotal * sizeof(float));
+    ctx->grady_upd = malloc(DimTotal * sizeof(float));
+    ctx->div = malloc(Dim1Total * sizeof(float));
+    ctx->div_upd = malloc(Dim1Total * sizeof(float));
+
+    if ((!ctx->Input)||(!ctx->u)||(!ctx->u_upd)||(!ctx->qx)||(!ctx->qy)||(!ctx->qx_upd)||(!ctx->qy_upd)||(!ctx->gradx)||(!ctx->grady)||(!ctx->gradx_upd)||(!ctx->grady_upd)||(!ctx->div)||(!ctx->div_upd)) {
+        fprintf(stderr, "Error allocating memory\n");
+        exit(-1);
+    }
+
+    return 0;
+}
+
+static int tnv_start(HWThread thr, void *hwctx, int device_id, void *data) {
+    int i,j,k;
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+    
+    int dimX = tnv_ctx->dimX;
+    int dimY = tnv_ctx->dimY;
+    int dimZ = tnv_ctx->dimZ;
+    int padZ = tnv_ctx->padZ;
+    int offY = ctx->offY;
+    int stepY = ctx->stepY;
+    int copY = ctx->copY;
+    
+//    printf("%i %p - %i %i %i (%i) x %i %i\n", device_id, ctx, dimX, dimY, dimZ, padZ, offY, stepY);
+
+    long DimTotal = (long)(dimX*stepY*padZ);
+    long Dim1Total = (long)(dimX*copY*padZ);
+
+    memset(ctx->u, 0, Dim1Total * sizeof(float));
+    memset(ctx->qx, 0, DimTotal * sizeof(float));
+    memset(ctx->qy, 0, DimTotal * sizeof(float));
+    memset(ctx->gradx, 0, DimTotal * sizeof(float));
+    memset(ctx->grady, 0, DimTotal * sizeof(float));
+    memset(ctx->div, 0, Dim1Total * sizeof(float));
+    
+    for(k=0; k<dimZ; k++) {
+        for(j=0; j<copY; j++) {
+            for(i=0; i<dimX; i++) {
+                ctx->Input[j * dimX * padZ + i * padZ + k] =  tnv_ctx->InputT[k * dimX * dimY + (j + offY) * dimX + i];
+                ctx->u[j * dimX * padZ + i * padZ + k] =  tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i];
+            }
+        }
+    }
+
+    return 0;
+}
+
+static int tnv_finish(HWThread thr, void *hwctx, int device_id, void *data) {
+    int i,j,k;
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+    int dimX = tnv_ctx->dimX;
+    int dimY = tnv_ctx->dimY;
+    int dimZ = tnv_ctx->dimZ;
+    int padZ = tnv_ctx->padZ;
+    int offY = ctx->offY;
+    int stepY = ctx->stepY;
+    int copY = ctx->copY;
+
+    for(k=0; k<dimZ; k++) {
+        for(j=0; j<stepY; j++) {
+            for(i=0; i<dimX; i++) {
+                tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i] = ctx->u[j * dimX * padZ + i * padZ + k];
+            }
+        }
+    }
+    
+    return 0;
+}
+
+
+static int tnv_copy(HWThread thr, void *hwctx, int device_id, void *data) {
+    int i,j,k;
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+    
+    int dimX = tnv_ctx->dimX;
+    int dimY = tnv_ctx->dimY;
+    int dimZ = tnv_ctx->dimZ;
+    int stepY = ctx->stepY;
+    int copY = ctx->copY;
+    int padZ = tnv_ctx->padZ;
+    long DimTotal = (long)(dimX*stepY*padZ);
+    long Dim1Total = (long)(dimX*copY*padZ);
+
+    // Auxiliar vectors
+    memcpy(ctx->u, ctx->u_upd, Dim1Total * sizeof(float));
+    memcpy(ctx->qx, ctx->qx_upd, DimTotal * sizeof(float));
+    memcpy(ctx->qy, ctx->qy_upd, DimTotal * sizeof(float));
+    memcpy(ctx->gradx, ctx->gradx_upd, DimTotal * sizeof(float));
+    memcpy(ctx->grady, ctx->grady_upd, DimTotal * sizeof(float));
+    memcpy(ctx->div, ctx->div_upd, Dim1Total * sizeof(float));
+
+    return 0;
+}
+
+static int tnv_restore(HWThread thr, void *hwctx, int device_id, void *data) {
+    int i,j,k;
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+    
+    int dimX = tnv_ctx->dimX;
+    int dimY = tnv_ctx->dimY;
+    int dimZ = tnv_ctx->dimZ;
+    int stepY = ctx->stepY;
+    int copY = ctx->copY;
+    int padZ = tnv_ctx->padZ;
+    long DimTotal = (long)(dimX*stepY*padZ);
+    long Dim1Total = (long)(dimX*copY*padZ);
+
+    // Auxiliar vectors
+    memcpy(ctx->u_upd, ctx->u, Dim1Total * sizeof(float));
+    memcpy(ctx->qx_upd, ctx->qx, DimTotal * sizeof(float));
+    memcpy(ctx->qy_upd, ctx->qy, DimTotal * sizeof(float));
+    memcpy(ctx->gradx_upd, ctx->gradx, DimTotal * sizeof(float));
+    memcpy(ctx->grady_upd, ctx->grady, DimTotal * sizeof(float));
+    memcpy(ctx->div_upd, ctx->div, Dim1Total * sizeof(float));
+
+    return 0;
+}
+
+
+static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) {
+    long i, j, k, l, m;
+
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+    
+    int dimX = tnv_ctx->dimX;
+    int dimY = tnv_ctx->dimY;
+    int dimZ = tnv_ctx->dimZ;
+    int padZ = tnv_ctx->padZ;
+    int offY = ctx->offY;
+    int stepY = ctx->stepY;
+    int copY = ctx->copY;
+
+    float *Input = ctx->Input;
+    float *u = ctx->u;
+    float *u_upd = ctx->u_upd;
+    float *qx = ctx->qx;
+    float *qy = ctx->qy;
+    float *qx_upd = ctx->qx_upd;
+    float *qy_upd = ctx->qy_upd;
+    float *gradx = ctx->gradx;
+    float *grady = ctx->grady;
+    float *gradx_upd = ctx->gradx_upd;
+    float *grady_upd = ctx->grady_upd;
+    float *div = ctx->div;
+    float *div_upd = ctx->div_upd;
+
+    long p = 1l;
+    long q = 1l;
+    long r = 0l;
+
+    float lambda = tnv_ctx->lambda;
+    float sigma = tnv_ctx->sigma;
+    float tau = tnv_ctx->tau;
+    float theta = tnv_ctx->theta;
+    
+    float taulambda = tau * lambda;
+    float divtau = 1.0f / tau;
+    float divsigma = 1.0f / sigma;
+    float theta1 = 1.0f + theta;
+    float constant = 1.0f + taulambda;
+
+    float resprimal = 0.0f;
+    float resdual = 0.0f;
+    float product = 0.0f;
+    float unorm = 0.0f;
+    float qnorm = 0.0f;
+
+    float udiff[dimZ];
+    float qxdiff;
+    float qydiff;
+    float divdiff;
+    float gradxdiff[dimZ];
+    float gradydiff[dimZ];
+
+    for(i=0; i < dimX; i++) {
+        for(k = 0; k < dimZ; k++) { 
+            int l = i * padZ + k;
+            u_upd[l] = (u[l] + tau * div[l] + taulambda * Input[l])/constant;
+            div_upd[l] = 0;
+        }
+    }
+
+    for(j = 0; j < stepY; j++) {
+/*        m = j * dimX * dimZ + (dimX - 1) * dimZ;
+        for(k = 0; k < dimZ; k++) { 
+            u_upd[k + m] = (u[k + m] + tau * div[k + m] + taulambda * Input[k + m]) / constant; 
+        }*/
+        
+        for(i = 0; i < dimX/* - 1*/; i++) {
+            float t[3];
+            float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f;
+            l = (j * dimX  + i) * padZ;
+            m = dimX * padZ;
+        
+//#pragma unroll 64
+            for(k = 0; k < dimZ; k++) {
+                u_upd[l + k + m] = (u[l + k + m] + tau * div[l + k + m] + taulambda * Input[l + k + m]) / constant;
+
+                gradx_upd[l + k] = (i == (dimX - 1))?0:(u_upd[l + k + padZ] - u_upd[l + k]);
+                grady_upd[l + k] = (j == (copY - 1))?0:(u_upd[l + k + m] - u_upd[l + k]);   // We need div from the next thread on last iter
+
+                udiff[k] = u[l + k] - u_upd[l + k];
+                unorm += (udiff[k] * udiff[k]);
+//                if ((!k)&&(!i)) printf("%i = %f %f, %f %f\n", offY + j, u[l + k], u_upd[l + k], udiff[k], unorm);
+
+                gradxdiff[k] = gradx[l + k] - gradx_upd[l + k];
+                gradydiff[k] = grady[l + k] - grady_upd[l + k];
+
+                float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k];
+                float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k];
+//#define TNV_NEW_STYLE                
+#ifdef TNV_NEW_STYLE                
+                qx_upd[l + k] = qx[l + k] + sigma * ubarx;
+                qy_upd[l + k] = qy[l + k] + sigma * ubary;
+
+                float vx = divsigma * qx_upd[l + k]; //+ ubarx
+                float vy = divsigma * qy_upd[l + k]; //+ ubary
+#else
+                float vx = ubarx + divsigma * qx[l + k];
+                float vy = ubary + divsigma * qy[l + k];
+#endif
+
+                M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy);
+            }
+
+            coefF(t, M1, M2, M3, sigma, p, q, r);
+            
+//#pragma unroll 64
+            for(k = 0; k < dimZ; k++) {
+#ifdef TNV_NEW_STYLE    
+                float vx = divsigma * qx_upd[l + k];
+                float vy = divsigma * qy_upd[l + k];
+
+                float gx_upd = vx * t[0] + vy * t[1];
+                float gy_upd = vx * t[1] + vy * t[2];
+
+                qx_upd[l + k] -= sigma * gx_upd;
+                qy_upd[l + k] -= sigma * gy_upd;
+#else
+                float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k];
+                float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k];
+                float vx = ubarx + divsigma * qx[l + k];
+                float vy = ubary + divsigma * qy[l + k];
+
+                float gx_upd = vx * t[0] + vy * t[1];
+                float gy_upd = vx * t[1] + vy * t[2];
+
+                qx_upd[l + k] = qx[l + k] + sigma * (ubarx - gx_upd);
+                qy_upd[l + k] = qy[l + k] + sigma * (ubary - gy_upd);
+#endif
+
+if(i != (dimX-1)) {
+                div_upd[l + k] += qx_upd[l + k];
+                div_upd[l + k + padZ] -= qx_upd[l + k];
+}
+if(j != (copY-1)) {
+                div_upd[l + k] += qy_upd[l + k];
+                div_upd[l + k + m] = -qy_upd[l + k];  // We need to update div in the next thread on last iter
+}
+
+                qxdiff = qx[l + k] - qx_upd[l + k];
+                qydiff = qy[l + k] - qy_upd[l + k];
+                qnorm += (qxdiff * qxdiff + qydiff * qydiff);
+
+                resdual += fabs(divsigma * qxdiff - gradxdiff[k]);
+                resdual += fabs(divsigma * qydiff - gradydiff[k]);
+                product += (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff);
+
+                if ((offY == 0)||(j > 0)) {
+                    divdiff = div[l + k] - div_upd[l + k];  // Multiple steps... How we compute without history?
+                    resprimal += fabs(divtau * udiff[k] + divdiff); 
+                }
+            }
+            
+        } // i
+    }
+    
+    
+    ctx->resprimal = resprimal;
+    ctx->resdual = resdual;
+    ctx->product = product;
+    ctx->unorm = unorm;
+    ctx->qnorm = qnorm;
+
+    return 0;
+}
+
+static void TNV_CPU_init(float *InputT, float *uT, int dimX, int dimY, int dimZ) {
+    int i, off, size, err;
+
+    if (sched) return;
+
+    tnv_ctx.dimX = dimX;
+    tnv_ctx.dimY = dimY;
+    tnv_ctx.dimZ = dimZ;
+        // Padding seems actually slower
+//    tnv_ctx.padZ = 64 * ((dimZ / 64) + ((dimZ % 64)?1:0));
+    tnv_ctx.padZ = dimZ;
+    
+    hw_sched_init();
+
+    int threads = hw_sched_get_cpu_count();
+    if (threads > dimY) threads = dimY/2;
+
+    int step = dimY / threads;
+    int extra = dimY % threads;
+
+    tnv_ctx.threads = threads;
+    tnv_ctx.thr_ctx = (tnv_thread_t*)calloc(threads, sizeof(tnv_thread_t));
+    for (i = 0, off = 0; i < threads; i++, off += size) {
+        tnv_thread_t *ctx = tnv_ctx.thr_ctx + i;
+        size = step + ((i < extra)?1:0);
+
+        ctx->offY = off;
+        ctx->stepY = size;
+
+        if (i == (threads-1)) ctx->copY = ctx->stepY;
+        else ctx->copY = ctx->stepY + 1;
+    }
+
+    sched = hw_sched_create(threads);
+    if (!sched) { fprintf(stderr, "Error creating threads\n"); exit(-1); }
+
+    err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_init);
+    if (!err) err = hw_sched_wait_task(sched);
+    if (err) { fprintf(stderr, "Error %i scheduling init threads", err); exit(-1); }
+}
+
+
+
+/*
+ * C-OMP implementation of Total Nuclear Variation regularisation model (2D + channels) [1]
+ * The code is modified from the implementation by Joan Duran <joan.duran@uib.es> see
+ * "denoisingPDHG_ipol.cpp" in Joans Collaborative Total Variation package
+ *
+ * Input Parameters:
+ * 1. Noisy volume of 2D + channel dimension, i.e. 3D volume
+ * 2. lambda - regularisation parameter
+ * 3. Number of iterations [OPTIONAL parameter]
+ * 4. eplsilon - tolerance constant [OPTIONAL parameter]
+ * 5. print information: 0 (off) or 1 (on)  [OPTIONAL parameter]
+ *
+ * Output:
+ * 1. Filtered/regularized image (u)
+ *
+ * [1]. Duran, J., Moeller, M., Sbert, C. and Cremers, D., 2016. Collaborative total variation: a general framework for vectorial TV models. SIAM Journal on Imaging Sciences, 9(1), pp.116-151.
+ */
+
+float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float tol, int dimX, int dimY, int dimZ)
+{
+    int err;
+    int iter;
+    int i,j,k,l,m;
+
+    lambda = 1.0f/(2.0f*lambda);
+    tnv_ctx.lambda = lambda;
+
+    // PDHG algorithm parameters
+    float tau = 0.5f;
+    float sigma = 0.5f;
+    float theta = 1.0f;
+
+    // Backtracking parameters
+    float s = 1.0f;
+    float gamma = 0.75f;
+    float beta = 0.95f;
+    float alpha0 = 0.2f;
+    float alpha = alpha0;
+    float delta = 1.5f;
+    float eta = 0.95f;
+
+    TNV_CPU_init(InputT, uT, dimX, dimY, dimZ);
+
+    tnv_ctx.InputT = InputT;
+    tnv_ctx.uT = uT;
+    
+    int padZ = tnv_ctx.padZ;
+
+    err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_start);
+    if (!err) err = hw_sched_wait_task(sched);
+    if (err) { fprintf(stderr, "Error %i scheduling start threads", err); exit(-1); }
+
+
+    // Apply Primal-Dual Hybrid Gradient scheme
+    float residual = fLarge;
+    for(iter = 0; iter < maxIter; iter++)   {
+        float resprimal = 0.0f;
+        float resdual = 0.0f;
+        float product = 0.0f;
+        float unorm = 0.0f;
+        float qnorm = 0.0f;
+
+        float divtau = 1.0f / tau;
+
+        tnv_ctx.sigma = sigma;
+        tnv_ctx.tau = tau;
+        tnv_ctx.theta = theta;
+
+        err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_step);
+        if (!err) err = hw_sched_wait_task(sched);
+        if (err) { fprintf(stderr, "Error %i scheduling tnv threads", err); exit(-1); }
+
+            // border regions
+        for (j = 1; j < tnv_ctx.threads; j++) {
+            tnv_thread_t *ctx0 = tnv_ctx.thr_ctx + (j - 1);
+            tnv_thread_t *ctx = tnv_ctx.thr_ctx + j;
+
+            m = ctx0->stepY * dimX * padZ;
+            for(i = 0; i < dimX; i++) {
+                for(k = 0; k < dimZ; k++) {
+                    int l = i * padZ + k;
+                        
+                    float div_upd_add = ctx0->div_upd[m + l];
+                    ctx->div_upd[l] += div_upd_add;
+                    ctx0->div_upd[m + l] = ctx->div_upd[l];
+                    //ctx0->u_upd[m + l] = ctx->u_upd[l];
+
+                    float divdiff = ctx->div[l] - ctx->div_upd[l];  // Multiple steps... How we compute without history?
+                    float udiff = ctx->u[l] - ctx->u_upd[l];
+                    resprimal += fabs(divtau * udiff + divdiff); 
+                }
+            }
+        }
+
+        for (j = 0; j < tnv_ctx.threads; j++) {
+            tnv_thread_t *ctx = tnv_ctx.thr_ctx + j;
+            resprimal += ctx->resprimal;
+            resdual += ctx->resdual;
+            product += ctx->product;
+            unorm += ctx->unorm;
+            qnorm += ctx->qnorm;
+        } 
+
+        residual = (resprimal + resdual) / ((float) (dimX*dimY*dimZ));
+        float b = (2.0f * tau * sigma * product) / (gamma * sigma * unorm + gamma * tau * qnorm);
+        float dual_dot_delta = resdual * s * delta;
+        float dual_div_delta = (resdual * s) / delta;
+        printf("resprimal: %f, resdual: %f, b: %f (product: %f, unorm: %f, qnorm: %f)\n", resprimal, resdual, b, product, unorm, qnorm);
+
+
+        if(b > 1) {
+            // Decrease step-sizes to fit balancing principle
+            tau = (beta * tau) / b;
+            sigma = (beta * sigma) / b;
+            alpha = alpha0;
+
+            err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_restore);
+            if (!err) err = hw_sched_wait_task(sched);
+            if (err) { fprintf(stderr, "Error %i scheduling restore threads", err); exit(-1); }
+        } else {
+            err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_copy);
+            if (!err) err = hw_sched_wait_task(sched);
+            if (err) { fprintf(stderr, "Error %i scheduling copy threads", err); exit(-1); }
+        
+            if(resprimal > dual_dot_delta) {
+                    // Increase primal step-size and decrease dual step-size
+                tau = tau / (1.0f - alpha);
+                sigma = sigma * (1.0f - alpha);
+                alpha = alpha * eta;
+            } else if(resprimal < dual_div_delta) {
+                    // Decrease primal step-size and increase dual step-size
+                tau = tau * (1.0f - alpha);
+                sigma = sigma / (1.0f - alpha);
+                alpha = alpha * eta;
+            }
+        }
+
+        if (residual < tol) break;
+    }
+
+    err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_finish);
+    if (!err) err = hw_sched_wait_task(sched);
+    if (err) { fprintf(stderr, "Error %i scheduling finish threads", err); exit(-1); }
+
+
+    printf("Iterations stopped at %i with the residual %f \n", iter, residual);
+    printf("Return: %f\n", *uT);
+
+    return *uT;
+}
diff --git a/src/Core/regularisers_CPU/hw_sched.c b/src/Core/regularisers_CPU/hw_sched.c
new file mode 100644
index 00000000..be895cc7
--- /dev/null
+++ b/src/Core/regularisers_CPU/hw_sched.c
@@ -0,0 +1,396 @@
+/*
+ * The PyHST program is Copyright (C) 2002-2011 of the
+ * European Synchrotron Radiation Facility (ESRF) and
+ * Karlsruhe Institute of Technology (KIT).
+ *
+ * PyHST is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * hst is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License along
+ * with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef HW_HAVE_SCHED_HEADERS
+# include <sys/types.h>
+# include <unistd.h>
+# include <sched.h>
+#endif /* HW_HAVE_SCHED_HEADERS */
+
+#include "hw_sched.h"
+
+
+#ifdef HW_USE_THREADS
+# define MUTEX_INIT(ctx, name) \
+    if (!err) { \
+	ctx->name##_mutex = g_mutex_new(); \
+	if (!ctx->name##_mutex) err = 1; \
+    }
+    
+# define MUTEX_FREE(ctx, name) \
+    if (ctx->name##_mutex) g_mutex_free(ctx->name##_mutex);
+
+# define COND_INIT(ctx, name) \
+    MUTEX_INIT(ctx, name##_cond) \
+    if (!err) { \
+	ctx->name##_cond = g_cond_new(); \
+	if (!ctx->name##_cond) { \
+	    err = 1; \
+	    MUTEX_FREE(ctx, name##_cond) \
+	} \
+    }
+
+# define COND_FREE(ctx, name) \
+    if (ctx->name##_cond) g_cond_free(ctx->name##_cond); \
+    MUTEX_FREE(ctx, name##_cond)
+#else /* HW_USE_THREADS */
+# define MUTEX_INIT(ctx, name)
+# define MUTEX_FREE(ctx, name)
+# define COND_INIT(ctx, name)
+# define COND_FREE(ctx, name)
+#endif /* HW_USE_THREADS */
+
+
+HWRunFunction ppu_run[] = {
+    (HWRunFunction)NULL
+};
+
+static int hw_sched_initialized = 0;
+
+int hw_sched_init(void) {
+    if (!hw_sched_initialized) {
+#ifdef HW_USE_THREADS
+	g_thread_init(NULL);
+#endif /* HW_USE_THREADS */
+	hw_sched_initialized = 1;
+    }
+
+    return 0;
+}
+
+
+int hw_sched_get_cpu_count(void) {
+#ifdef HW_HAVE_SCHED_HEADERS
+    int err;
+
+    int cpu_count;
+    cpu_set_t mask;
+
+    err = sched_getaffinity(getpid(), sizeof(mask), &mask);
+    if (err) return 1;
+
+# ifdef CPU_COUNT
+    cpu_count = CPU_COUNT(&mask);
+# else
+    for (cpu_count = 0; cpu_count < CPU_SETSIZE; cpu_count++) {
+	if (!CPU_ISSET(cpu_count, &mask)) break;
+    }
+# endif
+
+    if (!cpu_count) cpu_count = 1;
+    return cpu_count;    
+#else /* HW_HAVE_SCHED_HEADERS */
+    return 1;
+#endif /* HW_HAVE_SCHED_HEADERS */
+}
+
+
+HWSched hw_sched_create(int cpu_count) {
+    int i;
+    int err = 0;
+
+    HWSched ctx;
+
+    //hw_sched_init();
+    
+    ctx = (HWSched)malloc(sizeof(HWSchedS));
+    if (!ctx) return NULL;
+
+    memset(ctx, 0, sizeof(HWSchedS));
+
+    ctx->status = 1;
+
+    MUTEX_INIT(ctx, sync);
+    MUTEX_INIT(ctx, data);
+    COND_INIT(ctx, compl);
+    COND_INIT(ctx, job);
+    
+    if (err) {
+	hw_sched_destroy(ctx);
+	return NULL;
+    }
+    
+    if (!cpu_count) cpu_count = hw_sched_get_cpu_count();
+    if (cpu_count > HW_MAX_THREADS) cpu_count = HW_MAX_THREADS;
+
+    ctx->n_threads = 0;
+    for (i = 0; i < cpu_count; i++) {
+	ctx->thread[ctx->n_threads] = hw_thread_create(ctx, ctx->n_threads, NULL, ppu_run, NULL);
+	if (ctx->thread[ctx->n_threads]) {
+#ifndef HW_USE_THREADS
+	    ctx->thread[ctx->n_threads]->status = HW_THREAD_STATUS_STARTING;
+#endif /* HW_USE_THREADS */
+	    ++ctx->n_threads;
+	}
+    }
+    
+    if (!ctx->n_threads) {
+	hw_sched_destroy(ctx);
+	return NULL;
+    }
+    
+    return ctx;
+}
+
+static int hw_sched_wait_threads(HWSched ctx) {
+#ifdef HW_USE_THREADS
+    int i = 0;
+    
+    hw_sched_lock(ctx, compl_cond);
+    while (i < ctx->n_threads) {
+        for (; i < ctx->n_threads; i++) {
+	    if (ctx->thread[i]->status == HW_THREAD_STATUS_INIT) {
+		hw_sched_wait(ctx, compl);
+		break;
+	    }
+	}
+	
+    }
+    hw_sched_unlock(ctx, compl_cond);
+#endif /* HW_USE_THREADS */
+    
+    ctx->started = 1;
+
+    return 0;
+}
+
+void hw_sched_destroy(HWSched ctx) {
+    int i;
+
+    if (ctx->n_threads > 0) {
+	if (!ctx->started) {
+	    hw_sched_wait_threads(ctx);
+	}
+
+	ctx->status = 0;
+	hw_sched_lock(ctx, job_cond);
+	hw_sched_broadcast(ctx, job);
+	hw_sched_unlock(ctx, job_cond);
+    
+	for (i = 0; i < ctx->n_threads; i++) {
+	    hw_thread_destroy(ctx->thread[i]);
+	}
+    }
+
+    COND_FREE(ctx, job);
+    COND_FREE(ctx, compl);
+    MUTEX_FREE(ctx, data);
+    MUTEX_FREE(ctx, sync);
+
+    free(ctx);
+}
+
+int hw_sched_set_sequential_mode(HWSched ctx, int *n_blocks, int *cur_block, HWSchedFlags flags) {
+    ctx->mode = HW_SCHED_MODE_SEQUENTIAL;
+    ctx->n_blocks = n_blocks;
+    ctx->cur_block = cur_block;
+    ctx->flags = flags;
+    
+    return 0;
+}
+
+int hw_sched_get_chunk(HWSched ctx, int thread_id) {
+    int block;
+
+    switch (ctx->mode) {
+	case HW_SCHED_MODE_PREALLOCATED:
+	    if (ctx->thread[thread_id]->status == HW_THREAD_STATUS_STARTING) {
+#ifndef HW_USE_THREADS
+	        ctx->thread[thread_id]->status = HW_THREAD_STATUS_DONE;
+#endif /* HW_USE_THREADS */
+                return thread_id;
+	    } else {
+		return HW_SCHED_CHUNK_INVALID;
+	    }
+	case HW_SCHED_MODE_SEQUENTIAL:
+	    if ((ctx->flags&HW_SCHED_FLAG_INIT_CALL)&&(ctx->thread[thread_id]->status == HW_THREAD_STATUS_STARTING)) {
+	        return HW_SCHED_CHUNK_INIT;
+	    }
+	    hw_sched_lock(ctx, data);
+	    block = *ctx->cur_block;
+	    if (block < *ctx->n_blocks) {
+		*ctx->cur_block = *ctx->cur_block + 1;
+	    } else {
+		block = HW_SCHED_CHUNK_INVALID;
+	    }
+	    hw_sched_unlock(ctx, data);
+	    if (block == HW_SCHED_CHUNK_INVALID) {
+	        if (((ctx->flags&HW_SCHED_FLAG_FREE_CALL)&&(ctx->thread[thread_id]->status == HW_THREAD_STATUS_RUNNING))) {
+	            ctx->thread[thread_id]->status = HW_THREAD_STATUS_FINISHING;
+	            return HW_SCHED_CHUNK_FREE;
+	        }
+	        if ((ctx->flags&HW_SCHED_FLAG_TERMINATOR_CALL)&&((ctx->thread[thread_id]->status == HW_THREAD_STATUS_RUNNING)||(ctx->thread[thread_id]->status == HW_THREAD_STATUS_FINISHING))) {
+	            int i;
+	            hw_sched_lock(ctx, data);
+	            for (i = 0; i < ctx->n_threads; i++) {
+	                if (thread_id == i) continue;
+	                if ((ctx->thread[i]->status != HW_THREAD_STATUS_DONE)&&(ctx->thread[i]->status != HW_THREAD_STATUS_FINISHING2)&&(ctx->thread[i]->status != HW_THREAD_STATUS_IDLE)) {
+	            	    break;
+	            	}
+	            }
+	            ctx->thread[thread_id]->status  = HW_THREAD_STATUS_FINISHING2;
+	            hw_sched_unlock(ctx, data);
+	            if (i == ctx->n_threads) {
+	                return HW_SCHED_CHUNK_TERMINATOR;
+	            } 
+	        }
+	    }
+	    return block;
+	default:
+	    return HW_SCHED_CHUNK_INVALID;
+    }
+
+    return -1;
+}
+
+    
+int hw_sched_schedule_task(HWSched ctx, void *appctx, HWEntry entry) {
+#ifdef HW_USE_THREADS
+    if (!ctx->started) {
+	hw_sched_wait_threads(ctx);
+    }
+#else /* HW_USE_THREADS */
+    int err;
+    int i, chunk_id, n_threads;
+    HWRunFunction run;
+    HWThread thrctx;
+#endif /* HW_USE_THREADS */
+    
+    ctx->ctx = appctx;
+    ctx->entry = entry;
+
+    switch (ctx->mode) {
+	case HW_SCHED_MODE_SEQUENTIAL:
+	    *ctx->cur_block = 0;
+	break;
+	default:
+	    ;
+    }
+        
+#ifdef HW_USE_THREADS
+    hw_sched_lock(ctx, compl_cond);
+
+    hw_sched_lock(ctx, job_cond);
+    hw_sched_broadcast(ctx, job);
+    hw_sched_unlock(ctx, job_cond);
+#else  /* HW_USE_THREADS */
+    n_threads = ctx->n_threads;
+    
+    for (i = 0; i < n_threads; i++) {
+	thrctx = ctx->thread[i];
+	thrctx->err = 0;
+    }
+
+    i = 0;
+    thrctx = ctx->thread[i];
+    chunk_id = hw_sched_get_chunk(ctx, thrctx->thread_id);
+
+    while (chunk_id >= 0) {
+	run = hw_run_entry(thrctx->runs, entry);
+        err = run(thrctx, thrctx->hwctx, chunk_id, appctx);
+	if (err) {
+	    thrctx->err = err;
+	    break;
+	}
+	
+	if ((++i) == n_threads) i = 0;
+	thrctx = ctx->thread[i];
+        chunk_id = hw_sched_get_chunk(ctx, thrctx->thread_id);
+    }
+#endif /* HW_USE_THREADS */
+
+    return 0;
+}
+
+int hw_sched_wait_task(HWSched ctx) {
+    int err = 0;
+    int i = 0, n_threads = ctx->n_threads;
+
+#ifdef HW_USE_THREADS
+    while (i < ctx->n_threads) {
+        for (; i < ctx->n_threads; i++) {
+	    if (ctx->thread[i]->status == HW_THREAD_STATUS_DONE) {
+		ctx->thread[i]->status = HW_THREAD_STATUS_IDLE;
+	    } else {
+		hw_sched_wait(ctx, compl);
+		break;
+	    }
+	}
+	
+    }
+
+    hw_sched_unlock(ctx, compl_cond);
+#endif /* HW_USE_THREADS */
+
+    for (i = 0; i < n_threads; i++) {
+	HWThread thrctx = ctx->thread[i];
+	if (thrctx->err) return err = thrctx->err;
+
+#ifndef HW_USE_THREADS
+        thrctx->status = HW_THREAD_STATUS_IDLE;
+#endif /* HW_USE_THREADS */
+    }
+
+    return err;
+}
+
+int hw_sched_execute_task(HWSched ctx, void *appctx, HWEntry entry) {
+    int err;
+    
+    err = hw_sched_schedule_task(ctx, appctx, entry);
+    if (err) return err;
+    
+    return hw_sched_wait_task(ctx);
+}
+
+int hw_sched_schedule_thread_task(HWSched ctx, void *appctx, HWEntry entry) {
+    int err;
+    
+    ctx->saved_mode = ctx->mode;
+    ctx->mode = HW_SCHED_MODE_PREALLOCATED;
+    err = hw_sched_schedule_task(ctx, appctx, entry);
+    
+    return err;
+}
+
+
+int hw_sched_wait_thread_task(HWSched ctx) {
+    int err;
+
+    err = hw_sched_wait_task(ctx);
+    ctx->mode = ctx->saved_mode;
+
+    return err;
+}
+
+int hw_sched_execute_thread_task(HWSched ctx, void *appctx, HWEntry entry) {
+    int err;
+    int saved_mode = ctx->mode;
+
+    ctx->mode = HW_SCHED_MODE_PREALLOCATED;
+    err = hw_sched_execute_task(ctx, appctx, entry);
+    ctx->mode = saved_mode;
+    
+    return err;
+}
diff --git a/src/Core/regularisers_CPU/hw_sched.h b/src/Core/regularisers_CPU/hw_sched.h
new file mode 100644
index 00000000..c5d1285f
--- /dev/null
+++ b/src/Core/regularisers_CPU/hw_sched.h
@@ -0,0 +1,151 @@
+/*
+ * The PyHST program is Copyright (C) 2002-2011 of the
+ * European Synchrotron Radiation Facility (ESRF) and
+ * Karlsruhe Institute of Technology (KIT).
+ *
+ * PyHST is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * hst is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License along
+ * with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _HW_SCHED_H
+#define _HW_SCHED_H
+
+#include <glib.h>
+
+    // enable threading
+#define HW_HAVE_SCHED_HEADERS
+#define HW_USE_THREADS
+
+
+typedef struct HWSchedT *HWSched;
+#ifdef HW_USE_THREADS
+typedef GMutex *HWMutex;
+#else /* HW_USE_THREADS */
+typedef void *HWMutex;
+#endif /* HW_USE_THREADS */
+
+
+#include "hw_thread.h"
+
+enum HWSchedModeT {
+    HW_SCHED_MODE_PREALLOCATED = 0,
+    HW_SCHED_MODE_SEQUENTIAL
+};
+typedef enum HWSchedModeT HWSchedMode;
+
+enum HWSchedChunkT {
+    HW_SCHED_CHUNK_INVALID = -1,
+    HW_SCHED_CHUNK_INIT = -2,
+    HW_SCHED_CHUNK_FREE = -3,
+    HW_SCHED_CHUNK_TERMINATOR = -4
+};
+typedef enum HWSchedChunkT HWSchedChunk;
+
+enum HWSchedFlagsT {
+    HW_SCHED_FLAG_INIT_CALL = 1,        //! Executed in each thread before real chunks
+    HW_SCHED_FLAG_FREE_CALL = 2,        //! Executed in each thread after real chunks
+    HW_SCHED_FLAG_TERMINATOR_CALL = 4   //! Executed in one of the threads after all threads are done
+};
+typedef enum HWSchedFlagsT HWSchedFlags;
+
+
+#define HW_SINGLE_MODE
+//#define HW_DETECT_CPU_CORES
+#define HW_MAX_THREADS 128
+
+#ifdef HW_SINGLE_MODE
+    typedef HWRunFunction HWEntry;
+# define hw_run_entry(runs, entry) entry
+#else /* HW_SINGLE_MODE */
+    typedef int HWEntry;
+# define hw_run_entry(runs, entry) runs[entry]
+#endif /* HW_SINGLE_MODE */
+
+#ifndef HW_HIDE_DETAILS
+struct HWSchedT {
+    int status;
+    int started;
+    
+    int n_threads;
+    HWThread thread[HW_MAX_THREADS];
+    
+#ifdef HW_USE_THREADS
+    GCond *job_cond, *compl_cond;
+    GMutex *job_cond_mutex, *compl_cond_mutex, *data_mutex;
+    GMutex *sync_mutex;
+#endif /* HW_USE_THREADS */
+    
+    HWSchedMode mode;
+    HWSchedMode saved_mode;
+    HWSchedFlags flags;
+    int *n_blocks;
+    int *cur_block;
+
+    HWEntry entry;
+    void *ctx;
+};
+typedef struct HWSchedT HWSchedS;
+#endif /* HW_HIDE_DETAILS */
+
+# ifdef __cplusplus
+extern "C" {
+# endif
+
+HWSched hw_sched_create(int ppu_count);
+int hw_sched_init(void);
+void hw_sched_destroy(HWSched ctx);
+int hw_sched_get_cpu_count(void);
+
+int hw_sched_set_sequential_mode(HWSched ctx, int *n_blocks, int *cur_block, HWSchedFlags flags);
+int hw_sched_get_chunk(HWSched ctx, int thread_id);
+int hw_sched_schedule_task(HWSched ctx, void *appctx, HWEntry entry);
+int hw_sched_wait_task(HWSched ctx);
+int hw_sched_execute_task(HWSched ctx, void *appctx, HWEntry entry);
+
+int hw_sched_schedule_thread_task(HWSched ctx, void *appctx, HWEntry entry);
+int hw_sched_wait_thread_task(HWSched ctx);
+int hw_sched_execute_thread_task(HWSched ctx, void *appctx, HWEntry entry);
+
+HWMutex hw_sched_create_mutex(void);
+void hw_sched_destroy_mutex(HWMutex ctx);
+
+#ifdef HW_USE_THREADS
+# define hw_sched_lock(ctx, type) g_mutex_lock(ctx->type##_mutex)
+# define hw_sched_unlock(ctx, type) g_mutex_unlock(ctx->type##_mutex)
+# define hw_sched_broadcast(ctx, type) g_cond_broadcast(ctx->type##_cond)
+# define hw_sched_signal(ctx, type) g_cond_signal(ctx->type##_cond)
+# define hw_sched_wait(ctx, type) g_cond_wait(ctx->type##_cond, ctx->type##_cond_mutex)
+
+#define hw_sched_create_mutex(void) g_mutex_new()
+#define hw_sched_destroy_mutex(ctx) g_mutex_free(ctx)
+#define hw_sched_lock_mutex(ctx) g_mutex_lock(ctx)
+#define hw_sched_unlock_mutex(ctx) g_mutex_unlock(ctx)
+#else /* HW_USE_THREADS */
+# define hw_sched_lock(ctx, type)
+# define hw_sched_unlock(ctx, type)
+# define hw_sched_broadcast(ctx, type)
+# define hw_sched_signal(ctx, type)
+# define hw_sched_wait(ctx, type)
+
+#define hw_sched_create_mutex(void) NULL
+#define hw_sched_destroy_mutex(ctx)
+#define hw_sched_lock_mutex(ctx)
+#define hw_sched_unlock_mutex(ctx)
+#endif /* HW_USE_THREADS */
+
+# ifdef __cplusplus
+}
+# endif
+
+#endif /* _HW_SCHED_H */
+
diff --git a/src/Core/regularisers_CPU/hw_thread.c b/src/Core/regularisers_CPU/hw_thread.c
new file mode 100644
index 00000000..41b18006
--- /dev/null
+++ b/src/Core/regularisers_CPU/hw_thread.c
@@ -0,0 +1,141 @@
+/*
+ * The PyHST program is Copyright (C) 2002-2011 of the
+ * European Synchrotron Radiation Facility (ESRF) and
+ * Karlsruhe Institute of Technology (KIT).
+ *
+ * PyHST is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * hst is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License along
+ * with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "hw_sched.h"
+#include "hw_thread.h"
+
+#ifdef HW_USE_THREADS
+static void *hw_thread_function(HWThread ctx) {
+    int err;
+    int chunk_id;
+    
+    HWRunFunction *runs;
+    HWRunFunction run;
+    HWSched sched;
+    void *hwctx;
+    
+    sched = ctx->sched;
+    runs = ctx->run;
+    hwctx = ctx->hwctx;
+    
+    hw_sched_lock(sched, job_cond);
+
+    hw_sched_lock(sched, compl_cond);
+    ctx->status = HW_THREAD_STATUS_IDLE;    
+    hw_sched_broadcast(sched, compl);
+    hw_sched_unlock(sched, compl_cond);
+    
+    while (sched->status) {
+	hw_sched_wait(sched, job);
+	if (!sched->status) break;
+
+	ctx->err = 0;
+	ctx->status = HW_THREAD_STATUS_STARTING;
+	hw_sched_unlock(sched, job_cond);
+	
+	run = hw_run_entry(runs, sched->entry);
+#if 0
+	// Offset to interleave transfers if the GPUBox is used
+	// Just check with CUDA_LAUNCH_BLOCKED the togpu time and put it here
+	// It should be still significantly less than BP time
+	// We can do callibration during initilization in future
+	
+	usleep(12000 * ctx->thread_id);
+#endif
+	chunk_id = hw_sched_get_chunk(sched, ctx->thread_id);
+
+	    /* Should be after get_chunk, since we can check if it's first time */
+	ctx->status = HW_THREAD_STATUS_RUNNING; 
+	while (chunk_id != HW_SCHED_CHUNK_INVALID) {
+	    //printf("Thread %i processing slice %i\n", ctx->thread_id, chunk_id);
+	    err = run(ctx, hwctx, chunk_id, sched->ctx);
+	    if (err) {
+		ctx->err = err;
+		break;
+	    }
+	    chunk_id = hw_sched_get_chunk(sched, ctx->thread_id);
+	}
+
+	hw_sched_lock(sched, job_cond);
+	
+	hw_sched_lock(sched, compl_cond);
+	ctx->status = HW_THREAD_STATUS_DONE;
+	hw_sched_broadcast(sched, compl);
+	hw_sched_unlock(sched, compl_cond);
+    }
+
+    hw_sched_unlock(sched, job_cond);
+
+    g_thread_exit(NULL);
+    return NULL; /* TODO: check this */
+}
+#endif /* HW_USE_THREADS */
+
+
+HWThread hw_thread_create(HWSched sched, int thread_id, void *hwctx, HWRunFunction *run_func, HWFreeFunction free_func) {
+    GError *err;
+    
+    HWThread ctx;
+    
+    ctx = (HWThread)malloc(sizeof(HWThreadS));
+    if (!ctx) return ctx;
+    
+    memset(ctx, 0, sizeof(HWThreadS));
+
+    ctx->sched = sched;
+    ctx->hwctx = hwctx;
+    ctx->run = run_func;
+    ctx->free = free_func;
+    ctx->thread_id = thread_id;
+    ctx->status = HW_THREAD_STATUS_INIT;
+    
+#ifdef HW_USE_THREADS
+    ctx->thread = g_thread_create((GThreadFunc)hw_thread_function, ctx, 1, &err);
+    if (!ctx->thread) {
+	g_error_free(err);
+
+	hw_thread_destroy(ctx);
+	return NULL;
+    }
+#endif /* HW_USE_THREADS */
+    
+    return ctx;
+}
+
+void hw_thread_destroy(HWThread ctx) {
+#ifdef HW_USE_THREADS
+    if (ctx->thread) {
+	g_thread_join(ctx->thread);
+    }
+#endif /* HW_USE_THREADS */
+    
+    if (ctx->data) {
+	free(ctx->data);
+    }
+    
+    if (ctx->free) {
+	ctx->free(ctx->hwctx);
+    }
+    
+    free(ctx);
+}
diff --git a/src/Core/regularisers_CPU/hw_thread.h b/src/Core/regularisers_CPU/hw_thread.h
new file mode 100644
index 00000000..de7f60f3
--- /dev/null
+++ b/src/Core/regularisers_CPU/hw_thread.h
@@ -0,0 +1,76 @@
+/*
+ * The PyHST program is Copyright (C) 2002-2011 of the
+ * European Synchrotron Radiation Facility (ESRF) and
+ * Karlsruhe Institute of Technology (KIT).
+ *
+ * PyHST is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ * 
+ * hst is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License along
+ * with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _HW_THREAD_H
+#define _HW_THREAD_H
+
+#include <glib.h>
+
+typedef struct HWThreadT *HWThread;
+typedef int (*HWRunFunction)(HWThread thread, void *ctx, int block, void *attr);
+typedef int (*HWFreeFunction)(void *ctx);
+
+#include "hw_sched.h"
+
+enum HWThreadStatusT {
+    HW_THREAD_STATUS_IDLE = 0,
+    HW_THREAD_STATUS_STARTING = 1,
+    HW_THREAD_STATUS_RUNNING = 2,
+    HW_THREAD_STATUS_FINISHING = 3,
+    HW_THREAD_STATUS_FINISHING2 = 4,
+    HW_THREAD_STATUS_DONE = 5,
+    HW_THREAD_STATUS_INIT = 6
+};
+typedef enum HWThreadStatusT HWThreadStatus;
+
+
+#ifndef HW_HIDE_DETAILS
+struct HWThreadT {
+    int thread_id;
+    HWSched sched;
+
+#ifdef HW_USE_THREADS    
+    GThread *thread;
+#endif /* HW_USE_THREADS */
+    
+    void *hwctx;
+    HWRunFunction *run;
+    HWFreeFunction free;
+    
+    int err;
+    HWThreadStatus status;
+
+    void *data;			/**< Per-thread data storage, will be free'd if set */
+};
+typedef struct HWThreadT HWThreadS;
+#endif /* HW_HIDE_DETAILS */
+
+# ifdef __cplusplus
+extern "C" {
+# endif
+
+HWThread hw_thread_create(HWSched sched, int thread_id, void *hwctx, HWRunFunction *run_func, HWFreeFunction free_func);
+void hw_thread_destroy(HWThread ctx);
+
+# ifdef __cplusplus
+}
+# endif
+
+
+#endif /* _HW_THREAD_H */

From ec31378b47fdf0eae06732e4e91c3b1bb02c454a Mon Sep 17 00:00:00 2001
From: "Suren A. Chilingaryan" <csa@suren.me>
Date: Sun, 29 Mar 2020 16:04:44 +0200
Subject: [PATCH 02/10] Optimize cache usage with blocking (icc is faster and
 gcc is slightly slower than previous version)

---
 src/Core/regularisers_CPU/TNV_core.c | 94 +++++++++++++++-------------
 1 file changed, 51 insertions(+), 43 deletions(-)

diff --git a/src/Core/regularisers_CPU/TNV_core.c b/src/Core/regularisers_CPU/TNV_core.c
index be7fdef7..dce414ac 100755
--- a/src/Core/regularisers_CPU/TNV_core.c
+++ b/src/Core/regularisers_CPU/TNV_core.c
@@ -25,6 +25,7 @@
 
 #include "TNV_core.h"
 
+#define BLOCK 32
 #define min(a,b) (((a)<(b))?(a):(b))
 
 inline void coefF(float *t, float M1, float M2, float M3, float sigma, int p, int q, int r) {
@@ -141,7 +142,7 @@ inline void coefF(float *t, float M1, float M2, float M3, float sigma, int p, in
 typedef struct {
     int offY, stepY, copY;
     float *Input, *u, *qx, *qy, *gradx, *grady, *div;
-    float *div0, *udiff0, *udiff;
+    float *div0, *udiff0;
     float resprimal, resdual;
     float unorm, qnorm, product;
 } tnv_thread_t;
@@ -173,7 +174,6 @@ static int tnv_free(HWThread thr, void *hwctx, int device_id, void *data) {
     
     free(ctx->div0);
     free(ctx->udiff0);
-    free(ctx->udiff);
     
     return 0;
 }
@@ -196,19 +196,18 @@ static int tnv_init(HWThread thr, void *hwctx, int device_id, void *data) {
     long DimRow = (long)(dimX * padZ);
 
     // Auxiliar vectors
-    ctx->Input = malloc(Dim1Total * sizeof(float));
-    ctx->u = malloc(Dim1Total * sizeof(float));
-    ctx->qx = malloc(DimTotal * sizeof(float));
-    ctx->qy = malloc(DimTotal * sizeof(float));
-    ctx->gradx = malloc(DimTotal * sizeof(float));
-    ctx->grady = malloc(DimTotal * sizeof(float));
-    ctx->div = malloc(Dim1Total * sizeof(float));
-
-    ctx->div0 = malloc(DimRow * sizeof(float));
-    ctx->udiff0 = malloc(DimRow * sizeof(float));
-    ctx->udiff = malloc(DimRow * sizeof(float));
-
-    if ((!ctx->Input)||(!ctx->u)||(!ctx->qx)||(!ctx->qy)||(!ctx->gradx)||(!ctx->grady)||(!ctx->div)||(!ctx->div0)||(!ctx->udiff)||(!ctx->udiff0)) {
+    ctx->Input = memalign(64, Dim1Total * sizeof(float));
+    ctx->u = memalign(64, Dim1Total * sizeof(float));
+    ctx->qx = memalign(64, DimTotal * sizeof(float));
+    ctx->qy = memalign(64, DimTotal * sizeof(float));
+    ctx->gradx = memalign(64, DimTotal * sizeof(float));
+    ctx->grady = memalign(64, DimTotal * sizeof(float));
+    ctx->div = memalign(64, Dim1Total * sizeof(float));
+
+    ctx->div0 = memalign(64, DimRow * sizeof(float));
+    ctx->udiff0 = memalign(64, DimRow * sizeof(float));
+
+    if ((!ctx->Input)||(!ctx->u)||(!ctx->qx)||(!ctx->qy)||(!ctx->gradx)||(!ctx->grady)||(!ctx->div)||(!ctx->div0)||(!ctx->udiff0)) {
         fprintf(stderr, "Error allocating memory\n");
         exit(-1);
     }
@@ -304,7 +303,7 @@ static int tnv_restore(HWThread thr, void *hwctx, int device_id, void *data) {
 
 
 static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) {
-    long i, j, k, l, m;
+    long i, j, k, l;
 
     tnv_context_t *tnv_ctx = (tnv_context_t*)data;
     tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
@@ -349,39 +348,45 @@ static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) {
     float qxdiff;
     float qydiff;
     float divdiff;
-    float gradxdiff[dimZ];
-    float gradydiff[dimZ];
-    float ubarx[dimZ];
-    float ubary[dimZ];
-    float udiff_next[dimZ];
+    float gradxdiff[dimZ] __attribute__((aligned(64)));
+    float gradydiff[dimZ] __attribute__((aligned(64)));
+    float ubarx[dimZ] __attribute__((aligned(64)));
+    float ubary[dimZ] __attribute__((aligned(64)));
+    float udiff[dimZ] __attribute__((aligned(64)));
+
 
     for(i=0; i < dimX; i++) {
         for(k = 0; k < dimZ; k++) { 
             int l = i * padZ + k;
             float u_upd = (u[l] + tau * div[l] + taulambda * Input[l])/constant;
             float udiff = u[l] - u_upd;
-            ctx->udiff[l] = udiff;
             ctx->udiff0[l] = udiff;
             ctx->div0[l] = div[l];
-            u[l] = u_upd;
         }
     }
 
-    for(j = 0; j < stepY; j++) {
-        for(i = 0; i < dimX; i++) {
+    for(int j1 = 0; j1 < stepY; j1 += BLOCK) {
+     for(int i1 = 0; i1 < dimX; i1 += BLOCK) {
+      for(int j2 = 0; j2 < BLOCK; j2++) {
+        j = j1 + j2;
+        for(int i2 = 0; i2 < BLOCK; i2++) {
             float t[3];
             float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f;
+
+            i = i1 + i2;
+            if (i == dimX) break;
+            if (j == stepY) { j2 = BLOCK; break; }
             l = (j * dimX  + i) * padZ;
-            m = dimX * padZ;
-        
-//#pragma unroll 64
+
+#pragma vector aligned
+#pragma GCC ivdep 
             for(k = 0; k < dimZ; k++) {
-                float u_upd = (u[l + k + m] + tau * div[l + k + m] + taulambda * Input[l + k + m]) / constant;
-                udiff_next[k] = u[l + k + m] - u_upd;
-                u[l + k + m] = u_upd;
+                float u_upd = (u[l + k] + tau * div[l + k] + taulambda * Input[l + k]) / constant;
+                udiff[k] = u[l + k] - u_upd;
+                u[l + k] = u_upd;
 
-                float gradx_upd = (i == (dimX - 1))?0:(u[l + k + padZ] - u[l + k]);
-                float grady_upd = (j == (copY - 1))?0:(u[l + k + m] - u[l + k]);
+                float gradx_upd = (i == (dimX - 1))?0:((u[l + k + padZ] + tau * div[l + k + padZ] + taulambda * Input[l + k + padZ]) / constant - u_upd);
+                float grady_upd = (j == (copY - 1))?0:((u[l + k + dimX*padZ] + tau * div[l + k + dimX*padZ] + taulambda * Input[l + k + dimX*padZ]) / constant - u_upd);
                 gradxdiff[k] = gradx[l + k] - gradx_upd;
                 gradydiff[k] = grady[l + k] - grady_upd;
                 gradx[l + k] = gradx_upd;
@@ -398,7 +403,8 @@ static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) {
 
             coefF(t, M1, M2, M3, sigma, p, q, r);
             
-//#pragma unroll 64
+#pragma vector aligned
+#pragma GCC ivdep 
             for(k = 0; k < dimZ; k++) {
                 float vx = ubarx[k] + divsigma * qx[l + k];
                 float vy = ubary[k] + divsigma * qy[l + k];
@@ -411,26 +417,26 @@ static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) {
                 qx[l + k] += qxdiff;
                 qy[l + k] += qydiff;
 
-                float udiff = ctx->udiff[i * padZ + k];
-                ctx->udiff[i * padZ + k] = udiff_next[k];
-                unorm += (udiff * udiff);
+                unorm += (udiff[k] * udiff[k]);
                 qnorm += (qxdiff * qxdiff + qydiff * qydiff);
 
                 float div_upd = 0;
                 div_upd -= (i > 0)?qx[l + k - padZ]:0;
-                div_upd -= (j > 0)?qy[l + k - m]:0;
+                div_upd -= (j > 0)?qy[l + k - dimX*padZ]:0;
                 div_upd += (i < (dimX-1))?qx[l + k]:0;
                 div_upd += (j < (copY-1))?qy[l + k]:0;
                 divdiff = div[l + k] - div_upd;  
                 div[l + k] = div_upd;
 
-                resprimal +=  ((offY == 0)||(j > 0))?fabs(divtau * udiff + divdiff):0; 
+                resprimal +=  ((offY == 0)||(j > 0))?fabs(divtau * udiff[k] + divdiff):0; 
                 resdual += fabs(divsigma * qxdiff + gradxdiff[k]);
                 resdual += fabs(divsigma * qydiff + gradydiff[k]);
                 product -= (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff);
             }
             
         } // i
+       } // j
+     } // i
     } // j
 
 
@@ -452,8 +458,9 @@ static void TNV_CPU_init(float *InputT, float *uT, int dimX, int dimY, int dimZ)
     tnv_ctx.dimY = dimY;
     tnv_ctx.dimZ = dimZ;
         // Padding seems actually slower
-//    tnv_ctx.padZ = 64 * ((dimZ / 64) + ((dimZ % 64)?1:0));
-    tnv_ctx.padZ = dimZ;
+//    tnv_ctx.padZ = dimZ;
+//    tnv_ctx.padZ = 4 * ((dimZ / 4) + ((dimZ % 4)?1:0));
+    tnv_ctx.padZ = 16 * ((dimZ / 16) + ((dimZ % 16)?1:0));
     
     hw_sched_init();
 
@@ -573,8 +580,9 @@ float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float to
                     float udiff = ctx->udiff0[l];
 
                     ctx->div[l] -= ctx0->qy[l + m];
-                    ctx0->div[m + l + dimX*padZ] = ctx->div[l];
-                    
+                    ctx0->div[m + l + dimX * padZ] = ctx->div[l];
+                    ctx0->u[m + l + dimX * padZ] = ctx->u[l];
+
                     divdiff += ctx0->qy[l + m];
                     resprimal += fabs(divtau * udiff + divdiff); 
                 }

From 9b2ffcb1eb7e87b2b08670bbbd2fa8db6e5b2e1e Mon Sep 17 00:00:00 2001
From: "Suren A. Chilingaryan" <csa@suren.me>
Date: Sun, 29 Mar 2020 20:22:23 +0200
Subject: [PATCH 03/10] Optimize cache usage also for the version with full
 back-tracking

---
 .../regularisers_CPU/TNV_core_backtrack.c     | 123 +++++++++---------
 1 file changed, 58 insertions(+), 65 deletions(-)

diff --git a/src/Core/regularisers_CPU/TNV_core_backtrack.c b/src/Core/regularisers_CPU/TNV_core_backtrack.c
index d771db52..71924751 100755
--- a/src/Core/regularisers_CPU/TNV_core_backtrack.c
+++ b/src/Core/regularisers_CPU/TNV_core_backtrack.c
@@ -22,8 +22,10 @@
  * limitations under the License.
  */
 
+#include <malloc.h>
 #include "TNV_core.h"
 
+#define BLOCK 32
 #define min(a,b) (((a)<(b))?(a):(b))
 
 inline void coefF(float *t, float M1, float M2, float M3, float sigma, int p, int q, int r) {
@@ -196,18 +198,18 @@ static int tnv_init(HWThread thr, void *hwctx, int device_id, void *data) {
     long Dim1Total = (long)(dimX*(stepY+1)*padZ);
 
     // Auxiliar vectors
-    ctx->Input = malloc(Dim1Total * sizeof(float));
-    ctx->u = malloc(Dim1Total * sizeof(float));
-    ctx->u_upd = malloc(Dim1Total * sizeof(float));
-    ctx->qx = malloc(DimTotal * sizeof(float));
-    ctx->qy = malloc(DimTotal * sizeof(float));
-    ctx->qx_upd = malloc(DimTotal * sizeof(float));
-    ctx->qy_upd = malloc(DimTotal * sizeof(float));
-    ctx->gradx = malloc(DimTotal * sizeof(float));
-    ctx->grady = malloc(DimTotal * sizeof(float));
-    ctx->gradx_upd = malloc(DimTotal * sizeof(float));
-    ctx->grady_upd = malloc(DimTotal * sizeof(float));
-    ctx->div = malloc(Dim1Total * sizeof(float));
+    ctx->Input = memalign(64, Dim1Total * sizeof(float));
+    ctx->u = memalign(64, Dim1Total * sizeof(float));
+    ctx->u_upd = memalign(64, Dim1Total * sizeof(float));
+    ctx->qx = memalign(64, DimTotal * sizeof(float));
+    ctx->qy = memalign(64, DimTotal * sizeof(float));
+    ctx->qx_upd = memalign(64, DimTotal * sizeof(float));
+    ctx->qy_upd = memalign(64, DimTotal * sizeof(float));
+    ctx->gradx = memalign(64, DimTotal * sizeof(float));
+    ctx->grady = memalign(64, DimTotal * sizeof(float));
+    ctx->gradx_upd = memalign(64, DimTotal * sizeof(float));
+    ctx->grady_upd = memalign(64, DimTotal * sizeof(float));
+    ctx->div = memalign(64, Dim1Total * sizeof(float));
     ctx->div_upd = malloc(Dim1Total * sizeof(float));
 
     if ((!ctx->Input)||(!ctx->u)||(!ctx->u_upd)||(!ctx->qx)||(!ctx->qy)||(!ctx->qx_upd)||(!ctx->qy_upd)||(!ctx->gradx)||(!ctx->grady)||(!ctx->gradx_upd)||(!ctx->grady_upd)||(!ctx->div)||(!ctx->div_upd)) {
@@ -332,7 +334,7 @@ static int tnv_restore(HWThread thr, void *hwctx, int device_id, void *data) {
 
 
 static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) {
-    long i, j, k, l, m;
+    long i, j, k, l;
 
     tnv_context_t *tnv_ctx = (tnv_context_t*)data;
     tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
@@ -374,50 +376,41 @@ static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) {
     float theta1 = 1.0f + theta;
     float constant = 1.0f + taulambda;
 
-    float resprimal = 0.0f;
-    float resdual = 0.0f;
-    float product = 0.0f;
-    float unorm = 0.0f;
-    float qnorm = 0.0f;
-
-    float udiff[dimZ];
-    float qxdiff;
-    float qydiff;
-    float divdiff;
-    float gradxdiff[dimZ];
-    float gradydiff[dimZ];
-
-    for(i=0; i < dimX; i++) {
-        for(k = 0; k < dimZ; k++) { 
-            int l = i * padZ + k;
-            u_upd[l] = (u[l] + tau * div[l] + taulambda * Input[l])/constant;
-            div_upd[l] = 0;
-        }
-    }
-
-    for(j = 0; j < stepY; j++) {
-/*        m = j * dimX * dimZ + (dimX - 1) * dimZ;
-        for(k = 0; k < dimZ; k++) { 
-            u_upd[k + m] = (u[k + m] + tau * div[k + m] + taulambda * Input[k + m]) / constant; 
-        }*/
-        
-        for(i = 0; i < dimX/* - 1*/; i++) {
+    float resprimal = 0.0;
+    float resdual = 0.0;
+    float product = 0.0;
+    float unorm = 0.0;
+    float qnorm = 0.0;
+
+    float udiff[dimZ] __attribute__((aligned(64)));
+    float qxdiff __attribute__((aligned(64)));
+    float qydiff __attribute__((aligned(64)));
+    float divdiff __attribute__((aligned(64)));
+    float gradxdiff[dimZ] __attribute__((aligned(64)));
+    float gradydiff[dimZ] __attribute__((aligned(64)));
+
+    for(int j1 = 0; j1 < stepY; j1 += BLOCK) {
+     for(int i1 = 0; i1 < dimX; i1 += BLOCK) {
+      for(int j2 = 0; j2 < BLOCK; j2++) {
+        j = j1 + j2;
+        for(int i2 = 0; i2 < BLOCK; i2++) {
             float t[3];
             float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f;
+
+            i = i1 + i2;
+            if (i == dimX) break;
+            if (j == stepY) { j2 = BLOCK; break; }
             l = (j * dimX  + i) * padZ;
-            m = dimX * padZ;
         
-//#pragma unroll 64
+//#pragma vector aligned
+#pragma GCC ivdep 
             for(k = 0; k < dimZ; k++) {
-                u_upd[l + k + m] = (u[l + k + m] + tau * div[l + k + m] + taulambda * Input[l + k + m]) / constant;
-
-                gradx_upd[l + k] = (i == (dimX - 1))?0:(u_upd[l + k + padZ] - u_upd[l + k]);
-                grady_upd[l + k] = (j == (copY - 1))?0:(u_upd[l + k + m] - u_upd[l + k]);   // We need div from the next thread on last iter
-
+                u_upd[l + k] = (u[l + k] + tau * div[l + k] + taulambda * Input[l + k]) / constant;
                 udiff[k] = u[l + k] - u_upd[l + k];
                 unorm += (udiff[k] * udiff[k]);
-//                if ((!k)&&(!i)) printf("%i = %f %f, %f %f\n", offY + j, u[l + k], u_upd[l + k], udiff[k], unorm);
 
+                gradx_upd[l + k] = (i == (dimX - 1))?0:((u[l + k + padZ] + tau * div[l + k + padZ] + taulambda * Input[l + k + padZ]) / constant - u_upd[l + k]);
+                grady_upd[l + k] = (j == (copY - 1))?0:((u[l + k + dimX*padZ] + tau * div[l + k + dimX*padZ] + taulambda * Input[l + k + dimX*padZ]) / constant - u_upd[l + k]);
                 gradxdiff[k] = gradx[l + k] - gradx_upd[l + k];
                 gradydiff[k] = grady[l + k] - grady_upd[l + k];
 
@@ -440,7 +433,8 @@ static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) {
 
             coefF(t, M1, M2, M3, sigma, p, q, r);
             
-//#pragma unroll 64
+//#pragma vector aligned
+#pragma GCC ivdep 
             for(k = 0; k < dimZ; k++) {
 #ifdef TNV_NEW_STYLE    
                 float vx = divsigma * qx_upd[l + k];
@@ -464,14 +458,12 @@ static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) {
                 qy_upd[l + k] = qy[l + k] + sigma * (ubary - gy_upd);
 #endif
 
-if(i != (dimX-1)) {
-                div_upd[l + k] += qx_upd[l + k];
-                div_upd[l + k + padZ] -= qx_upd[l + k];
-}
-if(j != (copY-1)) {
-                div_upd[l + k] += qy_upd[l + k];
-                div_upd[l + k + m] = -qy_upd[l + k];  // We need to update div in the next thread on last iter
-}
+                float div_upd_val = 0;
+                div_upd_val -= (i > 0)?qx_upd[l + k - padZ]:0;
+                div_upd_val -= (j > 0)?qy_upd[l + k - dimX * padZ]:0;
+                div_upd_val += (i < (dimX-1))?qx_upd[l + k]:0;
+                div_upd_val += (j < (copY-1))?qy_upd[l + k]:0;
+                div_upd[l + k] = div_upd_val;
 
                 qxdiff = qx[l + k] - qx_upd[l + k];
                 qydiff = qy[l + k] - qy_upd[l + k];
@@ -488,7 +480,9 @@ if(j != (copY-1)) {
             }
             
         } // i
-    }
+       } // j
+      } // i
+    } // j
     
     
     ctx->resprimal = resprimal;
@@ -509,8 +503,8 @@ static void TNV_CPU_init(float *InputT, float *uT, int dimX, int dimY, int dimZ)
     tnv_ctx.dimY = dimY;
     tnv_ctx.dimZ = dimZ;
         // Padding seems actually slower
-//    tnv_ctx.padZ = 64 * ((dimZ / 64) + ((dimZ % 64)?1:0));
     tnv_ctx.padZ = dimZ;
+//    tnv_ctx.padZ = 16 * ((dimZ / 16) + ((dimZ % 16)?1:0));
     
     hw_sched_init();
 
@@ -620,17 +614,16 @@ float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float to
             tnv_thread_t *ctx0 = tnv_ctx.thr_ctx + (j - 1);
             tnv_thread_t *ctx = tnv_ctx.thr_ctx + j;
 
-            m = ctx0->stepY * dimX * padZ;
+            m = ctx0->stepY  * dimX * padZ;
             for(i = 0; i < dimX; i++) {
                 for(k = 0; k < dimZ; k++) {
                     int l = i * padZ + k;
                         
-                    float div_upd_add = ctx0->div_upd[m + l];
-                    ctx->div_upd[l] += div_upd_add;
+                    ctx->div_upd[l] -= ctx0->qy_upd[m - dimX * padZ + l];
                     ctx0->div_upd[m + l] = ctx->div_upd[l];
-                    //ctx0->u_upd[m + l] = ctx->u_upd[l];
+                    ctx0->u_upd[m + l] = ctx->u_upd[l];
 
-                    float divdiff = ctx->div[l] - ctx->div_upd[l];  // Multiple steps... How we compute without history?
+                    float divdiff = ctx->div[l] - ctx->div_upd[l];
                     float udiff = ctx->u[l] - ctx->u_upd[l];
                     resprimal += fabs(divtau * udiff + divdiff); 
                 }

From da30ce619bce168fc24ed0c17f04e411626ba18b Mon Sep 17 00:00:00 2001
From: "Suren A. Chilingaryan" <csa@suren.me>
Date: Sun, 29 Mar 2020 20:34:12 +0200
Subject: [PATCH 04/10] Eliminate conditionals in the innermost loop to help
 gcc auto-vectorizer

---
 src/Core/CMakeLists.txt                       |  13 +-
 src/Core/regularisers_CPU/TNV_core.c          | 216 ++++++++++--------
 .../regularisers_CPU/TNV_core_backtrack.c     | 189 ++++++++-------
 .../TNV_core_backtrack_loop.h                 | 100 ++++++++
 src/Core/regularisers_CPU/TNV_core_loop.h     | 107 +++++++++
 5 files changed, 423 insertions(+), 202 deletions(-)
 create mode 100644 src/Core/regularisers_CPU/TNV_core_backtrack_loop.h
 create mode 100644 src/Core/regularisers_CPU/TNV_core_loop.h

diff --git a/src/Core/CMakeLists.txt b/src/Core/CMakeLists.txt
index 10b16f30..76b0f3e5 100644
--- a/src/Core/CMakeLists.txt
+++ b/src/Core/CMakeLists.txt
@@ -60,9 +60,16 @@ message("CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}")
 message("Adding regularisers as a shared library")
 
 #set(CMAKE_C_COMPILER /opt/intel/compilers_and_libraries/linux/bin/intel64/icc)
-#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -xSSE4.2 -qopt-report=2 -qopt-report-phase=vec -qopenmp")
-set(CMAKE_C_FLAGS "-march=nocona -msse -msse2 -msse3 -mssse3 -msse4 -msse4.1 -msse4.2 -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=128 -fopenmp")
-#set(CMAKE_C_FLAGS "-march=native -msse -msse2 -msse3 -mssse3 -msse4 -msse4.1 -msse4.2 -mavx -mavx2 -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=512 -fopenmp")
+#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -xSSE4.2 -qopt-report=5 -qopt-report-file=stdout -qopt-report-phase=vec -qopenmp -g")
+#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -axAVX2 -xAVX2 -qopt-report=5 -qopt-report-file=stdout -qopt-report-phase=vec -qopenmp -g")
+#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -mavx512f -mavx512dq -mavx512bw -mavx512vbmi -mavx512vbmi2 -mavx512vl -qopt-report=5 -qopt-report-file=stdout -qopt-report-phase=vec -qopenmp -g")
+
+#set(CMAKE_C_COMPILER clang)
+#set(CMAKE_C_FLAGS "-march=nocona -msse -msse2 -msse3 -mssse3 -msse4 -msse4.1 -msse4.2 -ftree-vectorize -fopenmp")
+
+#set(CMAKE_C_FLAGS "-march=nocona -msse -msse2 -msse3 -mssse3 -msse4 -msse4.1 -msse4.2 -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=128 -fopenmp")
+set(CMAKE_C_FLAGS "-march=native -mavx2 -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=512 -fopenmp")
+#set(CMAKE_C_FLAGS "-march=native -mavx512f -mavx512dq -mavx512bw -mavx512vbmi -mavx512vbmi2 -mavx512vl -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=512 -fopenmp")
 #set(CMAKE_C_FLAGS_RELEASE "-g -gdwarf-2 -g3 -fno-omit-frame-pointer")
 
 #set(CMAKE_C_FLAGS "-acc -Minfo -ta=tesla:cc20 -openmp")
diff --git a/src/Core/regularisers_CPU/TNV_core.c b/src/Core/regularisers_CPU/TNV_core.c
index dce414ac..415c6448 100755
--- a/src/Core/regularisers_CPU/TNV_core.c
+++ b/src/Core/regularisers_CPU/TNV_core.c
@@ -5,12 +5,6 @@
  *
  * Copyright 2017 Daniil Kazantsev
  * Copyright 2017 Srikanth Nagella, Edoardo Pasca
- * 
- * Copyriht 2020 Suren A. Chlingaryan
- * Optimized version with 1/3 of memory consumption and ~10x performance
- * This version is not able to perform back-track except during first iterations
- * But warning would be printed if backtracking is required and slower version (TNV_core_backtrack.c) 
- * could be executed instead. It still better than original with 1/2 of memory consumption and 4x performance gain
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +17,7 @@
  * limitations under the License.
  */
 
+#include <malloc.h>
 #include "TNV_core.h"
 
 #define BLOCK 32
@@ -143,6 +138,7 @@ typedef struct {
     int offY, stepY, copY;
     float *Input, *u, *qx, *qy, *gradx, *grady, *div;
     float *div0, *udiff0;
+    float *gradxdiff, *gradydiff, *ubarx, *ubary, *udiff;
     float resprimal, resdual;
     float unorm, qnorm, product;
 } tnv_thread_t;
@@ -174,7 +170,13 @@ static int tnv_free(HWThread thr, void *hwctx, int device_id, void *data) {
     
     free(ctx->div0);
     free(ctx->udiff0);
-    
+
+    free(ctx->gradxdiff); 
+    free(ctx->gradydiff);
+    free(ctx->ubarx);
+    free(ctx->ubary);
+    free(ctx->udiff);
+
     return 0;
 }
 
@@ -194,6 +196,7 @@ static int tnv_init(HWThread thr, void *hwctx, int device_id, void *data) {
     long DimTotal = (long)(dimX*stepY*padZ);
     long Dim1Total = (long)(dimX*(stepY+1)*padZ);
     long DimRow = (long)(dimX * padZ);
+    long DimCell = (long)(padZ);
 
     // Auxiliar vectors
     ctx->Input = memalign(64, Dim1Total * sizeof(float));
@@ -207,7 +210,13 @@ static int tnv_init(HWThread thr, void *hwctx, int device_id, void *data) {
     ctx->div0 = memalign(64, DimRow * sizeof(float));
     ctx->udiff0 = memalign(64, DimRow * sizeof(float));
 
-    if ((!ctx->Input)||(!ctx->u)||(!ctx->qx)||(!ctx->qy)||(!ctx->gradx)||(!ctx->grady)||(!ctx->div)||(!ctx->div0)||(!ctx->udiff0)) {
+    ctx->gradxdiff = memalign(64, DimCell * sizeof(float));
+    ctx->gradydiff = memalign(64, DimCell * sizeof(float));
+    ctx->ubarx = memalign(64, DimCell * sizeof(float));
+    ctx->ubary = memalign(64, DimCell * sizeof(float));
+    ctx->udiff = memalign(64, DimCell * sizeof(float));
+    
+    if ((!ctx->Input)||(!ctx->u)||(!ctx->qx)||(!ctx->qy)||(!ctx->gradx)||(!ctx->grady)||(!ctx->div)||(!ctx->div0)||(!ctx->udiff)||(!ctx->udiff0)) {
         fprintf(stderr, "Error allocating memory\n");
         exit(-1);
     }
@@ -303,7 +312,7 @@ static int tnv_restore(HWThread thr, void *hwctx, int device_id, void *data) {
 
 
 static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) {
-    long i, j, k, l;
+    long i, j, k, l, m;
 
     tnv_context_t *tnv_ctx = (tnv_context_t*)data;
     tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
@@ -340,7 +349,8 @@ static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) {
     float constant = 1.0f + taulambda;
 
     float resprimal = 0.0f;
-    float resdual = 0.0f;
+    float resdual1 = 0.0f;
+    float resdual2 = 0.0f;
     float product = 0.0f;
     float unorm = 0.0f;
     float qnorm = 0.0f;
@@ -348,100 +358,92 @@ static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) {
     float qxdiff;
     float qydiff;
     float divdiff;
-    float gradxdiff[dimZ] __attribute__((aligned(64)));
-    float gradydiff[dimZ] __attribute__((aligned(64)));
-    float ubarx[dimZ] __attribute__((aligned(64)));
-    float ubary[dimZ] __attribute__((aligned(64)));
-    float udiff[dimZ] __attribute__((aligned(64)));
-
-
-    for(i=0; i < dimX; i++) {
-        for(k = 0; k < dimZ; k++) { 
-            int l = i * padZ + k;
-            float u_upd = (u[l] + tau * div[l] + taulambda * Input[l])/constant;
-            float udiff = u[l] - u_upd;
-            ctx->udiff0[l] = udiff;
-            ctx->div0[l] = div[l];
+    float *gradxdiff = ctx->gradxdiff;
+    float *gradydiff = ctx->gradydiff;
+    float *ubarx = ctx->ubarx;
+    float *ubary = ctx->ubary;
+    float *udiff = ctx->udiff;
+
+    float *udiff0 = ctx->udiff0;
+    float *div0 = ctx->div0;
+
+
+    j = 0; {
+#       define TNV_LOOP_FIRST_J
+        i = 0; {
+#           define TNV_LOOP_FIRST_I
+#           include "TNV_core_loop.h"
+#           undef TNV_LOOP_FIRST_I
+        }
+        for(i = 1; i < (dimX - 1); i++) {
+#           include "TNV_core_loop.h"
         }
+        i = dimX - 1; {
+#           define TNV_LOOP_LAST_I
+#           include "TNV_core_loop.h"
+#           undef TNV_LOOP_LAST_I
+        }
+#       undef TNV_LOOP_FIRST_J
     }
 
-    for(int j1 = 0; j1 < stepY; j1 += BLOCK) {
-     for(int i1 = 0; i1 < dimX; i1 += BLOCK) {
-      for(int j2 = 0; j2 < BLOCK; j2++) {
-        j = j1 + j2;
-        for(int i2 = 0; i2 < BLOCK; i2++) {
-            float t[3];
-            float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f;
-
-            i = i1 + i2;
-            if (i == dimX) break;
-            if (j == stepY) { j2 = BLOCK; break; }
-            l = (j * dimX  + i) * padZ;
-
-#pragma vector aligned
-#pragma GCC ivdep 
-            for(k = 0; k < dimZ; k++) {
-                float u_upd = (u[l + k] + tau * div[l + k] + taulambda * Input[l + k]) / constant;
-                udiff[k] = u[l + k] - u_upd;
-                u[l + k] = u_upd;
-
-                float gradx_upd = (i == (dimX - 1))?0:((u[l + k + padZ] + tau * div[l + k + padZ] + taulambda * Input[l + k + padZ]) / constant - u_upd);
-                float grady_upd = (j == (copY - 1))?0:((u[l + k + dimX*padZ] + tau * div[l + k + dimX*padZ] + taulambda * Input[l + k + dimX*padZ]) / constant - u_upd);
-                gradxdiff[k] = gradx[l + k] - gradx_upd;
-                gradydiff[k] = grady[l + k] - grady_upd;
-                gradx[l + k] = gradx_upd;
-                grady[l + k] = grady_upd;
-                
-                ubarx[k] = gradx_upd - theta * gradxdiff[k];
-                ubary[k] = grady_upd - theta * gradydiff[k];
-
-                float vx = ubarx[k] + divsigma * qx[l + k];
-                float vy = ubary[k] + divsigma * qy[l + k];
-
-                M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy);
-            }
 
-            coefF(t, M1, M2, M3, sigma, p, q, r);
-            
-#pragma vector aligned
-#pragma GCC ivdep 
-            for(k = 0; k < dimZ; k++) {
-                float vx = ubarx[k] + divsigma * qx[l + k];
-                float vy = ubary[k] + divsigma * qy[l + k];
-                float gx_upd = vx * t[0] + vy * t[1];
-                float gy_upd = vx * t[1] + vy * t[2];
-
-                qxdiff = sigma * (ubarx[k] - gx_upd);
-                qydiff = sigma * (ubary[k] - gy_upd);
-                
-                qx[l + k] += qxdiff;
-                qy[l + k] += qydiff;
-
-                unorm += (udiff[k] * udiff[k]);
-                qnorm += (qxdiff * qxdiff + qydiff * qydiff);
-
-                float div_upd = 0;
-                div_upd -= (i > 0)?qx[l + k - padZ]:0;
-                div_upd -= (j > 0)?qy[l + k - dimX*padZ]:0;
-                div_upd += (i < (dimX-1))?qx[l + k]:0;
-                div_upd += (j < (copY-1))?qy[l + k]:0;
-                divdiff = div[l + k] - div_upd;  
-                div[l + k] = div_upd;
-
-                resprimal +=  ((offY == 0)||(j > 0))?fabs(divtau * udiff[k] + divdiff):0; 
-                resdual += fabs(divsigma * qxdiff + gradxdiff[k]);
-                resdual += fabs(divsigma * qydiff + gradydiff[k]);
-                product -= (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff);
+
+    for(int j = 1; j < (copY - 1); j++) {
+        i = 0; {
+#           define TNV_LOOP_FIRST_I
+#           include "TNV_core_loop.h"
+#           undef TNV_LOOP_FIRST_I
+        }
+    }
+
+    for(int j1 = 1; j1 < (copY - 1); j1 += BLOCK) {
+        for(int i1 = 1; i1 < (dimX - 1); i1 += BLOCK) {
+            for(int j2 = 0; j2 < BLOCK; j2 ++) {
+                j = j1 + j2;
+                for(int i2 = 0; i2 < BLOCK; i2++) {
+                    i = i1 + i2;
+                    
+                    if (i == (dimX - 1)) break;
+                    if (j == (copY - 1)) { j2 = BLOCK; break; }
+#           include "TNV_core_loop.h"
+                }   
             }
-            
         } // i
-       } // j
-     } // i
-    } // j
+
+    }
+
+    for(int j = 1; j < (copY - 1); j++) {
+        i = dimX - 1; {
+#           define TNV_LOOP_LAST_I
+#           include "TNV_core_loop.h"
+#           undef TNV_LOOP_LAST_I
+        }
+    }
+
+
+
+    for (j = copY - 1; j < stepY; j++) {
+#       define TNV_LOOP_LAST_J
+        i = 0; {
+#           define TNV_LOOP_FIRST_I
+#           include "TNV_core_loop.h"
+#           undef TNV_LOOP_FIRST_I
+        }
+        for(i = 1; i < (dimX - 1); i++) {
+#           include "TNV_core_loop.h"
+        }
+        i = dimX - 1; {
+#           define TNV_LOOP_LAST_I
+#           include "TNV_core_loop.h"
+#           undef TNV_LOOP_LAST_I
+        }
+#       undef TNV_LOOP_LAST_J
+    }
+
 
 
     ctx->resprimal = resprimal;
-    ctx->resdual = resdual;
+    ctx->resdual = resdual1 + resdual2;
     ctx->product = product;
     ctx->unorm = unorm;
     ctx->qnorm = qnorm;
@@ -458,9 +460,9 @@ static void TNV_CPU_init(float *InputT, float *uT, int dimX, int dimY, int dimZ)
     tnv_ctx.dimY = dimY;
     tnv_ctx.dimZ = dimZ;
         // Padding seems actually slower
-//    tnv_ctx.padZ = dimZ;
+    tnv_ctx.padZ = dimZ;
 //    tnv_ctx.padZ = 4 * ((dimZ / 4) + ((dimZ % 4)?1:0));
-    tnv_ctx.padZ = 16 * ((dimZ / 16) + ((dimZ % 16)?1:0));
+//    tnv_ctx.padZ = 16 * ((dimZ / 16) + ((dimZ % 16)?1:0));
     
     hw_sched_init();
 
@@ -580,15 +582,28 @@ float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float to
                     float udiff = ctx->udiff0[l];
 
                     ctx->div[l] -= ctx0->qy[l + m];
-                    ctx0->div[m + l + dimX * padZ] = ctx->div[l];
-                    ctx0->u[m + l + dimX * padZ] = ctx->u[l];
-
+                    ctx0->div[m + l + dimX*padZ] = ctx->div[l];
+                    ctx0->u[m + l + dimX*padZ] = ctx->u[l];
+                    
                     divdiff += ctx0->qy[l + m];
                     resprimal += fabs(divtau * udiff + divdiff); 
                 }
             }
         }
 
+        {
+            tnv_thread_t *ctx = tnv_ctx.thr_ctx + 0;
+            for(i = 0; i < dimX; i++) {
+                for(k = 0; k < dimZ; k++) {
+                    int l = i * padZ + k;
+                        
+                    float divdiff = ctx->div0[l] - ctx->div[l];
+                    float udiff = ctx->udiff0[l];
+                    resprimal += fabs(divtau * udiff + divdiff); 
+                }
+            }
+        }
+
         for (j = 0; j < tnv_ctx.threads; j++) {
             tnv_thread_t *ctx = tnv_ctx.thr_ctx + j;
             resprimal += ctx->resprimal;
@@ -645,6 +660,5 @@ float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float to
     printf("Iterations stopped at %i with the residual %f \n", iter, residual);
     printf("Return: %f\n", *uT);
 
-//    exit(-1);
     return *uT;
 }
diff --git a/src/Core/regularisers_CPU/TNV_core_backtrack.c b/src/Core/regularisers_CPU/TNV_core_backtrack.c
index 71924751..9b19ed5c 100755
--- a/src/Core/regularisers_CPU/TNV_core_backtrack.c
+++ b/src/Core/regularisers_CPU/TNV_core_backtrack.c
@@ -335,6 +335,7 @@ static int tnv_restore(HWThread thr, void *hwctx, int device_id, void *data) {
 
 static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) {
     long i, j, k, l;
+    long i1, i2, j1, j2;
 
     tnv_context_t *tnv_ctx = (tnv_context_t*)data;
     tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
@@ -376,11 +377,12 @@ static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) {
     float theta1 = 1.0f + theta;
     float constant = 1.0f + taulambda;
 
-    float resprimal = 0.0;
-    float resdual = 0.0;
-    float product = 0.0;
-    float unorm = 0.0;
-    float qnorm = 0.0;
+    float resprimal = 0.0f;
+    float resdual1 = 0.0f;
+    float resdual2 = 0.0f;
+    float product = 0.0f;
+    float unorm = 0.0f;
+    float qnorm = 0.0f;
 
     float udiff[dimZ] __attribute__((aligned(64)));
     float qxdiff __attribute__((aligned(64)));
@@ -389,104 +391,82 @@ static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) {
     float gradxdiff[dimZ] __attribute__((aligned(64)));
     float gradydiff[dimZ] __attribute__((aligned(64)));
 
-    for(int j1 = 0; j1 < stepY; j1 += BLOCK) {
-     for(int i1 = 0; i1 < dimX; i1 += BLOCK) {
-      for(int j2 = 0; j2 < BLOCK; j2++) {
-        j = j1 + j2;
-        for(int i2 = 0; i2 < BLOCK; i2++) {
-            float t[3];
-            float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f;
-
-            i = i1 + i2;
-            if (i == dimX) break;
-            if (j == stepY) { j2 = BLOCK; break; }
-            l = (j * dimX  + i) * padZ;
-        
-//#pragma vector aligned
-#pragma GCC ivdep 
-            for(k = 0; k < dimZ; k++) {
-                u_upd[l + k] = (u[l + k] + tau * div[l + k] + taulambda * Input[l + k]) / constant;
-                udiff[k] = u[l + k] - u_upd[l + k];
-                unorm += (udiff[k] * udiff[k]);
-
-                gradx_upd[l + k] = (i == (dimX - 1))?0:((u[l + k + padZ] + tau * div[l + k + padZ] + taulambda * Input[l + k + padZ]) / constant - u_upd[l + k]);
-                grady_upd[l + k] = (j == (copY - 1))?0:((u[l + k + dimX*padZ] + tau * div[l + k + dimX*padZ] + taulambda * Input[l + k + dimX*padZ]) / constant - u_upd[l + k]);
-                gradxdiff[k] = gradx[l + k] - gradx_upd[l + k];
-                gradydiff[k] = grady[l + k] - grady_upd[l + k];
-
-                float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k];
-                float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k];
-//#define TNV_NEW_STYLE                
-#ifdef TNV_NEW_STYLE                
-                qx_upd[l + k] = qx[l + k] + sigma * ubarx;
-                qy_upd[l + k] = qy[l + k] + sigma * ubary;
-
-                float vx = divsigma * qx_upd[l + k]; //+ ubarx
-                float vy = divsigma * qy_upd[l + k]; //+ ubary
-#else
-                float vx = ubarx + divsigma * qx[l + k];
-                float vy = ubary + divsigma * qy[l + k];
-#endif
-
-                M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy);
-            }
 
-            coefF(t, M1, M2, M3, sigma, p, q, r);
-            
-//#pragma vector aligned
-#pragma GCC ivdep 
-            for(k = 0; k < dimZ; k++) {
-#ifdef TNV_NEW_STYLE    
-                float vx = divsigma * qx_upd[l + k];
-                float vy = divsigma * qy_upd[l + k];
-
-                float gx_upd = vx * t[0] + vy * t[1];
-                float gy_upd = vx * t[1] + vy * t[2];
-
-                qx_upd[l + k] -= sigma * gx_upd;
-                qy_upd[l + k] -= sigma * gy_upd;
-#else
-                float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k];
-                float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k];
-                float vx = ubarx + divsigma * qx[l + k];
-                float vy = ubary + divsigma * qy[l + k];
-
-                float gx_upd = vx * t[0] + vy * t[1];
-                float gy_upd = vx * t[1] + vy * t[2];
-
-                qx_upd[l + k] = qx[l + k] + sigma * (ubarx - gx_upd);
-                qy_upd[l + k] = qy[l + k] + sigma * (ubary - gy_upd);
-#endif
-
-                float div_upd_val = 0;
-                div_upd_val -= (i > 0)?qx_upd[l + k - padZ]:0;
-                div_upd_val -= (j > 0)?qy_upd[l + k - dimX * padZ]:0;
-                div_upd_val += (i < (dimX-1))?qx_upd[l + k]:0;
-                div_upd_val += (j < (copY-1))?qy_upd[l + k]:0;
-                div_upd[l + k] = div_upd_val;
-
-                qxdiff = qx[l + k] - qx_upd[l + k];
-                qydiff = qy[l + k] - qy_upd[l + k];
-                qnorm += (qxdiff * qxdiff + qydiff * qydiff);
-
-                resdual += fabs(divsigma * qxdiff - gradxdiff[k]);
-                resdual += fabs(divsigma * qydiff - gradydiff[k]);
-                product += (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff);
-
-                if ((offY == 0)||(j > 0)) {
-                    divdiff = div[l + k] - div_upd[l + k];  // Multiple steps... How we compute without history?
-                    resprimal += fabs(divtau * udiff[k] + divdiff); 
-                }
+    j = 0; {
+#       define TNV_LOOP_FIRST_J
+        i = 0; {
+#           define TNV_LOOP_FIRST_I
+#           include "TNV_core_backtrack_loop.h"
+#           undef TNV_LOOP_FIRST_I
+        }
+        for(i = 1; i < (dimX - 1); i++) {
+#           include "TNV_core_backtrack_loop.h"
+        }
+        i = dimX - 1; {
+#           define TNV_LOOP_LAST_I
+#           include "TNV_core_backtrack_loop.h"
+#           undef TNV_LOOP_LAST_I
+        }
+#       undef TNV_LOOP_FIRST_J
+    }
+
+
+
+    for(int j = 1; j < (copY - 1); j++) {
+        i = 0; {
+#           define TNV_LOOP_FIRST_I
+#           include "TNV_core_backtrack_loop.h"
+#           undef TNV_LOOP_FIRST_I
+        }
+    }
+
+    for(int j1 = 1; j1 < (copY - 1); j1 += BLOCK) {
+        for(int i1 = 1; i1 < (dimX - 1); i1 += BLOCK) {
+            for(int j2 = 0; j2 < BLOCK; j2 ++) {
+                j = j1 + j2;
+                for(int i2 = 0; i2 < BLOCK; i2++) {
+                    i = i1 + i2;
+                    
+                    if (i == (dimX - 1)) break;
+                    if (j == (copY - 1)) { j2 = BLOCK; break; }
+#           include "TNV_core_backtrack_loop.h"
+                }   
             }
-            
         } // i
-       } // j
-      } // i
-    } // j
-    
+
+    }
+
+    for(int j = 1; j < (copY - 1); j++) {
+        i = dimX - 1; {
+#           define TNV_LOOP_LAST_I
+#           include "TNV_core_backtrack_loop.h"
+#           undef TNV_LOOP_LAST_I
+        }
+    }
+
+
+
+    for (j = copY - 1; j < stepY; j++) {
+#       define TNV_LOOP_LAST_J
+        i = 0; {
+#           define TNV_LOOP_FIRST_I
+#           include "TNV_core_backtrack_loop.h"
+#           undef TNV_LOOP_FIRST_I
+        }
+        for(i = 1; i < (dimX - 1); i++) {
+#           include "TNV_core_backtrack_loop.h"
+        }
+        i = dimX - 1; {
+#           define TNV_LOOP_LAST_I
+#           include "TNV_core_backtrack_loop.h"
+#           undef TNV_LOOP_LAST_I
+        }
+#       undef TNV_LOOP_LAST_J
+    }
+
     
     ctx->resprimal = resprimal;
-    ctx->resdual = resdual;
+    ctx->resdual = resdual1 + resdual2;
     ctx->product = product;
     ctx->unorm = unorm;
     ctx->qnorm = qnorm;
@@ -630,6 +610,19 @@ float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float to
             }
         }
 
+        {
+            tnv_thread_t *ctx = tnv_ctx.thr_ctx + 0;
+            for(i = 0; i < dimX; i++) {
+                for(k = 0; k < dimZ; k++) {
+                    int l = i * padZ + k;
+
+                    float divdiff = ctx->div[l] - ctx->div_upd[l];
+                    float udiff = ctx->u[l] - ctx->u_upd[l];
+                    resprimal += fabs(divtau * udiff + divdiff); 
+                }
+            }
+        }
+
         for (j = 0; j < tnv_ctx.threads; j++) {
             tnv_thread_t *ctx = tnv_ctx.thr_ctx + j;
             resprimal += ctx->resprimal;
diff --git a/src/Core/regularisers_CPU/TNV_core_backtrack_loop.h b/src/Core/regularisers_CPU/TNV_core_backtrack_loop.h
new file mode 100644
index 00000000..3ec42509
--- /dev/null
+++ b/src/Core/regularisers_CPU/TNV_core_backtrack_loop.h
@@ -0,0 +1,100 @@
+            float t[3];
+            float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f;
+
+            l = (j * dimX  + i) * padZ;
+        
+//#pragma vector aligned
+#pragma GCC ivdep 
+            for(k = 0; k < dimZ; k++) {
+                u_upd[l + k] = (u[l + k] + tau * div[l + k] + taulambda * Input[l + k]) / constant;
+                udiff[k] = u[l + k] - u_upd[l + k];
+                unorm += (udiff[k] * udiff[k]);
+
+#ifdef TNV_LOOP_LAST_I
+                gradx_upd[l + k] = 0;
+#else
+                gradx_upd[l + k] = ((u[l + k + padZ] + tau * div[l + k + padZ] + taulambda * Input[l + k + padZ]) / constant - u_upd[l + k]);
+#endif
+
+#ifdef TNV_LOOP_LAST_J
+                grady_upd[l + k] = 0;
+#else
+                grady_upd[l + k] = ((u[l + k + dimX*padZ] + tau * div[l + k + dimX*padZ] + taulambda * Input[l + k + dimX*padZ]) / constant - u_upd[l + k]);
+#endif
+
+                gradxdiff[k] = gradx[l + k] - gradx_upd[l + k];
+                gradydiff[k] = grady[l + k] - grady_upd[l + k];
+
+                float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k];
+                float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k];
+//#define TNV_NEW_STYLE                
+#ifdef TNV_NEW_STYLE                
+                qx_upd[l + k] = qx[l + k] + sigma * ubarx;
+                qy_upd[l + k] = qy[l + k] + sigma * ubary;
+
+                float vx = divsigma * qx_upd[l + k]; //+ ubarx
+                float vy = divsigma * qy_upd[l + k]; //+ ubary
+#else
+                float vx = ubarx + divsigma * qx[l + k];
+                float vy = ubary + divsigma * qy[l + k];
+#endif
+
+                M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy);
+            }
+
+            coefF(t, M1, M2, M3, sigma, p, q, r);
+            
+//#pragma vector aligned
+#pragma GCC ivdep 
+            for(k = 0; k < dimZ; k++) {
+#ifdef TNV_NEW_STYLE    
+                float vx = divsigma * qx_upd[l + k];
+                float vy = divsigma * qy_upd[l + k];
+
+                float gx_upd = vx * t[0] + vy * t[1];
+                float gy_upd = vx * t[1] + vy * t[2];
+
+                qx_upd[l + k] -= sigma * gx_upd;
+                qy_upd[l + k] -= sigma * gy_upd;
+#else
+                float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k];
+                float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k];
+                float vx = ubarx + divsigma * qx[l + k];
+                float vy = ubary + divsigma * qy[l + k];
+
+                float gx_upd = vx * t[0] + vy * t[1];
+                float gy_upd = vx * t[1] + vy * t[2];
+
+                qx_upd[l + k] = qx[l + k] + sigma * (ubarx - gx_upd);
+                qy_upd[l + k] = qy[l + k] + sigma * (ubary - gy_upd);
+#endif
+
+                float div_upd_val = 0;
+#ifndef TNV_LOOP_FIRST_I
+                div_upd_val -= qx_upd[l + k - padZ];
+#endif
+
+#ifndef TNV_LOOP_FIRST_J
+                div_upd_val -= qy_upd[l + k - dimX * padZ];
+#endif
+#ifndef TNV_LOOP_LAST_I
+                div_upd_val += qx_upd[l + k];
+#endif
+#ifndef TNV_LOOP_LAST_J
+                div_upd_val += qy_upd[l + k];
+#endif
+                div_upd[l + k] = div_upd_val;
+
+                qxdiff = qx[l + k] - qx_upd[l + k];
+                qydiff = qy[l + k] - qy_upd[l + k];
+                qnorm += (qxdiff * qxdiff + qydiff * qydiff);
+
+                resdual1 += fabs(divsigma * qxdiff - gradxdiff[k]);
+                resdual2 += fabs(divsigma * qydiff - gradydiff[k]);
+                product += (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff);
+
+#ifndef TNV_LOOP_FIRST_J
+                divdiff = div[l + k] - div_upd[l + k];  // Multiple steps... How we compute without history?
+                resprimal += fabs(divtau * udiff[k] + divdiff); 
+#endif
+            }
diff --git a/src/Core/regularisers_CPU/TNV_core_loop.h b/src/Core/regularisers_CPU/TNV_core_loop.h
new file mode 100644
index 00000000..34e71398
--- /dev/null
+++ b/src/Core/regularisers_CPU/TNV_core_loop.h
@@ -0,0 +1,107 @@
+    {
+            float t[3];
+            float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f;
+            l = (j * dimX  + i) * padZ;
+            m = dimX * padZ;
+        
+            float *__restrict u_next = u + l + padZ;
+            float *__restrict u_current = u + l;
+            float *__restrict u_next_row = u + l + m;
+
+
+            float *__restrict qx_current = qx + l;
+            float *__restrict qy_current = qy + l;
+            float *__restrict qx_prev = qx + l - padZ;
+            float *__restrict qy_prev = qy + l - m;
+
+
+//  __assume(padZ%16==0);
+
+//#pragma vector aligned
+#pragma GCC ivdep 
+            for(k = 0; k < dimZ; k++) {
+                float u_upd = (u[l + k] + tau * div[l + k] + taulambda * Input[l + k]) / constant; // 3 reads
+                udiff[k] = u[l + k] - u_upd; // cache 1w
+                u[l + k] = u_upd; // 1 write
+
+#ifdef TNV_LOOP_FIRST_J
+                udiff0[l + k] = udiff[k];
+                div0[l + k] = div[l + k];
+#endif
+
+#ifdef TNV_LOOP_LAST_I
+                float gradx_upd = 0;
+#else
+                float u_next_upd = (u[l + k + padZ] + tau * div[l + k + padZ] + taulambda * Input[l + k + padZ]) / constant; // 3 reads
+                float gradx_upd = (u_next_upd - u_upd); // 2 reads
+#endif
+
+#ifdef TNV_LOOP_LAST_J
+                float grady_upd = 0;
+#else
+                float u_next_row_upd = (u[l + k + m] + tau * div[l + k + m] + taulambda * Input[l + k + m]) / constant; // 3 reads
+                float grady_upd = (u_next_row_upd - u_upd); // 1 read
+#endif
+
+                gradxdiff[k] = gradx[l + k] - gradx_upd; // 1 read, cache 1w
+                gradydiff[k] = grady[l + k] - grady_upd; // 1 read, cache 1w
+                gradx[l + k] = gradx_upd; // 1 write
+                grady[l + k] = grady_upd; // 1 write
+                
+                ubarx[k] = gradx_upd - theta * gradxdiff[k]; // cache 1w
+                ubary[k] = grady_upd - theta * gradydiff[k]; // cache 1w
+
+                float vx = ubarx[k] + divsigma * qx[l + k]; // 1 read
+                float vy = ubary[k] + divsigma * qy[l + k]; // 1 read
+
+                M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy);
+            }
+
+            coefF(t, M1, M2, M3, sigma, p, q, r);
+            
+//#pragma vector aligned
+#pragma GCC ivdep 
+            for(k = 0; k < padZ; k++) {
+                float vx = ubarx[k] + divsigma * qx_current[k]; // cache 2r
+                float vy = ubary[k] + divsigma * qy_current[k]; // cache 2r
+                float gx_upd = vx * t[0] + vy * t[1];
+                float gy_upd = vx * t[1] + vy * t[2];
+
+                qxdiff = sigma * (ubarx[k] - gx_upd);
+                qydiff = sigma * (ubary[k] - gy_upd);
+                
+                qx_current[k] += qxdiff; // write 1
+                qy_current[k] += qydiff; // write 1
+
+                unorm += (udiff[k] * udiff[k]);
+                qnorm += (qxdiff * qxdiff + qydiff * qydiff);
+
+                float div_upd = 0;
+
+#ifndef TNV_LOOP_FIRST_I
+                div_upd -= qx_prev[k]; // 1 read
+#endif
+#ifndef TNV_LOOP_FIRST_J
+                div_upd -= qy_prev[k]; // 1 read
+#endif
+#ifndef TNV_LOOP_LAST_I
+                div_upd += qx_current[k]; 
+#endif
+#ifndef TNV_LOOP_LAST_J
+                div_upd += qy_current[k]; 
+#endif
+
+                divdiff = div[l + k] - div_upd;  // 1 read
+                div[l + k] = div_upd; // 1 write
+
+#ifndef TNV_LOOP_FIRST_J
+                resprimal += fabs(divtau * udiff[k] + divdiff); 
+#endif
+
+                    // We need to have two independent accumulators to allow gcc-autovectorization
+                resdual1 += fabs(divsigma * qxdiff + gradxdiff[k]); // cache 1r
+                resdual2 += fabs(divsigma * qydiff + gradydiff[k]); // cache 1r
+                product -= (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff);
+            }
+    }
+    
\ No newline at end of file

From febfe9a6490052d4b8789fd8f7a0342115bfd55e Mon Sep 17 00:00:00 2001
From: "Suren A. Chilingaryan" <csa@suren.me>
Date: Sun, 29 Mar 2020 23:13:21 +0200
Subject: [PATCH 05/10] Enable padding over Z dimmension

---
 src/Core/CMakeLists.txt                             | 10 ++++++----
 src/Core/regularisers_CPU/TNV_core.c                |  4 ++--
 src/Core/regularisers_CPU/TNV_core_backtrack.c      |  4 ++--
 src/Core/regularisers_CPU/TNV_core_backtrack_loop.h |  4 ++--
 src/Core/regularisers_CPU/TNV_core_loop.h           |  4 ++--
 5 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/src/Core/CMakeLists.txt b/src/Core/CMakeLists.txt
index 76b0f3e5..9a53b676 100644
--- a/src/Core/CMakeLists.txt
+++ b/src/Core/CMakeLists.txt
@@ -60,15 +60,17 @@ message("CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}")
 message("Adding regularisers as a shared library")
 
 #set(CMAKE_C_COMPILER /opt/intel/compilers_and_libraries/linux/bin/intel64/icc)
-#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -xSSE4.2 -qopt-report=5 -qopt-report-file=stdout -qopt-report-phase=vec -qopenmp -g")
-#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -axAVX2 -xAVX2 -qopt-report=5 -qopt-report-file=stdout -qopt-report-phase=vec -qopenmp -g")
-#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -mavx512f -mavx512dq -mavx512bw -mavx512vbmi -mavx512vbmi2 -mavx512vl -qopt-report=5 -qopt-report-file=stdout -qopt-report-phase=vec -qopenmp -g")
+#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -xSSE4.2 -qopt-report=5 -qopt-report-file=stdout -qopt-report-phase=vec -qopenmp")
+#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -axAVX2 -xAVX2 -qopt-report=5 -qopt-report-file=stdout -qopt-report-phase=vec -qopenmp")
+#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -mavx512f -mavx512dq -mavx512bw -mavx512vbmi -mavx512vbmi2 -mavx512vl -qopt-report=5 -qopt-report-file=stdout -qopt-report-phase=vec -qopenmp")
 
 #set(CMAKE_C_COMPILER clang)
 #set(CMAKE_C_FLAGS "-march=nocona -msse -msse2 -msse3 -mssse3 -msse4 -msse4.1 -msse4.2 -ftree-vectorize -fopenmp")
 
+#set(CMAKE_C_COMPILER gcc-9)
+set(CMAKE_C_FLAGS "-march=native -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -fopenmp")
 #set(CMAKE_C_FLAGS "-march=nocona -msse -msse2 -msse3 -mssse3 -msse4 -msse4.1 -msse4.2 -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=128 -fopenmp")
-set(CMAKE_C_FLAGS "-march=native -mavx2 -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=512 -fopenmp")
+#set(CMAKE_C_FLAGS "-march=native -mavx2 -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=512 -fopenmp")
 #set(CMAKE_C_FLAGS "-march=native -mavx512f -mavx512dq -mavx512bw -mavx512vbmi -mavx512vbmi2 -mavx512vl -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=512 -fopenmp")
 #set(CMAKE_C_FLAGS_RELEASE "-g -gdwarf-2 -g3 -fno-omit-frame-pointer")
 
diff --git a/src/Core/regularisers_CPU/TNV_core.c b/src/Core/regularisers_CPU/TNV_core.c
index 415c6448..7b8023bc 100755
--- a/src/Core/regularisers_CPU/TNV_core.c
+++ b/src/Core/regularisers_CPU/TNV_core.c
@@ -460,9 +460,9 @@ static void TNV_CPU_init(float *InputT, float *uT, int dimX, int dimY, int dimZ)
     tnv_ctx.dimY = dimY;
     tnv_ctx.dimZ = dimZ;
         // Padding seems actually slower
-    tnv_ctx.padZ = dimZ;
+//    tnv_ctx.padZ = dimZ;
 //    tnv_ctx.padZ = 4 * ((dimZ / 4) + ((dimZ % 4)?1:0));
-//    tnv_ctx.padZ = 16 * ((dimZ / 16) + ((dimZ % 16)?1:0));
+    tnv_ctx.padZ = 16 * ((dimZ / 16) + ((dimZ % 16)?1:0));
     
     hw_sched_init();
 
diff --git a/src/Core/regularisers_CPU/TNV_core_backtrack.c b/src/Core/regularisers_CPU/TNV_core_backtrack.c
index 9b19ed5c..7eb367e3 100755
--- a/src/Core/regularisers_CPU/TNV_core_backtrack.c
+++ b/src/Core/regularisers_CPU/TNV_core_backtrack.c
@@ -483,8 +483,8 @@ static void TNV_CPU_init(float *InputT, float *uT, int dimX, int dimY, int dimZ)
     tnv_ctx.dimY = dimY;
     tnv_ctx.dimZ = dimZ;
         // Padding seems actually slower
-    tnv_ctx.padZ = dimZ;
-//    tnv_ctx.padZ = 16 * ((dimZ / 16) + ((dimZ % 16)?1:0));
+//    tnv_ctx.padZ = dimZ;
+    tnv_ctx.padZ = 16 * ((dimZ / 16) + ((dimZ % 16)?1:0));
     
     hw_sched_init();
 
diff --git a/src/Core/regularisers_CPU/TNV_core_backtrack_loop.h b/src/Core/regularisers_CPU/TNV_core_backtrack_loop.h
index 3ec42509..2605d223 100644
--- a/src/Core/regularisers_CPU/TNV_core_backtrack_loop.h
+++ b/src/Core/regularisers_CPU/TNV_core_backtrack_loop.h
@@ -3,7 +3,7 @@
 
             l = (j * dimX  + i) * padZ;
         
-//#pragma vector aligned
+#pragma vector aligned
 #pragma GCC ivdep 
             for(k = 0; k < dimZ; k++) {
                 u_upd[l + k] = (u[l + k] + tau * div[l + k] + taulambda * Input[l + k]) / constant;
@@ -44,7 +44,7 @@
 
             coefF(t, M1, M2, M3, sigma, p, q, r);
             
-//#pragma vector aligned
+#pragma vector aligned
 #pragma GCC ivdep 
             for(k = 0; k < dimZ; k++) {
 #ifdef TNV_NEW_STYLE    
diff --git a/src/Core/regularisers_CPU/TNV_core_loop.h b/src/Core/regularisers_CPU/TNV_core_loop.h
index 34e71398..3f6d9bc2 100644
--- a/src/Core/regularisers_CPU/TNV_core_loop.h
+++ b/src/Core/regularisers_CPU/TNV_core_loop.h
@@ -17,7 +17,7 @@
 
 //  __assume(padZ%16==0);
 
-//#pragma vector aligned
+#pragma vector aligned
 #pragma GCC ivdep 
             for(k = 0; k < dimZ; k++) {
                 float u_upd = (u[l + k] + tau * div[l + k] + taulambda * Input[l + k]) / constant; // 3 reads
@@ -59,7 +59,7 @@
 
             coefF(t, M1, M2, M3, sigma, p, q, r);
             
-//#pragma vector aligned
+#pragma vector aligned
 #pragma GCC ivdep 
             for(k = 0; k < padZ; k++) {
                 float vx = ubarx[k] + divsigma * qx_current[k]; // cache 2r

From cd0dd76a48aada43c3066642a8eb327a0374d113 Mon Sep 17 00:00:00 2001
From: "Suren A. Chilingaryan" <csa@suren.me>
Date: Sun, 29 Mar 2020 23:13:45 +0200
Subject: [PATCH 06/10] Add optimization steps for reference

---
 src/Core/performance_CPU/README               |  25 +
 src/Core/performance_CPU/TNV_core.c.v15       | 731 ++++++++++++++++++
 src/Core/performance_CPU/TNV_core.c.v17       | 690 +++++++++++++++++
 src/Core/performance_CPU/TNV_core.c.v18       | 688 +++++++++++++++++
 src/Core/performance_CPU/TNV_core.c.v19       | 681 ++++++++++++++++
 src/Core/performance_CPU/TNV_core.c.v27       | 650 ++++++++++++++++
 src/Core/performance_CPU/TNV_core.c.v32       | 676 ++++++++++++++++
 src/Core/performance_CPU/TNV_core.c.v4.stdver | 629 +++++++++++++++
 .../TNV_core_backtrack_loop.h.v19             | 100 +++
 src/Core/performance_CPU/TNV_core_loop.h.v32  | 119 +++
 10 files changed, 4989 insertions(+)
 create mode 100644 src/Core/performance_CPU/README
 create mode 100755 src/Core/performance_CPU/TNV_core.c.v15
 create mode 100755 src/Core/performance_CPU/TNV_core.c.v17
 create mode 100755 src/Core/performance_CPU/TNV_core.c.v18
 create mode 100755 src/Core/performance_CPU/TNV_core.c.v19
 create mode 100755 src/Core/performance_CPU/TNV_core.c.v27
 create mode 100755 src/Core/performance_CPU/TNV_core.c.v32
 create mode 100755 src/Core/performance_CPU/TNV_core.c.v4.stdver
 create mode 100644 src/Core/performance_CPU/TNV_core_backtrack_loop.h.v19
 create mode 100644 src/Core/performance_CPU/TNV_core_loop.h.v32

diff --git a/src/Core/performance_CPU/README b/src/Core/performance_CPU/README
new file mode 100644
index 00000000..948c544a
--- /dev/null
+++ b/src/Core/performance_CPU/README
@@ -0,0 +1,25 @@
+TNV_core.c.v4.stdver: Fully equivalent (signle-threads). There is two potential breakage points. 
+    - If OpenMP is enabled, the acces to div_upd will not be serialized and results will breaj
+    - The results will slightly differ due to different order of summation if loop summing resprimal/resdual organized in a logical way
+TNV_core.c.v15: Multi-threads.  Works correctly only in the single-threaded mode (if TNV_NEW_STYLE is disabled). In multi-threaded there results slightly differ due to changed order of operation
+    - TNV_NEW_STYLE slightly disturbs results in both single- and multi-threaded modes
+    - Resprimal/resdual are summed in groups (not sequentially) if multiple threads. But his actually should improve precision. Use TNV_CHECK_RES to check conformance
+    - Afterwards, in multi-threaded moded there is a still minor descripancy which first occurs in resprimal (after a few iterations). This is because of changed order of operations while computing
+    div_upd (only on the first lines of each new sub-block). Normally, we first compute the vertical and, then, add horizontal. On the border rows, instead we first add horizontals...
+    To check, div/div_upd changed to double. There is no difference then.
+TNV_core.c.v17: Computationaly comptabile with v15.
+    - Padding actually harms performance
+    - Intel compiler gives about 10% speed-up
+TNV_core.c.v18: Blocking helps to boost performance further but only with Intel Compiler. Gcc/Clang is slightly slower here.
+    - Padding here doesn't harm performance, but is not helpful either
+    - Difference between icc and gcc is probably due to auto-vectorization.
+    - Results slightly changed due to different order of operations
+TNV_core.c.v19: Eliminate conditionals in the inner loops to help gcc-autovectorisation
+    - Last version implementing full algrotithm with backtrack in the middle of iterations.
+    - Again results slightly diverge from v18 due to different order of operations
+TNV_core.c.v27: v18 with backtracking only on first iterations (otherwise warning reported)
+TNV_core.c.v32: v19 with backtracking only on first iterations (otherwise warning reported)
+
+
+Repo:
+    - Padding seems to have effect on the newer AVX2 systems. Re-enabled.
diff --git a/src/Core/performance_CPU/TNV_core.c.v15 b/src/Core/performance_CPU/TNV_core.c.v15
new file mode 100755
index 00000000..3161cbfb
--- /dev/null
+++ b/src/Core/performance_CPU/TNV_core.c.v15
@@ -0,0 +1,731 @@
+/*
+ * This work is part of the Core Imaging Library developed by
+ * Visual Analytics and Imaging System Group of the Science Technology
+ * Facilities Council, STFC
+ *
+ * Copyright 2017 Daniil Kazantsev
+ * Copyright 2017 Srikanth Nagella, Edoardo Pasca
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "TNV_core.h"
+
+#define min(a,b) (((a)<(b))?(a):(b))
+
+/*inline*/ void coefF(float *t, float M1, float M2, float M3, float sigma, int p, int q, int r) {
+    int ii, num;
+    float divsigma = 1.0f / sigma;
+    float sum, shrinkfactor;
+    float T,D,det,eig1,eig2,sig1,sig2,V1, V2, V3, V4, v0,v1,v2, mu1,mu2,sig1_upd,sig2_upd;
+    float proj[2] = {0};
+
+    // Compute eigenvalues of M
+    T = M1 + M3;
+    D = M1 * M3 - M2 * M2;
+    det = sqrtf(MAX((T * T / 4.0f) - D, 0.0f));
+    eig1 = MAX((T / 2.0f) + det, 0.0f);
+    eig2 = MAX((T / 2.0f) - det, 0.0f);
+    sig1 = sqrtf(eig1);
+    sig2 = sqrtf(eig2);
+
+    // Compute normalized eigenvectors
+    V1 = V2 = V3 = V4 = 0.0f;
+
+    if(M2 != 0.0f)
+    {
+        v0 = M2;
+        v1 = eig1 - M3;
+        v2 = eig2 - M3;
+
+        mu1 = sqrtf(v0 * v0 + v1 * v1);
+        mu2 = sqrtf(v0 * v0 + v2 * v2);
+
+        if(mu1 > fTiny)
+        {
+            V1 = v1 / mu1;
+            V3 = v0 / mu1;
+        }
+
+        if(mu2 > fTiny)
+        {
+            V2 = v2 / mu2;
+            V4 = v0 / mu2;
+        }
+
+    } else
+    {
+        if(M1 > M3)
+        {
+            V1 = V4 = 1.0f;
+            V2 = V3 = 0.0f;
+
+        } else
+        {
+            V1 = V4 = 0.0f;
+            V2 = V3 = 1.0f;
+        }
+    }
+
+    // Compute prox_p of the diagonal entries
+    sig1_upd = sig2_upd = 0.0f;
+
+    if(p == 1)
+    {
+        sig1_upd = MAX(sig1 - divsigma, 0.0f);
+        sig2_upd = MAX(sig2 - divsigma, 0.0f);
+
+    } else if(p == INFNORM)
+    {
+        proj[0] = sigma * fabs(sig1);
+        proj[1] = sigma * fabs(sig2);
+
+        /*l1 projection part */
+        sum = fLarge;
+        num = 0l;
+        shrinkfactor = 0.0f;
+        while(sum > 1.0f)
+        {
+            sum = 0.0f;
+            num = 0;
+
+            for(ii = 0; ii < 2; ii++)
+            {
+                proj[ii] = MAX(proj[ii] - shrinkfactor, 0.0f);
+
+                sum += fabs(proj[ii]);
+                if(proj[ii]!= 0.0f)
+                    num++;
+            }
+
+            if(num > 0)
+                shrinkfactor = (sum - 1.0f) / num;
+            else
+                break;
+        }
+        /*l1 proj ends*/
+
+        sig1_upd = sig1 - divsigma * proj[0];
+        sig2_upd = sig2 - divsigma * proj[1];
+    }
+
+    // Compute the diagonal entries of $\widehat{\Sigma}\Sigma^{\dagger}_0$
+    if(sig1 > fTiny)
+        sig1_upd /= sig1;
+
+    if(sig2 > fTiny)
+        sig2_upd /= sig2;
+
+    // Compute solution
+    t[0] = sig1_upd * V1 * V1 + sig2_upd * V2 * V2;
+    t[1] = sig1_upd * V1 * V3 + sig2_upd * V2 * V4;
+    t[2] = sig1_upd * V3 * V3 + sig2_upd * V4 * V4;
+}
+
+
+#include "hw_sched.h"
+typedef struct {
+    int offY, stepY, realY, copY;
+    float *Input, *u, *u_upd, *qx, *qy, *qx_upd, *qy_upd, *gradx, *grady, *gradx_upd, *grady_upd;
+    double *div, *div_upd;
+    float resprimal, resdual;
+    float unorm, qnorm, product;
+} tnv_thread_t;
+
+typedef struct {
+    int threads;
+    tnv_thread_t *thr_ctx;
+    float *InputT, *uT;
+    int dimX, dimY, dimZ, padZ;
+    float lambda, sigma, tau, theta;
+} tnv_context_t;
+
+HWSched sched = NULL;
+tnv_context_t tnv_ctx;
+
+
+static int tnv_free(HWThread thr, void *hwctx, int device_id, void *data) {
+    int i,j,k;
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+    free(ctx->Input);
+    free(ctx->u);
+    free(ctx->u_upd);
+    free(ctx->qx);
+    free(ctx->qy);
+    free(ctx->qx_upd);
+    free(ctx->qy_upd);
+    free(ctx->gradx);
+    free(ctx->grady);
+    free(ctx->gradx_upd);
+    free(ctx->grady_upd);
+    free(ctx->div);
+    free(ctx->div_upd);
+    
+    return 0;
+}
+
+static int tnv_init(HWThread thr, void *hwctx, int device_id, void *data) {
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+    
+    int dimX = tnv_ctx->dimX;
+    int dimY = tnv_ctx->dimY;
+    int dimZ = tnv_ctx->dimZ;
+    int offY = ctx->offY;
+    int stepY = ctx->stepY;
+    int realY = ctx->realY;
+    
+//    printf("%i %p - %i %i %i x %i %i\n", device_id, ctx, dimX, dimY, dimZ, offY, stepY);
+
+    long DimTotal = (long)(dimX*realY*dimZ);
+    long Dim1Total = (long)(dimX*(stepY+1)*dimZ);
+
+    // Auxiliar vectors
+    ctx->Input = calloc(Dim1Total, sizeof(float));
+    ctx->u = calloc(Dim1Total, sizeof(float));
+    ctx->u_upd = calloc(Dim1Total, sizeof(float));
+    ctx->qx = calloc(DimTotal, sizeof(float));
+    ctx->qy = calloc(DimTotal, sizeof(float));
+    ctx->qx_upd = calloc(DimTotal, sizeof(float));
+    ctx->qy_upd = calloc(DimTotal, sizeof(float));
+    ctx->gradx = calloc(DimTotal, sizeof(float));
+    ctx->grady = calloc(DimTotal, sizeof(float));
+    ctx->gradx_upd = calloc(DimTotal, sizeof(float));
+    ctx->grady_upd = calloc(DimTotal, sizeof(float));
+    ctx->div = calloc(Dim1Total, sizeof(double));
+    ctx->div_upd = calloc(Dim1Total, sizeof(double));
+
+    if ((!ctx->Input)||(!ctx->u)||(!ctx->u_upd)||(!ctx->qx)||(!ctx->qy)||(!ctx->qx_upd)||(!ctx->qy_upd)||(!ctx->gradx)||(!ctx->grady)||(!ctx->gradx_upd)||(!ctx->grady_upd)||(!ctx->div)||(!ctx->div_upd)) {
+        fprintf(stderr, "Error allocating memory\n");
+        exit(-1);
+    }
+
+    return 0;
+}
+
+static int tnv_start(HWThread thr, void *hwctx, int device_id, void *data) {
+    int i,j,k;
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+    
+    int dimX = tnv_ctx->dimX;
+    int dimY = tnv_ctx->dimY;
+    int dimZ = tnv_ctx->dimZ;
+    int offY = ctx->offY;
+    int stepY = ctx->stepY;
+    int realY = ctx->realY;
+    int copY = ctx->copY;
+    
+//    printf("%i %p - %i %i %i x %i %i\n", device_id, ctx, dimX, dimY, dimZ, offY, stepY);
+
+    long DimTotal = (long)(dimX*realY*dimZ);
+    long Dim1Total = (long)(dimX*(stepY+1)*dimZ);
+
+    memset(ctx->u, 0, Dim1Total * sizeof(float));
+    memset(ctx->qx, 0, DimTotal * sizeof(float));
+    memset(ctx->qy, 0, DimTotal * sizeof(float));
+    memset(ctx->gradx, 0, DimTotal * sizeof(float));
+    memset(ctx->grady, 0, DimTotal * sizeof(float));
+    memset(ctx->div, 0, Dim1Total * sizeof(double));
+    memset(ctx->u_upd, 0, Dim1Total * sizeof(float));
+    memset(ctx->qx_upd, 0, DimTotal * sizeof(float));
+    memset(ctx->qy_upd, 0, DimTotal * sizeof(float));
+    memset(ctx->gradx_upd, 0, DimTotal * sizeof(float));
+    memset(ctx->grady_upd, 0, DimTotal * sizeof(float));
+    memset(ctx->div_upd, 0, Dim1Total * sizeof(double));
+
+    for(k=0; k<dimZ; k++) {
+        for(j=0; j<copY; j++) {
+            for(i=0; i<dimX; i++) {
+                ctx->Input[j * dimX * dimZ + i * dimZ + k] =  tnv_ctx->InputT[k * dimX * dimY + (j + offY) * dimX + i];
+                ctx->u[j * dimX * dimZ + i * dimZ + k] =  tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i];
+            }
+        }
+    }
+
+    return 0;
+}
+
+static int tnv_finish(HWThread thr, void *hwctx, int device_id, void *data) {
+    int i,j,k;
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+    int dimX = tnv_ctx->dimX;
+    int dimY = tnv_ctx->dimY;
+    int dimZ = tnv_ctx->dimZ;
+    int offY = ctx->offY;
+    int stepY = ctx->stepY;
+    int realY = ctx->realY;
+    int copY = ctx->copY;
+
+    long DimTotal = (long)(dimX*realY*dimZ);
+    long Dim1Total = (long)(dimX*(stepY+1)*dimZ);
+
+    for(k=0; k<dimZ; k++) {
+        for(j=0; j<copY; j++) {
+            for(i=0; i<dimX; i++) {
+                tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i] = ctx->u[j * dimX * dimZ + i * dimZ + k];
+            }
+        }
+    }
+    
+    return 0;
+}
+
+
+static int tnv_copy(HWThread thr, void *hwctx, int device_id, void *data) {
+    int i,j,k;
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+    
+    int dimX = tnv_ctx->dimX;
+    int dimY = tnv_ctx->dimY;
+    int dimZ = tnv_ctx->dimZ;
+    int stepY = ctx->stepY;
+    int realY = ctx->realY;
+    long DimTotal = (long)(dimX*realY*dimZ);
+    long Dim1Total = (long)(dimX*(stepY+1)*dimZ);
+
+    // Auxiliar vectors
+    memcpy(ctx->u, ctx->u_upd, Dim1Total * sizeof(float));
+    memcpy(ctx->qx, ctx->qx_upd, DimTotal * sizeof(float));
+    memcpy(ctx->qy, ctx->qy_upd, DimTotal * sizeof(float));
+    memcpy(ctx->gradx, ctx->gradx_upd, DimTotal * sizeof(float));
+    memcpy(ctx->grady, ctx->grady_upd, DimTotal * sizeof(float));
+    memcpy(ctx->div, ctx->div_upd, Dim1Total * sizeof(double));
+
+    return 0;
+}
+
+static int tnv_restore(HWThread thr, void *hwctx, int device_id, void *data) {
+    int i,j,k;
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+    
+    int dimX = tnv_ctx->dimX;
+    int dimY = tnv_ctx->dimY;
+    int dimZ = tnv_ctx->dimZ;
+    int stepY = ctx->stepY;
+    int realY = ctx->realY;
+    long DimTotal = (long)(dimX*realY*dimZ);
+    long Dim1Total = (long)(dimX*(stepY+1)*dimZ);
+
+    // Auxiliar vectors
+    memcpy(ctx->u_upd, ctx->u, Dim1Total * sizeof(float));
+    memcpy(ctx->qx_upd, ctx->qx, DimTotal * sizeof(float));
+    memcpy(ctx->qy_upd, ctx->qy, DimTotal * sizeof(float));
+    memcpy(ctx->gradx_upd, ctx->gradx, DimTotal * sizeof(float));
+    memcpy(ctx->grady_upd, ctx->grady, DimTotal * sizeof(float));
+    memcpy(ctx->div_upd, ctx->div, Dim1Total * sizeof(double));
+
+    return 0;
+}
+
+
+static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) {
+    long i, j, k, l, m;
+
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+    
+    int dimX = tnv_ctx->dimX;
+    int dimY = tnv_ctx->dimY;
+    int dimZ = tnv_ctx->dimZ;
+    int padZ = tnv_ctx->padZ;
+    int offY = ctx->offY;
+    int stepY = ctx->stepY;
+    int copY = ctx->copY;
+
+    float *Input = ctx->Input;
+    float *u = ctx->u;
+    float *u_upd = ctx->u_upd;
+    float *qx = ctx->qx;
+    float *qy = ctx->qy;
+    float *qx_upd = ctx->qx_upd;
+    float *qy_upd = ctx->qy_upd;
+    float *gradx = ctx->gradx;
+    float *grady = ctx->grady;
+    float *gradx_upd = ctx->gradx_upd;
+    float *grady_upd = ctx->grady_upd;
+    double *div = ctx->div;
+    double *div_upd = ctx->div_upd;
+
+    long p = 1l;
+    long q = 1l;
+    long r = 0l;
+
+    float lambda = tnv_ctx->lambda;
+    float sigma = tnv_ctx->sigma;
+    float tau = tnv_ctx->tau;
+    float theta = tnv_ctx->theta;
+    
+    float taulambda = tau * lambda;
+    float divtau = 1.0f / tau;
+    float divsigma = 1.0f / sigma;
+    float theta1 = 1.0f + theta;
+    float constant = 1.0f + taulambda;
+
+    float resprimal = 0.0f;
+    float resdual = 0.0f;
+    float product = 0.0f;
+    float unorm = 0.0f;
+    float qnorm = 0.0f;
+
+    float udiff[dimZ];
+    float qxdiff;
+    float qydiff;
+    float divdiff;
+    float gradxdiff[dimZ];
+    float gradydiff[dimZ];
+
+    for(k=0; k < dimZ * dimX; k++) { 
+        u_upd[k] = (u[k] + tau * div[k] + taulambda * Input[k])/constant;
+        div_upd[k] = 0;
+    }
+
+    for(j = 0; j < stepY; j++) {
+/*        m = j * dimX * dimZ + (dimX - 1) * dimZ;
+        for(k = 0; k < dimZ; k++) { 
+            u_upd[k + m] = (u[k + m] + tau * div[k + m] + taulambda * Input[k + m]) / constant; 
+        }*/
+        
+        for(i=0; i < (dimX /*- 1*/); i++) {
+            l = (j * dimX  + i) * dimZ;
+            float t[3];
+            float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f;
+            m = dimX * dimZ;
+        
+//#pragma unroll 64
+            for(k = 0; k < dimZ; k++) {
+                u_upd[l + k + m] = (u[l + k + m] + tau * div[l + k + m] + taulambda * Input[l + k + m]) / constant;
+
+                gradx_upd[l + k] = (i == (dimX - 1))?0:(u_upd[l + k + dimZ] - u_upd[l + k]);
+                grady_upd[l + k] = (j == (copY - 1))?0:(u_upd[l + k + dimX * dimZ] - u_upd[l + k]);   // We need div from the next thread on last iter
+
+                udiff[k] = u[l + k] - u_upd[l + k];
+                unorm += (udiff[k] * udiff[k]);
+//                if ((!k)&&(!i)) printf("%i = %f %f, %f %f\n", offY + j, u[l + k], u_upd[l + k], udiff[k], unorm);
+
+                gradxdiff[k] = gradx[l + k] - gradx_upd[l + k];
+                gradydiff[k] = grady[l + k] - grady_upd[l + k];
+
+                float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k];
+                float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k];
+//#define TNV_NEW_STYLE                
+#ifdef TNV_NEW_STYLE                
+                qx_upd[l + k] = qx[l + k] + sigma * ubarx;
+                qy_upd[l + k] = qy[l + k] + sigma * ubary;
+
+                float vx = divsigma * qx_upd[l + k]; //+ ubarx
+                float vy = divsigma * qy_upd[l + k]; //+ ubary
+#else
+                float vx = ubarx + divsigma * qx[l + k];
+                float vy = ubary + divsigma * qy[l + k];
+#endif
+
+                M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy);
+            }
+
+            coefF(t, M1, M2, M3, sigma, p, q, r);
+            
+//#pragma unroll 64
+            for(k = 0; k < dimZ; k++) {
+#ifdef TNV_NEW_STYLE    
+                float vx = divsigma * qx_upd[l + k];
+                float vy = divsigma * qy_upd[l + k];
+
+                float gx_upd = vx * t[0] + vy * t[1];
+                float gy_upd = vx * t[1] + vy * t[2];
+
+                qx_upd[l + k] -= sigma * gx_upd;
+                qy_upd[l + k] -= sigma * gy_upd;
+#else
+                float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k];
+                float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k];
+                float vx = ubarx + divsigma * qx[l + k];
+                float vy = ubary + divsigma * qy[l + k];
+
+                float gx_upd = vx * t[0] + vy * t[1];
+                float gy_upd = vx * t[1] + vy * t[2];
+
+                qx_upd[l + k] = qx[l + k] + sigma * (ubarx - gx_upd);
+                qy_upd[l + k] = qy[l + k] + sigma * (ubary - gy_upd);
+#endif
+
+if(i != (dimX-1)) {
+                div_upd[l + k] += qx_upd[l + k];
+                div_upd[l + k + dimZ] -= qx_upd[l + k];
+}
+if(j != (copY-1)) {
+                div_upd[l + k] += qy_upd[l + k];
+                div_upd[l + k + dimX * dimZ] = -qy_upd[l + k];  // We need to update div in the next thread on last iter
+}
+
+                qxdiff = qx[l + k] - qx_upd[l + k];
+                qydiff = qy[l + k] - qy_upd[l + k];
+                qnorm += (qxdiff * qxdiff + qydiff * qydiff);
+
+                resdual += fabs(divsigma * qxdiff - gradxdiff[k]);
+                resdual += fabs(divsigma * qydiff - gradydiff[k]);
+                product += (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff);
+
+                if ((offY == 0)||(j > 0)) {
+                    divdiff = div[l + k] - div_upd[l + k];  // Multiple steps... How we compute without history?
+                    resprimal += fabs(divtau * udiff[k] + divdiff); 
+                }
+            }
+            
+        } // i
+    }
+    
+    
+    ctx->resprimal = resprimal;
+    ctx->resdual = resdual;
+    ctx->product = product;
+    ctx->unorm = unorm;
+    ctx->qnorm = qnorm;
+
+    return 0;
+}
+
+static void TNV_CPU_init(float *InputT, float *uT, int dimX, int dimY, int dimZ) {
+    int i, off, size, err;
+
+    if (sched) return;
+
+    tnv_ctx.dimX = dimX;
+    tnv_ctx.dimY = dimY;
+    tnv_ctx.dimZ = dimZ;
+    tnv_ctx.padZ = 64 * ((dimZ / 64) + ((dimZ % 64)?1:0));
+
+    hw_sched_init();
+
+    int threads = hw_sched_get_cpu_count();
+    if (threads > dimY) threads = dimY/2;
+
+    int step = dimY / threads;
+    int extra = dimY % threads;
+
+    tnv_ctx.threads = threads;
+    tnv_ctx.thr_ctx = (tnv_thread_t*)calloc(threads, sizeof(tnv_thread_t));
+    for (i = 0, off = 0; i < threads; i++, off += size) {
+        tnv_thread_t *ctx = tnv_ctx.thr_ctx + i;
+        size = step + ((i < extra)?1:0);
+
+        ctx->offY = off;
+        ctx->stepY = size;
+        ctx->realY = ctx->stepY;
+
+        if (i == (threads-1)) ctx->copY = ctx->stepY;
+        else ctx->copY = ctx->stepY + 1;
+    }
+
+    sched = hw_sched_create(threads);
+    if (!sched) { fprintf(stderr, "Error creating threads\n"); exit(-1); }
+
+    err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_init);
+    if (!err) err = hw_sched_wait_task(sched);
+    if (err) { fprintf(stderr, "Error %i scheduling init threads", err); exit(-1); }
+}
+
+
+
+/*
+ * C-OMP implementation of Total Nuclear Variation regularisation model (2D + channels) [1]
+ * The code is modified from the implementation by Joan Duran <joan.duran@uib.es> see
+ * "denoisingPDHG_ipol.cpp" in Joans Collaborative Total Variation package
+ *
+ * Input Parameters:
+ * 1. Noisy volume of 2D + channel dimension, i.e. 3D volume
+ * 2. lambda - regularisation parameter
+ * 3. Number of iterations [OPTIONAL parameter]
+ * 4. eplsilon - tolerance constant [OPTIONAL parameter]
+ * 5. print information: 0 (off) or 1 (on)  [OPTIONAL parameter]
+ *
+ * Output:
+ * 1. Filtered/regularized image (u)
+ *
+ * [1]. Duran, J., Moeller, M., Sbert, C. and Cremers, D., 2016. Collaborative total variation: a general framework for vectorial TV models. SIAM Journal on Imaging Sciences, 9(1), pp.116-151.
+ */
+
+float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float tol, int dimX, int dimY, int dimZ)
+{
+    int err;
+    int iter;
+    int i,j,k,l;
+
+    lambda = 1.0f/(2.0f*lambda);
+    tnv_ctx.lambda = lambda;
+
+    // PDHG algorithm parameters
+    float tau = 0.5f;
+    float sigma = 0.5f;
+    float theta = 1.0f;
+
+    // Backtracking parameters
+    float s = 1.0f;
+    float gamma = 0.75f;
+    float beta = 0.95f;
+    float alpha0 = 0.2f;
+    float alpha = alpha0;
+    float delta = 1.5f;
+    float eta = 0.95f;
+
+    TNV_CPU_init(InputT, uT, dimX, dimY, dimZ);
+
+    tnv_ctx.InputT = InputT;
+    tnv_ctx.uT = uT;
+
+    err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_start);
+    if (!err) err = hw_sched_wait_task(sched);
+    if (err) { fprintf(stderr, "Error %i scheduling start threads", err); exit(-1); }
+
+
+    // Apply Primal-Dual Hybrid Gradient scheme
+    float residual = fLarge;
+    for(iter = 0; iter < maxIter; iter++)   {
+        float resprimal = 0.0f;
+        float resdual = 0.0f;
+        float product = 0.0f;
+        float unorm = 0.0f;
+        float qnorm = 0.0f;
+
+        float divtau = 1.0f / tau;
+
+        tnv_ctx.sigma = sigma;
+        tnv_ctx.tau = tau;
+        tnv_ctx.theta = theta;
+
+        err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_step);
+        if (!err) err = hw_sched_wait_task(sched);
+        if (err) { fprintf(stderr, "Error %i scheduling tnv threads", err); exit(-1); }
+
+            // border regions
+        for (i = 1; i < tnv_ctx.threads; i++) {
+            tnv_thread_t *ctx0 = tnv_ctx.thr_ctx + (i - 1);
+            tnv_thread_t *ctx = tnv_ctx.thr_ctx + i;
+
+            l = ctx0->stepY * dimX * dimZ;
+            for(k=0; k < dimZ * (dimX /*- 1*/); k++) {
+                double div_upd_add = ctx0->div_upd[l + k];
+                ctx->div_upd[k] += div_upd_add;
+                ctx0->div_upd[l + k] = ctx->div_upd[k];
+
+//                ctx0->u_upd[l + k] = ctx->u_upd[k];
+
+                float divdiff = ctx->div[k] - ctx->div_upd[k];  // Multiple steps... How we compute without history?
+                float udiff = ctx->u[k] - ctx->u_upd[k];
+                resprimal += fabs(divtau * udiff + divdiff); 
+            }
+        }
+
+#define TNV_CHECK_RES
+#ifndef TNV_CHECK_RES
+        for (i = 0; i < tnv_ctx.threads; i++) {
+            tnv_thread_t *ctx = tnv_ctx.thr_ctx + i;
+            resprimal += ctx->resprimal;
+            resdual += ctx->resdual;
+            product += ctx->product;
+            unorm += ctx->unorm;
+            qnorm += ctx->qnorm;
+        } 
+#else
+        resprimal = 0;
+        float divsigma = 1.0f / sigma;
+        for(j=0; j<dimY; j++)
+        for(i=0; i<dimX; i++)
+        for(int l=0; l<dimZ; l++)
+        {
+            int step = dimY / tnv_ctx.threads;
+            int extra = dimY % tnv_ctx.threads;
+
+            int thr, subj = j;
+            for (thr = 0; thr < tnv_ctx.threads; thr++) {
+                int size = step;
+                if (thr < extra) size++;
+                
+                if (subj >= size) subj-= size;
+                else break;
+            }
+
+            tnv_thread_t *ctx = tnv_ctx.thr_ctx + thr;
+           
+            int k = subj * dimX * dimZ + i * dimZ + l;
+        
+            float udiff = ctx->u[k] - ctx->u_upd[k];
+            float qxdiff = ctx->qx[k] - ctx->qx_upd[k];
+            float qydiff = ctx->qy[k] - ctx->qy_upd[k];
+            float divdiff = ctx->div[k] - ctx->div_upd[k];
+            float gradxdiff = ctx->gradx[k] - ctx->gradx_upd[k];
+            float gradydiff = ctx->grady[k] - ctx->grady_upd[k];
+
+            resprimal += fabs(divtau*udiff + divdiff);
+            resdual += fabs(divsigma*qxdiff - gradxdiff);
+            resdual += fabs(divsigma*qydiff - gradydiff);
+
+            unorm += (udiff * udiff);
+            qnorm += (qxdiff * qxdiff + qydiff * qydiff);
+            product += (gradxdiff * qxdiff + gradydiff * qydiff);
+        }
+#endif
+
+
+
+        residual = (resprimal + resdual) / ((float) (dimX*dimY*dimZ));
+        float b = (2.0f * tau * sigma * product) / (gamma * sigma * unorm + gamma * tau * qnorm);
+        float dual_dot_delta = resdual * s * delta;
+        float dual_div_delta = (resdual * s) / delta;
+        printf("resprimal: %f, resdual: %f, b: %f (product: %f, unorm: %f, qnorm: %f)\n", resprimal, resdual, b, product, unorm, qnorm);
+
+
+        if(b > 1) {
+            // Decrease step-sizes to fit balancing principle
+            tau = (beta * tau) / b;
+            sigma = (beta * sigma) / b;
+            alpha = alpha0;
+
+            err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_restore);
+            if (!err) err = hw_sched_wait_task(sched);
+            if (err) { fprintf(stderr, "Error %i scheduling restore threads", err); exit(-1); }
+        } else {
+            err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_copy);
+            if (!err) err = hw_sched_wait_task(sched);
+            if (err) { fprintf(stderr, "Error %i scheduling copy threads", err); exit(-1); }
+        
+            if(resprimal > dual_dot_delta) {
+                    // Increase primal step-size and decrease dual step-size
+                tau = tau / (1.0f - alpha);
+                sigma = sigma * (1.0f - alpha);
+                alpha = alpha * eta;
+            } else if(resprimal < dual_div_delta) {
+                    // Decrease primal step-size and increase dual step-size
+                tau = tau * (1.0f - alpha);
+                sigma = sigma / (1.0f - alpha);
+                alpha = alpha * eta;
+            }
+        }
+
+        if (residual < tol) break;
+    }
+
+    err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_finish);
+    if (!err) err = hw_sched_wait_task(sched);
+    if (err) { fprintf(stderr, "Error %i scheduling finish threads", err); exit(-1); }
+
+
+    printf("Iterations stopped at %i with the residual %f \n", iter, residual);
+    printf("Return: %f\n", *uT);
+
+    return *uT;
+}
diff --git a/src/Core/performance_CPU/TNV_core.c.v17 b/src/Core/performance_CPU/TNV_core.c.v17
new file mode 100755
index 00000000..60739cd6
--- /dev/null
+++ b/src/Core/performance_CPU/TNV_core.c.v17
@@ -0,0 +1,690 @@
+/*
+ * This work is part of the Core Imaging Library developed by
+ * Visual Analytics and Imaging System Group of the Science Technology
+ * Facilities Council, STFC
+ *
+ * Copyright 2017 Daniil Kazantsev
+ * Copyright 2017 Srikanth Nagella, Edoardo Pasca
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "TNV_core.h"
+
+#define min(a,b) (((a)<(b))?(a):(b))
+
+inline void coefF(float *t, float M1, float M2, float M3, float sigma, int p, int q, int r) {
+    int ii, num;
+    float divsigma = 1.0f / sigma;
+    float sum, shrinkfactor;
+    float T,D,det,eig1,eig2,sig1,sig2,V1, V2, V3, V4, v0,v1,v2, mu1,mu2,sig1_upd,sig2_upd;
+    float proj[2] = {0};
+
+    // Compute eigenvalues of M
+    T = M1 + M3;
+    D = M1 * M3 - M2 * M2;
+    det = sqrtf(MAX((T * T / 4.0f) - D, 0.0f));
+    eig1 = MAX((T / 2.0f) + det, 0.0f);
+    eig2 = MAX((T / 2.0f) - det, 0.0f);
+    sig1 = sqrtf(eig1);
+    sig2 = sqrtf(eig2);
+
+    // Compute normalized eigenvectors
+    V1 = V2 = V3 = V4 = 0.0f;
+
+    if(M2 != 0.0f)
+    {
+        v0 = M2;
+        v1 = eig1 - M3;
+        v2 = eig2 - M3;
+
+        mu1 = sqrtf(v0 * v0 + v1 * v1);
+        mu2 = sqrtf(v0 * v0 + v2 * v2);
+
+        if(mu1 > fTiny)
+        {
+            V1 = v1 / mu1;
+            V3 = v0 / mu1;
+        }
+
+        if(mu2 > fTiny)
+        {
+            V2 = v2 / mu2;
+            V4 = v0 / mu2;
+        }
+
+    } else
+    {
+        if(M1 > M3)
+        {
+            V1 = V4 = 1.0f;
+            V2 = V3 = 0.0f;
+
+        } else
+        {
+            V1 = V4 = 0.0f;
+            V2 = V3 = 1.0f;
+        }
+    }
+
+    // Compute prox_p of the diagonal entries
+    sig1_upd = sig2_upd = 0.0f;
+
+    if(p == 1)
+    {
+        sig1_upd = MAX(sig1 - divsigma, 0.0f);
+        sig2_upd = MAX(sig2 - divsigma, 0.0f);
+
+    } else if(p == INFNORM)
+    {
+        proj[0] = sigma * fabs(sig1);
+        proj[1] = sigma * fabs(sig2);
+
+        /*l1 projection part */
+        sum = fLarge;
+        num = 0l;
+        shrinkfactor = 0.0f;
+        while(sum > 1.0f)
+        {
+            sum = 0.0f;
+            num = 0;
+
+            for(ii = 0; ii < 2; ii++)
+            {
+                proj[ii] = MAX(proj[ii] - shrinkfactor, 0.0f);
+
+                sum += fabs(proj[ii]);
+                if(proj[ii]!= 0.0f)
+                    num++;
+            }
+
+            if(num > 0)
+                shrinkfactor = (sum - 1.0f) / num;
+            else
+                break;
+        }
+        /*l1 proj ends*/
+
+        sig1_upd = sig1 - divsigma * proj[0];
+        sig2_upd = sig2 - divsigma * proj[1];
+    }
+
+    // Compute the diagonal entries of $\widehat{\Sigma}\Sigma^{\dagger}_0$
+    if(sig1 > fTiny)
+        sig1_upd /= sig1;
+
+    if(sig2 > fTiny)
+        sig2_upd /= sig2;
+
+    // Compute solution
+    t[0] = sig1_upd * V1 * V1 + sig2_upd * V2 * V2;
+    t[1] = sig1_upd * V1 * V3 + sig2_upd * V2 * V4;
+    t[2] = sig1_upd * V3 * V3 + sig2_upd * V4 * V4;
+}
+
+
+#include "hw_sched.h"
+typedef struct {
+    int offY, stepY, copY;
+    float *Input, *u, *u_upd, *qx, *qy, *qx_upd, *qy_upd, *gradx, *grady, *gradx_upd, *grady_upd;
+    float *div, *div_upd;
+    float resprimal, resdual;
+    float unorm, qnorm, product;
+} tnv_thread_t;
+
+typedef struct {
+    int threads;
+    tnv_thread_t *thr_ctx;
+    float *InputT, *uT;
+    int dimX, dimY, dimZ, padZ;
+    float lambda, sigma, tau, theta;
+} tnv_context_t;
+
+HWSched sched = NULL;
+tnv_context_t tnv_ctx;
+
+
+static int tnv_free(HWThread thr, void *hwctx, int device_id, void *data) {
+    int i,j,k;
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+    free(ctx->Input);
+    free(ctx->u);
+    free(ctx->u_upd);
+    free(ctx->qx);
+    free(ctx->qy);
+    free(ctx->qx_upd);
+    free(ctx->qy_upd);
+    free(ctx->gradx);
+    free(ctx->grady);
+    free(ctx->gradx_upd);
+    free(ctx->grady_upd);
+    free(ctx->div);
+    free(ctx->div_upd);
+    
+    return 0;
+}
+
+static int tnv_init(HWThread thr, void *hwctx, int device_id, void *data) {
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+    
+    int dimX = tnv_ctx->dimX;
+    int dimY = tnv_ctx->dimY;
+    int dimZ = tnv_ctx->dimZ;
+    int padZ = tnv_ctx->padZ;
+    int offY = ctx->offY;
+    int stepY = ctx->stepY;
+    
+//    printf("%i %p - %i %i %i x %i %i\n", device_id, ctx, dimX, dimY, dimZ, offY, stepY);
+
+    long DimTotal = (long)(dimX*stepY*padZ);
+    long Dim1Total = (long)(dimX*(stepY+1)*padZ);
+
+    // Auxiliar vectors
+    ctx->Input = malloc(Dim1Total * sizeof(float));
+    ctx->u = malloc(Dim1Total * sizeof(float));
+    ctx->u_upd = malloc(Dim1Total * sizeof(float));
+    ctx->qx = malloc(DimTotal * sizeof(float));
+    ctx->qy = malloc(DimTotal * sizeof(float));
+    ctx->qx_upd = malloc(DimTotal * sizeof(float));
+    ctx->qy_upd = malloc(DimTotal * sizeof(float));
+    ctx->gradx = malloc(DimTotal * sizeof(float));
+    ctx->grady = malloc(DimTotal * sizeof(float));
+    ctx->gradx_upd = malloc(DimTotal * sizeof(float));
+    ctx->grady_upd = malloc(DimTotal * sizeof(float));
+    ctx->div = malloc(Dim1Total * sizeof(float));
+    ctx->div_upd = malloc(Dim1Total * sizeof(float));
+
+    if ((!ctx->Input)||(!ctx->u)||(!ctx->u_upd)||(!ctx->qx)||(!ctx->qy)||(!ctx->qx_upd)||(!ctx->qy_upd)||(!ctx->gradx)||(!ctx->grady)||(!ctx->gradx_upd)||(!ctx->grady_upd)||(!ctx->div)||(!ctx->div_upd)) {
+        fprintf(stderr, "Error allocating memory\n");
+        exit(-1);
+    }
+
+    return 0;
+}
+
+static int tnv_start(HWThread thr, void *hwctx, int device_id, void *data) {
+    int i,j,k;
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+    
+    int dimX = tnv_ctx->dimX;
+    int dimY = tnv_ctx->dimY;
+    int dimZ = tnv_ctx->dimZ;
+    int padZ = tnv_ctx->padZ;
+    int offY = ctx->offY;
+    int stepY = ctx->stepY;
+    int copY = ctx->copY;
+    
+//    printf("%i %p - %i %i %i (%i) x %i %i\n", device_id, ctx, dimX, dimY, dimZ, padZ, offY, stepY);
+
+    long DimTotal = (long)(dimX*stepY*padZ);
+    long Dim1Total = (long)(dimX*copY*padZ);
+
+    memset(ctx->u, 0, Dim1Total * sizeof(float));
+    memset(ctx->qx, 0, DimTotal * sizeof(float));
+    memset(ctx->qy, 0, DimTotal * sizeof(float));
+    memset(ctx->gradx, 0, DimTotal * sizeof(float));
+    memset(ctx->grady, 0, DimTotal * sizeof(float));
+    memset(ctx->div, 0, Dim1Total * sizeof(float));
+    
+    for(k=0; k<dimZ; k++) {
+        for(j=0; j<copY; j++) {
+            for(i=0; i<dimX; i++) {
+                ctx->Input[j * dimX * padZ + i * padZ + k] =  tnv_ctx->InputT[k * dimX * dimY + (j + offY) * dimX + i];
+                ctx->u[j * dimX * padZ + i * padZ + k] =  tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i];
+            }
+        }
+    }
+
+    return 0;
+}
+
+static int tnv_finish(HWThread thr, void *hwctx, int device_id, void *data) {
+    int i,j,k;
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+    int dimX = tnv_ctx->dimX;
+    int dimY = tnv_ctx->dimY;
+    int dimZ = tnv_ctx->dimZ;
+    int padZ = tnv_ctx->padZ;
+    int offY = ctx->offY;
+    int stepY = ctx->stepY;
+    int copY = ctx->copY;
+
+    for(k=0; k<dimZ; k++) {
+        for(j=0; j<stepY; j++) {
+            for(i=0; i<dimX; i++) {
+                tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i] = ctx->u[j * dimX * padZ + i * padZ + k];
+            }
+        }
+    }
+    
+    return 0;
+}
+
+
+static int tnv_copy(HWThread thr, void *hwctx, int device_id, void *data) {
+    int i,j,k;
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+    
+    int dimX = tnv_ctx->dimX;
+    int dimY = tnv_ctx->dimY;
+    int dimZ = tnv_ctx->dimZ;
+    int stepY = ctx->stepY;
+    int copY = ctx->copY;
+    int padZ = tnv_ctx->padZ;
+    long DimTotal = (long)(dimX*stepY*padZ);
+    long Dim1Total = (long)(dimX*copY*padZ);
+
+    // Auxiliar vectors
+    memcpy(ctx->u, ctx->u_upd, Dim1Total * sizeof(float));
+    memcpy(ctx->qx, ctx->qx_upd, DimTotal * sizeof(float));
+    memcpy(ctx->qy, ctx->qy_upd, DimTotal * sizeof(float));
+    memcpy(ctx->gradx, ctx->gradx_upd, DimTotal * sizeof(float));
+    memcpy(ctx->grady, ctx->grady_upd, DimTotal * sizeof(float));
+    memcpy(ctx->div, ctx->div_upd, Dim1Total * sizeof(float));
+
+    return 0;
+}
+
+static int tnv_restore(HWThread thr, void *hwctx, int device_id, void *data) {
+    int i,j,k;
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+    
+    int dimX = tnv_ctx->dimX;
+    int dimY = tnv_ctx->dimY;
+    int dimZ = tnv_ctx->dimZ;
+    int stepY = ctx->stepY;
+    int copY = ctx->copY;
+    int padZ = tnv_ctx->padZ;
+    long DimTotal = (long)(dimX*stepY*padZ);
+    long Dim1Total = (long)(dimX*copY*padZ);
+
+    // Auxiliar vectors
+    memcpy(ctx->u_upd, ctx->u, Dim1Total * sizeof(float));
+    memcpy(ctx->qx_upd, ctx->qx, DimTotal * sizeof(float));
+    memcpy(ctx->qy_upd, ctx->qy, DimTotal * sizeof(float));
+    memcpy(ctx->gradx_upd, ctx->gradx, DimTotal * sizeof(float));
+    memcpy(ctx->grady_upd, ctx->grady, DimTotal * sizeof(float));
+    memcpy(ctx->div_upd, ctx->div, Dim1Total * sizeof(float));
+
+    return 0;
+}
+
+
+static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) {
+    long i, j, k, l, m;
+
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+    
+    int dimX = tnv_ctx->dimX;
+    int dimY = tnv_ctx->dimY;
+    int dimZ = tnv_ctx->dimZ;
+    int padZ = tnv_ctx->padZ;
+    int offY = ctx->offY;
+    int stepY = ctx->stepY;
+    int copY = ctx->copY;
+
+    float *Input = ctx->Input;
+    float *u = ctx->u;
+    float *u_upd = ctx->u_upd;
+    float *qx = ctx->qx;
+    float *qy = ctx->qy;
+    float *qx_upd = ctx->qx_upd;
+    float *qy_upd = ctx->qy_upd;
+    float *gradx = ctx->gradx;
+    float *grady = ctx->grady;
+    float *gradx_upd = ctx->gradx_upd;
+    float *grady_upd = ctx->grady_upd;
+    float *div = ctx->div;
+    float *div_upd = ctx->div_upd;
+
+    long p = 1l;
+    long q = 1l;
+    long r = 0l;
+
+    float lambda = tnv_ctx->lambda;
+    float sigma = tnv_ctx->sigma;
+    float tau = tnv_ctx->tau;
+    float theta = tnv_ctx->theta;
+    
+    float taulambda = tau * lambda;
+    float divtau = 1.0f / tau;
+    float divsigma = 1.0f / sigma;
+    float theta1 = 1.0f + theta;
+    float constant = 1.0f + taulambda;
+
+    float resprimal = 0.0f;
+    float resdual = 0.0f;
+    float product = 0.0f;
+    float unorm = 0.0f;
+    float qnorm = 0.0f;
+
+    float udiff[dimZ];
+    float qxdiff;
+    float qydiff;
+    float divdiff;
+    float gradxdiff[dimZ];
+    float gradydiff[dimZ];
+
+    for(i=0; i < dimX; i++) {
+        for(k = 0; k < dimZ; k++) { 
+            int l = i * padZ + k;
+            u_upd[l] = (u[l] + tau * div[l] + taulambda * Input[l])/constant;
+            div_upd[l] = 0;
+        }
+    }
+
+    for(j = 0; j < stepY; j++) {
+/*        m = j * dimX * dimZ + (dimX - 1) * dimZ;
+        for(k = 0; k < dimZ; k++) { 
+            u_upd[k + m] = (u[k + m] + tau * div[k + m] + taulambda * Input[k + m]) / constant; 
+        }*/
+        
+        for(i = 0; i < dimX/* - 1*/; i++) {
+            float t[3];
+            float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f;
+            l = (j * dimX  + i) * padZ;
+            m = dimX * padZ;
+        
+//#pragma unroll 64
+            for(k = 0; k < dimZ; k++) {
+                u_upd[l + k + m] = (u[l + k + m] + tau * div[l + k + m] + taulambda * Input[l + k + m]) / constant;
+
+                gradx_upd[l + k] = (i == (dimX - 1))?0:(u_upd[l + k + padZ] - u_upd[l + k]);
+                grady_upd[l + k] = (j == (copY - 1))?0:(u_upd[l + k + m] - u_upd[l + k]);   // We need div from the next thread on last iter
+
+                udiff[k] = u[l + k] - u_upd[l + k];
+                unorm += (udiff[k] * udiff[k]);
+//                if ((!k)&&(!i)) printf("%i = %f %f, %f %f\n", offY + j, u[l + k], u_upd[l + k], udiff[k], unorm);
+
+                gradxdiff[k] = gradx[l + k] - gradx_upd[l + k];
+                gradydiff[k] = grady[l + k] - grady_upd[l + k];
+
+                float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k];
+                float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k];
+//#define TNV_NEW_STYLE                
+#ifdef TNV_NEW_STYLE                
+                qx_upd[l + k] = qx[l + k] + sigma * ubarx;
+                qy_upd[l + k] = qy[l + k] + sigma * ubary;
+
+                float vx = divsigma * qx_upd[l + k]; //+ ubarx
+                float vy = divsigma * qy_upd[l + k]; //+ ubary
+#else
+                float vx = ubarx + divsigma * qx[l + k];
+                float vy = ubary + divsigma * qy[l + k];
+#endif
+
+                M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy);
+            }
+
+            coefF(t, M1, M2, M3, sigma, p, q, r);
+            
+//#pragma unroll 64
+            for(k = 0; k < dimZ; k++) {
+#ifdef TNV_NEW_STYLE    
+                float vx = divsigma * qx_upd[l + k];
+                float vy = divsigma * qy_upd[l + k];
+
+                float gx_upd = vx * t[0] + vy * t[1];
+                float gy_upd = vx * t[1] + vy * t[2];
+
+                qx_upd[l + k] -= sigma * gx_upd;
+                qy_upd[l + k] -= sigma * gy_upd;
+#else
+                float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k];
+                float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k];
+                float vx = ubarx + divsigma * qx[l + k];
+                float vy = ubary + divsigma * qy[l + k];
+
+                float gx_upd = vx * t[0] + vy * t[1];
+                float gy_upd = vx * t[1] + vy * t[2];
+
+                qx_upd[l + k] = qx[l + k] + sigma * (ubarx - gx_upd);
+                qy_upd[l + k] = qy[l + k] + sigma * (ubary - gy_upd);
+#endif
+
+if(i != (dimX-1)) {
+                div_upd[l + k] += qx_upd[l + k];
+                div_upd[l + k + padZ] -= qx_upd[l + k];
+}
+if(j != (copY-1)) {
+                div_upd[l + k] += qy_upd[l + k];
+                div_upd[l + k + m] = -qy_upd[l + k];  // We need to update div in the next thread on last iter
+}
+
+                qxdiff = qx[l + k] - qx_upd[l + k];
+                qydiff = qy[l + k] - qy_upd[l + k];
+                qnorm += (qxdiff * qxdiff + qydiff * qydiff);
+
+                resdual += fabs(divsigma * qxdiff - gradxdiff[k]);
+                resdual += fabs(divsigma * qydiff - gradydiff[k]);
+                product += (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff);
+
+                if ((offY == 0)||(j > 0)) {
+                    divdiff = div[l + k] - div_upd[l + k];  // Multiple steps... How we compute without history?
+                    resprimal += fabs(divtau * udiff[k] + divdiff); 
+                }
+            }
+            
+        } // i
+    }
+    
+    
+    ctx->resprimal = resprimal;
+    ctx->resdual = resdual;
+    ctx->product = product;
+    ctx->unorm = unorm;
+    ctx->qnorm = qnorm;
+
+    return 0;
+}
+
+static void TNV_CPU_init(float *InputT, float *uT, int dimX, int dimY, int dimZ) {
+    int i, off, size, err;
+
+    if (sched) return;
+
+    tnv_ctx.dimX = dimX;
+    tnv_ctx.dimY = dimY;
+    tnv_ctx.dimZ = dimZ;
+        // Padding seems actually slower
+//    tnv_ctx.padZ = 64 * ((dimZ / 64) + ((dimZ % 64)?1:0));
+    tnv_ctx.padZ = dimZ;
+    
+    hw_sched_init();
+
+    int threads = hw_sched_get_cpu_count();
+    if (threads > dimY) threads = dimY/2;
+
+    int step = dimY / threads;
+    int extra = dimY % threads;
+
+    tnv_ctx.threads = threads;
+    tnv_ctx.thr_ctx = (tnv_thread_t*)calloc(threads, sizeof(tnv_thread_t));
+    for (i = 0, off = 0; i < threads; i++, off += size) {
+        tnv_thread_t *ctx = tnv_ctx.thr_ctx + i;
+        size = step + ((i < extra)?1:0);
+
+        ctx->offY = off;
+        ctx->stepY = size;
+
+        if (i == (threads-1)) ctx->copY = ctx->stepY;
+        else ctx->copY = ctx->stepY + 1;
+    }
+
+    sched = hw_sched_create(threads);
+    if (!sched) { fprintf(stderr, "Error creating threads\n"); exit(-1); }
+
+    err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_init);
+    if (!err) err = hw_sched_wait_task(sched);
+    if (err) { fprintf(stderr, "Error %i scheduling init threads", err); exit(-1); }
+}
+
+
+
+/*
+ * C-OMP implementation of Total Nuclear Variation regularisation model (2D + channels) [1]
+ * The code is modified from the implementation by Joan Duran <joan.duran@uib.es> see
+ * "denoisingPDHG_ipol.cpp" in Joans Collaborative Total Variation package
+ *
+ * Input Parameters:
+ * 1. Noisy volume of 2D + channel dimension, i.e. 3D volume
+ * 2. lambda - regularisation parameter
+ * 3. Number of iterations [OPTIONAL parameter]
+ * 4. eplsilon - tolerance constant [OPTIONAL parameter]
+ * 5. print information: 0 (off) or 1 (on)  [OPTIONAL parameter]
+ *
+ * Output:
+ * 1. Filtered/regularized image (u)
+ *
+ * [1]. Duran, J., Moeller, M., Sbert, C. and Cremers, D., 2016. Collaborative total variation: a general framework for vectorial TV models. SIAM Journal on Imaging Sciences, 9(1), pp.116-151.
+ */
+
+float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float tol, int dimX, int dimY, int dimZ)
+{
+    int err;
+    int iter;
+    int i,j,k,l,m;
+
+    lambda = 1.0f/(2.0f*lambda);
+    tnv_ctx.lambda = lambda;
+
+    // PDHG algorithm parameters
+    float tau = 0.5f;
+    float sigma = 0.5f;
+    float theta = 1.0f;
+
+    // Backtracking parameters
+    float s = 1.0f;
+    float gamma = 0.75f;
+    float beta = 0.95f;
+    float alpha0 = 0.2f;
+    float alpha = alpha0;
+    float delta = 1.5f;
+    float eta = 0.95f;
+
+    TNV_CPU_init(InputT, uT, dimX, dimY, dimZ);
+
+    tnv_ctx.InputT = InputT;
+    tnv_ctx.uT = uT;
+    
+    int padZ = tnv_ctx.padZ;
+
+    err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_start);
+    if (!err) err = hw_sched_wait_task(sched);
+    if (err) { fprintf(stderr, "Error %i scheduling start threads", err); exit(-1); }
+
+
+    // Apply Primal-Dual Hybrid Gradient scheme
+    float residual = fLarge;
+    for(iter = 0; iter < maxIter; iter++)   {
+        float resprimal = 0.0f;
+        float resdual = 0.0f;
+        float product = 0.0f;
+        float unorm = 0.0f;
+        float qnorm = 0.0f;
+
+        float divtau = 1.0f / tau;
+
+        tnv_ctx.sigma = sigma;
+        tnv_ctx.tau = tau;
+        tnv_ctx.theta = theta;
+
+        err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_step);
+        if (!err) err = hw_sched_wait_task(sched);
+        if (err) { fprintf(stderr, "Error %i scheduling tnv threads", err); exit(-1); }
+
+            // border regions
+        for (j = 1; j < tnv_ctx.threads; j++) {
+            tnv_thread_t *ctx0 = tnv_ctx.thr_ctx + (j - 1);
+            tnv_thread_t *ctx = tnv_ctx.thr_ctx + j;
+
+            m = ctx0->stepY * dimX * padZ;
+            for(i = 0; i < dimX; i++) {
+                for(k = 0; k < dimZ; k++) {
+                    int l = i * padZ + k;
+                        
+                    float div_upd_add = ctx0->div_upd[m + l];
+                    ctx->div_upd[l] += div_upd_add;
+                    ctx0->div_upd[m + l] = ctx->div_upd[l];
+                    //ctx0->u_upd[m + l] = ctx->u_upd[l];
+
+                    float divdiff = ctx->div[l] - ctx->div_upd[l];  // Multiple steps... How we compute without history?
+                    float udiff = ctx->u[l] - ctx->u_upd[l];
+                    resprimal += fabs(divtau * udiff + divdiff); 
+                }
+            }
+        }
+
+        for (j = 0; j < tnv_ctx.threads; j++) {
+            tnv_thread_t *ctx = tnv_ctx.thr_ctx + j;
+            resprimal += ctx->resprimal;
+            resdual += ctx->resdual;
+            product += ctx->product;
+            unorm += ctx->unorm;
+            qnorm += ctx->qnorm;
+        } 
+
+        residual = (resprimal + resdual) / ((float) (dimX*dimY*dimZ));
+        float b = (2.0f * tau * sigma * product) / (gamma * sigma * unorm + gamma * tau * qnorm);
+        float dual_dot_delta = resdual * s * delta;
+        float dual_div_delta = (resdual * s) / delta;
+        printf("resprimal: %f, resdual: %f, b: %f (product: %f, unorm: %f, qnorm: %f)\n", resprimal, resdual, b, product, unorm, qnorm);
+
+
+        if(b > 1) {
+            // Decrease step-sizes to fit balancing principle
+            tau = (beta * tau) / b;
+            sigma = (beta * sigma) / b;
+            alpha = alpha0;
+
+            err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_restore);
+            if (!err) err = hw_sched_wait_task(sched);
+            if (err) { fprintf(stderr, "Error %i scheduling restore threads", err); exit(-1); }
+        } else {
+            err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_copy);
+            if (!err) err = hw_sched_wait_task(sched);
+            if (err) { fprintf(stderr, "Error %i scheduling copy threads", err); exit(-1); }
+        
+            if(resprimal > dual_dot_delta) {
+                    // Increase primal step-size and decrease dual step-size
+                tau = tau / (1.0f - alpha);
+                sigma = sigma * (1.0f - alpha);
+                alpha = alpha * eta;
+            } else if(resprimal < dual_div_delta) {
+                    // Decrease primal step-size and increase dual step-size
+                tau = tau * (1.0f - alpha);
+                sigma = sigma / (1.0f - alpha);
+                alpha = alpha * eta;
+            }
+        }
+
+        if (residual < tol) break;
+    }
+
+    err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_finish);
+    if (!err) err = hw_sched_wait_task(sched);
+    if (err) { fprintf(stderr, "Error %i scheduling finish threads", err); exit(-1); }
+
+
+    printf("Iterations stopped at %i with the residual %f \n", iter, residual);
+    printf("Return: %f\n", *uT);
+
+    return *uT;
+}
diff --git a/src/Core/performance_CPU/TNV_core.c.v18 b/src/Core/performance_CPU/TNV_core.c.v18
new file mode 100755
index 00000000..71924751
--- /dev/null
+++ b/src/Core/performance_CPU/TNV_core.c.v18
@@ -0,0 +1,688 @@
+/*
+ * This work is part of the Core Imaging Library developed by
+ * Visual Analytics and Imaging System Group of the Science Technology
+ * Facilities Council, STFC
+ *
+ * Copyright 2017 Daniil Kazantsev
+ * Copyright 2017 Srikanth Nagella, Edoardo Pasca
+ *
+ * Copyriht 2020 Suren A. Chlingaryan
+ * Optimized version with 1/2 of memory consumption and ~4x performance
+ * This version is algorithmicly comptable with the original, but slight change in results 
+ * is expected due to different order of floating-point operations.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <malloc.h>
+#include "TNV_core.h"
+
+#define BLOCK 32
+#define min(a,b) (((a)<(b))?(a):(b))
+
+inline void coefF(float *t, float M1, float M2, float M3, float sigma, int p, int q, int r) {
+    int ii, num;
+    float divsigma = 1.0f / sigma;
+    float sum, shrinkfactor;
+    float T,D,det,eig1,eig2,sig1,sig2,V1, V2, V3, V4, v0,v1,v2, mu1,mu2,sig1_upd,sig2_upd;
+    float proj[2] = {0};
+
+    // Compute eigenvalues of M
+    T = M1 + M3;
+    D = M1 * M3 - M2 * M2;
+    det = sqrtf(MAX((T * T / 4.0f) - D, 0.0f));
+    eig1 = MAX((T / 2.0f) + det, 0.0f);
+    eig2 = MAX((T / 2.0f) - det, 0.0f);
+    sig1 = sqrtf(eig1);
+    sig2 = sqrtf(eig2);
+
+    // Compute normalized eigenvectors
+    V1 = V2 = V3 = V4 = 0.0f;
+
+    if(M2 != 0.0f)
+    {
+        v0 = M2;
+        v1 = eig1 - M3;
+        v2 = eig2 - M3;
+
+        mu1 = sqrtf(v0 * v0 + v1 * v1);
+        mu2 = sqrtf(v0 * v0 + v2 * v2);
+
+        if(mu1 > fTiny)
+        {
+            V1 = v1 / mu1;
+            V3 = v0 / mu1;
+        }
+
+        if(mu2 > fTiny)
+        {
+            V2 = v2 / mu2;
+            V4 = v0 / mu2;
+        }
+
+    } else
+    {
+        if(M1 > M3)
+        {
+            V1 = V4 = 1.0f;
+            V2 = V3 = 0.0f;
+
+        } else
+        {
+            V1 = V4 = 0.0f;
+            V2 = V3 = 1.0f;
+        }
+    }
+
+    // Compute prox_p of the diagonal entries
+    sig1_upd = sig2_upd = 0.0f;
+
+    if(p == 1)
+    {
+        sig1_upd = MAX(sig1 - divsigma, 0.0f);
+        sig2_upd = MAX(sig2 - divsigma, 0.0f);
+
+    } else if(p == INFNORM)
+    {
+        proj[0] = sigma * fabs(sig1);
+        proj[1] = sigma * fabs(sig2);
+
+        /*l1 projection part */
+        sum = fLarge;
+        num = 0l;
+        shrinkfactor = 0.0f;
+        while(sum > 1.0f)
+        {
+            sum = 0.0f;
+            num = 0;
+
+            for(ii = 0; ii < 2; ii++)
+            {
+                proj[ii] = MAX(proj[ii] - shrinkfactor, 0.0f);
+
+                sum += fabs(proj[ii]);
+                if(proj[ii]!= 0.0f)
+                    num++;
+            }
+
+            if(num > 0)
+                shrinkfactor = (sum - 1.0f) / num;
+            else
+                break;
+        }
+        /*l1 proj ends*/
+
+        sig1_upd = sig1 - divsigma * proj[0];
+        sig2_upd = sig2 - divsigma * proj[1];
+    }
+
+    // Compute the diagonal entries of $\widehat{\Sigma}\Sigma^{\dagger}_0$
+    if(sig1 > fTiny)
+        sig1_upd /= sig1;
+
+    if(sig2 > fTiny)
+        sig2_upd /= sig2;
+
+    // Compute solution
+    t[0] = sig1_upd * V1 * V1 + sig2_upd * V2 * V2;
+    t[1] = sig1_upd * V1 * V3 + sig2_upd * V2 * V4;
+    t[2] = sig1_upd * V3 * V3 + sig2_upd * V4 * V4;
+}
+
+
+#include "hw_sched.h"
+typedef struct {
+    int offY, stepY, copY;
+    float *Input, *u, *u_upd, *qx, *qy, *qx_upd, *qy_upd, *gradx, *grady, *gradx_upd, *grady_upd;
+    float *div, *div_upd;
+    float resprimal, resdual;
+    float unorm, qnorm, product;
+} tnv_thread_t;
+
+typedef struct {
+    int threads;
+    tnv_thread_t *thr_ctx;
+    float *InputT, *uT;
+    int dimX, dimY, dimZ, padZ;
+    float lambda, sigma, tau, theta;
+} tnv_context_t;
+
+HWSched sched = NULL;
+tnv_context_t tnv_ctx;
+
+
+static int tnv_free(HWThread thr, void *hwctx, int device_id, void *data) {
+    int i,j,k;
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+    free(ctx->Input);
+    free(ctx->u);
+    free(ctx->u_upd);
+    free(ctx->qx);
+    free(ctx->qy);
+    free(ctx->qx_upd);
+    free(ctx->qy_upd);
+    free(ctx->gradx);
+    free(ctx->grady);
+    free(ctx->gradx_upd);
+    free(ctx->grady_upd);
+    free(ctx->div);
+    free(ctx->div_upd);
+    
+    return 0;
+}
+
+static int tnv_init(HWThread thr, void *hwctx, int device_id, void *data) {
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+    
+    int dimX = tnv_ctx->dimX;
+    int dimY = tnv_ctx->dimY;
+    int dimZ = tnv_ctx->dimZ;
+    int padZ = tnv_ctx->padZ;
+    int offY = ctx->offY;
+    int stepY = ctx->stepY;
+    
+//    printf("%i %p - %i %i %i x %i %i\n", device_id, ctx, dimX, dimY, dimZ, offY, stepY);
+
+    long DimTotal = (long)(dimX*stepY*padZ);
+    long Dim1Total = (long)(dimX*(stepY+1)*padZ);
+
+    // Auxiliar vectors
+    ctx->Input = memalign(64, Dim1Total * sizeof(float));
+    ctx->u = memalign(64, Dim1Total * sizeof(float));
+    ctx->u_upd = memalign(64, Dim1Total * sizeof(float));
+    ctx->qx = memalign(64, DimTotal * sizeof(float));
+    ctx->qy = memalign(64, DimTotal * sizeof(float));
+    ctx->qx_upd = memalign(64, DimTotal * sizeof(float));
+    ctx->qy_upd = memalign(64, DimTotal * sizeof(float));
+    ctx->gradx = memalign(64, DimTotal * sizeof(float));
+    ctx->grady = memalign(64, DimTotal * sizeof(float));
+    ctx->gradx_upd = memalign(64, DimTotal * sizeof(float));
+    ctx->grady_upd = memalign(64, DimTotal * sizeof(float));
+    ctx->div = memalign(64, Dim1Total * sizeof(float));
+    ctx->div_upd = malloc(Dim1Total * sizeof(float));
+
+    if ((!ctx->Input)||(!ctx->u)||(!ctx->u_upd)||(!ctx->qx)||(!ctx->qy)||(!ctx->qx_upd)||(!ctx->qy_upd)||(!ctx->gradx)||(!ctx->grady)||(!ctx->gradx_upd)||(!ctx->grady_upd)||(!ctx->div)||(!ctx->div_upd)) {
+        fprintf(stderr, "Error allocating memory\n");
+        exit(-1);
+    }
+
+    return 0;
+}
+
+static int tnv_start(HWThread thr, void *hwctx, int device_id, void *data) {
+    int i,j,k;
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+    
+    int dimX = tnv_ctx->dimX;
+    int dimY = tnv_ctx->dimY;
+    int dimZ = tnv_ctx->dimZ;
+    int padZ = tnv_ctx->padZ;
+    int offY = ctx->offY;
+    int stepY = ctx->stepY;
+    int copY = ctx->copY;
+    
+//    printf("%i %p - %i %i %i (%i) x %i %i\n", device_id, ctx, dimX, dimY, dimZ, padZ, offY, stepY);
+
+    long DimTotal = (long)(dimX*stepY*padZ);
+    long Dim1Total = (long)(dimX*copY*padZ);
+
+    memset(ctx->u, 0, Dim1Total * sizeof(float));
+    memset(ctx->qx, 0, DimTotal * sizeof(float));
+    memset(ctx->qy, 0, DimTotal * sizeof(float));
+    memset(ctx->gradx, 0, DimTotal * sizeof(float));
+    memset(ctx->grady, 0, DimTotal * sizeof(float));
+    memset(ctx->div, 0, Dim1Total * sizeof(float));
+    
+    for(k=0; k<dimZ; k++) {
+        for(j=0; j<copY; j++) {
+            for(i=0; i<dimX; i++) {
+                ctx->Input[j * dimX * padZ + i * padZ + k] =  tnv_ctx->InputT[k * dimX * dimY + (j + offY) * dimX + i];
+                ctx->u[j * dimX * padZ + i * padZ + k] =  tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i];
+            }
+        }
+    }
+
+    return 0;
+}
+
+static int tnv_finish(HWThread thr, void *hwctx, int device_id, void *data) {
+    int i,j,k;
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+    int dimX = tnv_ctx->dimX;
+    int dimY = tnv_ctx->dimY;
+    int dimZ = tnv_ctx->dimZ;
+    int padZ = tnv_ctx->padZ;
+    int offY = ctx->offY;
+    int stepY = ctx->stepY;
+    int copY = ctx->copY;
+
+    for(k=0; k<dimZ; k++) {
+        for(j=0; j<stepY; j++) {
+            for(i=0; i<dimX; i++) {
+                tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i] = ctx->u[j * dimX * padZ + i * padZ + k];
+            }
+        }
+    }
+    
+    return 0;
+}
+
+
+static int tnv_copy(HWThread thr, void *hwctx, int device_id, void *data) {
+    int i,j,k;
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+    
+    int dimX = tnv_ctx->dimX;
+    int dimY = tnv_ctx->dimY;
+    int dimZ = tnv_ctx->dimZ;
+    int stepY = ctx->stepY;
+    int copY = ctx->copY;
+    int padZ = tnv_ctx->padZ;
+    long DimTotal = (long)(dimX*stepY*padZ);
+    long Dim1Total = (long)(dimX*copY*padZ);
+
+    // Auxiliar vectors
+    memcpy(ctx->u, ctx->u_upd, Dim1Total * sizeof(float));
+    memcpy(ctx->qx, ctx->qx_upd, DimTotal * sizeof(float));
+    memcpy(ctx->qy, ctx->qy_upd, DimTotal * sizeof(float));
+    memcpy(ctx->gradx, ctx->gradx_upd, DimTotal * sizeof(float));
+    memcpy(ctx->grady, ctx->grady_upd, DimTotal * sizeof(float));
+    memcpy(ctx->div, ctx->div_upd, Dim1Total * sizeof(float));
+
+    return 0;
+}
+
+static int tnv_restore(HWThread thr, void *hwctx, int device_id, void *data) {
+    int i,j,k;
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+    
+    int dimX = tnv_ctx->dimX;
+    int dimY = tnv_ctx->dimY;
+    int dimZ = tnv_ctx->dimZ;
+    int stepY = ctx->stepY;
+    int copY = ctx->copY;
+    int padZ = tnv_ctx->padZ;
+    long DimTotal = (long)(dimX*stepY*padZ);
+    long Dim1Total = (long)(dimX*copY*padZ);
+
+    // Auxiliar vectors
+    memcpy(ctx->u_upd, ctx->u, Dim1Total * sizeof(float));
+    memcpy(ctx->qx_upd, ctx->qx, DimTotal * sizeof(float));
+    memcpy(ctx->qy_upd, ctx->qy, DimTotal * sizeof(float));
+    memcpy(ctx->gradx_upd, ctx->gradx, DimTotal * sizeof(float));
+    memcpy(ctx->grady_upd, ctx->grady, DimTotal * sizeof(float));
+    memcpy(ctx->div_upd, ctx->div, Dim1Total * sizeof(float));
+
+    return 0;
+}
+
+
+static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) {
+    long i, j, k, l;
+
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+    
+    int dimX = tnv_ctx->dimX;
+    int dimY = tnv_ctx->dimY;
+    int dimZ = tnv_ctx->dimZ;
+    int padZ = tnv_ctx->padZ;
+    int offY = ctx->offY;
+    int stepY = ctx->stepY;
+    int copY = ctx->copY;
+
+    float *Input = ctx->Input;
+    float *u = ctx->u;
+    float *u_upd = ctx->u_upd;
+    float *qx = ctx->qx;
+    float *qy = ctx->qy;
+    float *qx_upd = ctx->qx_upd;
+    float *qy_upd = ctx->qy_upd;
+    float *gradx = ctx->gradx;
+    float *grady = ctx->grady;
+    float *gradx_upd = ctx->gradx_upd;
+    float *grady_upd = ctx->grady_upd;
+    float *div = ctx->div;
+    float *div_upd = ctx->div_upd;
+
+    long p = 1l;
+    long q = 1l;
+    long r = 0l;
+
+    float lambda = tnv_ctx->lambda;
+    float sigma = tnv_ctx->sigma;
+    float tau = tnv_ctx->tau;
+    float theta = tnv_ctx->theta;
+    
+    float taulambda = tau * lambda;
+    float divtau = 1.0f / tau;
+    float divsigma = 1.0f / sigma;
+    float theta1 = 1.0f + theta;
+    float constant = 1.0f + taulambda;
+
+    float resprimal = 0.0;
+    float resdual = 0.0;
+    float product = 0.0;
+    float unorm = 0.0;
+    float qnorm = 0.0;
+
+    float udiff[dimZ] __attribute__((aligned(64)));
+    float qxdiff __attribute__((aligned(64)));
+    float qydiff __attribute__((aligned(64)));
+    float divdiff __attribute__((aligned(64)));
+    float gradxdiff[dimZ] __attribute__((aligned(64)));
+    float gradydiff[dimZ] __attribute__((aligned(64)));
+
+    for(int j1 = 0; j1 < stepY; j1 += BLOCK) {
+     for(int i1 = 0; i1 < dimX; i1 += BLOCK) {
+      for(int j2 = 0; j2 < BLOCK; j2++) {
+        j = j1 + j2;
+        for(int i2 = 0; i2 < BLOCK; i2++) {
+            float t[3];
+            float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f;
+
+            i = i1 + i2;
+            if (i == dimX) break;
+            if (j == stepY) { j2 = BLOCK; break; }
+            l = (j * dimX  + i) * padZ;
+        
+//#pragma vector aligned
+#pragma GCC ivdep 
+            for(k = 0; k < dimZ; k++) {
+                u_upd[l + k] = (u[l + k] + tau * div[l + k] + taulambda * Input[l + k]) / constant;
+                udiff[k] = u[l + k] - u_upd[l + k];
+                unorm += (udiff[k] * udiff[k]);
+
+                gradx_upd[l + k] = (i == (dimX - 1))?0:((u[l + k + padZ] + tau * div[l + k + padZ] + taulambda * Input[l + k + padZ]) / constant - u_upd[l + k]);
+                grady_upd[l + k] = (j == (copY - 1))?0:((u[l + k + dimX*padZ] + tau * div[l + k + dimX*padZ] + taulambda * Input[l + k + dimX*padZ]) / constant - u_upd[l + k]);
+                gradxdiff[k] = gradx[l + k] - gradx_upd[l + k];
+                gradydiff[k] = grady[l + k] - grady_upd[l + k];
+
+                float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k];
+                float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k];
+//#define TNV_NEW_STYLE                
+#ifdef TNV_NEW_STYLE                
+                qx_upd[l + k] = qx[l + k] + sigma * ubarx;
+                qy_upd[l + k] = qy[l + k] + sigma * ubary;
+
+                float vx = divsigma * qx_upd[l + k]; //+ ubarx
+                float vy = divsigma * qy_upd[l + k]; //+ ubary
+#else
+                float vx = ubarx + divsigma * qx[l + k];
+                float vy = ubary + divsigma * qy[l + k];
+#endif
+
+                M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy);
+            }
+
+            coefF(t, M1, M2, M3, sigma, p, q, r);
+            
+//#pragma vector aligned
+#pragma GCC ivdep 
+            for(k = 0; k < dimZ; k++) {
+#ifdef TNV_NEW_STYLE    
+                float vx = divsigma * qx_upd[l + k];
+                float vy = divsigma * qy_upd[l + k];
+
+                float gx_upd = vx * t[0] + vy * t[1];
+                float gy_upd = vx * t[1] + vy * t[2];
+
+                qx_upd[l + k] -= sigma * gx_upd;
+                qy_upd[l + k] -= sigma * gy_upd;
+#else
+                float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k];
+                float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k];
+                float vx = ubarx + divsigma * qx[l + k];
+                float vy = ubary + divsigma * qy[l + k];
+
+                float gx_upd = vx * t[0] + vy * t[1];
+                float gy_upd = vx * t[1] + vy * t[2];
+
+                qx_upd[l + k] = qx[l + k] + sigma * (ubarx - gx_upd);
+                qy_upd[l + k] = qy[l + k] + sigma * (ubary - gy_upd);
+#endif
+
+                float div_upd_val = 0;
+                div_upd_val -= (i > 0)?qx_upd[l + k - padZ]:0;
+                div_upd_val -= (j > 0)?qy_upd[l + k - dimX * padZ]:0;
+                div_upd_val += (i < (dimX-1))?qx_upd[l + k]:0;
+                div_upd_val += (j < (copY-1))?qy_upd[l + k]:0;
+                div_upd[l + k] = div_upd_val;
+
+                qxdiff = qx[l + k] - qx_upd[l + k];
+                qydiff = qy[l + k] - qy_upd[l + k];
+                qnorm += (qxdiff * qxdiff + qydiff * qydiff);
+
+                resdual += fabs(divsigma * qxdiff - gradxdiff[k]);
+                resdual += fabs(divsigma * qydiff - gradydiff[k]);
+                product += (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff);
+
+                if ((offY == 0)||(j > 0)) {
+                    divdiff = div[l + k] - div_upd[l + k];  // Multiple steps... How we compute without history?
+                    resprimal += fabs(divtau * udiff[k] + divdiff); 
+                }
+            }
+            
+        } // i
+       } // j
+      } // i
+    } // j
+    
+    
+    ctx->resprimal = resprimal;
+    ctx->resdual = resdual;
+    ctx->product = product;
+    ctx->unorm = unorm;
+    ctx->qnorm = qnorm;
+
+    return 0;
+}
+
+static void TNV_CPU_init(float *InputT, float *uT, int dimX, int dimY, int dimZ) {
+    int i, off, size, err;
+
+    if (sched) return;
+
+    tnv_ctx.dimX = dimX;
+    tnv_ctx.dimY = dimY;
+    tnv_ctx.dimZ = dimZ;
+        // Padding seems actually slower
+    tnv_ctx.padZ = dimZ;
+//    tnv_ctx.padZ = 16 * ((dimZ / 16) + ((dimZ % 16)?1:0));
+    
+    hw_sched_init();
+
+    int threads = hw_sched_get_cpu_count();
+    if (threads > dimY) threads = dimY/2;
+
+    int step = dimY / threads;
+    int extra = dimY % threads;
+
+    tnv_ctx.threads = threads;
+    tnv_ctx.thr_ctx = (tnv_thread_t*)calloc(threads, sizeof(tnv_thread_t));
+    for (i = 0, off = 0; i < threads; i++, off += size) {
+        tnv_thread_t *ctx = tnv_ctx.thr_ctx + i;
+        size = step + ((i < extra)?1:0);
+
+        ctx->offY = off;
+        ctx->stepY = size;
+
+        if (i == (threads-1)) ctx->copY = ctx->stepY;
+        else ctx->copY = ctx->stepY + 1;
+    }
+
+    sched = hw_sched_create(threads);
+    if (!sched) { fprintf(stderr, "Error creating threads\n"); exit(-1); }
+
+    err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_init);
+    if (!err) err = hw_sched_wait_task(sched);
+    if (err) { fprintf(stderr, "Error %i scheduling init threads", err); exit(-1); }
+}
+
+
+
+/*
+ * C-OMP implementation of Total Nuclear Variation regularisation model (2D + channels) [1]
+ * The code is modified from the implementation by Joan Duran <joan.duran@uib.es> see
+ * "denoisingPDHG_ipol.cpp" in Joans Collaborative Total Variation package
+ *
+ * Input Parameters:
+ * 1. Noisy volume of 2D + channel dimension, i.e. 3D volume
+ * 2. lambda - regularisation parameter
+ * 3. Number of iterations [OPTIONAL parameter]
+ * 4. eplsilon - tolerance constant [OPTIONAL parameter]
+ * 5. print information: 0 (off) or 1 (on)  [OPTIONAL parameter]
+ *
+ * Output:
+ * 1. Filtered/regularized image (u)
+ *
+ * [1]. Duran, J., Moeller, M., Sbert, C. and Cremers, D., 2016. Collaborative total variation: a general framework for vectorial TV models. SIAM Journal on Imaging Sciences, 9(1), pp.116-151.
+ */
+
+float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float tol, int dimX, int dimY, int dimZ)
+{
+    int err;
+    int iter;
+    int i,j,k,l,m;
+
+    lambda = 1.0f/(2.0f*lambda);
+    tnv_ctx.lambda = lambda;
+
+    // PDHG algorithm parameters
+    float tau = 0.5f;
+    float sigma = 0.5f;
+    float theta = 1.0f;
+
+    // Backtracking parameters
+    float s = 1.0f;
+    float gamma = 0.75f;
+    float beta = 0.95f;
+    float alpha0 = 0.2f;
+    float alpha = alpha0;
+    float delta = 1.5f;
+    float eta = 0.95f;
+
+    TNV_CPU_init(InputT, uT, dimX, dimY, dimZ);
+
+    tnv_ctx.InputT = InputT;
+    tnv_ctx.uT = uT;
+    
+    int padZ = tnv_ctx.padZ;
+
+    err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_start);
+    if (!err) err = hw_sched_wait_task(sched);
+    if (err) { fprintf(stderr, "Error %i scheduling start threads", err); exit(-1); }
+
+
+    // Apply Primal-Dual Hybrid Gradient scheme
+    float residual = fLarge;
+    for(iter = 0; iter < maxIter; iter++)   {
+        float resprimal = 0.0f;
+        float resdual = 0.0f;
+        float product = 0.0f;
+        float unorm = 0.0f;
+        float qnorm = 0.0f;
+
+        float divtau = 1.0f / tau;
+
+        tnv_ctx.sigma = sigma;
+        tnv_ctx.tau = tau;
+        tnv_ctx.theta = theta;
+
+        err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_step);
+        if (!err) err = hw_sched_wait_task(sched);
+        if (err) { fprintf(stderr, "Error %i scheduling tnv threads", err); exit(-1); }
+
+            // border regions
+        for (j = 1; j < tnv_ctx.threads; j++) {
+            tnv_thread_t *ctx0 = tnv_ctx.thr_ctx + (j - 1);
+            tnv_thread_t *ctx = tnv_ctx.thr_ctx + j;
+
+            m = ctx0->stepY  * dimX * padZ;
+            for(i = 0; i < dimX; i++) {
+                for(k = 0; k < dimZ; k++) {
+                    int l = i * padZ + k;
+                        
+                    ctx->div_upd[l] -= ctx0->qy_upd[m - dimX * padZ + l];
+                    ctx0->div_upd[m + l] = ctx->div_upd[l];
+                    ctx0->u_upd[m + l] = ctx->u_upd[l];
+
+                    float divdiff = ctx->div[l] - ctx->div_upd[l];
+                    float udiff = ctx->u[l] - ctx->u_upd[l];
+                    resprimal += fabs(divtau * udiff + divdiff); 
+                }
+            }
+        }
+
+        for (j = 0; j < tnv_ctx.threads; j++) {
+            tnv_thread_t *ctx = tnv_ctx.thr_ctx + j;
+            resprimal += ctx->resprimal;
+            resdual += ctx->resdual;
+            product += ctx->product;
+            unorm += ctx->unorm;
+            qnorm += ctx->qnorm;
+        } 
+
+        residual = (resprimal + resdual) / ((float) (dimX*dimY*dimZ));
+        float b = (2.0f * tau * sigma * product) / (gamma * sigma * unorm + gamma * tau * qnorm);
+        float dual_dot_delta = resdual * s * delta;
+        float dual_div_delta = (resdual * s) / delta;
+        printf("resprimal: %f, resdual: %f, b: %f (product: %f, unorm: %f, qnorm: %f)\n", resprimal, resdual, b, product, unorm, qnorm);
+
+
+        if(b > 1) {
+            // Decrease step-sizes to fit balancing principle
+            tau = (beta * tau) / b;
+            sigma = (beta * sigma) / b;
+            alpha = alpha0;
+
+            err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_restore);
+            if (!err) err = hw_sched_wait_task(sched);
+            if (err) { fprintf(stderr, "Error %i scheduling restore threads", err); exit(-1); }
+        } else {
+            err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_copy);
+            if (!err) err = hw_sched_wait_task(sched);
+            if (err) { fprintf(stderr, "Error %i scheduling copy threads", err); exit(-1); }
+        
+            if(resprimal > dual_dot_delta) {
+                    // Increase primal step-size and decrease dual step-size
+                tau = tau / (1.0f - alpha);
+                sigma = sigma * (1.0f - alpha);
+                alpha = alpha * eta;
+            } else if(resprimal < dual_div_delta) {
+                    // Decrease primal step-size and increase dual step-size
+                tau = tau * (1.0f - alpha);
+                sigma = sigma / (1.0f - alpha);
+                alpha = alpha * eta;
+            }
+        }
+
+        if (residual < tol) break;
+    }
+
+    err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_finish);
+    if (!err) err = hw_sched_wait_task(sched);
+    if (err) { fprintf(stderr, "Error %i scheduling finish threads", err); exit(-1); }
+
+
+    printf("Iterations stopped at %i with the residual %f \n", iter, residual);
+    printf("Return: %f\n", *uT);
+
+    return *uT;
+}
diff --git a/src/Core/performance_CPU/TNV_core.c.v19 b/src/Core/performance_CPU/TNV_core.c.v19
new file mode 100755
index 00000000..9b19ed5c
--- /dev/null
+++ b/src/Core/performance_CPU/TNV_core.c.v19
@@ -0,0 +1,681 @@
+/*
+ * This work is part of the Core Imaging Library developed by
+ * Visual Analytics and Imaging System Group of the Science Technology
+ * Facilities Council, STFC
+ *
+ * Copyright 2017 Daniil Kazantsev
+ * Copyright 2017 Srikanth Nagella, Edoardo Pasca
+ *
+ * Copyriht 2020 Suren A. Chlingaryan
+ * Optimized version with 1/2 of memory consumption and ~4x performance
+ * This version is algorithmicly comptable with the original, but slight change in results 
+ * is expected due to different order of floating-point operations.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <malloc.h>
+#include "TNV_core.h"
+
+#define BLOCK 32
+#define min(a,b) (((a)<(b))?(a):(b))
+
+inline void coefF(float *t, float M1, float M2, float M3, float sigma, int p, int q, int r) {
+    int ii, num;
+    float divsigma = 1.0f / sigma;
+    float sum, shrinkfactor;
+    float T,D,det,eig1,eig2,sig1,sig2,V1, V2, V3, V4, v0,v1,v2, mu1,mu2,sig1_upd,sig2_upd;
+    float proj[2] = {0};
+
+    // Compute eigenvalues of M
+    T = M1 + M3;
+    D = M1 * M3 - M2 * M2;
+    det = sqrtf(MAX((T * T / 4.0f) - D, 0.0f));
+    eig1 = MAX((T / 2.0f) + det, 0.0f);
+    eig2 = MAX((T / 2.0f) - det, 0.0f);
+    sig1 = sqrtf(eig1);
+    sig2 = sqrtf(eig2);
+
+    // Compute normalized eigenvectors
+    V1 = V2 = V3 = V4 = 0.0f;
+
+    if(M2 != 0.0f)
+    {
+        v0 = M2;
+        v1 = eig1 - M3;
+        v2 = eig2 - M3;
+
+        mu1 = sqrtf(v0 * v0 + v1 * v1);
+        mu2 = sqrtf(v0 * v0 + v2 * v2);
+
+        if(mu1 > fTiny)
+        {
+            V1 = v1 / mu1;
+            V3 = v0 / mu1;
+        }
+
+        if(mu2 > fTiny)
+        {
+            V2 = v2 / mu2;
+            V4 = v0 / mu2;
+        }
+
+    } else
+    {
+        if(M1 > M3)
+        {
+            V1 = V4 = 1.0f;
+            V2 = V3 = 0.0f;
+
+        } else
+        {
+            V1 = V4 = 0.0f;
+            V2 = V3 = 1.0f;
+        }
+    }
+
+    // Compute prox_p of the diagonal entries
+    sig1_upd = sig2_upd = 0.0f;
+
+    if(p == 1)
+    {
+        sig1_upd = MAX(sig1 - divsigma, 0.0f);
+        sig2_upd = MAX(sig2 - divsigma, 0.0f);
+
+    } else if(p == INFNORM)
+    {
+        proj[0] = sigma * fabs(sig1);
+        proj[1] = sigma * fabs(sig2);
+
+        /*l1 projection part */
+        sum = fLarge;
+        num = 0l;
+        shrinkfactor = 0.0f;
+        while(sum > 1.0f)
+        {
+            sum = 0.0f;
+            num = 0;
+
+            for(ii = 0; ii < 2; ii++)
+            {
+                proj[ii] = MAX(proj[ii] - shrinkfactor, 0.0f);
+
+                sum += fabs(proj[ii]);
+                if(proj[ii]!= 0.0f)
+                    num++;
+            }
+
+            if(num > 0)
+                shrinkfactor = (sum - 1.0f) / num;
+            else
+                break;
+        }
+        /*l1 proj ends*/
+
+        sig1_upd = sig1 - divsigma * proj[0];
+        sig2_upd = sig2 - divsigma * proj[1];
+    }
+
+    // Compute the diagonal entries of $\widehat{\Sigma}\Sigma^{\dagger}_0$
+    if(sig1 > fTiny)
+        sig1_upd /= sig1;
+
+    if(sig2 > fTiny)
+        sig2_upd /= sig2;
+
+    // Compute solution
+    t[0] = sig1_upd * V1 * V1 + sig2_upd * V2 * V2;
+    t[1] = sig1_upd * V1 * V3 + sig2_upd * V2 * V4;
+    t[2] = sig1_upd * V3 * V3 + sig2_upd * V4 * V4;
+}
+
+
+#include "hw_sched.h"
+typedef struct {
+    int offY, stepY, copY;
+    float *Input, *u, *u_upd, *qx, *qy, *qx_upd, *qy_upd, *gradx, *grady, *gradx_upd, *grady_upd;
+    float *div, *div_upd;
+    float resprimal, resdual;
+    float unorm, qnorm, product;
+} tnv_thread_t;
+
+typedef struct {
+    int threads;
+    tnv_thread_t *thr_ctx;
+    float *InputT, *uT;
+    int dimX, dimY, dimZ, padZ;
+    float lambda, sigma, tau, theta;
+} tnv_context_t;
+
+HWSched sched = NULL;
+tnv_context_t tnv_ctx;
+
+
+static int tnv_free(HWThread thr, void *hwctx, int device_id, void *data) {
+    int i,j,k;
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+    free(ctx->Input);
+    free(ctx->u);
+    free(ctx->u_upd);
+    free(ctx->qx);
+    free(ctx->qy);
+    free(ctx->qx_upd);
+    free(ctx->qy_upd);
+    free(ctx->gradx);
+    free(ctx->grady);
+    free(ctx->gradx_upd);
+    free(ctx->grady_upd);
+    free(ctx->div);
+    free(ctx->div_upd);
+    
+    return 0;
+}
+
+static int tnv_init(HWThread thr, void *hwctx, int device_id, void *data) {
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+    
+    int dimX = tnv_ctx->dimX;
+    int dimY = tnv_ctx->dimY;
+    int dimZ = tnv_ctx->dimZ;
+    int padZ = tnv_ctx->padZ;
+    int offY = ctx->offY;
+    int stepY = ctx->stepY;
+    
+//    printf("%i %p - %i %i %i x %i %i\n", device_id, ctx, dimX, dimY, dimZ, offY, stepY);
+
+    long DimTotal = (long)(dimX*stepY*padZ);
+    long Dim1Total = (long)(dimX*(stepY+1)*padZ);
+
+    // Auxiliar vectors
+    ctx->Input = memalign(64, Dim1Total * sizeof(float));
+    ctx->u = memalign(64, Dim1Total * sizeof(float));
+    ctx->u_upd = memalign(64, Dim1Total * sizeof(float));
+    ctx->qx = memalign(64, DimTotal * sizeof(float));
+    ctx->qy = memalign(64, DimTotal * sizeof(float));
+    ctx->qx_upd = memalign(64, DimTotal * sizeof(float));
+    ctx->qy_upd = memalign(64, DimTotal * sizeof(float));
+    ctx->gradx = memalign(64, DimTotal * sizeof(float));
+    ctx->grady = memalign(64, DimTotal * sizeof(float));
+    ctx->gradx_upd = memalign(64, DimTotal * sizeof(float));
+    ctx->grady_upd = memalign(64, DimTotal * sizeof(float));
+    ctx->div = memalign(64, Dim1Total * sizeof(float));
+    ctx->div_upd = malloc(Dim1Total * sizeof(float));
+
+    if ((!ctx->Input)||(!ctx->u)||(!ctx->u_upd)||(!ctx->qx)||(!ctx->qy)||(!ctx->qx_upd)||(!ctx->qy_upd)||(!ctx->gradx)||(!ctx->grady)||(!ctx->gradx_upd)||(!ctx->grady_upd)||(!ctx->div)||(!ctx->div_upd)) {
+        fprintf(stderr, "Error allocating memory\n");
+        exit(-1);
+    }
+
+    return 0;
+}
+
+static int tnv_start(HWThread thr, void *hwctx, int device_id, void *data) {
+    int i,j,k;
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+    
+    int dimX = tnv_ctx->dimX;
+    int dimY = tnv_ctx->dimY;
+    int dimZ = tnv_ctx->dimZ;
+    int padZ = tnv_ctx->padZ;
+    int offY = ctx->offY;
+    int stepY = ctx->stepY;
+    int copY = ctx->copY;
+    
+//    printf("%i %p - %i %i %i (%i) x %i %i\n", device_id, ctx, dimX, dimY, dimZ, padZ, offY, stepY);
+
+    long DimTotal = (long)(dimX*stepY*padZ);
+    long Dim1Total = (long)(dimX*copY*padZ);
+
+    memset(ctx->u, 0, Dim1Total * sizeof(float));
+    memset(ctx->qx, 0, DimTotal * sizeof(float));
+    memset(ctx->qy, 0, DimTotal * sizeof(float));
+    memset(ctx->gradx, 0, DimTotal * sizeof(float));
+    memset(ctx->grady, 0, DimTotal * sizeof(float));
+    memset(ctx->div, 0, Dim1Total * sizeof(float));
+    
+    for(k=0; k<dimZ; k++) {
+        for(j=0; j<copY; j++) {
+            for(i=0; i<dimX; i++) {
+                ctx->Input[j * dimX * padZ + i * padZ + k] =  tnv_ctx->InputT[k * dimX * dimY + (j + offY) * dimX + i];
+                ctx->u[j * dimX * padZ + i * padZ + k] =  tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i];
+            }
+        }
+    }
+
+    return 0;
+}
+
+static int tnv_finish(HWThread thr, void *hwctx, int device_id, void *data) {
+    int i,j,k;
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+    int dimX = tnv_ctx->dimX;
+    int dimY = tnv_ctx->dimY;
+    int dimZ = tnv_ctx->dimZ;
+    int padZ = tnv_ctx->padZ;
+    int offY = ctx->offY;
+    int stepY = ctx->stepY;
+    int copY = ctx->copY;
+
+    for(k=0; k<dimZ; k++) {
+        for(j=0; j<stepY; j++) {
+            for(i=0; i<dimX; i++) {
+                tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i] = ctx->u[j * dimX * padZ + i * padZ + k];
+            }
+        }
+    }
+    
+    return 0;
+}
+
+
+static int tnv_copy(HWThread thr, void *hwctx, int device_id, void *data) {
+    int i,j,k;
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+    
+    int dimX = tnv_ctx->dimX;
+    int dimY = tnv_ctx->dimY;
+    int dimZ = tnv_ctx->dimZ;
+    int stepY = ctx->stepY;
+    int copY = ctx->copY;
+    int padZ = tnv_ctx->padZ;
+    long DimTotal = (long)(dimX*stepY*padZ);
+    long Dim1Total = (long)(dimX*copY*padZ);
+
+    // Auxiliar vectors
+    memcpy(ctx->u, ctx->u_upd, Dim1Total * sizeof(float));
+    memcpy(ctx->qx, ctx->qx_upd, DimTotal * sizeof(float));
+    memcpy(ctx->qy, ctx->qy_upd, DimTotal * sizeof(float));
+    memcpy(ctx->gradx, ctx->gradx_upd, DimTotal * sizeof(float));
+    memcpy(ctx->grady, ctx->grady_upd, DimTotal * sizeof(float));
+    memcpy(ctx->div, ctx->div_upd, Dim1Total * sizeof(float));
+
+    return 0;
+}
+
+static int tnv_restore(HWThread thr, void *hwctx, int device_id, void *data) {
+    int i,j,k;
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+    
+    int dimX = tnv_ctx->dimX;
+    int dimY = tnv_ctx->dimY;
+    int dimZ = tnv_ctx->dimZ;
+    int stepY = ctx->stepY;
+    int copY = ctx->copY;
+    int padZ = tnv_ctx->padZ;
+    long DimTotal = (long)(dimX*stepY*padZ);
+    long Dim1Total = (long)(dimX*copY*padZ);
+
+    // Auxiliar vectors
+    memcpy(ctx->u_upd, ctx->u, Dim1Total * sizeof(float));
+    memcpy(ctx->qx_upd, ctx->qx, DimTotal * sizeof(float));
+    memcpy(ctx->qy_upd, ctx->qy, DimTotal * sizeof(float));
+    memcpy(ctx->gradx_upd, ctx->gradx, DimTotal * sizeof(float));
+    memcpy(ctx->grady_upd, ctx->grady, DimTotal * sizeof(float));
+    memcpy(ctx->div_upd, ctx->div, Dim1Total * sizeof(float));
+
+    return 0;
+}
+
+
+static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) {
+    long i, j, k, l;
+    long i1, i2, j1, j2;
+
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+    
+    int dimX = tnv_ctx->dimX;
+    int dimY = tnv_ctx->dimY;
+    int dimZ = tnv_ctx->dimZ;
+    int padZ = tnv_ctx->padZ;
+    int offY = ctx->offY;
+    int stepY = ctx->stepY;
+    int copY = ctx->copY;
+
+    float *Input = ctx->Input;
+    float *u = ctx->u;
+    float *u_upd = ctx->u_upd;
+    float *qx = ctx->qx;
+    float *qy = ctx->qy;
+    float *qx_upd = ctx->qx_upd;
+    float *qy_upd = ctx->qy_upd;
+    float *gradx = ctx->gradx;
+    float *grady = ctx->grady;
+    float *gradx_upd = ctx->gradx_upd;
+    float *grady_upd = ctx->grady_upd;
+    float *div = ctx->div;
+    float *div_upd = ctx->div_upd;
+
+    long p = 1l;
+    long q = 1l;
+    long r = 0l;
+
+    float lambda = tnv_ctx->lambda;
+    float sigma = tnv_ctx->sigma;
+    float tau = tnv_ctx->tau;
+    float theta = tnv_ctx->theta;
+    
+    float taulambda = tau * lambda;
+    float divtau = 1.0f / tau;
+    float divsigma = 1.0f / sigma;
+    float theta1 = 1.0f + theta;
+    float constant = 1.0f + taulambda;
+
+    float resprimal = 0.0f;
+    float resdual1 = 0.0f;
+    float resdual2 = 0.0f;
+    float product = 0.0f;
+    float unorm = 0.0f;
+    float qnorm = 0.0f;
+
+    float udiff[dimZ] __attribute__((aligned(64)));
+    float qxdiff __attribute__((aligned(64)));
+    float qydiff __attribute__((aligned(64)));
+    float divdiff __attribute__((aligned(64)));
+    float gradxdiff[dimZ] __attribute__((aligned(64)));
+    float gradydiff[dimZ] __attribute__((aligned(64)));
+
+
+    j = 0; {
+#       define TNV_LOOP_FIRST_J
+        i = 0; {
+#           define TNV_LOOP_FIRST_I
+#           include "TNV_core_backtrack_loop.h"
+#           undef TNV_LOOP_FIRST_I
+        }
+        for(i = 1; i < (dimX - 1); i++) {
+#           include "TNV_core_backtrack_loop.h"
+        }
+        i = dimX - 1; {
+#           define TNV_LOOP_LAST_I
+#           include "TNV_core_backtrack_loop.h"
+#           undef TNV_LOOP_LAST_I
+        }
+#       undef TNV_LOOP_FIRST_J
+    }
+
+
+
+    for(int j = 1; j < (copY - 1); j++) {
+        i = 0; {
+#           define TNV_LOOP_FIRST_I
+#           include "TNV_core_backtrack_loop.h"
+#           undef TNV_LOOP_FIRST_I
+        }
+    }
+
+    for(int j1 = 1; j1 < (copY - 1); j1 += BLOCK) {
+        for(int i1 = 1; i1 < (dimX - 1); i1 += BLOCK) {
+            for(int j2 = 0; j2 < BLOCK; j2 ++) {
+                j = j1 + j2;
+                for(int i2 = 0; i2 < BLOCK; i2++) {
+                    i = i1 + i2;
+                    
+                    if (i == (dimX - 1)) break;
+                    if (j == (copY - 1)) { j2 = BLOCK; break; }
+#           include "TNV_core_backtrack_loop.h"
+                }   
+            }
+        } // i
+
+    }
+
+    for(int j = 1; j < (copY - 1); j++) {
+        i = dimX - 1; {
+#           define TNV_LOOP_LAST_I
+#           include "TNV_core_backtrack_loop.h"
+#           undef TNV_LOOP_LAST_I
+        }
+    }
+
+
+
+    for (j = copY - 1; j < stepY; j++) {
+#       define TNV_LOOP_LAST_J
+        i = 0; {
+#           define TNV_LOOP_FIRST_I
+#           include "TNV_core_backtrack_loop.h"
+#           undef TNV_LOOP_FIRST_I
+        }
+        for(i = 1; i < (dimX - 1); i++) {
+#           include "TNV_core_backtrack_loop.h"
+        }
+        i = dimX - 1; {
+#           define TNV_LOOP_LAST_I
+#           include "TNV_core_backtrack_loop.h"
+#           undef TNV_LOOP_LAST_I
+        }
+#       undef TNV_LOOP_LAST_J
+    }
+
+    
+    ctx->resprimal = resprimal;
+    ctx->resdual = resdual1 + resdual2;
+    ctx->product = product;
+    ctx->unorm = unorm;
+    ctx->qnorm = qnorm;
+
+    return 0;
+}
+
+static void TNV_CPU_init(float *InputT, float *uT, int dimX, int dimY, int dimZ) {
+    int i, off, size, err;
+
+    if (sched) return;
+
+    tnv_ctx.dimX = dimX;
+    tnv_ctx.dimY = dimY;
+    tnv_ctx.dimZ = dimZ;
+        // Padding seems actually slower
+    tnv_ctx.padZ = dimZ;
+//    tnv_ctx.padZ = 16 * ((dimZ / 16) + ((dimZ % 16)?1:0));
+    
+    hw_sched_init();
+
+    int threads = hw_sched_get_cpu_count();
+    if (threads > dimY) threads = dimY/2;
+
+    int step = dimY / threads;
+    int extra = dimY % threads;
+
+    tnv_ctx.threads = threads;
+    tnv_ctx.thr_ctx = (tnv_thread_t*)calloc(threads, sizeof(tnv_thread_t));
+    for (i = 0, off = 0; i < threads; i++, off += size) {
+        tnv_thread_t *ctx = tnv_ctx.thr_ctx + i;
+        size = step + ((i < extra)?1:0);
+
+        ctx->offY = off;
+        ctx->stepY = size;
+
+        if (i == (threads-1)) ctx->copY = ctx->stepY;
+        else ctx->copY = ctx->stepY + 1;
+    }
+
+    sched = hw_sched_create(threads);
+    if (!sched) { fprintf(stderr, "Error creating threads\n"); exit(-1); }
+
+    err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_init);
+    if (!err) err = hw_sched_wait_task(sched);
+    if (err) { fprintf(stderr, "Error %i scheduling init threads", err); exit(-1); }
+}
+
+
+
+/*
+ * C-OMP implementation of Total Nuclear Variation regularisation model (2D + channels) [1]
+ * The code is modified from the implementation by Joan Duran <joan.duran@uib.es> see
+ * "denoisingPDHG_ipol.cpp" in Joans Collaborative Total Variation package
+ *
+ * Input Parameters:
+ * 1. Noisy volume of 2D + channel dimension, i.e. 3D volume
+ * 2. lambda - regularisation parameter
+ * 3. Number of iterations [OPTIONAL parameter]
+ * 4. eplsilon - tolerance constant [OPTIONAL parameter]
+ * 5. print information: 0 (off) or 1 (on)  [OPTIONAL parameter]
+ *
+ * Output:
+ * 1. Filtered/regularized image (u)
+ *
+ * [1]. Duran, J., Moeller, M., Sbert, C. and Cremers, D., 2016. Collaborative total variation: a general framework for vectorial TV models. SIAM Journal on Imaging Sciences, 9(1), pp.116-151.
+ */
+
+float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float tol, int dimX, int dimY, int dimZ)
+{
+    int err;
+    int iter;
+    int i,j,k,l,m;
+
+    lambda = 1.0f/(2.0f*lambda);
+    tnv_ctx.lambda = lambda;
+
+    // PDHG algorithm parameters
+    float tau = 0.5f;
+    float sigma = 0.5f;
+    float theta = 1.0f;
+
+    // Backtracking parameters
+    float s = 1.0f;
+    float gamma = 0.75f;
+    float beta = 0.95f;
+    float alpha0 = 0.2f;
+    float alpha = alpha0;
+    float delta = 1.5f;
+    float eta = 0.95f;
+
+    TNV_CPU_init(InputT, uT, dimX, dimY, dimZ);
+
+    tnv_ctx.InputT = InputT;
+    tnv_ctx.uT = uT;
+    
+    int padZ = tnv_ctx.padZ;
+
+    err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_start);
+    if (!err) err = hw_sched_wait_task(sched);
+    if (err) { fprintf(stderr, "Error %i scheduling start threads", err); exit(-1); }
+
+
+    // Apply Primal-Dual Hybrid Gradient scheme
+    float residual = fLarge;
+    for(iter = 0; iter < maxIter; iter++)   {
+        float resprimal = 0.0f;
+        float resdual = 0.0f;
+        float product = 0.0f;
+        float unorm = 0.0f;
+        float qnorm = 0.0f;
+
+        float divtau = 1.0f / tau;
+
+        tnv_ctx.sigma = sigma;
+        tnv_ctx.tau = tau;
+        tnv_ctx.theta = theta;
+
+        err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_step);
+        if (!err) err = hw_sched_wait_task(sched);
+        if (err) { fprintf(stderr, "Error %i scheduling tnv threads", err); exit(-1); }
+
+            // border regions
+        for (j = 1; j < tnv_ctx.threads; j++) {
+            tnv_thread_t *ctx0 = tnv_ctx.thr_ctx + (j - 1);
+            tnv_thread_t *ctx = tnv_ctx.thr_ctx + j;
+
+            m = ctx0->stepY  * dimX * padZ;
+            for(i = 0; i < dimX; i++) {
+                for(k = 0; k < dimZ; k++) {
+                    int l = i * padZ + k;
+                        
+                    ctx->div_upd[l] -= ctx0->qy_upd[m - dimX * padZ + l];
+                    ctx0->div_upd[m + l] = ctx->div_upd[l];
+                    ctx0->u_upd[m + l] = ctx->u_upd[l];
+
+                    float divdiff = ctx->div[l] - ctx->div_upd[l];
+                    float udiff = ctx->u[l] - ctx->u_upd[l];
+                    resprimal += fabs(divtau * udiff + divdiff); 
+                }
+            }
+        }
+
+        {
+            tnv_thread_t *ctx = tnv_ctx.thr_ctx + 0;
+            for(i = 0; i < dimX; i++) {
+                for(k = 0; k < dimZ; k++) {
+                    int l = i * padZ + k;
+
+                    float divdiff = ctx->div[l] - ctx->div_upd[l];
+                    float udiff = ctx->u[l] - ctx->u_upd[l];
+                    resprimal += fabs(divtau * udiff + divdiff); 
+                }
+            }
+        }
+
+        for (j = 0; j < tnv_ctx.threads; j++) {
+            tnv_thread_t *ctx = tnv_ctx.thr_ctx + j;
+            resprimal += ctx->resprimal;
+            resdual += ctx->resdual;
+            product += ctx->product;
+            unorm += ctx->unorm;
+            qnorm += ctx->qnorm;
+        } 
+
+        residual = (resprimal + resdual) / ((float) (dimX*dimY*dimZ));
+        float b = (2.0f * tau * sigma * product) / (gamma * sigma * unorm + gamma * tau * qnorm);
+        float dual_dot_delta = resdual * s * delta;
+        float dual_div_delta = (resdual * s) / delta;
+        printf("resprimal: %f, resdual: %f, b: %f (product: %f, unorm: %f, qnorm: %f)\n", resprimal, resdual, b, product, unorm, qnorm);
+
+
+        if(b > 1) {
+            // Decrease step-sizes to fit balancing principle
+            tau = (beta * tau) / b;
+            sigma = (beta * sigma) / b;
+            alpha = alpha0;
+
+            err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_restore);
+            if (!err) err = hw_sched_wait_task(sched);
+            if (err) { fprintf(stderr, "Error %i scheduling restore threads", err); exit(-1); }
+        } else {
+            err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_copy);
+            if (!err) err = hw_sched_wait_task(sched);
+            if (err) { fprintf(stderr, "Error %i scheduling copy threads", err); exit(-1); }
+        
+            if(resprimal > dual_dot_delta) {
+                    // Increase primal step-size and decrease dual step-size
+                tau = tau / (1.0f - alpha);
+                sigma = sigma * (1.0f - alpha);
+                alpha = alpha * eta;
+            } else if(resprimal < dual_div_delta) {
+                    // Decrease primal step-size and increase dual step-size
+                tau = tau * (1.0f - alpha);
+                sigma = sigma / (1.0f - alpha);
+                alpha = alpha * eta;
+            }
+        }
+
+        if (residual < tol) break;
+    }
+
+    err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_finish);
+    if (!err) err = hw_sched_wait_task(sched);
+    if (err) { fprintf(stderr, "Error %i scheduling finish threads", err); exit(-1); }
+
+
+    printf("Iterations stopped at %i with the residual %f \n", iter, residual);
+    printf("Return: %f\n", *uT);
+
+    return *uT;
+}
diff --git a/src/Core/performance_CPU/TNV_core.c.v27 b/src/Core/performance_CPU/TNV_core.c.v27
new file mode 100755
index 00000000..dce414ac
--- /dev/null
+++ b/src/Core/performance_CPU/TNV_core.c.v27
@@ -0,0 +1,650 @@
+/*
+ * This work is part of the Core Imaging Library developed by
+ * Visual Analytics and Imaging System Group of the Science Technology
+ * Facilities Council, STFC
+ *
+ * Copyright 2017 Daniil Kazantsev
+ * Copyright 2017 Srikanth Nagella, Edoardo Pasca
+ * 
+ * Copyriht 2020 Suren A. Chlingaryan
+ * Optimized version with 1/3 of memory consumption and ~10x performance
+ * This version is not able to perform back-track except during first iterations
+ * But warning would be printed if backtracking is required and slower version (TNV_core_backtrack.c) 
+ * could be executed instead. It still better than original with 1/2 of memory consumption and 4x performance gain
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "TNV_core.h"
+
+#define BLOCK 32
+#define min(a,b) (((a)<(b))?(a):(b))
+
+inline void coefF(float *t, float M1, float M2, float M3, float sigma, int p, int q, int r) {
+    int ii, num;
+    float divsigma = 1.0f / sigma;
+    float sum, shrinkfactor;
+    float T,D,det,eig1,eig2,sig1,sig2,V1, V2, V3, V4, v0,v1,v2, mu1,mu2,sig1_upd,sig2_upd;
+    float proj[2] = {0};
+
+    // Compute eigenvalues of M
+    T = M1 + M3;
+    D = M1 * M3 - M2 * M2;
+    det = sqrtf(MAX((T * T / 4.0f) - D, 0.0f));
+    eig1 = MAX((T / 2.0f) + det, 0.0f);
+    eig2 = MAX((T / 2.0f) - det, 0.0f);
+    sig1 = sqrtf(eig1);
+    sig2 = sqrtf(eig2);
+
+    // Compute normalized eigenvectors
+    V1 = V2 = V3 = V4 = 0.0f;
+
+    if(M2 != 0.0f)
+    {
+        v0 = M2;
+        v1 = eig1 - M3;
+        v2 = eig2 - M3;
+
+        mu1 = sqrtf(v0 * v0 + v1 * v1);
+        mu2 = sqrtf(v0 * v0 + v2 * v2);
+
+        if(mu1 > fTiny)
+        {
+            V1 = v1 / mu1;
+            V3 = v0 / mu1;
+        }
+
+        if(mu2 > fTiny)
+        {
+            V2 = v2 / mu2;
+            V4 = v0 / mu2;
+        }
+
+    } else
+    {
+        if(M1 > M3)
+        {
+            V1 = V4 = 1.0f;
+            V2 = V3 = 0.0f;
+
+        } else
+        {
+            V1 = V4 = 0.0f;
+            V2 = V3 = 1.0f;
+        }
+    }
+
+    // Compute prox_p of the diagonal entries
+    sig1_upd = sig2_upd = 0.0f;
+
+    if(p == 1)
+    {
+        sig1_upd = MAX(sig1 - divsigma, 0.0f);
+        sig2_upd = MAX(sig2 - divsigma, 0.0f);
+
+    } else if(p == INFNORM)
+    {
+        proj[0] = sigma * fabs(sig1);
+        proj[1] = sigma * fabs(sig2);
+
+        /*l1 projection part */
+        sum = fLarge;
+        num = 0l;
+        shrinkfactor = 0.0f;
+        while(sum > 1.0f)
+        {
+            sum = 0.0f;
+            num = 0;
+
+            for(ii = 0; ii < 2; ii++)
+            {
+                proj[ii] = MAX(proj[ii] - shrinkfactor, 0.0f);
+
+                sum += fabs(proj[ii]);
+                if(proj[ii]!= 0.0f)
+                    num++;
+            }
+
+            if(num > 0)
+                shrinkfactor = (sum - 1.0f) / num;
+            else
+                break;
+        }
+        /*l1 proj ends*/
+
+        sig1_upd = sig1 - divsigma * proj[0];
+        sig2_upd = sig2 - divsigma * proj[1];
+    }
+
+    // Compute the diagonal entries of $\widehat{\Sigma}\Sigma^{\dagger}_0$
+    if(sig1 > fTiny)
+        sig1_upd /= sig1;
+
+    if(sig2 > fTiny)
+        sig2_upd /= sig2;
+
+    // Compute solution
+    t[0] = sig1_upd * V1 * V1 + sig2_upd * V2 * V2;
+    t[1] = sig1_upd * V1 * V3 + sig2_upd * V2 * V4;
+    t[2] = sig1_upd * V3 * V3 + sig2_upd * V4 * V4;
+}
+
+
+#include "hw_sched.h"
+typedef struct {
+    int offY, stepY, copY;
+    float *Input, *u, *qx, *qy, *gradx, *grady, *div;
+    float *div0, *udiff0;
+    float resprimal, resdual;
+    float unorm, qnorm, product;
+} tnv_thread_t;
+
+typedef struct {
+    int threads;
+    tnv_thread_t *thr_ctx;
+    float *InputT, *uT;
+    int dimX, dimY, dimZ, padZ;
+    float lambda, sigma, tau, theta;
+} tnv_context_t;
+
+HWSched sched = NULL;
+tnv_context_t tnv_ctx;
+
+
+static int tnv_free(HWThread thr, void *hwctx, int device_id, void *data) {
+    int i,j,k;
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+    free(ctx->Input);
+    free(ctx->u);
+    free(ctx->qx);
+    free(ctx->qy);
+    free(ctx->gradx);
+    free(ctx->grady);
+    free(ctx->div);
+    
+    free(ctx->div0);
+    free(ctx->udiff0);
+    
+    return 0;
+}
+
+static int tnv_init(HWThread thr, void *hwctx, int device_id, void *data) {
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+    
+    int dimX = tnv_ctx->dimX;
+    int dimY = tnv_ctx->dimY;
+    int dimZ = tnv_ctx->dimZ;
+    int padZ = tnv_ctx->padZ;
+    int offY = ctx->offY;
+    int stepY = ctx->stepY;
+    
+//    printf("%i %p - %i %i %i x %i %i\n", device_id, ctx, dimX, dimY, dimZ, offY, stepY);
+
+    long DimTotal = (long)(dimX*stepY*padZ);
+    long Dim1Total = (long)(dimX*(stepY+1)*padZ);
+    long DimRow = (long)(dimX * padZ);
+
+    // Auxiliar vectors
+    ctx->Input = memalign(64, Dim1Total * sizeof(float));
+    ctx->u = memalign(64, Dim1Total * sizeof(float));
+    ctx->qx = memalign(64, DimTotal * sizeof(float));
+    ctx->qy = memalign(64, DimTotal * sizeof(float));
+    ctx->gradx = memalign(64, DimTotal * sizeof(float));
+    ctx->grady = memalign(64, DimTotal * sizeof(float));
+    ctx->div = memalign(64, Dim1Total * sizeof(float));
+
+    ctx->div0 = memalign(64, DimRow * sizeof(float));
+    ctx->udiff0 = memalign(64, DimRow * sizeof(float));
+
+    if ((!ctx->Input)||(!ctx->u)||(!ctx->qx)||(!ctx->qy)||(!ctx->gradx)||(!ctx->grady)||(!ctx->div)||(!ctx->div0)||(!ctx->udiff0)) {
+        fprintf(stderr, "Error allocating memory\n");
+        exit(-1);
+    }
+
+    return 0;
+}
+
+static int tnv_start(HWThread thr, void *hwctx, int device_id, void *data) {
+    int i,j,k;
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+    
+    int dimX = tnv_ctx->dimX;
+    int dimY = tnv_ctx->dimY;
+    int dimZ = tnv_ctx->dimZ;
+    int padZ = tnv_ctx->padZ;
+    int offY = ctx->offY;
+    int stepY = ctx->stepY;
+    int copY = ctx->copY;
+    
+//    printf("%i %p - %i %i %i (%i) x %i %i\n", device_id, ctx, dimX, dimY, dimZ, padZ, offY, stepY);
+
+    long DimTotal = (long)(dimX*stepY*padZ);
+    long Dim1Total = (long)(dimX*copY*padZ);
+
+    memset(ctx->u, 0, Dim1Total * sizeof(float));
+    memset(ctx->qx, 0, DimTotal * sizeof(float));
+    memset(ctx->qy, 0, DimTotal * sizeof(float));
+    memset(ctx->gradx, 0, DimTotal * sizeof(float));
+    memset(ctx->grady, 0, DimTotal * sizeof(float));
+    memset(ctx->div, 0, Dim1Total * sizeof(float));
+    
+    for(k=0; k<dimZ; k++) {
+        for(j=0; j<copY; j++) {
+            for(i=0; i<dimX; i++) {
+                ctx->Input[j * dimX * padZ + i * padZ + k] =  tnv_ctx->InputT[k * dimX * dimY + (j + offY) * dimX + i];
+                ctx->u[j * dimX * padZ + i * padZ + k] =  tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i];
+            }
+        }
+    }
+
+    return 0;
+}
+
+static int tnv_finish(HWThread thr, void *hwctx, int device_id, void *data) {
+    int i,j,k;
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+    int dimX = tnv_ctx->dimX;
+    int dimY = tnv_ctx->dimY;
+    int dimZ = tnv_ctx->dimZ;
+    int padZ = tnv_ctx->padZ;
+    int offY = ctx->offY;
+    int stepY = ctx->stepY;
+    int copY = ctx->copY;
+
+    for(k=0; k<dimZ; k++) {
+        for(j=0; j<stepY; j++) {
+            for(i=0; i<dimX; i++) {
+                tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i] = ctx->u[j * dimX * padZ + i * padZ + k];
+            }
+        }
+    }
+    
+    return 0;
+}
+
+
+static int tnv_restore(HWThread thr, void *hwctx, int device_id, void *data) {
+    int i,j,k;
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+    
+    int dimX = tnv_ctx->dimX;
+    int dimY = tnv_ctx->dimY;
+    int dimZ = tnv_ctx->dimZ;
+    int stepY = ctx->stepY;
+    int copY = ctx->copY;
+    int padZ = tnv_ctx->padZ;
+    long DimTotal = (long)(dimX*stepY*padZ);
+    long Dim1Total = (long)(dimX*copY*padZ);
+
+    memset(ctx->u, 0, Dim1Total * sizeof(float));
+    memset(ctx->qx, 0, DimTotal * sizeof(float));
+    memset(ctx->qy, 0, DimTotal * sizeof(float));
+    memset(ctx->gradx, 0, DimTotal * sizeof(float));
+    memset(ctx->grady, 0, DimTotal * sizeof(float));
+    memset(ctx->div, 0, Dim1Total * sizeof(float));
+
+    return 0;
+}
+
+
+static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) {
+    long i, j, k, l;
+
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+    
+    int dimX = tnv_ctx->dimX;
+    int dimY = tnv_ctx->dimY;
+    int dimZ = tnv_ctx->dimZ;
+    int padZ = tnv_ctx->padZ;
+    int offY = ctx->offY;
+    int stepY = ctx->stepY;
+    int copY = ctx->copY;
+
+    float *Input = ctx->Input;
+    float *u = ctx->u;
+    float *qx = ctx->qx;
+    float *qy = ctx->qy;
+    float *gradx = ctx->gradx;
+    float *grady = ctx->grady;
+    float *div = ctx->div;
+
+    long p = 1l;
+    long q = 1l;
+    long r = 0l;
+
+    float lambda = tnv_ctx->lambda;
+    float sigma = tnv_ctx->sigma;
+    float tau = tnv_ctx->tau;
+    float theta = tnv_ctx->theta;
+    
+    float taulambda = tau * lambda;
+    float divtau = 1.0f / tau;
+    float divsigma = 1.0f / sigma;
+    float theta1 = 1.0f + theta;
+    float constant = 1.0f + taulambda;
+
+    float resprimal = 0.0f;
+    float resdual = 0.0f;
+    float product = 0.0f;
+    float unorm = 0.0f;
+    float qnorm = 0.0f;
+
+    float qxdiff;
+    float qydiff;
+    float divdiff;
+    float gradxdiff[dimZ] __attribute__((aligned(64)));
+    float gradydiff[dimZ] __attribute__((aligned(64)));
+    float ubarx[dimZ] __attribute__((aligned(64)));
+    float ubary[dimZ] __attribute__((aligned(64)));
+    float udiff[dimZ] __attribute__((aligned(64)));
+
+
+    for(i=0; i < dimX; i++) {
+        for(k = 0; k < dimZ; k++) { 
+            int l = i * padZ + k;
+            float u_upd = (u[l] + tau * div[l] + taulambda * Input[l])/constant;
+            float udiff = u[l] - u_upd;
+            ctx->udiff0[l] = udiff;
+            ctx->div0[l] = div[l];
+        }
+    }
+
+    for(int j1 = 0; j1 < stepY; j1 += BLOCK) {
+     for(int i1 = 0; i1 < dimX; i1 += BLOCK) {
+      for(int j2 = 0; j2 < BLOCK; j2++) {
+        j = j1 + j2;
+        for(int i2 = 0; i2 < BLOCK; i2++) {
+            float t[3];
+            float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f;
+
+            i = i1 + i2;
+            if (i == dimX) break;
+            if (j == stepY) { j2 = BLOCK; break; }
+            l = (j * dimX  + i) * padZ;
+
+#pragma vector aligned
+#pragma GCC ivdep 
+            for(k = 0; k < dimZ; k++) {
+                float u_upd = (u[l + k] + tau * div[l + k] + taulambda * Input[l + k]) / constant;
+                udiff[k] = u[l + k] - u_upd;
+                u[l + k] = u_upd;
+
+                float gradx_upd = (i == (dimX - 1))?0:((u[l + k + padZ] + tau * div[l + k + padZ] + taulambda * Input[l + k + padZ]) / constant - u_upd);
+                float grady_upd = (j == (copY - 1))?0:((u[l + k + dimX*padZ] + tau * div[l + k + dimX*padZ] + taulambda * Input[l + k + dimX*padZ]) / constant - u_upd);
+                gradxdiff[k] = gradx[l + k] - gradx_upd;
+                gradydiff[k] = grady[l + k] - grady_upd;
+                gradx[l + k] = gradx_upd;
+                grady[l + k] = grady_upd;
+                
+                ubarx[k] = gradx_upd - theta * gradxdiff[k];
+                ubary[k] = grady_upd - theta * gradydiff[k];
+
+                float vx = ubarx[k] + divsigma * qx[l + k];
+                float vy = ubary[k] + divsigma * qy[l + k];
+
+                M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy);
+            }
+
+            coefF(t, M1, M2, M3, sigma, p, q, r);
+            
+#pragma vector aligned
+#pragma GCC ivdep 
+            for(k = 0; k < dimZ; k++) {
+                float vx = ubarx[k] + divsigma * qx[l + k];
+                float vy = ubary[k] + divsigma * qy[l + k];
+                float gx_upd = vx * t[0] + vy * t[1];
+                float gy_upd = vx * t[1] + vy * t[2];
+
+                qxdiff = sigma * (ubarx[k] - gx_upd);
+                qydiff = sigma * (ubary[k] - gy_upd);
+                
+                qx[l + k] += qxdiff;
+                qy[l + k] += qydiff;
+
+                unorm += (udiff[k] * udiff[k]);
+                qnorm += (qxdiff * qxdiff + qydiff * qydiff);
+
+                float div_upd = 0;
+                div_upd -= (i > 0)?qx[l + k - padZ]:0;
+                div_upd -= (j > 0)?qy[l + k - dimX*padZ]:0;
+                div_upd += (i < (dimX-1))?qx[l + k]:0;
+                div_upd += (j < (copY-1))?qy[l + k]:0;
+                divdiff = div[l + k] - div_upd;  
+                div[l + k] = div_upd;
+
+                resprimal +=  ((offY == 0)||(j > 0))?fabs(divtau * udiff[k] + divdiff):0; 
+                resdual += fabs(divsigma * qxdiff + gradxdiff[k]);
+                resdual += fabs(divsigma * qydiff + gradydiff[k]);
+                product -= (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff);
+            }
+            
+        } // i
+       } // j
+     } // i
+    } // j
+
+
+    ctx->resprimal = resprimal;
+    ctx->resdual = resdual;
+    ctx->product = product;
+    ctx->unorm = unorm;
+    ctx->qnorm = qnorm;
+
+    return 0;
+}
+
+static void TNV_CPU_init(float *InputT, float *uT, int dimX, int dimY, int dimZ) {
+    int i, off, size, err;
+
+    if (sched) return;
+
+    tnv_ctx.dimX = dimX;
+    tnv_ctx.dimY = dimY;
+    tnv_ctx.dimZ = dimZ;
+        // Padding seems actually slower
+//    tnv_ctx.padZ = dimZ;
+//    tnv_ctx.padZ = 4 * ((dimZ / 4) + ((dimZ % 4)?1:0));
+    tnv_ctx.padZ = 16 * ((dimZ / 16) + ((dimZ % 16)?1:0));
+    
+    hw_sched_init();
+
+    int threads = hw_sched_get_cpu_count();
+    if (threads > dimY) threads = dimY/2;
+
+    int step = dimY / threads;
+    int extra = dimY % threads;
+
+    tnv_ctx.threads = threads;
+    tnv_ctx.thr_ctx = (tnv_thread_t*)calloc(threads, sizeof(tnv_thread_t));
+    for (i = 0, off = 0; i < threads; i++, off += size) {
+        tnv_thread_t *ctx = tnv_ctx.thr_ctx + i;
+        size = step + ((i < extra)?1:0);
+
+        ctx->offY = off;
+        ctx->stepY = size;
+
+        if (i == (threads-1)) ctx->copY = ctx->stepY;
+        else ctx->copY = ctx->stepY + 1;
+    }
+
+    sched = hw_sched_create(threads);
+    if (!sched) { fprintf(stderr, "Error creating threads\n"); exit(-1); }
+
+    err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_init);
+    if (!err) err = hw_sched_wait_task(sched);
+    if (err) { fprintf(stderr, "Error %i scheduling init threads", err); exit(-1); }
+}
+
+
+
+/*
+ * C-OMP implementation of Total Nuclear Variation regularisation model (2D + channels) [1]
+ * The code is modified from the implementation by Joan Duran <joan.duran@uib.es> see
+ * "denoisingPDHG_ipol.cpp" in Joans Collaborative Total Variation package
+ *
+ * Input Parameters:
+ * 1. Noisy volume of 2D + channel dimension, i.e. 3D volume
+ * 2. lambda - regularisation parameter
+ * 3. Number of iterations [OPTIONAL parameter]
+ * 4. eplsilon - tolerance constant [OPTIONAL parameter]
+ * 5. print information: 0 (off) or 1 (on)  [OPTIONAL parameter]
+ *
+ * Output:
+ * 1. Filtered/regularized image (u)
+ *
+ * [1]. Duran, J., Moeller, M., Sbert, C. and Cremers, D., 2016. Collaborative total variation: a general framework for vectorial TV models. SIAM Journal on Imaging Sciences, 9(1), pp.116-151.
+ */
+
+float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float tol, int dimX, int dimY, int dimZ)
+{
+    int err;
+    int iter;
+    int i,j,k,l,m;
+
+    lambda = 1.0f/(2.0f*lambda);
+    tnv_ctx.lambda = lambda;
+
+    // PDHG algorithm parameters
+    float tau = 0.5f;
+    float sigma = 0.5f;
+    float theta = 1.0f;
+
+    // Backtracking parameters
+    float s = 1.0f;
+    float gamma = 0.75f;
+    float beta = 0.95f;
+    float alpha0 = 0.2f;
+    float alpha = alpha0;
+    float delta = 1.5f;
+    float eta = 0.95f;
+
+    TNV_CPU_init(InputT, uT, dimX, dimY, dimZ);
+
+    tnv_ctx.InputT = InputT;
+    tnv_ctx.uT = uT;
+    
+    int padZ = tnv_ctx.padZ;
+
+    err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_start);
+    if (!err) err = hw_sched_wait_task(sched);
+    if (err) { fprintf(stderr, "Error %i scheduling start threads", err); exit(-1); }
+
+
+    // Apply Primal-Dual Hybrid Gradient scheme
+    float residual = fLarge;
+    int started = 0;
+    for(iter = 0; iter < maxIter; iter++)   {
+        float resprimal = 0.0f;
+        float resdual = 0.0f;
+        float product = 0.0f;
+        float unorm = 0.0f;
+        float qnorm = 0.0f;
+
+        float divtau = 1.0f / tau;
+
+        tnv_ctx.sigma = sigma;
+        tnv_ctx.tau = tau;
+        tnv_ctx.theta = theta;
+
+        err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_step);
+        if (!err) err = hw_sched_wait_task(sched);
+        if (err) { fprintf(stderr, "Error %i scheduling tnv threads", err); exit(-1); }
+
+            // border regions
+        for (j = 1; j < tnv_ctx.threads; j++) {
+            tnv_thread_t *ctx0 = tnv_ctx.thr_ctx + (j - 1);
+            tnv_thread_t *ctx = tnv_ctx.thr_ctx + j;
+
+            m = (ctx0->stepY - 1) * dimX * padZ;
+            for(i = 0; i < dimX; i++) {
+                for(k = 0; k < dimZ; k++) {
+                    int l = i * padZ + k;
+                        
+                    float divdiff = ctx->div0[l] - ctx->div[l];
+                    float udiff = ctx->udiff0[l];
+
+                    ctx->div[l] -= ctx0->qy[l + m];
+                    ctx0->div[m + l + dimX * padZ] = ctx->div[l];
+                    ctx0->u[m + l + dimX * padZ] = ctx->u[l];
+
+                    divdiff += ctx0->qy[l + m];
+                    resprimal += fabs(divtau * udiff + divdiff); 
+                }
+            }
+        }
+
+        for (j = 0; j < tnv_ctx.threads; j++) {
+            tnv_thread_t *ctx = tnv_ctx.thr_ctx + j;
+            resprimal += ctx->resprimal;
+            resdual += ctx->resdual;
+            product += ctx->product;
+            unorm += ctx->unorm;
+            qnorm += ctx->qnorm;
+        } 
+
+        residual = (resprimal + resdual) / ((float) (dimX*dimY*dimZ));
+        float b = (2.0f * tau * sigma * product) / (gamma * sigma * unorm + gamma * tau * qnorm);
+        float dual_dot_delta = resdual * s * delta;
+        float dual_div_delta = (resdual * s) / delta;
+        printf("resprimal: %f, resdual: %f, b: %f (product: %f, unorm: %f, qnorm: %f)\n", resprimal, resdual, b, product, unorm, qnorm);
+
+
+        if(b > 1) {
+            
+            // Decrease step-sizes to fit balancing principle
+            tau = (beta * tau) / b;
+            sigma = (beta * sigma) / b;
+            alpha = alpha0;
+
+            if (started) {
+                fprintf(stderr, "\n\n\nWARNING: Back-tracking is required in the middle of iterative optimization! We CAN'T do it in the fast version. The standard TNV recommended\n\n\n");
+            } else {
+                err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_restore);
+                if (!err) err = hw_sched_wait_task(sched);
+                if (err) { fprintf(stderr, "Error %i scheduling restore threads", err); exit(-1); }
+            }
+        } else {
+            started = 1;
+            if(resprimal > dual_dot_delta) {
+                    // Increase primal step-size and decrease dual step-size
+                tau = tau / (1.0f - alpha);
+                sigma = sigma * (1.0f - alpha);
+                alpha = alpha * eta;
+            } else if(resprimal < dual_div_delta) {
+                    // Decrease primal step-size and increase dual step-size
+                tau = tau * (1.0f - alpha);
+                sigma = sigma / (1.0f - alpha);
+                alpha = alpha * eta;
+            }
+        }
+
+        if (residual < tol) break;
+    }
+
+    err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_finish);
+    if (!err) err = hw_sched_wait_task(sched);
+    if (err) { fprintf(stderr, "Error %i scheduling finish threads", err); exit(-1); }
+
+
+    printf("Iterations stopped at %i with the residual %f \n", iter, residual);
+    printf("Return: %f\n", *uT);
+
+//    exit(-1);
+    return *uT;
+}
diff --git a/src/Core/performance_CPU/TNV_core.c.v32 b/src/Core/performance_CPU/TNV_core.c.v32
new file mode 100755
index 00000000..fcb2f878
--- /dev/null
+++ b/src/Core/performance_CPU/TNV_core.c.v32
@@ -0,0 +1,676 @@
+/*
+ * This work is part of the Core Imaging Library developed by
+ * Visual Analytics and Imaging System Group of the Science Technology
+ * Facilities Council, STFC
+ *
+ * Copyright 2017 Daniil Kazantsev
+ * Copyright 2017 Srikanth Nagella, Edoardo Pasca
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <malloc.h>
+#include "TNV_core.h"
+
+#define min(a,b) (((a)<(b))?(a):(b))
+
+inline void coefF(float *t, float M1, float M2, float M3, float sigma, int p, int q, int r) {
+    int ii, num;
+    float divsigma = 1.0f / sigma;
+    float sum, shrinkfactor;
+    float T,D,det,eig1,eig2,sig1,sig2,V1, V2, V3, V4, v0,v1,v2, mu1,mu2,sig1_upd,sig2_upd;
+    float proj[2] = {0};
+
+    // Compute eigenvalues of M
+    T = M1 + M3;
+    D = M1 * M3 - M2 * M2;
+    det = sqrtf(MAX((T * T / 4.0f) - D, 0.0f));
+    eig1 = MAX((T / 2.0f) + det, 0.0f);
+    eig2 = MAX((T / 2.0f) - det, 0.0f);
+    sig1 = sqrtf(eig1);
+    sig2 = sqrtf(eig2);
+
+    // Compute normalized eigenvectors
+    V1 = V2 = V3 = V4 = 0.0f;
+
+    if(M2 != 0.0f)
+    {
+        v0 = M2;
+        v1 = eig1 - M3;
+        v2 = eig2 - M3;
+
+        mu1 = sqrtf(v0 * v0 + v1 * v1);
+        mu2 = sqrtf(v0 * v0 + v2 * v2);
+
+        if(mu1 > fTiny)
+        {
+            V1 = v1 / mu1;
+            V3 = v0 / mu1;
+        }
+
+        if(mu2 > fTiny)
+        {
+            V2 = v2 / mu2;
+            V4 = v0 / mu2;
+        }
+
+    } else
+    {
+        if(M1 > M3)
+        {
+            V1 = V4 = 1.0f;
+            V2 = V3 = 0.0f;
+
+        } else
+        {
+            V1 = V4 = 0.0f;
+            V2 = V3 = 1.0f;
+        }
+    }
+
+    // Compute prox_p of the diagonal entries
+    sig1_upd = sig2_upd = 0.0f;
+
+    if(p == 1)
+    {
+        sig1_upd = MAX(sig1 - divsigma, 0.0f);
+        sig2_upd = MAX(sig2 - divsigma, 0.0f);
+
+    } else if(p == INFNORM)
+    {
+        proj[0] = sigma * fabs(sig1);
+        proj[1] = sigma * fabs(sig2);
+
+        /*l1 projection part */
+        sum = fLarge;
+        num = 0l;
+        shrinkfactor = 0.0f;
+        while(sum > 1.0f)
+        {
+            sum = 0.0f;
+            num = 0;
+
+            for(ii = 0; ii < 2; ii++)
+            {
+                proj[ii] = MAX(proj[ii] - shrinkfactor, 0.0f);
+
+                sum += fabs(proj[ii]);
+                if(proj[ii]!= 0.0f)
+                    num++;
+            }
+
+            if(num > 0)
+                shrinkfactor = (sum - 1.0f) / num;
+            else
+                break;
+        }
+        /*l1 proj ends*/
+
+        sig1_upd = sig1 - divsigma * proj[0];
+        sig2_upd = sig2 - divsigma * proj[1];
+    }
+
+    // Compute the diagonal entries of $\widehat{\Sigma}\Sigma^{\dagger}_0$
+    if(sig1 > fTiny)
+        sig1_upd /= sig1;
+
+    if(sig2 > fTiny)
+        sig2_upd /= sig2;
+
+    // Compute solution
+    t[0] = sig1_upd * V1 * V1 + sig2_upd * V2 * V2;
+    t[1] = sig1_upd * V1 * V3 + sig2_upd * V2 * V4;
+    t[2] = sig1_upd * V3 * V3 + sig2_upd * V4 * V4;
+}
+
+
+#include "hw_sched.h"
+typedef struct {
+    int offY, stepY, copY;
+    float *Input, *u, *qx, *qy, *gradx, *grady, *div;
+    float *div0, *udiff0;
+    float *gradxdiff, *gradydiff, *ubarx, *ubary, *udiff;
+    float resprimal, resdual;
+    float unorm, qnorm, product;
+} tnv_thread_t;
+
+typedef struct {
+    int threads;
+    tnv_thread_t *thr_ctx;
+    float *InputT, *uT;
+    int dimX, dimY, dimZ, padZ;
+    float lambda, sigma, tau, theta;
+} tnv_context_t;
+
+HWSched sched = NULL;
+tnv_context_t tnv_ctx;
+
+
+static int tnv_free(HWThread thr, void *hwctx, int device_id, void *data) {
+    int i,j,k;
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+    free(ctx->Input);
+    free(ctx->u);
+    free(ctx->qx);
+    free(ctx->qy);
+    free(ctx->gradx);
+    free(ctx->grady);
+    free(ctx->div);
+    
+    free(ctx->div0);
+    free(ctx->udiff0);
+
+    free(ctx->gradxdiff); 
+    free(ctx->gradydiff);
+    free(ctx->ubarx);
+    free(ctx->ubary);
+    free(ctx->udiff);
+
+    return 0;
+}
+
+static int tnv_init(HWThread thr, void *hwctx, int device_id, void *data) {
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+    
+    int dimX = tnv_ctx->dimX;
+    int dimY = tnv_ctx->dimY;
+    int dimZ = tnv_ctx->dimZ;
+    int padZ = tnv_ctx->padZ;
+    int offY = ctx->offY;
+    int stepY = ctx->stepY;
+    
+//    printf("%i %p - %i %i %i x %i %i\n", device_id, ctx, dimX, dimY, dimZ, offY, stepY);
+
+    long DimTotal = (long)(dimX*stepY*padZ);
+    long Dim1Total = (long)(dimX*(stepY+1)*padZ);
+    long DimRow = (long)(dimX * padZ);
+    long DimCell = (long)(padZ);
+
+    // Auxiliar vectors
+    ctx->Input = memalign(64, Dim1Total * sizeof(float));
+    ctx->u = memalign(64, Dim1Total * sizeof(float));
+    ctx->qx = memalign(64, DimTotal * sizeof(float));
+    ctx->qy = memalign(64, DimTotal * sizeof(float));
+    ctx->gradx = memalign(64, DimTotal * sizeof(float));
+    ctx->grady = memalign(64, DimTotal * sizeof(float));
+    ctx->div = memalign(64, Dim1Total * sizeof(float));
+
+    ctx->div0 = memalign(64, DimRow * sizeof(float));
+    ctx->udiff0 = memalign(64, DimRow * sizeof(float));
+
+    ctx->gradxdiff = memalign(64, DimCell * sizeof(float));
+    ctx->gradydiff = memalign(64, DimCell * sizeof(float));
+    ctx->ubarx = memalign(64, DimCell * sizeof(float));
+    ctx->ubary = memalign(64, DimCell * sizeof(float));
+    ctx->udiff = memalign(64, DimCell * sizeof(float));
+    
+    if ((!ctx->Input)||(!ctx->u)||(!ctx->qx)||(!ctx->qy)||(!ctx->gradx)||(!ctx->grady)||(!ctx->div)||(!ctx->div0)||(!ctx->udiff)||(!ctx->udiff0)) {
+        fprintf(stderr, "Error allocating memory\n");
+        exit(-1);
+    }
+
+    return 0;
+}
+
+static int tnv_start(HWThread thr, void *hwctx, int device_id, void *data) {
+    int i,j,k;
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+    
+    int dimX = tnv_ctx->dimX;
+    int dimY = tnv_ctx->dimY;
+    int dimZ = tnv_ctx->dimZ;
+    int padZ = tnv_ctx->padZ;
+    int offY = ctx->offY;
+    int stepY = ctx->stepY;
+    int copY = ctx->copY;
+    
+//    printf("%i %p - %i %i %i (%i) x %i %i\n", device_id, ctx, dimX, dimY, dimZ, padZ, offY, stepY);
+
+    long DimTotal = (long)(dimX*stepY*padZ);
+    long Dim1Total = (long)(dimX*copY*padZ);
+
+    memset(ctx->u, 0, Dim1Total * sizeof(float));
+    memset(ctx->qx, 0, DimTotal * sizeof(float));
+    memset(ctx->qy, 0, DimTotal * sizeof(float));
+    memset(ctx->gradx, 0, DimTotal * sizeof(float));
+    memset(ctx->grady, 0, DimTotal * sizeof(float));
+    memset(ctx->div, 0, Dim1Total * sizeof(float));
+    
+    for(k=0; k<dimZ; k++) {
+        for(j=0; j<copY; j++) {
+            for(i=0; i<dimX; i++) {
+                ctx->Input[j * dimX * padZ + i * padZ + k] =  tnv_ctx->InputT[k * dimX * dimY + (j + offY) * dimX + i];
+                ctx->u[j * dimX * padZ + i * padZ + k] =  tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i];
+            }
+        }
+    }
+
+    return 0;
+}
+
+static int tnv_finish(HWThread thr, void *hwctx, int device_id, void *data) {
+    int i,j,k;
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+
+    int dimX = tnv_ctx->dimX;
+    int dimY = tnv_ctx->dimY;
+    int dimZ = tnv_ctx->dimZ;
+    int padZ = tnv_ctx->padZ;
+    int offY = ctx->offY;
+    int stepY = ctx->stepY;
+    int copY = ctx->copY;
+
+    for(k=0; k<dimZ; k++) {
+        for(j=0; j<stepY; j++) {
+            for(i=0; i<dimX; i++) {
+                tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i] = ctx->u[j * dimX * padZ + i * padZ + k];
+            }
+        }
+    }
+    
+    return 0;
+}
+
+
+static int tnv_restore(HWThread thr, void *hwctx, int device_id, void *data) {
+    int i,j,k;
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+    
+    int dimX = tnv_ctx->dimX;
+    int dimY = tnv_ctx->dimY;
+    int dimZ = tnv_ctx->dimZ;
+    int stepY = ctx->stepY;
+    int copY = ctx->copY;
+    int padZ = tnv_ctx->padZ;
+    long DimTotal = (long)(dimX*stepY*padZ);
+    long Dim1Total = (long)(dimX*copY*padZ);
+
+    memset(ctx->u, 0, Dim1Total * sizeof(float));
+    memset(ctx->qx, 0, DimTotal * sizeof(float));
+    memset(ctx->qy, 0, DimTotal * sizeof(float));
+    memset(ctx->gradx, 0, DimTotal * sizeof(float));
+    memset(ctx->grady, 0, DimTotal * sizeof(float));
+    memset(ctx->div, 0, Dim1Total * sizeof(float));
+
+    return 0;
+}
+
+
+static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) {
+    long i, j, k, l, m;
+
+    tnv_context_t *tnv_ctx = (tnv_context_t*)data;
+    tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id;
+    
+    int dimX = tnv_ctx->dimX;
+    int dimY = tnv_ctx->dimY;
+    int dimZ = tnv_ctx->dimZ;
+    int padZ = tnv_ctx->padZ;
+    int offY = ctx->offY;
+    int stepY = ctx->stepY;
+    int copY = ctx->copY;
+
+    float *Input = ctx->Input;
+    float *u = ctx->u;
+    float *qx = ctx->qx;
+    float *qy = ctx->qy;
+    float *gradx = ctx->gradx;
+    float *grady = ctx->grady;
+    float *div = ctx->div;
+
+    long p = 1l;
+    long q = 1l;
+    long r = 0l;
+
+    float lambda = tnv_ctx->lambda;
+    float sigma = tnv_ctx->sigma;
+    float tau = tnv_ctx->tau;
+    float theta = tnv_ctx->theta;
+    
+    float taulambda = tau * lambda;
+    float divtau = 1.0f / tau;
+    float divsigma = 1.0f / sigma;
+    float theta1 = 1.0f + theta;
+    float constant = 1.0f + taulambda;
+
+    float resprimal = 0.0f;
+    float resdual1 = 0.0f;
+    float resdual2 = 0.0f;
+    float product = 0.0f;
+    float unorm = 0.0f;
+    float qnorm = 0.0f;
+
+    float qxdiff;
+    float qydiff;
+    float divdiff;
+    float *gradxdiff = ctx->gradxdiff;
+    float *gradydiff = ctx->gradydiff;
+    float *ubarx = ctx->ubarx;
+    float *ubary = ctx->ubary;
+    float *udiff = ctx->udiff;
+
+    float *udiff0 = ctx->udiff0;
+    float *div0 = ctx->div0;
+
+/*
+    for(i=0; i < dimX; i++) {
+        for(k = 0; k < dimZ; k++) { 
+            int l = i * padZ + k;
+            float u_upd = (u[l] + tau * div[l] + taulambda * Input[l])/constant;
+            float udiff_val = u[l] - u_upd;
+            udiff0[l] = udiff_val;
+            div0[l] = div[l];
+        }
+    }
+*/
+
+
+    j = 0; {
+#       define TNV_LOOP_FIRST_J
+        i = 0; {
+#           define TNV_LOOP_FIRST_I
+#           include "TNV_core_loop.h"
+#           undef TNV_LOOP_FIRST_I
+        }
+        for(i = 1; i < (dimX - 1); i++) {
+#           include "TNV_core_loop.h"
+        }
+        i = dimX - 1; {
+#           define TNV_LOOP_LAST_I
+#           include "TNV_core_loop.h"
+#           undef TNV_LOOP_LAST_I
+        }
+#       undef TNV_LOOP_FIRST_J
+    }
+
+
+
+    for(int j = 1; j < (copY - 1); j++) {
+        i = 0; {
+#           define TNV_LOOP_FIRST_I
+#           include "TNV_core_loop.h"
+#           undef TNV_LOOP_FIRST_I
+        }
+    }
+
+#define BLOCK 32
+    for(int j1 = 1; j1 < (copY - 1); j1 += BLOCK) {
+        for(int i1 = 1; i1 < (dimX - 1); i1 += BLOCK) {
+            for(int j2 = 0; j2 < BLOCK; j2 ++) {
+                j = j1 + j2;
+                for(int i2 = 0; i2 < BLOCK; i2++) {
+                    i = i1 + i2;
+                    
+                    if (i == (dimX - 1)) break;
+                    if (j == (copY - 1)) { j2 = BLOCK; break; }
+#           include "TNV_core_loop.h"
+                }   
+            }
+        } // i
+
+    }
+
+    for(int j = 1; j < (copY - 1); j++) {
+        i = dimX - 1; {
+#           define TNV_LOOP_LAST_I
+#           include "TNV_core_loop.h"
+#           undef TNV_LOOP_LAST_I
+        }
+    }
+
+
+
+    for (j = copY - 1; j < stepY; j++) {
+#       define TNV_LOOP_LAST_J
+        i = 0; {
+#           define TNV_LOOP_FIRST_I
+#           include "TNV_core_loop.h"
+#           undef TNV_LOOP_FIRST_I
+        }
+        for(i = 1; i < (dimX - 1); i++) {
+#           include "TNV_core_loop.h"
+        }
+        i = dimX - 1; {
+#           define TNV_LOOP_LAST_I
+#           include "TNV_core_loop.h"
+#           undef TNV_LOOP_LAST_I
+        }
+#       undef TNV_LOOP_LAST_J
+    }
+
+
+
+    ctx->resprimal = resprimal;
+    ctx->resdual = resdual1 + resdual2;
+    ctx->product = product;
+    ctx->unorm = unorm;
+    ctx->qnorm = qnorm;
+
+    return 0;
+}
+
+static void TNV_CPU_init(float *InputT, float *uT, int dimX, int dimY, int dimZ) {
+    int i, off, size, err;
+
+    if (sched) return;
+
+    tnv_ctx.dimX = dimX;
+    tnv_ctx.dimY = dimY;
+    tnv_ctx.dimZ = dimZ;
+        // Padding seems actually slower
+//    tnv_ctx.padZ = dimZ;
+//    tnv_ctx.padZ = 4 * ((dimZ / 4) + ((dimZ % 4)?1:0));
+    tnv_ctx.padZ = 16 * ((dimZ / 16) + ((dimZ % 16)?1:0));
+    
+    hw_sched_init();
+
+    int threads = hw_sched_get_cpu_count();
+    if (threads > dimY) threads = dimY/2;
+
+    int step = dimY / threads;
+    int extra = dimY % threads;
+
+    tnv_ctx.threads = threads;
+    tnv_ctx.thr_ctx = (tnv_thread_t*)calloc(threads, sizeof(tnv_thread_t));
+    for (i = 0, off = 0; i < threads; i++, off += size) {
+        tnv_thread_t *ctx = tnv_ctx.thr_ctx + i;
+        size = step + ((i < extra)?1:0);
+
+        ctx->offY = off;
+        ctx->stepY = size;
+
+        if (i == (threads-1)) ctx->copY = ctx->stepY;
+        else ctx->copY = ctx->stepY + 1;
+    }
+
+    sched = hw_sched_create(threads);
+    if (!sched) { fprintf(stderr, "Error creating threads\n"); exit(-1); }
+
+    err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_init);
+    if (!err) err = hw_sched_wait_task(sched);
+    if (err) { fprintf(stderr, "Error %i scheduling init threads", err); exit(-1); }
+}
+
+
+
+/*
+ * C-OMP implementation of Total Nuclear Variation regularisation model (2D + channels) [1]
+ * The code is modified from the implementation by Joan Duran <joan.duran@uib.es> see
+ * "denoisingPDHG_ipol.cpp" in Joans Collaborative Total Variation package
+ *
+ * Input Parameters:
+ * 1. Noisy volume of 2D + channel dimension, i.e. 3D volume
+ * 2. lambda - regularisation parameter
+ * 3. Number of iterations [OPTIONAL parameter]
+ * 4. eplsilon - tolerance constant [OPTIONAL parameter]
+ * 5. print information: 0 (off) or 1 (on)  [OPTIONAL parameter]
+ *
+ * Output:
+ * 1. Filtered/regularized image (u)
+ *
+ * [1]. Duran, J., Moeller, M., Sbert, C. and Cremers, D., 2016. Collaborative total variation: a general framework for vectorial TV models. SIAM Journal on Imaging Sciences, 9(1), pp.116-151.
+ */
+
+float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float tol, int dimX, int dimY, int dimZ)
+{
+    int err;
+    int iter;
+    int i,j,k,l,m;
+
+    lambda = 1.0f/(2.0f*lambda);
+    tnv_ctx.lambda = lambda;
+
+    // PDHG algorithm parameters
+    float tau = 0.5f;
+    float sigma = 0.5f;
+    float theta = 1.0f;
+
+    // Backtracking parameters
+    float s = 1.0f;
+    float gamma = 0.75f;
+    float beta = 0.95f;
+    float alpha0 = 0.2f;
+    float alpha = alpha0;
+    float delta = 1.5f;
+    float eta = 0.95f;
+
+    TNV_CPU_init(InputT, uT, dimX, dimY, dimZ);
+
+    tnv_ctx.InputT = InputT;
+    tnv_ctx.uT = uT;
+    
+    int padZ = tnv_ctx.padZ;
+
+    err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_start);
+    if (!err) err = hw_sched_wait_task(sched);
+    if (err) { fprintf(stderr, "Error %i scheduling start threads", err); exit(-1); }
+
+
+    // Apply Primal-Dual Hybrid Gradient scheme
+    float residual = fLarge;
+    int started = 0;
+    for(iter = 0; iter < maxIter; iter++)   {
+        float resprimal = 0.0f;
+        float resdual = 0.0f;
+        float product = 0.0f;
+        float unorm = 0.0f;
+        float qnorm = 0.0f;
+
+        float divtau = 1.0f / tau;
+
+        tnv_ctx.sigma = sigma;
+        tnv_ctx.tau = tau;
+        tnv_ctx.theta = theta;
+
+        err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_step);
+        if (!err) err = hw_sched_wait_task(sched);
+        if (err) { fprintf(stderr, "Error %i scheduling tnv threads", err); exit(-1); }
+
+            // border regions
+        for (j = 1; j < tnv_ctx.threads; j++) {
+            tnv_thread_t *ctx0 = tnv_ctx.thr_ctx + (j - 1);
+            tnv_thread_t *ctx = tnv_ctx.thr_ctx + j;
+
+            m = (ctx0->stepY - 1) * dimX * padZ;
+            for(i = 0; i < dimX; i++) {
+                for(k = 0; k < dimZ; k++) {
+                    int l = i * padZ + k;
+                        
+                    float divdiff = ctx->div0[l] - ctx->div[l];
+                    float udiff = ctx->udiff0[l];
+
+                    ctx->div[l] -= ctx0->qy[l + m];
+                    ctx0->div[m + l + dimX*padZ] = ctx->div[l];
+                    ctx0->u[m + l + dimX*padZ] = ctx->u[l];
+                    
+                    divdiff += ctx0->qy[l + m];
+                    resprimal += fabs(divtau * udiff + divdiff); 
+                }
+            }
+        }
+
+        {
+            tnv_thread_t *ctx = tnv_ctx.thr_ctx + 0;
+            for(i = 0; i < dimX; i++) {
+                for(k = 0; k < dimZ; k++) {
+                    int l = i * padZ + k;
+                        
+                    float divdiff = ctx->div0[l] - ctx->div[l];
+                    float udiff = ctx->udiff0[l];
+                    resprimal += fabs(divtau * udiff + divdiff); 
+                }
+            }
+        }
+
+        for (j = 0; j < tnv_ctx.threads; j++) {
+            tnv_thread_t *ctx = tnv_ctx.thr_ctx + j;
+            resprimal += ctx->resprimal;
+            resdual += ctx->resdual;
+            product += ctx->product;
+            unorm += ctx->unorm;
+            qnorm += ctx->qnorm;
+        } 
+
+        residual = (resprimal + resdual) / ((float) (dimX*dimY*dimZ));
+        float b = (2.0f * tau * sigma * product) / (gamma * sigma * unorm + gamma * tau * qnorm);
+        float dual_dot_delta = resdual * s * delta;
+        float dual_div_delta = (resdual * s) / delta;
+        printf("resprimal: %f, resdual: %f, b: %f (product: %f, unorm: %f, qnorm: %f)\n", resprimal, resdual, b, product, unorm, qnorm);
+
+
+        if(b > 1) {
+            
+            // Decrease step-sizes to fit balancing principle
+            tau = (beta * tau) / b;
+            sigma = (beta * sigma) / b;
+            alpha = alpha0;
+
+            if (started) {
+                fprintf(stderr, "\n\n\nWARNING: Back-tracking is required in the middle of iterative optimization! We CAN'T do it in the fast version. The standard TNV recommended\n\n\n");
+            } else {
+                err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_restore);
+                if (!err) err = hw_sched_wait_task(sched);
+                if (err) { fprintf(stderr, "Error %i scheduling restore threads", err); exit(-1); }
+            }
+        } else {
+            started = 1;
+            if(resprimal > dual_dot_delta) {
+                    // Increase primal step-size and decrease dual step-size
+                tau = tau / (1.0f - alpha);
+                sigma = sigma * (1.0f - alpha);
+                alpha = alpha * eta;
+            } else if(resprimal < dual_div_delta) {
+                    // Decrease primal step-size and increase dual step-size
+                tau = tau * (1.0f - alpha);
+                sigma = sigma / (1.0f - alpha);
+                alpha = alpha * eta;
+            }
+        }
+
+        if (residual < tol) break;
+    }
+
+    err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_finish);
+    if (!err) err = hw_sched_wait_task(sched);
+    if (err) { fprintf(stderr, "Error %i scheduling finish threads", err); exit(-1); }
+
+
+    printf("Iterations stopped at %i with the residual %f \n", iter, residual);
+    printf("Return: %f\n", *uT);
+
+    return *uT;
+}
diff --git a/src/Core/performance_CPU/TNV_core.c.v4.stdver b/src/Core/performance_CPU/TNV_core.c.v4.stdver
new file mode 100755
index 00000000..6be60ae6
--- /dev/null
+++ b/src/Core/performance_CPU/TNV_core.c.v4.stdver
@@ -0,0 +1,629 @@
+/*
+ * This work is part of the Core Imaging Library developed by
+ * Visual Analytics and Imaging System Group of the Science Technology
+ * Facilities Council, STFC
+ *
+ * Copyright 2017 Daniil Kazantsev
+ * Copyright 2017 Srikanth Nagella, Edoardo Pasca
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "TNV_core.h"
+
+
+inline void coefF(float *t, float M1, float M2, float M3, float sigma, int p, int q, int r) {
+    int ii, num;
+    float divsigma = 1.0f / sigma;
+    float sum, shrinkfactor;
+    float T,D,det,eig1,eig2,sig1,sig2,V1, V2, V3, V4, v0,v1,v2, mu1,mu2,sig1_upd,sig2_upd;
+    float proj[2] = {0};
+
+    // Compute eigenvalues of M
+    T = M1 + M3;
+    D = M1 * M3 - M2 * M2;
+    det = sqrt(MAX((T * T / 4.0f) - D, 0.0f));
+    eig1 = MAX((T / 2.0f) + det, 0.0f);
+    eig2 = MAX((T / 2.0f) - det, 0.0f);
+    sig1 = sqrt(eig1);
+    sig2 = sqrt(eig2);
+
+    // Compute normalized eigenvectors
+    V1 = V2 = V3 = V4 = 0.0f;
+
+    if(M2 != 0.0f)
+    {
+        v0 = M2;
+        v1 = eig1 - M3;
+        v2 = eig2 - M3;
+
+        mu1 = sqrtf(v0 * v0 + v1 * v1);
+        mu2 = sqrtf(v0 * v0 + v2 * v2);
+
+        if(mu1 > fTiny)
+        {
+            V1 = v1 / mu1;
+            V3 = v0 / mu1;
+        }
+
+        if(mu2 > fTiny)
+        {
+            V2 = v2 / mu2;
+            V4 = v0 / mu2;
+        }
+
+    } else
+    {
+        if(M1 > M3)
+        {
+            V1 = V4 = 1.0f;
+            V2 = V3 = 0.0f;
+
+        } else
+        {
+            V1 = V4 = 0.0f;
+            V2 = V3 = 1.0f;
+        }
+    }
+
+    // Compute prox_p of the diagonal entries
+    sig1_upd = sig2_upd = 0.0f;
+
+    if(p == 1)
+    {
+        sig1_upd = MAX(sig1 - divsigma, 0.0f);
+        sig2_upd = MAX(sig2 - divsigma, 0.0f);
+
+    } else if(p == INFNORM)
+    {
+        proj[0] = sigma * fabs(sig1);
+        proj[1] = sigma * fabs(sig2);
+
+        /*l1 projection part */
+        sum = fLarge;
+        num = 0l;
+        shrinkfactor = 0.0f;
+        while(sum > 1.0f)
+        {
+            sum = 0.0f;
+            num = 0;
+
+            for(ii = 0; ii < 2; ii++)
+            {
+                proj[ii] = MAX(proj[ii] - shrinkfactor, 0.0f);
+
+                sum += fabs(proj[ii]);
+                if(proj[ii]!= 0.0f)
+                    num++;
+            }
+
+            if(num > 0)
+                shrinkfactor = (sum - 1.0f) / num;
+            else
+                break;
+        }
+        /*l1 proj ends*/
+
+        sig1_upd = sig1 - divsigma * proj[0];
+        sig2_upd = sig2 - divsigma * proj[1];
+    }
+
+    // Compute the diagonal entries of $\widehat{\Sigma}\Sigma^{\dagger}_0$
+    if(sig1 > fTiny)
+        sig1_upd /= sig1;
+
+    if(sig2 > fTiny)
+        sig2_upd /= sig2;
+
+    // Compute solution
+    t[0] = sig1_upd * V1 * V1 + sig2_upd * V2 * V2;
+    t[1] = sig1_upd * V1 * V3 + sig2_upd * V2 * V4;
+    t[2] = sig1_upd * V3 * V3 + sig2_upd * V4 * V4;
+}
+
+
+/*
+ * C-OMP implementation of Total Nuclear Variation regularisation model (2D + channels) [1]
+ * The code is modified from the implementation by Joan Duran <joan.duran@uib.es> see
+ * "denoisingPDHG_ipol.cpp" in Joans Collaborative Total Variation package
+ *
+ * Input Parameters:
+ * 1. Noisy volume of 2D + channel dimension, i.e. 3D volume
+ * 2. lambda - regularisation parameter
+ * 3. Number of iterations [OPTIONAL parameter]
+ * 4. eplsilon - tolerance constant [OPTIONAL parameter]
+ * 5. print information: 0 (off) or 1 (on)  [OPTIONAL parameter]
+ *
+ * Output:
+ * 1. Filtered/regularized image (u)
+ *
+ * [1]. Duran, J., Moeller, M., Sbert, C. and Cremers, D., 2016. Collaborative total variation: a general framework for vectorial TV models. SIAM Journal on Imaging Sciences, 9(1), pp.116-151.
+ */
+
+float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float tol, int dimX, int dimY, int dimZ)
+{
+    long i, j, k, p, q, r, DimTotal;
+    float taulambda;
+    float *u_upd, *qx, *qy, *qx_upd, *qy_upd, *gradx, *grady, *gradx_upd, *grady_upd,  *div, *div_upd;
+
+    p = 1l;
+    q = 1l;
+    r = 0l;
+
+    lambda = 1.0f/(2.0f*lambda);
+    DimTotal = (long)(dimX*dimY*dimZ);
+    /* PDHG algorithm parameters*/
+    float tau = 0.5f;
+    float sigma = 0.5f;
+    float theta = 1.0f;
+
+    // Auxiliar vectors
+    u_upd = calloc(DimTotal, sizeof(float));
+    qx = calloc(DimTotal, sizeof(float));
+    qy = calloc(DimTotal, sizeof(float));
+    qx_upd = calloc(DimTotal, sizeof(float));
+    qy_upd = calloc(DimTotal, sizeof(float));
+    gradx = calloc(DimTotal, sizeof(float));
+    grady = calloc(DimTotal, sizeof(float));
+    gradx_upd = calloc(DimTotal, sizeof(float));
+    grady_upd = calloc(DimTotal, sizeof(float));
+    div = calloc(DimTotal, sizeof(float));
+    div_upd = calloc(DimTotal, sizeof(float));
+
+
+    float *Input = calloc(DimTotal, sizeof(float));
+    float *u = calloc(DimTotal, sizeof(float));
+    for(k=0; k<dimZ; k++) {
+        for(j=0; j<dimY; j++) {
+            for(i=0; i<dimX; i++) {
+                Input[j * dimX * dimZ + i * dimZ + k] =  InputT[k * dimX * dimY + j * dimX + i];
+                u[j * dimX * dimZ + i * dimZ + k] =  uT[k * dimX * dimY + j * dimX + i];
+            }
+        }
+    }
+
+
+    // Backtracking parameters
+    float s = 1.0f;
+    float gamma = 0.75f;
+    float beta = 0.95f;
+    float alpha0 = 0.2f;
+    float alpha = alpha0;
+    float delta = 1.5f;
+    float eta = 0.95f;
+
+    // PDHG algorithm parameters
+    taulambda = tau * lambda;
+    float divtau = 1.0f / tau;
+    float divsigma = 1.0f / sigma;
+    float theta1 = 1.0f + theta;
+
+    /*allocate memory for  taulambda */
+    //taulambda = (float*) calloc(dimZ, sizeof(float));
+    //for(k=0; k < dimZ; k++)  {taulambda[k] = tau*lambda[k];}
+
+    // Apply Primal-Dual Hybrid Gradient scheme
+    int iter = 0;
+    float residual = fLarge;
+
+    for(iter = 0; iter < maxIter; iter++)   {
+        // Argument of proximal mapping of fidelity term
+        // Proximal solution of fidelity term
+        // proxG(u_upd, u, div, Input, tau, taulambda, (long)(dimX), (long)(dimY), (long)(dimZ));
+        float constant = 1.0f + taulambda;
+        #pragma omp parallel for shared(Input, u, u_upd) private(k)
+        for(k=0; k<dimZ*dimX*dimY; k++) {
+            float v = u[k] + tau * div[k];
+            u_upd[k] = (v + taulambda * Input[k])/constant;
+        }
+
+    memset(div_upd, 0, dimX*dimY*dimZ*sizeof(float));
+            // This changes results, I guess access to div_upd is violated
+//    #pragma omp parallel for shared (gradx_upd,grady_upd,gradx,grady,qx,qy,qx_upd,qy_upd) private(i,j,k)
+        for(j=0; j<dimY; j++) {
+            for(i=0; i<dimX; i++) {
+                float t[3];
+                float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f;
+                int l = (j * dimX  + i) * dimZ;
+                
+                for(k = 0; k < dimZ; k++) {
+                    if(i != dimX-1)
+                        gradx_upd[l + k] = u_upd[l + k + dimZ] - u_upd[l + k];
+                    else
+                        gradx_upd[l + k] = 0.0f;
+
+                    if(j != dimY-1)
+                        grady_upd[l + k] = u_upd[l + k + dimX * dimZ] - u_upd[l + k];
+                    else
+                        grady_upd[l + k] = 0.0f;
+
+                    float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k];
+                    float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k];
+                    float vx = ubarx + divsigma * qx[l + k];
+                    float vy = ubary + divsigma * qy[l + k];
+        
+                    M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy);
+                }
+                
+                coefF(t, M1, M2, M3, sigma, p, q, r);
+                
+                for(k = 0; k < dimZ; k++) {
+                    float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k];
+                    float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k];
+                    float vx = ubarx + divsigma * qx[l + k];
+                    float vy = ubary + divsigma * qy[l + k];
+
+                    float gx_upd = vx * t[0] + vy * t[1];
+                    float gy_upd = vx * t[1] + vy * t[2];
+
+                    qx_upd[l + k] = qx[l + k] + sigma * (ubarx - gx_upd);
+                    qy_upd[l + k] = qy[l + k] + sigma * (ubary - gy_upd);
+                    
+                    if(i != dimX-1) {
+                        div_upd[l + k] += qx_upd[l + k];
+                        div_upd[l + k + dimZ] -= qx_upd[l + k];
+                    }
+                    
+                    if(j != dimY-1) {
+                        div_upd[l + k] += qy_upd[l + k];
+                        div_upd[l + k + dimX * dimZ] -= qy_upd[l + k];
+                    }
+                }
+            }
+        }
+
+// Compute primal residual, dual residual, and backtracking condition
+        float resprimal = 0.0f;
+        float resdual = 0.0f;
+        float product = 0.0f;
+        float unorm = 0.0f;
+        float qnorm = 0.0f;
+
+            // If this loop is inner, the result slightly changed due to different summation order
+        for(int l=0; l<dimZ; l++)
+        for(j=0; j<dimY; j++)
+        for(i=0; i<dimX; i++)
+        {
+//        for(k=0; k<dimX*dimY*dimZ; k++) {
+            int k = j * dimX * dimZ + i * dimZ + l;
+        
+            float udiff = u[k] - u_upd[k];
+            float qxdiff = qx[k] - qx_upd[k];
+            float qydiff = qy[k] - qy_upd[k];
+            float divdiff = div[k] - div_upd[k];
+            float gradxdiff = gradx[k] - gradx_upd[k];
+            float gradydiff = grady[k] - grady_upd[k];
+
+            resprimal += fabs(divtau*udiff + divdiff);
+            resdual += fabs(divsigma*qxdiff - gradxdiff);
+            resdual += fabs(divsigma*qydiff - gradydiff);
+
+            unorm += (udiff * udiff);
+            qnorm += (qxdiff * qxdiff + qydiff * qydiff);
+            product += (gradxdiff * qxdiff + gradydiff * qydiff);
+        }
+
+        float b = (2.0f * tau * sigma * product) / (gamma * sigma * unorm +
+                  gamma * tau * qnorm);
+
+//        printf("resprimal: %f, resdual: %f, b: %f\n", resprimal, resdual, b);
+        printf("resprimal: %f, resdual: %f, b: %f (product: %f, unorm: %f, qnorm: %f)\n", resprimal, resdual, b, product, unorm, qnorm);
+
+//        printf("b: %f\n", b);
+
+// Adapt step-size parameters
+        float dual_dot_delta = resdual * s * delta;
+        float dual_div_delta = (resdual * s) / delta;
+
+        if(b > 1)
+        {
+            // Decrease step-sizes to fit balancing principle
+            tau = (beta * tau) / b;
+            sigma = (beta * sigma) / b;
+            alpha = alpha0;
+
+            copyIm(u, u_upd, (long)(dimX), (long)(dimY), (long)(dimZ));
+            copyIm(qx, qx_upd, (long)(dimX), (long)(dimY), (long)(dimZ));
+            copyIm(qy, qy_upd, (long)(dimX), (long)(dimY), (long)(dimZ));
+            copyIm(gradx, gradx_upd, (long)(dimX), (long)(dimY), (long)(dimZ));
+            copyIm(grady, grady_upd, (long)(dimX), (long)(dimY), (long)(dimZ));
+            copyIm(div, div_upd, (long)(dimX), (long)(dimY), (long)(dimZ));
+
+        } else if(resprimal > dual_dot_delta)
+        {
+            // Increase primal step-size and decrease dual step-size
+            tau = tau / (1.0f - alpha);
+            sigma = sigma * (1.0f - alpha);
+            alpha = alpha * eta;
+
+        } else if(resprimal < dual_div_delta)
+        {
+            // Decrease primal step-size and increase dual step-size
+            tau = tau * (1.0f - alpha);
+            sigma = sigma / (1.0f - alpha);
+            alpha = alpha * eta;
+        }
+
+// Update variables
+        taulambda = tau * lambda;
+//for(k=0; k < dimZ; k++) taulambda[k] = tau*lambda[k];
+
+        divsigma = 1.0f / sigma;
+        divtau = 1.0f / tau;
+
+        copyIm(u_upd, u, (long)(dimX), (long)(dimY), (long)(dimZ));
+        copyIm(qx_upd, qx, (long)(dimX), (long)(dimY), (long)(dimZ));
+        copyIm(qy_upd, qy, (long)(dimX), (long)(dimY), (long)(dimZ));
+        copyIm(gradx_upd, gradx, (long)(dimX), (long)(dimY), (long)(dimZ));
+        copyIm(grady_upd, grady, (long)(dimX), (long)(dimY), (long)(dimZ));
+        copyIm(div_upd, div, (long)(dimX), (long)(dimY), (long)(dimZ));
+
+// Compute residual at current iteration
+        residual = (resprimal + resdual) / ((float) (dimX*dimY*dimZ));
+
+//       printf("%f \n", residual);
+        if (residual < tol) {
+            printf("Iterations stopped at %i with the residual %f \n", iter, residual);
+            break;
+        }
+
+    }
+    printf("Iterations stopped at %i with the residual %f \n", iter, residual);
+
+    free (u_upd);
+    free(qx);
+    free(qy);
+    free(qx_upd);
+    free(qy_upd);
+    free(gradx);
+    free(grady);
+    free(gradx_upd);
+    free(grady_upd);
+    free(div);
+    free(div_upd);
+    printf("Return: %f\n", *u);
+
+    for(k=0; k<dimZ; k++) {
+        for(j=0; j<dimY; j++) {
+            for(i=0; i<dimX; i++) {
+                uT[k * dimX * dimY + j * dimX + i] = u[j * dimX * dimZ + i * dimZ + k];
+            }
+        }
+    }
+
+
+
+
+    return *u;
+}
+
+float proxG(float *u_upd, float *v, float *f, float taulambda, long dimX, long dimY, long dimZ)
+{
+    float constant;
+    long k;
+    constant = 1.0f + taulambda;
+    #pragma omp parallel for shared(v, f, u_upd) private(k)
+    for(k=0; k<dimZ*dimX*dimY; k++) {
+        u_upd[k] = (v[k] + taulambda * f[k])/constant;
+        //u_upd[(dimX*dimY)*k + l] = (v[(dimX*dimY)*k + l] + taulambda * f[(dimX*dimY)*k + l])/constant;
+    }
+    return *u_upd;
+}
+
+float gradient(float *u_upd, float *gradx_upd, float *grady_upd, long dimX, long dimY, long dimZ)
+{
+    long i, j, k, l;
+    // Compute discrete gradient using forward differences
+    #pragma omp parallel for shared(gradx_upd,grady_upd,u_upd) private(i, j, k, l)
+    for(k = 0; k < dimZ; k++)   {
+        for(j = 0; j < dimY; j++)   {
+            l = j * dimX;
+            for(i = 0; i < dimX; i++)   {
+                // Derivatives in the x-direction
+                if(i != dimX-1)
+                    gradx_upd[(dimX*dimY)*k + i+l] = u_upd[(dimX*dimY)*k + i+1+l] - u_upd[(dimX*dimY)*k + i+l];
+                else
+                    gradx_upd[(dimX*dimY)*k + i+l] = 0.0f;
+
+                // Derivatives in the y-direction
+                if(j != dimY-1)
+                    //grady_upd[(dimX*dimY)*k + i+l] = u_upd[(dimX*dimY)*k + i+dimY+l] -u_upd[(dimX*dimY)*k + i+l];
+                    grady_upd[(dimX*dimY)*k + i+l] = u_upd[(dimX*dimY)*k + i+(j+1)*dimX] -u_upd[(dimX*dimY)*k + i+l];
+                else
+                    grady_upd[(dimX*dimY)*k + i+l] = 0.0f;
+            }
+        }
+    }
+    return 1;
+}
+
+float proxF(float *gx, float *gy, float *vx, float *vy, float sigma, int p, int q, int r, long dimX, long dimY, long dimZ)
+{
+    // (S^p, \ell^1) norm decouples at each pixel
+//   Spl1(gx, gy, vx, vy, sigma, p, num_channels, dim);
+    float divsigma = 1.0f / sigma;
+
+    // $\ell^{1,1,1}$-TV regularization
+//       int i,j,k;
+//     #pragma omp parallel for shared (gx,gy,vx,vy) private(i,j,k)
+//      for(k = 0; k < dimZ; k++)  {
+//         for(i=0; i<dimX; i++) {
+//              for(j=0; j<dimY; j++) {
+//                 gx[(dimX*dimY)*k + (i)*dimY + (j)] = SIGN(vx[(dimX*dimY)*k + (i)*dimY + (j)]) * MAX(fabs(vx[(dimX*dimY)*k + (i)*dimY + (j)]) - divsigma,  0.0f);
+//                 gy[(dimX*dimY)*k + (i)*dimY + (j)] = SIGN(vy[(dimX*dimY)*k + (i)*dimY + (j)]) * MAX(fabs(vy[(dimX*dimY)*k + (i)*dimY + (j)]) - divsigma,  0.0f);
+//             }}}
+
+    // Auxiliar vector
+    float *proj, sum, shrinkfactor ;
+    float M1,M2,M3,valuex,valuey,T,D,det,eig1,eig2,sig1,sig2,V1, V2, V3, V4, v0,v1,v2, mu1,mu2,sig1_upd,sig2_upd,t1,t2,t3;
+    long i,j,k, ii, num;
+    #pragma omp parallel for shared (gx,gy,vx,vy,p) private(i,ii,j,k,proj,num, sum, shrinkfactor, M1,M2,M3,valuex,valuey,T,D,det,eig1,eig2,sig1,sig2,V1, V2, V3, V4,v0,v1,v2,mu1,mu2,sig1_upd,sig2_upd,t1,t2,t3)
+    for(i=0; i<dimX; i++) {
+        for(j=0; j<dimY; j++) {
+
+            proj = (float*) calloc (2,sizeof(float));
+            // Compute matrix $M\in\R^{2\times 2}$
+            M1 = 0.0f;
+            M2 = 0.0f;
+            M3 = 0.0f;
+
+            for(k = 0; k < dimZ; k++)
+            {
+                valuex = vx[(dimX*dimY)*k + (j)*dimX + (i)];
+                valuey = vy[(dimX*dimY)*k + (j)*dimX + (i)];
+
+                M1 += (valuex * valuex);
+                M2 += (valuex * valuey);
+                M3 += (valuey * valuey);
+            }
+
+            // Compute eigenvalues of M
+            T = M1 + M3;
+            D = M1 * M3 - M2 * M2;
+            det = sqrt(MAX((T * T / 4.0f) - D, 0.0f));
+            eig1 = MAX((T / 2.0f) + det, 0.0f);
+            eig2 = MAX((T / 2.0f) - det, 0.0f);
+            sig1 = sqrt(eig1);
+            sig2 = sqrt(eig2);
+
+            // Compute normalized eigenvectors
+            V1 = V2 = V3 = V4 = 0.0f;
+
+            if(M2 != 0.0f)
+            {
+                v0 = M2;
+                v1 = eig1 - M3;
+                v2 = eig2 - M3;
+
+                mu1 = sqrtf(v0 * v0 + v1 * v1);
+                mu2 = sqrtf(v0 * v0 + v2 * v2);
+
+                if(mu1 > fTiny)
+                {
+                    V1 = v1 / mu1;
+                    V3 = v0 / mu1;
+                }
+
+                if(mu2 > fTiny)
+                {
+                    V2 = v2 / mu2;
+                    V4 = v0 / mu2;
+                }
+
+            } else
+            {
+                if(M1 > M3)
+                {
+                    V1 = V4 = 1.0f;
+                    V2 = V3 = 0.0f;
+
+                } else
+                {
+                    V1 = V4 = 0.0f;
+                    V2 = V3 = 1.0f;
+                }
+            }
+
+            // Compute prox_p of the diagonal entries
+            sig1_upd = sig2_upd = 0.0f;
+
+            if(p == 1)
+            {
+                sig1_upd = MAX(sig1 - divsigma, 0.0f);
+                sig2_upd = MAX(sig2 - divsigma, 0.0f);
+
+            } else if(p == INFNORM)
+            {
+                proj[0] = sigma * fabs(sig1);
+                proj[1] = sigma * fabs(sig2);
+
+                /*l1 projection part */
+                sum = fLarge;
+                num = 0l;
+                shrinkfactor = 0.0f;
+                while(sum > 1.0f)
+                {
+                    sum = 0.0f;
+                    num = 0;
+
+                    for(ii = 0; ii < 2; ii++)
+                    {
+                        proj[ii] = MAX(proj[ii] - shrinkfactor, 0.0f);
+
+                        sum += fabs(proj[ii]);
+                        if(proj[ii]!= 0.0f)
+                            num++;
+                    }
+
+                    if(num > 0)
+                        shrinkfactor = (sum - 1.0f) / num;
+                    else
+                        break;
+                }
+                /*l1 proj ends*/
+
+                sig1_upd = sig1 - divsigma * proj[0];
+                sig2_upd = sig2 - divsigma * proj[1];
+            }
+
+            // Compute the diagonal entries of $\widehat{\Sigma}\Sigma^{\dagger}_0$
+            if(sig1 > fTiny)
+                sig1_upd /= sig1;
+
+            if(sig2 > fTiny)
+                sig2_upd /= sig2;
+
+            // Compute solution
+            t1 = sig1_upd * V1 * V1 + sig2_upd * V2 * V2;
+            t2 = sig1_upd * V1 * V3 + sig2_upd * V2 * V4;
+            t3 = sig1_upd * V3 * V3 + sig2_upd * V4 * V4;
+
+            for(k = 0; k < dimZ; k++)
+            {
+                gx[(dimX*dimY)*k + j*dimX + i] = vx[(dimX*dimY)*k + j*dimX + i] * t1 + vy[(dimX*dimY)*k + j*dimX + i] * t2;
+                gy[(dimX*dimY)*k + j*dimX + i] = vx[(dimX*dimY)*k + j*dimX + i] * t2 + vy[(dimX*dimY)*k + j*dimX + i] * t3;
+            }
+
+            // Delete allocated memory
+            free(proj);
+        }
+    }
+
+    return 1;
+}
+
+float divergence(float *qx_upd, float *qy_upd, float *div_upd, long dimX, long dimY, long dimZ)
+{
+    long i, j, k, l;
+    #pragma omp parallel for shared(qx_upd,qy_upd,div_upd) private(i, j, k, l)
+    for(k = 0; k < dimZ; k++)   {
+        for(j = 0; j < dimY; j++)   {
+            l = j * dimX;
+            for(i = 0; i < dimX; i++)   {
+                if(i != dimX-1)
+                {
+                    // ux[k][i+l] = u[k][i+1+l] - u[k][i+l]
+                    div_upd[(dimX*dimY)*k + i+1+l] -= qx_upd[(dimX*dimY)*k + i+l];
+                    div_upd[(dimX*dimY)*k + i+l] += qx_upd[(dimX*dimY)*k + i+l];
+                }
+
+                if(j != dimY-1)
+                {
+                    // uy[k][i+l] = u[k][i+width+l] - u[k][i+l]
+                    //div_upd[(dimX*dimY)*k + i+dimY+l] -= qy_upd[(dimX*dimY)*k + i+l];
+                    div_upd[(dimX*dimY)*k + i+(j+1)*dimX] -= qy_upd[(dimX*dimY)*k + i+l];
+                    div_upd[(dimX*dimY)*k + i+l] += qy_upd[(dimX*dimY)*k + i+l];
+                }
+            }
+        }
+    }
+    return *div_upd;
+}
diff --git a/src/Core/performance_CPU/TNV_core_backtrack_loop.h.v19 b/src/Core/performance_CPU/TNV_core_backtrack_loop.h.v19
new file mode 100644
index 00000000..3ec42509
--- /dev/null
+++ b/src/Core/performance_CPU/TNV_core_backtrack_loop.h.v19
@@ -0,0 +1,100 @@
+            float t[3];
+            float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f;
+
+            l = (j * dimX  + i) * padZ;
+        
+//#pragma vector aligned
+#pragma GCC ivdep 
+            for(k = 0; k < dimZ; k++) {
+                u_upd[l + k] = (u[l + k] + tau * div[l + k] + taulambda * Input[l + k]) / constant;
+                udiff[k] = u[l + k] - u_upd[l + k];
+                unorm += (udiff[k] * udiff[k]);
+
+#ifdef TNV_LOOP_LAST_I
+                gradx_upd[l + k] = 0;
+#else
+                gradx_upd[l + k] = ((u[l + k + padZ] + tau * div[l + k + padZ] + taulambda * Input[l + k + padZ]) / constant - u_upd[l + k]);
+#endif
+
+#ifdef TNV_LOOP_LAST_J
+                grady_upd[l + k] = 0;
+#else
+                grady_upd[l + k] = ((u[l + k + dimX*padZ] + tau * div[l + k + dimX*padZ] + taulambda * Input[l + k + dimX*padZ]) / constant - u_upd[l + k]);
+#endif
+
+                gradxdiff[k] = gradx[l + k] - gradx_upd[l + k];
+                gradydiff[k] = grady[l + k] - grady_upd[l + k];
+
+                float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k];
+                float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k];
+//#define TNV_NEW_STYLE                
+#ifdef TNV_NEW_STYLE                
+                qx_upd[l + k] = qx[l + k] + sigma * ubarx;
+                qy_upd[l + k] = qy[l + k] + sigma * ubary;
+
+                float vx = divsigma * qx_upd[l + k]; //+ ubarx
+                float vy = divsigma * qy_upd[l + k]; //+ ubary
+#else
+                float vx = ubarx + divsigma * qx[l + k];
+                float vy = ubary + divsigma * qy[l + k];
+#endif
+
+                M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy);
+            }
+
+            coefF(t, M1, M2, M3, sigma, p, q, r);
+            
+//#pragma vector aligned
+#pragma GCC ivdep 
+            for(k = 0; k < dimZ; k++) {
+#ifdef TNV_NEW_STYLE    
+                float vx = divsigma * qx_upd[l + k];
+                float vy = divsigma * qy_upd[l + k];
+
+                float gx_upd = vx * t[0] + vy * t[1];
+                float gy_upd = vx * t[1] + vy * t[2];
+
+                qx_upd[l + k] -= sigma * gx_upd;
+                qy_upd[l + k] -= sigma * gy_upd;
+#else
+                float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k];
+                float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k];
+                float vx = ubarx + divsigma * qx[l + k];
+                float vy = ubary + divsigma * qy[l + k];
+
+                float gx_upd = vx * t[0] + vy * t[1];
+                float gy_upd = vx * t[1] + vy * t[2];
+
+                qx_upd[l + k] = qx[l + k] + sigma * (ubarx - gx_upd);
+                qy_upd[l + k] = qy[l + k] + sigma * (ubary - gy_upd);
+#endif
+
+                float div_upd_val = 0;
+#ifndef TNV_LOOP_FIRST_I
+                div_upd_val -= qx_upd[l + k - padZ];
+#endif
+
+#ifndef TNV_LOOP_FIRST_J
+                div_upd_val -= qy_upd[l + k - dimX * padZ];
+#endif
+#ifndef TNV_LOOP_LAST_I
+                div_upd_val += qx_upd[l + k];
+#endif
+#ifndef TNV_LOOP_LAST_J
+                div_upd_val += qy_upd[l + k];
+#endif
+                div_upd[l + k] = div_upd_val;
+
+                qxdiff = qx[l + k] - qx_upd[l + k];
+                qydiff = qy[l + k] - qy_upd[l + k];
+                qnorm += (qxdiff * qxdiff + qydiff * qydiff);
+
+                resdual1 += fabs(divsigma * qxdiff - gradxdiff[k]);
+                resdual2 += fabs(divsigma * qydiff - gradydiff[k]);
+                product += (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff);
+
+#ifndef TNV_LOOP_FIRST_J
+                divdiff = div[l + k] - div_upd[l + k];  // Multiple steps... How we compute without history?
+                resprimal += fabs(divtau * udiff[k] + divdiff); 
+#endif
+            }
diff --git a/src/Core/performance_CPU/TNV_core_loop.h.v32 b/src/Core/performance_CPU/TNV_core_loop.h.v32
new file mode 100644
index 00000000..be531566
--- /dev/null
+++ b/src/Core/performance_CPU/TNV_core_loop.h.v32
@@ -0,0 +1,119 @@
+    {
+            float t[3];
+            float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f;
+            l = (j * dimX  + i) * padZ;
+            m = dimX * padZ;
+        
+            float *__restrict u_next = u + l + padZ;
+            float *__restrict u_current = u + l;
+            float *__restrict u_next_row = u + l + m;
+
+
+            float *__restrict qx_current = qx + l;
+            float *__restrict qy_current = qy + l;
+            float *__restrict qx_prev = qx + l - padZ;
+            float *__restrict qy_prev = qy + l - m;
+
+
+  __assume(padZ%16==0);
+/*
+  __assume_aligned(Input, 64);
+  __assume_aligned(div, 64);
+  __assume_aligned(gradx, 64);
+  __assume_aligned(grady, 64);
+  __assume_aligned(u, 64);
+  __assume_aligned(qx, 64);
+  __assume_aligned(qy, 64);
+  __assume_aligned(u_current, 64);
+  __assume_aligned(u_next, 64);
+  __assume_aligned(u_next_row, 64);
+*/
+
+#pragma vector aligned
+#pragma GCC ivdep 
+            for(k = 0; k < dimZ; k++) {
+                float u_upd = (u[l + k] + tau * div[l + k] + taulambda * Input[l + k]) / constant; // 3 reads
+                udiff[k] = u[l + k] - u_upd; // cache 1w
+                u[l + k] = u_upd; // 1 write
+
+#ifdef TNV_LOOP_FIRST_J
+                udiff0[l + k] = udiff[k];
+                div0[l + k] = div[l + k];
+#endif
+
+#ifdef TNV_LOOP_LAST_I
+                float gradx_upd = 0;
+#else
+                float u_next_upd = (u[l + k + padZ] + tau * div[l + k + padZ] + taulambda * Input[l + k + padZ]) / constant; // 3 reads
+                float gradx_upd = (u_next_upd - u_upd); // 2 reads
+#endif
+
+#ifdef TNV_LOOP_LAST_J
+                float grady_upd = 0;
+#else
+                float u_next_row_upd = (u[l + k + m] + tau * div[l + k + m] + taulambda * Input[l + k + m]) / constant; // 3 reads
+                float grady_upd = (u_next_row_upd - u_upd); // 1 read
+#endif
+
+                gradxdiff[k] = gradx[l + k] - gradx_upd; // 1 read, cache 1w
+                gradydiff[k] = grady[l + k] - grady_upd; // 1 read, cache 1w
+                gradx[l + k] = gradx_upd; // 1 write
+                grady[l + k] = grady_upd; // 1 write
+                
+                ubarx[k] = gradx_upd - theta * gradxdiff[k]; // cache 1w
+                ubary[k] = grady_upd - theta * gradydiff[k]; // cache 1w
+
+                float vx = ubarx[k] + divsigma * qx[l + k]; // 1 read
+                float vy = ubary[k] + divsigma * qy[l + k]; // 1 read
+
+                M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy);
+            }
+
+            coefF(t, M1, M2, M3, sigma, p, q, r);
+            
+#pragma vector aligned
+#pragma GCC ivdep 
+            for(k = 0; k < padZ; k++) {
+                float vx = ubarx[k] + divsigma * qx_current[k]; // cache 2r
+                float vy = ubary[k] + divsigma * qy_current[k]; // cache 2r
+                float gx_upd = vx * t[0] + vy * t[1];
+                float gy_upd = vx * t[1] + vy * t[2];
+
+                qxdiff = sigma * (ubarx[k] - gx_upd);
+                qydiff = sigma * (ubary[k] - gy_upd);
+                
+                qx_current[k] += qxdiff; // write 1
+                qy_current[k] += qydiff; // write 1
+
+                unorm += (udiff[k] * udiff[k]);
+                qnorm += (qxdiff * qxdiff + qydiff * qydiff);
+
+                float div_upd = 0;
+
+#ifndef TNV_LOOP_FIRST_I
+                div_upd -= qx_prev[k]; // 1 read
+#endif
+#ifndef TNV_LOOP_FIRST_J
+                div_upd -= qy_prev[k]; // 1 read
+#endif
+#ifndef TNV_LOOP_LAST_I
+                div_upd += qx_current[k]; 
+#endif
+#ifndef TNV_LOOP_LAST_J
+                div_upd += qy_current[k]; 
+#endif
+
+                divdiff = div[l + k] - div_upd;  // 1 read
+                div[l + k] = div_upd; // 1 write
+
+#ifndef TNV_LOOP_FIRST_J
+                resprimal += fabs(divtau * udiff[k] + divdiff); 
+#endif
+
+                    // We need to have two independent accumulators to allow gcc-autovectorization
+                resdual1 += fabs(divsigma * qxdiff + gradxdiff[k]); // cache 1r
+                resdual2 += fabs(divsigma * qydiff + gradydiff[k]); // cache 1r
+                product -= (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff);
+            }
+    }
+    
\ No newline at end of file

From f59a2ea520ba9fb453582b9a4246f3f8e5257474 Mon Sep 17 00:00:00 2001
From: "Suren A. Chilingaryan" <csa@suren.me>
Date: Sun, 29 Mar 2020 23:31:17 +0200
Subject: [PATCH 07/10] Remove verification output

---
 src/Core/regularisers_CPU/TNV_core.c           | 4 ++--
 src/Core/regularisers_CPU/TNV_core_backtrack.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/Core/regularisers_CPU/TNV_core.c b/src/Core/regularisers_CPU/TNV_core.c
index 7b8023bc..21f228d5 100755
--- a/src/Core/regularisers_CPU/TNV_core.c
+++ b/src/Core/regularisers_CPU/TNV_core.c
@@ -617,7 +617,7 @@ float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float to
         float b = (2.0f * tau * sigma * product) / (gamma * sigma * unorm + gamma * tau * qnorm);
         float dual_dot_delta = resdual * s * delta;
         float dual_div_delta = (resdual * s) / delta;
-        printf("resprimal: %f, resdual: %f, b: %f (product: %f, unorm: %f, qnorm: %f)\n", resprimal, resdual, b, product, unorm, qnorm);
+//        printf("resprimal: %f, resdual: %f, b: %f (product: %f, unorm: %f, qnorm: %f)\n", resprimal, resdual, b, product, unorm, qnorm);
 
 
         if(b > 1) {
@@ -658,7 +658,7 @@ float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float to
 
 
     printf("Iterations stopped at %i with the residual %f \n", iter, residual);
-    printf("Return: %f\n", *uT);
+//    printf("Return: %f\n", *uT);
 
     return *uT;
 }
diff --git a/src/Core/regularisers_CPU/TNV_core_backtrack.c b/src/Core/regularisers_CPU/TNV_core_backtrack.c
index 7eb367e3..f166f5be 100755
--- a/src/Core/regularisers_CPU/TNV_core_backtrack.c
+++ b/src/Core/regularisers_CPU/TNV_core_backtrack.c
@@ -636,7 +636,7 @@ float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float to
         float b = (2.0f * tau * sigma * product) / (gamma * sigma * unorm + gamma * tau * qnorm);
         float dual_dot_delta = resdual * s * delta;
         float dual_div_delta = (resdual * s) / delta;
-        printf("resprimal: %f, resdual: %f, b: %f (product: %f, unorm: %f, qnorm: %f)\n", resprimal, resdual, b, product, unorm, qnorm);
+//        printf("resprimal: %f, resdual: %f, b: %f (product: %f, unorm: %f, qnorm: %f)\n", resprimal, resdual, b, product, unorm, qnorm);
 
 
         if(b > 1) {
@@ -675,7 +675,7 @@ float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float to
 
 
     printf("Iterations stopped at %i with the residual %f \n", iter, residual);
-    printf("Return: %f\n", *uT);
+//    printf("Return: %f\n", *uT);
 
     return *uT;
 }

From a08cd34eed5620e810dab60072237d462969cba5 Mon Sep 17 00:00:00 2001
From: Edoardo Pasca <edo@edosil.net>
Date: Sun, 5 Apr 2020 08:07:28 +0100
Subject: [PATCH 08/10] initial integration in CMake

---
 src/Core/CMakeLists.txt | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/Core/CMakeLists.txt b/src/Core/CMakeLists.txt
index 35d5bb41..e639e1f2 100644
--- a/src/Core/CMakeLists.txt
+++ b/src/Core/CMakeLists.txt
@@ -60,7 +60,7 @@ message("Adding regularisers as a shared library")
 #set(CMAKE_C_FLAGS "-march=nocona -msse -msse2 -msse3 -mssse3 -msse4 -msse4.1 -msse4.2 -ftree-vectorize -fopenmp")
 
 #set(CMAKE_C_COMPILER gcc-9)
-set(CMAKE_C_FLAGS "-march=native -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -fopenmp")
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -fopenmp")
 #set(CMAKE_C_FLAGS "-march=nocona -msse -msse2 -msse3 -mssse3 -msse4 -msse4.1 -msse4.2 -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=128 -fopenmp")
 #set(CMAKE_C_FLAGS "-march=native -mavx2 -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=512 -fopenmp")
 #set(CMAKE_C_FLAGS "-march=native -mavx512f -mavx512dq -mavx512bw -mavx512vbmi -mavx512vbmi2 -mavx512vl -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=512 -fopenmp")
@@ -81,6 +81,7 @@ add_library(cilreg SHARED
             ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/ROF_TV_core.c
             ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/FGP_dTV_core.c
             ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/TNV_core.c
+            #${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/TNV_core_fast.c
             ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/Nonlocal_TV_core.c
             ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/PatchSelect_core.c
 	    ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/utils.c
@@ -91,9 +92,9 @@ target_link_libraries(cilreg ${EXTRA_LIBRARIES} ${GLIB2_LIBRARIES} ${GTHREAD2_LI
 include_directories(cilreg PUBLIC
                       ${GLIB2_INCLUDE_DIR}
                       ${LIBRARY_INC}/include
-					  ${CMAKE_CURRENT_SOURCE_DIR}
-		              ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/
-		              ${CMAKE_CURRENT_SOURCE_DIR}/inpainters_CPU/  )
+					            ${CMAKE_CURRENT_SOURCE_DIR}
+		                  ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/
+		                  ${CMAKE_CURRENT_SOURCE_DIR}/inpainters_CPU/  )
 
 ## Install
 

From 10c3df58c5f4c0649c1c23607f430f8a420b6f6f Mon Sep 17 00:00:00 2001
From: Edoardo Pasca <edo.paskino@gmail.com>
Date: Mon, 6 Apr 2020 13:44:06 +0100
Subject: [PATCH 09/10] fix build with conda packages

---
 CMakeLists.txt        | 19 +++++++++++++++++++
 cmake/FindGLIB2.cmake | 11 +++++++----
 2 files changed, 26 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index afbff2da..0471a017 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -56,5 +56,24 @@ endif()
 message(STATUS "Python wrappers will be installed in " ${PYTHON_DEST})
 
 
+
+# include(ExternalProject)
+# ExternalProject_Add(
+#   GLIB2
+#   #URL http://download.dre.vanderbilt.edu/previous_versions/ACE-${ACE_VERSION}.zip
+#   GIT_REPOSITORY https://github.com/GNOME/glib.git
+#   GIT_TAG 2.62.6
+#   CONFIGURE_COMMAND meson _build
+#   BUILD_COMMAND ninja -C _build
+#   INSTALL_COMMAND ninja -C _build install
+#   LOG_INSTALL 1
+#   BINARY_DIR ${CMAKE_BINARY_DIR}/build/
+#   INSTALL_DIR ${LIBRARY_DIR}
+#   STAMP_DIR ${CMAKE_BINARY_DIR}/stamp
+#   TMP_DIR ${CMAKE_BINARY_DIR}/tmp
+#   DOWNLOAD_DIR ${CMAKE_BINARY_DIR}/download
+#   SOURCE_DIR ${CMAKE_BINARY_DIR}/source
+# )
+
 #add_subdirectory(src/Core)
 add_subdirectory(src)
diff --git a/cmake/FindGLIB2.cmake b/cmake/FindGLIB2.cmake
index 2940d9f2..75297ab6 100644
--- a/cmake/FindGLIB2.cmake
+++ b/cmake/FindGLIB2.cmake
@@ -7,15 +7,18 @@
 #  GLIB2_LIBRARIES - glib2 library
 
 set(GLIB2_DIR GLIB2_DIR-NOTFOUND CACHE PATH "Location of GLIB2 package")
-
-if(GLIB2_INCLUDE_DIR AND GLIB2_LIBRARIES)
+message(STATUS "GLIB2_INCLUDE_DIR > ${GLIB2_INCLUDE_DIR} <")
+message(STATUS "GLIB2_LIBRARIES > ${GLIB2_LIBRARIES} <")
+if(GLIB2_INCLUDE_DIR AND GLIB2_LIBRARY_DIR)
     # Already in cache, be silent
+    message(STATUS "Should have variables set")
     set(GLIB2_FIND_QUIETLY TRUE)
-endif(GLIB2_INCLUDE_DIR AND GLIB2_LIBRARIES)
+    set (GLIB2_DIR ${GLIB2_LIBRARY_DIR})
+endif(GLIB2_INCLUDE_DIR AND GLIB2_LIBRARY_DIR)
 
 if (GLIB2_DIR)
     set(PKG_GLIB_LIBRARY_DIRS ${GLIB2_DIR}/lib${CMAKE_BUILD_ARCH} ${GLIB2_DIR}/lib)
-    set(PKG_GLIB_INCLUDE_DIRS ${GLIB2_DIR}/include/)
+    set(PKG_GLIB_INCLUDE_DIRS ${GLIB2_DIR}/include/ ${GLIB2_DIR}/lib/glib-2.0/include)
 else (GLIB2_DIR)
     if (NOT WIN32)
 	find_package(PkgConfig REQUIRED)

From 8589ac62236b352a0ac3f4ff6cb8f5f824a65fdb Mon Sep 17 00:00:00 2001
From: Edoardo Pasca <edo.paskino@gmail.com>
Date: Mon, 6 Apr 2020 14:07:35 +0100
Subject: [PATCH 10/10] fix linking and conda recipe

---
 recipe/build.sh         | 2 +-
 recipe/meta.yaml        | 2 ++
 src/Core/CMakeLists.txt | 9 +++++----
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/recipe/build.sh b/recipe/build.sh
index 6ba7e275..a5debcd9 100644
--- a/recipe/build.sh
+++ b/recipe/build.sh
@@ -3,6 +3,6 @@ cp -rv "$RECIPE_DIR/../test" "$SRC_DIR/"
 
 cd $SRC_DIR
 
-cmake -G "Unix Makefiles" $RECIPE_DIR/../ -DBUILD_PYTHON_WRAPPER=ON -DCONDA_BUILD=ON -DBUILD_CUDA=ON -DCMAKE_BUILD_TYPE="Release" -DLIBRARY_LIB=$CONDA_PREFIX/lib -DLIBRARY_INC=$CONDA_PREFIX -DCMAKE_INSTALL_PREFIX=$PREFIX
+cmake -G "Unix Makefiles" $RECIPE_DIR/../ -DBUILD_PYTHON_WRAPPER=ON -DCONDA_BUILD=ON -DBUILD_CUDA=ON -DCMAKE_BUILD_TYPE="Release" -DLIBRARY_LIB=$CONDA_PREFIX/lib -DLIBRARY_INC=$CONDA_PREFIX -DCMAKE_INSTALL_PREFIX=$PREFIX -DGLIB2_INCLUDE_DIR=$CONDA_PREFIX -DGLIB2_LIBRARY_DIR=$CONDA_PREFIX
 
 make install
diff --git a/recipe/meta.yaml b/recipe/meta.yaml
index 10901029..7d805948 100644
--- a/recipe/meta.yaml
+++ b/recipe/meta.yaml
@@ -28,6 +28,7 @@ requirements:
     - vc 9  # [win and py27]
     - cmake
     - ripgrep
+    - glib # [unix]
 
   run:
     - {{ pin_compatible('numpy', max_pin='x.x') }}
@@ -37,6 +38,7 @@ requirements:
     - vc 14 # [win and py35]
     - vc 9  # [win and py27]
     - libgcc-ng # [unix]
+    - glib # [unix]
 
 about:
   home: http://www.ccpi.ac.uk
diff --git a/src/Core/CMakeLists.txt b/src/Core/CMakeLists.txt
index e639e1f2..2295ea6e 100644
--- a/src/Core/CMakeLists.txt
+++ b/src/Core/CMakeLists.txt
@@ -18,11 +18,12 @@ message("Creating Regularisers as a shared library")
 find_package(GLIB2 REQUIRED)
 find_package(OpenMP REQUIRED)
 if (OPENMP_FOUND)
+    message(STATUS "OpenMP_C_FLAGS ${OpenMP_C_FLAGS}")
     set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
     set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
-    set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS} ${OpenMP_CXX_FLAGS}")
-   set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_SHARED_LINKER_FLAGS} ${OpenMP_CXX_FLAGS}")
-   set (CMAKE_STATIC_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_STATIC_LINKER_FLAGS} ${OpenMP_CXX_FLAGS}")
+  #   set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS} ${OpenMP_C_FLAGS}")
+  #  set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_SHARED_LINKER_FLAGS} ${OpenMP_C_FLAGS}")
+  #  set (CMAKE_STATIC_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_STATIC_LINKER_FLAGS} ${OpenMP_C_FLAGS}")
 endif()
 
 message("CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} -fno-omit-frame-pointer")
@@ -45,7 +46,7 @@ elseif(UNIX)
    set(EXTRA_LIBRARIES "m")
 endif()
   set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS}")
-  set (CMAKE_C_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS}")
+  set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${FLAGS}")
 message("CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}")
 
 ## Build the regularisers package as a library