From 324411a3e033c7f05b77e3bff829b55d674438da Mon Sep 17 00:00:00 2001 From: "Suren A. Chilingaryan" Date: Fri, 27 Mar 2020 20:15:30 +0100 Subject: [PATCH 01/10] Optimized TNV routine (10x performance, 1/3 memcory consumption) --- CMakeLists.txt | 1 + cmake/FindGLIB2.cmake | 54 + src/Core/CMakeLists.txt | 19 +- src/Core/regularisers_CPU/TNV_core.c | 948 +++++++++++------- .../regularisers_CPU/TNV_core_backtrack.c | 695 +++++++++++++ src/Core/regularisers_CPU/hw_sched.c | 396 ++++++++ src/Core/regularisers_CPU/hw_sched.h | 151 +++ src/Core/regularisers_CPU/hw_thread.c | 141 +++ src/Core/regularisers_CPU/hw_thread.h | 76 ++ 9 files changed, 2097 insertions(+), 384 deletions(-) create mode 100644 cmake/FindGLIB2.cmake create mode 100755 src/Core/regularisers_CPU/TNV_core_backtrack.c create mode 100644 src/Core/regularisers_CPU/hw_sched.c create mode 100644 src/Core/regularisers_CPU/hw_sched.h create mode 100644 src/Core/regularisers_CPU/hw_thread.c create mode 100644 src/Core/regularisers_CPU/hw_thread.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 043f13c5..afbff2da 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,6 +13,7 @@ # limitations under the License. cmake_minimum_required (VERSION 3.0) +set(CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake" ${CMAKE_MODULE_PATH}) project(RGL) #https://stackoverflow.com/questions/13298504/using-cmake-with-setup-py diff --git a/cmake/FindGLIB2.cmake b/cmake/FindGLIB2.cmake new file mode 100644 index 00000000..2940d9f2 --- /dev/null +++ b/cmake/FindGLIB2.cmake @@ -0,0 +1,54 @@ +# - Try to find the GLIB2 libraries +# Once done this will define +# +# GLIB2_FOUND - system has glib2 +# GLIB2_DIR - path to the glib2 base directory +# GLIB2_INCLUDE_DIR - the glib2 include directory +# GLIB2_LIBRARIES - glib2 library + +set(GLIB2_DIR GLIB2_DIR-NOTFOUND CACHE PATH "Location of GLIB2 package") + +if(GLIB2_INCLUDE_DIR AND GLIB2_LIBRARIES) + # Already in cache, be silent + set(GLIB2_FIND_QUIETLY TRUE) +endif(GLIB2_INCLUDE_DIR AND GLIB2_LIBRARIES) + +if (GLIB2_DIR) + set(PKG_GLIB_LIBRARY_DIRS ${GLIB2_DIR}/lib${CMAKE_BUILD_ARCH} ${GLIB2_DIR}/lib) + set(PKG_GLIB_INCLUDE_DIRS ${GLIB2_DIR}/include/) +else (GLIB2_DIR) + if (NOT WIN32) + find_package(PkgConfig REQUIRED) + pkg_check_modules(PKG_GLIB REQUIRED glib-2.0) + endif(NOT WIN32) +endif (GLIB2_DIR) + +find_path(GLIB2_MAIN_INCLUDE_DIR glib.h + PATH_SUFFIXES glib-2.0 + PATHS ${PKG_GLIB_INCLUDE_DIRS} ) + +# search the glibconfig.h include dir under the same root where the library is found +find_library(GLIB2_LIBRARIES + NAMES glib-2.0 + PATHS ${PKG_GLIB_LIBRARY_DIRS} ) + +find_library(GTHREAD2_LIBRARIES + NAMES gthread-2.0 + PATHS ${PKG_GLIB_LIBRARY_DIRS} ) + +find_path(GLIB2_INTERNAL_INCLUDE_DIR glibconfig.h + PATH_SUFFIXES glib-2.0/include + PATHS ${PKG_GLIB_INCLUDE_DIRS} ${PKG_GLIB_LIBRARY_DIRS} ${CMAKE_SYSTEM_LIBRARY_PATH}) + +set(GLIB2_INCLUDE_DIR ${GLIB2_MAIN_INCLUDE_DIR}) + +# not sure if this include dir is optional or required +# for now it is optional +if(GLIB2_INTERNAL_INCLUDE_DIR) + set(GLIB2_INCLUDE_DIR ${GLIB2_INCLUDE_DIR} ${GLIB2_INTERNAL_INCLUDE_DIR}) +endif(GLIB2_INTERNAL_INCLUDE_DIR) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(GLIB2 DEFAULT_MSG GLIB2_LIBRARIES GTHREAD2_LIBRARIES GLIB2_MAIN_INCLUDE_DIR) + +mark_as_advanced(GLIB2_INCLUDE_DIR GLIB2_LIBRARIES GTHREAD2_LIBRARIES GLIB2_INTERNAL_INCLUDE_DIR GLIB2_MAIN_INCLUDE_DIR) diff --git a/src/Core/CMakeLists.txt b/src/Core/CMakeLists.txt index 4e43b5e6..10b16f30 100644 --- a/src/Core/CMakeLists.txt +++ b/src/Core/CMakeLists.txt @@ -15,6 +15,7 @@ message("CIL_VERSION ${CIL_VERSION}") ## Build the regularisers package as a library message("Creating Regularisers as a shared library") +find_package(GLIB2 REQUIRED) find_package(OpenMP REQUIRED) if (OPENMP_FOUND) set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") @@ -24,8 +25,8 @@ if (OPENMP_FOUND) set (CMAKE_STATIC_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_STATIC_LINKER_FLAGS} ${OpenMP_CXX_FLAGS}") endif() -message("CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}") -message("CMAKE_C_FLAGS ${CMAKE_C_FLAGS}") +message("CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} -fno-omit-frame-pointer") +message("CMAKE_C_FLAGS ${CMAKE_C_FLAGS} -fno-omit-frame-pointer") message("CMAKE_EXE_LINKER_FLAGS ${CMAKE_EXE_LINKER_FLAGS}") message("CMAKE_SHARED_LINKER_FLAGS ${CMAKE_SHARED_LINKER_FLAGS}") message("CMAKE_STATIC_LINKER_FLAGS ${CMAKE_STATIC_LINKER_FLAGS}") @@ -58,10 +59,17 @@ message("CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}") ## Build the regularisers package as a library message("Adding regularisers as a shared library") -#set(CMAKE_C_COMPILER /apps/pgi/linux86-64/17.4/bin/pgcc) +#set(CMAKE_C_COMPILER /opt/intel/compilers_and_libraries/linux/bin/intel64/icc) +#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -xSSE4.2 -qopt-report=2 -qopt-report-phase=vec -qopenmp") +set(CMAKE_C_FLAGS "-march=nocona -msse -msse2 -msse3 -mssse3 -msse4 -msse4.1 -msse4.2 -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=128 -fopenmp") +#set(CMAKE_C_FLAGS "-march=native -msse -msse2 -msse3 -mssse3 -msse4 -msse4.1 -msse4.2 -mavx -mavx2 -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=512 -fopenmp") +#set(CMAKE_C_FLAGS_RELEASE "-g -gdwarf-2 -g3 -fno-omit-frame-pointer") + #set(CMAKE_C_FLAGS "-acc -Minfo -ta=tesla:cc20 -openmp") #set(CMAKE_C_FLAGS "-acc -Minfo -ta=multicore -openmp -fPIC") add_library(cilreg SHARED + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/hw_sched.c + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/hw_thread.c ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/FGP_TV_core.c ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/SB_TV_core.c ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/PD_TV_core.c @@ -78,8 +86,9 @@ add_library(cilreg SHARED ${CMAKE_CURRENT_SOURCE_DIR}/inpainters_CPU/Diffusion_Inpaint_core.c ${CMAKE_CURRENT_SOURCE_DIR}/inpainters_CPU/NonlocalMarching_Inpaint_core.c ) -target_link_libraries(cilreg ${EXTRA_LIBRARIES} ) +target_link_libraries(cilreg ${EXTRA_LIBRARIES} ${GLIB2_LIBRARIES} ${GTHREAD2_LIBRARIES} ) include_directories(cilreg PUBLIC + ${GLIB2_INCLUDE_DIR} ${LIBRARY_INC}/include ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/ @@ -105,7 +114,7 @@ endif() # GPU Regularisers -if (BUILD_CUDA) +if (BUILD_CUDA1) find_package(CUDA) if (CUDA_FOUND) set(CUDA_NVCC_FLAGS "-Xcompiler -fPIC -shared -D_FORCE_INLINES") diff --git a/src/Core/regularisers_CPU/TNV_core.c b/src/Core/regularisers_CPU/TNV_core.c index 753cc5f3..be7fdef7 100755 --- a/src/Core/regularisers_CPU/TNV_core.c +++ b/src/Core/regularisers_CPU/TNV_core.c @@ -5,6 +5,12 @@ * * Copyright 2017 Daniil Kazantsev * Copyright 2017 Srikanth Nagella, Edoardo Pasca + * + * Copyriht 2020 Suren A. Chlingaryan + * Optimized version with 1/3 of memory consumption and ~10x performance + * This version is not able to perform back-track except during first iterations + * But warning would be printed if backtracking is required and slower version (TNV_core_backtrack.c) + * could be executed instead. It still better than original with 1/2 of memory consumption and 4x performance gain * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,6 +25,467 @@ #include "TNV_core.h" +#define min(a,b) (((a)<(b))?(a):(b)) + +inline void coefF(float *t, float M1, float M2, float M3, float sigma, int p, int q, int r) { + int ii, num; + float divsigma = 1.0f / sigma; + float sum, shrinkfactor; + float T,D,det,eig1,eig2,sig1,sig2,V1, V2, V3, V4, v0,v1,v2, mu1,mu2,sig1_upd,sig2_upd; + float proj[2] = {0}; + + // Compute eigenvalues of M + T = M1 + M3; + D = M1 * M3 - M2 * M2; + det = sqrtf(MAX((T * T / 4.0f) - D, 0.0f)); + eig1 = MAX((T / 2.0f) + det, 0.0f); + eig2 = MAX((T / 2.0f) - det, 0.0f); + sig1 = sqrtf(eig1); + sig2 = sqrtf(eig2); + + // Compute normalized eigenvectors + V1 = V2 = V3 = V4 = 0.0f; + + if(M2 != 0.0f) + { + v0 = M2; + v1 = eig1 - M3; + v2 = eig2 - M3; + + mu1 = sqrtf(v0 * v0 + v1 * v1); + mu2 = sqrtf(v0 * v0 + v2 * v2); + + if(mu1 > fTiny) + { + V1 = v1 / mu1; + V3 = v0 / mu1; + } + + if(mu2 > fTiny) + { + V2 = v2 / mu2; + V4 = v0 / mu2; + } + + } else + { + if(M1 > M3) + { + V1 = V4 = 1.0f; + V2 = V3 = 0.0f; + + } else + { + V1 = V4 = 0.0f; + V2 = V3 = 1.0f; + } + } + + // Compute prox_p of the diagonal entries + sig1_upd = sig2_upd = 0.0f; + + if(p == 1) + { + sig1_upd = MAX(sig1 - divsigma, 0.0f); + sig2_upd = MAX(sig2 - divsigma, 0.0f); + + } else if(p == INFNORM) + { + proj[0] = sigma * fabs(sig1); + proj[1] = sigma * fabs(sig2); + + /*l1 projection part */ + sum = fLarge; + num = 0l; + shrinkfactor = 0.0f; + while(sum > 1.0f) + { + sum = 0.0f; + num = 0; + + for(ii = 0; ii < 2; ii++) + { + proj[ii] = MAX(proj[ii] - shrinkfactor, 0.0f); + + sum += fabs(proj[ii]); + if(proj[ii]!= 0.0f) + num++; + } + + if(num > 0) + shrinkfactor = (sum - 1.0f) / num; + else + break; + } + /*l1 proj ends*/ + + sig1_upd = sig1 - divsigma * proj[0]; + sig2_upd = sig2 - divsigma * proj[1]; + } + + // Compute the diagonal entries of $\widehat{\Sigma}\Sigma^{\dagger}_0$ + if(sig1 > fTiny) + sig1_upd /= sig1; + + if(sig2 > fTiny) + sig2_upd /= sig2; + + // Compute solution + t[0] = sig1_upd * V1 * V1 + sig2_upd * V2 * V2; + t[1] = sig1_upd * V1 * V3 + sig2_upd * V2 * V4; + t[2] = sig1_upd * V3 * V3 + sig2_upd * V4 * V4; +} + + +#include "hw_sched.h" +typedef struct { + int offY, stepY, copY; + float *Input, *u, *qx, *qy, *gradx, *grady, *div; + float *div0, *udiff0, *udiff; + float resprimal, resdual; + float unorm, qnorm, product; +} tnv_thread_t; + +typedef struct { + int threads; + tnv_thread_t *thr_ctx; + float *InputT, *uT; + int dimX, dimY, dimZ, padZ; + float lambda, sigma, tau, theta; +} tnv_context_t; + +HWSched sched = NULL; +tnv_context_t tnv_ctx; + + +static int tnv_free(HWThread thr, void *hwctx, int device_id, void *data) { + int i,j,k; + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + free(ctx->Input); + free(ctx->u); + free(ctx->qx); + free(ctx->qy); + free(ctx->gradx); + free(ctx->grady); + free(ctx->div); + + free(ctx->div0); + free(ctx->udiff0); + free(ctx->udiff); + + return 0; +} + +static int tnv_init(HWThread thr, void *hwctx, int device_id, void *data) { + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + int dimX = tnv_ctx->dimX; + int dimY = tnv_ctx->dimY; + int dimZ = tnv_ctx->dimZ; + int padZ = tnv_ctx->padZ; + int offY = ctx->offY; + int stepY = ctx->stepY; + +// printf("%i %p - %i %i %i x %i %i\n", device_id, ctx, dimX, dimY, dimZ, offY, stepY); + + long DimTotal = (long)(dimX*stepY*padZ); + long Dim1Total = (long)(dimX*(stepY+1)*padZ); + long DimRow = (long)(dimX * padZ); + + // Auxiliar vectors + ctx->Input = malloc(Dim1Total * sizeof(float)); + ctx->u = malloc(Dim1Total * sizeof(float)); + ctx->qx = malloc(DimTotal * sizeof(float)); + ctx->qy = malloc(DimTotal * sizeof(float)); + ctx->gradx = malloc(DimTotal * sizeof(float)); + ctx->grady = malloc(DimTotal * sizeof(float)); + ctx->div = malloc(Dim1Total * sizeof(float)); + + ctx->div0 = malloc(DimRow * sizeof(float)); + ctx->udiff0 = malloc(DimRow * sizeof(float)); + ctx->udiff = malloc(DimRow * sizeof(float)); + + if ((!ctx->Input)||(!ctx->u)||(!ctx->qx)||(!ctx->qy)||(!ctx->gradx)||(!ctx->grady)||(!ctx->div)||(!ctx->div0)||(!ctx->udiff)||(!ctx->udiff0)) { + fprintf(stderr, "Error allocating memory\n"); + exit(-1); + } + + return 0; +} + +static int tnv_start(HWThread thr, void *hwctx, int device_id, void *data) { + int i,j,k; + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + int dimX = tnv_ctx->dimX; + int dimY = tnv_ctx->dimY; + int dimZ = tnv_ctx->dimZ; + int padZ = tnv_ctx->padZ; + int offY = ctx->offY; + int stepY = ctx->stepY; + int copY = ctx->copY; + +// printf("%i %p - %i %i %i (%i) x %i %i\n", device_id, ctx, dimX, dimY, dimZ, padZ, offY, stepY); + + long DimTotal = (long)(dimX*stepY*padZ); + long Dim1Total = (long)(dimX*copY*padZ); + + memset(ctx->u, 0, Dim1Total * sizeof(float)); + memset(ctx->qx, 0, DimTotal * sizeof(float)); + memset(ctx->qy, 0, DimTotal * sizeof(float)); + memset(ctx->gradx, 0, DimTotal * sizeof(float)); + memset(ctx->grady, 0, DimTotal * sizeof(float)); + memset(ctx->div, 0, Dim1Total * sizeof(float)); + + for(k=0; kInput[j * dimX * padZ + i * padZ + k] = tnv_ctx->InputT[k * dimX * dimY + (j + offY) * dimX + i]; + ctx->u[j * dimX * padZ + i * padZ + k] = tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i]; + } + } + } + + return 0; +} + +static int tnv_finish(HWThread thr, void *hwctx, int device_id, void *data) { + int i,j,k; + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + int dimX = tnv_ctx->dimX; + int dimY = tnv_ctx->dimY; + int dimZ = tnv_ctx->dimZ; + int padZ = tnv_ctx->padZ; + int offY = ctx->offY; + int stepY = ctx->stepY; + int copY = ctx->copY; + + for(k=0; kuT[k * dimX * dimY + (j + offY) * dimX + i] = ctx->u[j * dimX * padZ + i * padZ + k]; + } + } + } + + return 0; +} + + +static int tnv_restore(HWThread thr, void *hwctx, int device_id, void *data) { + int i,j,k; + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + int dimX = tnv_ctx->dimX; + int dimY = tnv_ctx->dimY; + int dimZ = tnv_ctx->dimZ; + int stepY = ctx->stepY; + int copY = ctx->copY; + int padZ = tnv_ctx->padZ; + long DimTotal = (long)(dimX*stepY*padZ); + long Dim1Total = (long)(dimX*copY*padZ); + + memset(ctx->u, 0, Dim1Total * sizeof(float)); + memset(ctx->qx, 0, DimTotal * sizeof(float)); + memset(ctx->qy, 0, DimTotal * sizeof(float)); + memset(ctx->gradx, 0, DimTotal * sizeof(float)); + memset(ctx->grady, 0, DimTotal * sizeof(float)); + memset(ctx->div, 0, Dim1Total * sizeof(float)); + + return 0; +} + + +static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) { + long i, j, k, l, m; + + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + int dimX = tnv_ctx->dimX; + int dimY = tnv_ctx->dimY; + int dimZ = tnv_ctx->dimZ; + int padZ = tnv_ctx->padZ; + int offY = ctx->offY; + int stepY = ctx->stepY; + int copY = ctx->copY; + + float *Input = ctx->Input; + float *u = ctx->u; + float *qx = ctx->qx; + float *qy = ctx->qy; + float *gradx = ctx->gradx; + float *grady = ctx->grady; + float *div = ctx->div; + + long p = 1l; + long q = 1l; + long r = 0l; + + float lambda = tnv_ctx->lambda; + float sigma = tnv_ctx->sigma; + float tau = tnv_ctx->tau; + float theta = tnv_ctx->theta; + + float taulambda = tau * lambda; + float divtau = 1.0f / tau; + float divsigma = 1.0f / sigma; + float theta1 = 1.0f + theta; + float constant = 1.0f + taulambda; + + float resprimal = 0.0f; + float resdual = 0.0f; + float product = 0.0f; + float unorm = 0.0f; + float qnorm = 0.0f; + + float qxdiff; + float qydiff; + float divdiff; + float gradxdiff[dimZ]; + float gradydiff[dimZ]; + float ubarx[dimZ]; + float ubary[dimZ]; + float udiff_next[dimZ]; + + for(i=0; i < dimX; i++) { + for(k = 0; k < dimZ; k++) { + int l = i * padZ + k; + float u_upd = (u[l] + tau * div[l] + taulambda * Input[l])/constant; + float udiff = u[l] - u_upd; + ctx->udiff[l] = udiff; + ctx->udiff0[l] = udiff; + ctx->div0[l] = div[l]; + u[l] = u_upd; + } + } + + for(j = 0; j < stepY; j++) { + for(i = 0; i < dimX; i++) { + float t[3]; + float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f; + l = (j * dimX + i) * padZ; + m = dimX * padZ; + +//#pragma unroll 64 + for(k = 0; k < dimZ; k++) { + float u_upd = (u[l + k + m] + tau * div[l + k + m] + taulambda * Input[l + k + m]) / constant; + udiff_next[k] = u[l + k + m] - u_upd; + u[l + k + m] = u_upd; + + float gradx_upd = (i == (dimX - 1))?0:(u[l + k + padZ] - u[l + k]); + float grady_upd = (j == (copY - 1))?0:(u[l + k + m] - u[l + k]); + gradxdiff[k] = gradx[l + k] - gradx_upd; + gradydiff[k] = grady[l + k] - grady_upd; + gradx[l + k] = gradx_upd; + grady[l + k] = grady_upd; + + ubarx[k] = gradx_upd - theta * gradxdiff[k]; + ubary[k] = grady_upd - theta * gradydiff[k]; + + float vx = ubarx[k] + divsigma * qx[l + k]; + float vy = ubary[k] + divsigma * qy[l + k]; + + M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy); + } + + coefF(t, M1, M2, M3, sigma, p, q, r); + +//#pragma unroll 64 + for(k = 0; k < dimZ; k++) { + float vx = ubarx[k] + divsigma * qx[l + k]; + float vy = ubary[k] + divsigma * qy[l + k]; + float gx_upd = vx * t[0] + vy * t[1]; + float gy_upd = vx * t[1] + vy * t[2]; + + qxdiff = sigma * (ubarx[k] - gx_upd); + qydiff = sigma * (ubary[k] - gy_upd); + + qx[l + k] += qxdiff; + qy[l + k] += qydiff; + + float udiff = ctx->udiff[i * padZ + k]; + ctx->udiff[i * padZ + k] = udiff_next[k]; + unorm += (udiff * udiff); + qnorm += (qxdiff * qxdiff + qydiff * qydiff); + + float div_upd = 0; + div_upd -= (i > 0)?qx[l + k - padZ]:0; + div_upd -= (j > 0)?qy[l + k - m]:0; + div_upd += (i < (dimX-1))?qx[l + k]:0; + div_upd += (j < (copY-1))?qy[l + k]:0; + divdiff = div[l + k] - div_upd; + div[l + k] = div_upd; + + resprimal += ((offY == 0)||(j > 0))?fabs(divtau * udiff + divdiff):0; + resdual += fabs(divsigma * qxdiff + gradxdiff[k]); + resdual += fabs(divsigma * qydiff + gradydiff[k]); + product -= (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff); + } + + } // i + } // j + + + ctx->resprimal = resprimal; + ctx->resdual = resdual; + ctx->product = product; + ctx->unorm = unorm; + ctx->qnorm = qnorm; + + return 0; +} + +static void TNV_CPU_init(float *InputT, float *uT, int dimX, int dimY, int dimZ) { + int i, off, size, err; + + if (sched) return; + + tnv_ctx.dimX = dimX; + tnv_ctx.dimY = dimY; + tnv_ctx.dimZ = dimZ; + // Padding seems actually slower +// tnv_ctx.padZ = 64 * ((dimZ / 64) + ((dimZ % 64)?1:0)); + tnv_ctx.padZ = dimZ; + + hw_sched_init(); + + int threads = hw_sched_get_cpu_count(); + if (threads > dimY) threads = dimY/2; + + int step = dimY / threads; + int extra = dimY % threads; + + tnv_ctx.threads = threads; + tnv_ctx.thr_ctx = (tnv_thread_t*)calloc(threads, sizeof(tnv_thread_t)); + for (i = 0, off = 0; i < threads; i++, off += size) { + tnv_thread_t *ctx = tnv_ctx.thr_ctx + i; + size = step + ((i < extra)?1:0); + + ctx->offY = off; + ctx->stepY = size; + + if (i == (threads-1)) ctx->copY = ctx->stepY; + else ctx->copY = ctx->stepY + 1; + } + + sched = hw_sched_create(threads); + if (!sched) { fprintf(stderr, "Error creating threads\n"); exit(-1); } + + err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_init); + if (!err) err = hw_sched_wait_task(sched); + if (err) { fprintf(stderr, "Error %i scheduling init threads", err); exit(-1); } +} + + + /* * C-OMP implementation of Total Nuclear Variation regularisation model (2D + channels) [1] * The code is modified from the implementation by Joan Duran see @@ -32,50 +499,25 @@ * 5. print information: 0 (off) or 1 (on) [OPTIONAL parameter] * * Output: - * 1. Filtered/regularized image + * 1. Filtered/regularized image (u) * * [1]. Duran, J., Moeller, M., Sbert, C. and Cremers, D., 2016. Collaborative total variation: a general framework for vectorial TV models. SIAM Journal on Imaging Sciences, 9(1), pp.116-151. */ -float TNV_CPU_main(float *Input, float *u, float lambda, int maxIter, float tol, int dimX, int dimY, int dimZ) +float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float tol, int dimX, int dimY, int dimZ) { - long k, p, q, r, DimTotal; - float taulambda; - float *u_upd, *gx, *gy, *gx_upd, *gy_upd, *qx, *qy, *qx_upd, *qy_upd, *v, *vx, *vy, *gradx, *grady, *gradx_upd, *grady_upd, *gradx_ubar, *grady_ubar, *div, *div_upd; - - p = 1l; - q = 1l; - r = 0l; - + int err; + int iter; + int i,j,k,l,m; + lambda = 1.0f/(2.0f*lambda); - DimTotal = (long)(dimX*dimY*dimZ); - /* PDHG algorithm parameters*/ + tnv_ctx.lambda = lambda; + + // PDHG algorithm parameters float tau = 0.5f; float sigma = 0.5f; float theta = 1.0f; - - // Auxiliar vectors - u_upd = calloc(DimTotal, sizeof(float)); - gx = calloc(DimTotal, sizeof(float)); - gy = calloc(DimTotal, sizeof(float)); - gx_upd = calloc(DimTotal, sizeof(float)); - gy_upd = calloc(DimTotal, sizeof(float)); - qx = calloc(DimTotal, sizeof(float)); - qy = calloc(DimTotal, sizeof(float)); - qx_upd = calloc(DimTotal, sizeof(float)); - qy_upd = calloc(DimTotal, sizeof(float)); - v = calloc(DimTotal, sizeof(float)); - vx = calloc(DimTotal, sizeof(float)); - vy = calloc(DimTotal, sizeof(float)); - gradx = calloc(DimTotal, sizeof(float)); - grady = calloc(DimTotal, sizeof(float)); - gradx_upd = calloc(DimTotal, sizeof(float)); - grady_upd = calloc(DimTotal, sizeof(float)); - gradx_ubar = calloc(DimTotal, sizeof(float)); - grady_ubar = calloc(DimTotal, sizeof(float)); - div = calloc(DimTotal, sizeof(float)); - div_upd = calloc(DimTotal, sizeof(float)); - + // Backtracking parameters float s = 1.0f; float gamma = 0.75f; @@ -84,369 +526,117 @@ float TNV_CPU_main(float *Input, float *u, float lambda, int maxIter, float tol, float alpha = alpha0; float delta = 1.5f; float eta = 0.95f; + + TNV_CPU_init(InputT, uT, dimX, dimY, dimZ); + + tnv_ctx.InputT = InputT; + tnv_ctx.uT = uT; - // PDHG algorithm parameters - taulambda = tau * lambda; - float divtau = 1.0f / tau; - float divsigma = 1.0f / sigma; - float theta1 = 1.0f + theta; - - /*allocate memory for taulambda */ - //taulambda = (float*) calloc(dimZ, sizeof(float)); - //for(k=0; k < dimZ; k++) {taulambda[k] = tau*lambda[k];} - + int padZ = tnv_ctx.padZ; + + err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_start); + if (!err) err = hw_sched_wait_task(sched); + if (err) { fprintf(stderr, "Error %i scheduling start threads", err); exit(-1); } + + // Apply Primal-Dual Hybrid Gradient scheme - int iter = 0; float residual = fLarge; - float ubarx, ubary; - + int started = 0; for(iter = 0; iter < maxIter; iter++) { - // Argument of proximal mapping of fidelity term -#pragma omp parallel for shared(v, u) private(k) - for(k=0; kstepY - 1) * dimX * padZ; + for(i = 0; i < dimX; i++) { + for(k = 0; k < dimZ; k++) { + int l = i * padZ + k; + + float divdiff = ctx->div0[l] - ctx->div[l]; + float udiff = ctx->udiff0[l]; -if(b > 1) -{ - // Decrease step-sizes to fit balancing principle - tau = (beta * tau) / b; - sigma = (beta * sigma) / b; - alpha = alpha0; - - copyIm(u, u_upd, (long)(dimX), (long)(dimY), (long)(dimZ)); - copyIm(gx, gx_upd, (long)(dimX), (long)(dimY), (long)(dimZ)); - copyIm(gy, gy_upd, (long)(dimX), (long)(dimY), (long)(dimZ)); - copyIm(qx, qx_upd, (long)(dimX), (long)(dimY), (long)(dimZ)); - copyIm(qy, qy_upd, (long)(dimX), (long)(dimY), (long)(dimZ)); - copyIm(gradx, gradx_upd, (long)(dimX), (long)(dimY), (long)(dimZ)); - copyIm(grady, grady_upd, (long)(dimX), (long)(dimY), (long)(dimZ)); - copyIm(div, div_upd, (long)(dimX), (long)(dimY), (long)(dimZ)); - -} else if(resprimal > dual_dot_delta) -{ - // Increase primal step-size and decrease dual step-size - tau = tau / (1.0f - alpha); - sigma = sigma * (1.0f - alpha); - alpha = alpha * eta; - -} else if(resprimal < dual_div_delta) -{ - // Decrease primal step-size and increase dual step-size - tau = tau * (1.0f - alpha); - sigma = sigma / (1.0f - alpha); - alpha = alpha * eta; -} + ctx->div[l] -= ctx0->qy[l + m]; + ctx0->div[m + l + dimX*padZ] = ctx->div[l]; + + divdiff += ctx0->qy[l + m]; + resprimal += fabs(divtau * udiff + divdiff); + } + } + } -// Update variables -taulambda = tau * lambda; -//for(k=0; k < dimZ; k++) taulambda[k] = tau*lambda[k]; + for (j = 0; j < tnv_ctx.threads; j++) { + tnv_thread_t *ctx = tnv_ctx.thr_ctx + j; + resprimal += ctx->resprimal; + resdual += ctx->resdual; + product += ctx->product; + unorm += ctx->unorm; + qnorm += ctx->qnorm; + } -divsigma = 1.0f / sigma; -divtau = 1.0f / tau; + residual = (resprimal + resdual) / ((float) (dimX*dimY*dimZ)); + float b = (2.0f * tau * sigma * product) / (gamma * sigma * unorm + gamma * tau * qnorm); + float dual_dot_delta = resdual * s * delta; + float dual_div_delta = (resdual * s) / delta; + printf("resprimal: %f, resdual: %f, b: %f (product: %f, unorm: %f, qnorm: %f)\n", resprimal, resdual, b, product, unorm, qnorm); -copyIm(u_upd, u, (long)(dimX), (long)(dimY), (long)(dimZ)); -copyIm(gx_upd, gx, (long)(dimX), (long)(dimY), (long)(dimZ)); -copyIm(gy_upd, gy, (long)(dimX), (long)(dimY), (long)(dimZ)); -copyIm(qx_upd, qx, (long)(dimX), (long)(dimY), (long)(dimZ)); -copyIm(qy_upd, qy, (long)(dimX), (long)(dimY), (long)(dimZ)); -copyIm(gradx_upd, gradx, (long)(dimX), (long)(dimY), (long)(dimZ)); -copyIm(grady_upd, grady, (long)(dimX), (long)(dimY), (long)(dimZ)); -copyIm(div_upd, div, (long)(dimX), (long)(dimY), (long)(dimZ)); -// Compute residual at current iteration -residual = (resprimal + resdual) / ((float) (dimX*dimY*dimZ)); + if(b > 1) { + + // Decrease step-sizes to fit balancing principle + tau = (beta * tau) / b; + sigma = (beta * sigma) / b; + alpha = alpha0; -// printf("%f \n", residual); -if (residual < tol) { - printf("Iterations stopped at %i with the residual %f \n", iter, residual); - break; } + if (started) { + fprintf(stderr, "\n\n\nWARNING: Back-tracking is required in the middle of iterative optimization! We CAN'T do it in the fast version. The standard TNV recommended\n\n\n"); + } else { + err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_restore); + if (!err) err = hw_sched_wait_task(sched); + if (err) { fprintf(stderr, "Error %i scheduling restore threads", err); exit(-1); } + } + } else { + started = 1; + if(resprimal > dual_dot_delta) { + // Increase primal step-size and decrease dual step-size + tau = tau / (1.0f - alpha); + sigma = sigma * (1.0f - alpha); + alpha = alpha * eta; + } else if(resprimal < dual_div_delta) { + // Decrease primal step-size and increase dual step-size + tau = tau * (1.0f - alpha); + sigma = sigma / (1.0f - alpha); + alpha = alpha * eta; + } + } + if (residual < tol) break; } - printf("Iterations stopped at %i with the residual %f \n", iter, residual); - free (u_upd); free(gx); free(gy); free(gx_upd); free(gy_upd); - free(qx); free(qy); free(qx_upd); free(qy_upd); free(v); free(vx); free(vy); - free(gradx); free(grady); free(gradx_upd); free(grady_upd); free(gradx_ubar); - free(grady_ubar); free(div); free(div_upd); - return *u; -} -float proxG(float *u_upd, float *v, float *f, float taulambda, long dimX, long dimY, long dimZ) -{ - float constant; - long k; - constant = 1.0f + taulambda; -#pragma omp parallel for shared(v, f, u_upd) private(k) - for(k=0; k fTiny) - { - V1 = v1 / mu1; - V3 = v0 / mu1; - } - - if(mu2 > fTiny) - { - V2 = v2 / mu2; - V4 = v0 / mu2; - } - - } else - { - if(M1 > M3) - { - V1 = V4 = 1.0f; - V2 = V3 = 0.0f; - - } else - { - V1 = V4 = 0.0f; - V2 = V3 = 1.0f; - } - } - - // Compute prox_p of the diagonal entries - sig1_upd = sig2_upd = 0.0f; - - if(p == 1) - { - sig1_upd = MAX(sig1 - divsigma, 0.0f); - sig2_upd = MAX(sig2 - divsigma, 0.0f); - - } else if(p == INFNORM) - { - proj[0] = sigma * fabs(sig1); - proj[1] = sigma * fabs(sig2); - - /*l1 projection part */ - sum = fLarge; - num = 0l; - shrinkfactor = 0.0f; - while(sum > 1.0f) - { - sum = 0.0f; - num = 0; - - for(ii = 0; ii < 2; ii++) - { - proj[ii] = MAX(proj[ii] - shrinkfactor, 0.0f); - - sum += fabs(proj[ii]); - if(proj[ii]!= 0.0f) - num++; - } - - if(num > 0) - shrinkfactor = (sum - 1.0f) / num; - else - break; - } - /*l1 proj ends*/ - - sig1_upd = sig1 - divsigma * proj[0]; - sig2_upd = sig2 - divsigma * proj[1]; - } - - // Compute the diagonal entries of $\widehat{\Sigma}\Sigma^{\dagger}_0$ - if(sig1 > fTiny) - sig1_upd /= sig1; - - if(sig2 > fTiny) - sig2_upd /= sig2; - - // Compute solution - t1 = sig1_upd * V1 * V1 + sig2_upd * V2 * V2; - t2 = sig1_upd * V1 * V3 + sig2_upd * V2 * V4; - t3 = sig1_upd * V3 * V3 + sig2_upd * V4 * V4; - - for(k = 0; k < dimZ; k++) - { - gx[(dimX*dimY)*k + j*dimX + i] = vx[(dimX*dimY)*k + j*dimX + i] * t1 + vy[(dimX*dimY)*k + j*dimX + i] * t2; - gy[(dimX*dimY)*k + j*dimX + i] = vx[(dimX*dimY)*k + j*dimX + i] * t2 + vy[(dimX*dimY)*k + j*dimX + i] * t3; - } - - // Delete allocated memory - free(proj); - }} - - return 1; -} + printf("Iterations stopped at %i with the residual %f \n", iter, residual); + printf("Return: %f\n", *uT); -float divergence(float *qx_upd, float *qy_upd, float *div_upd, long dimX, long dimY, long dimZ) -{ - long i, j, k, l; -#pragma omp parallel for shared(qx_upd,qy_upd,div_upd) private(i, j, k, l) - for(k = 0; k < dimZ; k++) { - for(j = 0; j < dimY; j++) { - l = j * dimX; - for(i = 0; i < dimX; i++) { - if(i != dimX-1) - { - // ux[k][i+l] = u[k][i+1+l] - u[k][i+l] - div_upd[(dimX*dimY)*k + i+1+l] -= qx_upd[(dimX*dimY)*k + i+l]; - div_upd[(dimX*dimY)*k + i+l] += qx_upd[(dimX*dimY)*k + i+l]; - } - - if(j != dimY-1) - { - // uy[k][i+l] = u[k][i+width+l] - u[k][i+l] - //div_upd[(dimX*dimY)*k + i+dimY+l] -= qy_upd[(dimX*dimY)*k + i+l]; - div_upd[(dimX*dimY)*k + i+(j+1)*dimX] -= qy_upd[(dimX*dimY)*k + i+l]; - div_upd[(dimX*dimY)*k + i+l] += qy_upd[(dimX*dimY)*k + i+l]; - } - } - } - } - return *div_upd; +// exit(-1); + return *uT; } diff --git a/src/Core/regularisers_CPU/TNV_core_backtrack.c b/src/Core/regularisers_CPU/TNV_core_backtrack.c new file mode 100755 index 00000000..d771db52 --- /dev/null +++ b/src/Core/regularisers_CPU/TNV_core_backtrack.c @@ -0,0 +1,695 @@ +/* + * This work is part of the Core Imaging Library developed by + * Visual Analytics and Imaging System Group of the Science Technology + * Facilities Council, STFC + * + * Copyright 2017 Daniil Kazantsev + * Copyright 2017 Srikanth Nagella, Edoardo Pasca + * + * Copyriht 2020 Suren A. Chlingaryan + * Optimized version with 1/2 of memory consumption and ~4x performance + * This version is algorithmicly comptable with the original, but slight change in results + * is expected due to different order of floating-point operations. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "TNV_core.h" + +#define min(a,b) (((a)<(b))?(a):(b)) + +inline void coefF(float *t, float M1, float M2, float M3, float sigma, int p, int q, int r) { + int ii, num; + float divsigma = 1.0f / sigma; + float sum, shrinkfactor; + float T,D,det,eig1,eig2,sig1,sig2,V1, V2, V3, V4, v0,v1,v2, mu1,mu2,sig1_upd,sig2_upd; + float proj[2] = {0}; + + // Compute eigenvalues of M + T = M1 + M3; + D = M1 * M3 - M2 * M2; + det = sqrtf(MAX((T * T / 4.0f) - D, 0.0f)); + eig1 = MAX((T / 2.0f) + det, 0.0f); + eig2 = MAX((T / 2.0f) - det, 0.0f); + sig1 = sqrtf(eig1); + sig2 = sqrtf(eig2); + + // Compute normalized eigenvectors + V1 = V2 = V3 = V4 = 0.0f; + + if(M2 != 0.0f) + { + v0 = M2; + v1 = eig1 - M3; + v2 = eig2 - M3; + + mu1 = sqrtf(v0 * v0 + v1 * v1); + mu2 = sqrtf(v0 * v0 + v2 * v2); + + if(mu1 > fTiny) + { + V1 = v1 / mu1; + V3 = v0 / mu1; + } + + if(mu2 > fTiny) + { + V2 = v2 / mu2; + V4 = v0 / mu2; + } + + } else + { + if(M1 > M3) + { + V1 = V4 = 1.0f; + V2 = V3 = 0.0f; + + } else + { + V1 = V4 = 0.0f; + V2 = V3 = 1.0f; + } + } + + // Compute prox_p of the diagonal entries + sig1_upd = sig2_upd = 0.0f; + + if(p == 1) + { + sig1_upd = MAX(sig1 - divsigma, 0.0f); + sig2_upd = MAX(sig2 - divsigma, 0.0f); + + } else if(p == INFNORM) + { + proj[0] = sigma * fabs(sig1); + proj[1] = sigma * fabs(sig2); + + /*l1 projection part */ + sum = fLarge; + num = 0l; + shrinkfactor = 0.0f; + while(sum > 1.0f) + { + sum = 0.0f; + num = 0; + + for(ii = 0; ii < 2; ii++) + { + proj[ii] = MAX(proj[ii] - shrinkfactor, 0.0f); + + sum += fabs(proj[ii]); + if(proj[ii]!= 0.0f) + num++; + } + + if(num > 0) + shrinkfactor = (sum - 1.0f) / num; + else + break; + } + /*l1 proj ends*/ + + sig1_upd = sig1 - divsigma * proj[0]; + sig2_upd = sig2 - divsigma * proj[1]; + } + + // Compute the diagonal entries of $\widehat{\Sigma}\Sigma^{\dagger}_0$ + if(sig1 > fTiny) + sig1_upd /= sig1; + + if(sig2 > fTiny) + sig2_upd /= sig2; + + // Compute solution + t[0] = sig1_upd * V1 * V1 + sig2_upd * V2 * V2; + t[1] = sig1_upd * V1 * V3 + sig2_upd * V2 * V4; + t[2] = sig1_upd * V3 * V3 + sig2_upd * V4 * V4; +} + + +#include "hw_sched.h" +typedef struct { + int offY, stepY, copY; + float *Input, *u, *u_upd, *qx, *qy, *qx_upd, *qy_upd, *gradx, *grady, *gradx_upd, *grady_upd; + float *div, *div_upd; + float resprimal, resdual; + float unorm, qnorm, product; +} tnv_thread_t; + +typedef struct { + int threads; + tnv_thread_t *thr_ctx; + float *InputT, *uT; + int dimX, dimY, dimZ, padZ; + float lambda, sigma, tau, theta; +} tnv_context_t; + +HWSched sched = NULL; +tnv_context_t tnv_ctx; + + +static int tnv_free(HWThread thr, void *hwctx, int device_id, void *data) { + int i,j,k; + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + free(ctx->Input); + free(ctx->u); + free(ctx->u_upd); + free(ctx->qx); + free(ctx->qy); + free(ctx->qx_upd); + free(ctx->qy_upd); + free(ctx->gradx); + free(ctx->grady); + free(ctx->gradx_upd); + free(ctx->grady_upd); + free(ctx->div); + free(ctx->div_upd); + + return 0; +} + +static int tnv_init(HWThread thr, void *hwctx, int device_id, void *data) { + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + int dimX = tnv_ctx->dimX; + int dimY = tnv_ctx->dimY; + int dimZ = tnv_ctx->dimZ; + int padZ = tnv_ctx->padZ; + int offY = ctx->offY; + int stepY = ctx->stepY; + +// printf("%i %p - %i %i %i x %i %i\n", device_id, ctx, dimX, dimY, dimZ, offY, stepY); + + long DimTotal = (long)(dimX*stepY*padZ); + long Dim1Total = (long)(dimX*(stepY+1)*padZ); + + // Auxiliar vectors + ctx->Input = malloc(Dim1Total * sizeof(float)); + ctx->u = malloc(Dim1Total * sizeof(float)); + ctx->u_upd = malloc(Dim1Total * sizeof(float)); + ctx->qx = malloc(DimTotal * sizeof(float)); + ctx->qy = malloc(DimTotal * sizeof(float)); + ctx->qx_upd = malloc(DimTotal * sizeof(float)); + ctx->qy_upd = malloc(DimTotal * sizeof(float)); + ctx->gradx = malloc(DimTotal * sizeof(float)); + ctx->grady = malloc(DimTotal * sizeof(float)); + ctx->gradx_upd = malloc(DimTotal * sizeof(float)); + ctx->grady_upd = malloc(DimTotal * sizeof(float)); + ctx->div = malloc(Dim1Total * sizeof(float)); + ctx->div_upd = malloc(Dim1Total * sizeof(float)); + + if ((!ctx->Input)||(!ctx->u)||(!ctx->u_upd)||(!ctx->qx)||(!ctx->qy)||(!ctx->qx_upd)||(!ctx->qy_upd)||(!ctx->gradx)||(!ctx->grady)||(!ctx->gradx_upd)||(!ctx->grady_upd)||(!ctx->div)||(!ctx->div_upd)) { + fprintf(stderr, "Error allocating memory\n"); + exit(-1); + } + + return 0; +} + +static int tnv_start(HWThread thr, void *hwctx, int device_id, void *data) { + int i,j,k; + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + int dimX = tnv_ctx->dimX; + int dimY = tnv_ctx->dimY; + int dimZ = tnv_ctx->dimZ; + int padZ = tnv_ctx->padZ; + int offY = ctx->offY; + int stepY = ctx->stepY; + int copY = ctx->copY; + +// printf("%i %p - %i %i %i (%i) x %i %i\n", device_id, ctx, dimX, dimY, dimZ, padZ, offY, stepY); + + long DimTotal = (long)(dimX*stepY*padZ); + long Dim1Total = (long)(dimX*copY*padZ); + + memset(ctx->u, 0, Dim1Total * sizeof(float)); + memset(ctx->qx, 0, DimTotal * sizeof(float)); + memset(ctx->qy, 0, DimTotal * sizeof(float)); + memset(ctx->gradx, 0, DimTotal * sizeof(float)); + memset(ctx->grady, 0, DimTotal * sizeof(float)); + memset(ctx->div, 0, Dim1Total * sizeof(float)); + + for(k=0; kInput[j * dimX * padZ + i * padZ + k] = tnv_ctx->InputT[k * dimX * dimY + (j + offY) * dimX + i]; + ctx->u[j * dimX * padZ + i * padZ + k] = tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i]; + } + } + } + + return 0; +} + +static int tnv_finish(HWThread thr, void *hwctx, int device_id, void *data) { + int i,j,k; + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + int dimX = tnv_ctx->dimX; + int dimY = tnv_ctx->dimY; + int dimZ = tnv_ctx->dimZ; + int padZ = tnv_ctx->padZ; + int offY = ctx->offY; + int stepY = ctx->stepY; + int copY = ctx->copY; + + for(k=0; kuT[k * dimX * dimY + (j + offY) * dimX + i] = ctx->u[j * dimX * padZ + i * padZ + k]; + } + } + } + + return 0; +} + + +static int tnv_copy(HWThread thr, void *hwctx, int device_id, void *data) { + int i,j,k; + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + int dimX = tnv_ctx->dimX; + int dimY = tnv_ctx->dimY; + int dimZ = tnv_ctx->dimZ; + int stepY = ctx->stepY; + int copY = ctx->copY; + int padZ = tnv_ctx->padZ; + long DimTotal = (long)(dimX*stepY*padZ); + long Dim1Total = (long)(dimX*copY*padZ); + + // Auxiliar vectors + memcpy(ctx->u, ctx->u_upd, Dim1Total * sizeof(float)); + memcpy(ctx->qx, ctx->qx_upd, DimTotal * sizeof(float)); + memcpy(ctx->qy, ctx->qy_upd, DimTotal * sizeof(float)); + memcpy(ctx->gradx, ctx->gradx_upd, DimTotal * sizeof(float)); + memcpy(ctx->grady, ctx->grady_upd, DimTotal * sizeof(float)); + memcpy(ctx->div, ctx->div_upd, Dim1Total * sizeof(float)); + + return 0; +} + +static int tnv_restore(HWThread thr, void *hwctx, int device_id, void *data) { + int i,j,k; + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + int dimX = tnv_ctx->dimX; + int dimY = tnv_ctx->dimY; + int dimZ = tnv_ctx->dimZ; + int stepY = ctx->stepY; + int copY = ctx->copY; + int padZ = tnv_ctx->padZ; + long DimTotal = (long)(dimX*stepY*padZ); + long Dim1Total = (long)(dimX*copY*padZ); + + // Auxiliar vectors + memcpy(ctx->u_upd, ctx->u, Dim1Total * sizeof(float)); + memcpy(ctx->qx_upd, ctx->qx, DimTotal * sizeof(float)); + memcpy(ctx->qy_upd, ctx->qy, DimTotal * sizeof(float)); + memcpy(ctx->gradx_upd, ctx->gradx, DimTotal * sizeof(float)); + memcpy(ctx->grady_upd, ctx->grady, DimTotal * sizeof(float)); + memcpy(ctx->div_upd, ctx->div, Dim1Total * sizeof(float)); + + return 0; +} + + +static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) { + long i, j, k, l, m; + + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + int dimX = tnv_ctx->dimX; + int dimY = tnv_ctx->dimY; + int dimZ = tnv_ctx->dimZ; + int padZ = tnv_ctx->padZ; + int offY = ctx->offY; + int stepY = ctx->stepY; + int copY = ctx->copY; + + float *Input = ctx->Input; + float *u = ctx->u; + float *u_upd = ctx->u_upd; + float *qx = ctx->qx; + float *qy = ctx->qy; + float *qx_upd = ctx->qx_upd; + float *qy_upd = ctx->qy_upd; + float *gradx = ctx->gradx; + float *grady = ctx->grady; + float *gradx_upd = ctx->gradx_upd; + float *grady_upd = ctx->grady_upd; + float *div = ctx->div; + float *div_upd = ctx->div_upd; + + long p = 1l; + long q = 1l; + long r = 0l; + + float lambda = tnv_ctx->lambda; + float sigma = tnv_ctx->sigma; + float tau = tnv_ctx->tau; + float theta = tnv_ctx->theta; + + float taulambda = tau * lambda; + float divtau = 1.0f / tau; + float divsigma = 1.0f / sigma; + float theta1 = 1.0f + theta; + float constant = 1.0f + taulambda; + + float resprimal = 0.0f; + float resdual = 0.0f; + float product = 0.0f; + float unorm = 0.0f; + float qnorm = 0.0f; + + float udiff[dimZ]; + float qxdiff; + float qydiff; + float divdiff; + float gradxdiff[dimZ]; + float gradydiff[dimZ]; + + for(i=0; i < dimX; i++) { + for(k = 0; k < dimZ; k++) { + int l = i * padZ + k; + u_upd[l] = (u[l] + tau * div[l] + taulambda * Input[l])/constant; + div_upd[l] = 0; + } + } + + for(j = 0; j < stepY; j++) { +/* m = j * dimX * dimZ + (dimX - 1) * dimZ; + for(k = 0; k < dimZ; k++) { + u_upd[k + m] = (u[k + m] + tau * div[k + m] + taulambda * Input[k + m]) / constant; + }*/ + + for(i = 0; i < dimX/* - 1*/; i++) { + float t[3]; + float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f; + l = (j * dimX + i) * padZ; + m = dimX * padZ; + +//#pragma unroll 64 + for(k = 0; k < dimZ; k++) { + u_upd[l + k + m] = (u[l + k + m] + tau * div[l + k + m] + taulambda * Input[l + k + m]) / constant; + + gradx_upd[l + k] = (i == (dimX - 1))?0:(u_upd[l + k + padZ] - u_upd[l + k]); + grady_upd[l + k] = (j == (copY - 1))?0:(u_upd[l + k + m] - u_upd[l + k]); // We need div from the next thread on last iter + + udiff[k] = u[l + k] - u_upd[l + k]; + unorm += (udiff[k] * udiff[k]); +// if ((!k)&&(!i)) printf("%i = %f %f, %f %f\n", offY + j, u[l + k], u_upd[l + k], udiff[k], unorm); + + gradxdiff[k] = gradx[l + k] - gradx_upd[l + k]; + gradydiff[k] = grady[l + k] - grady_upd[l + k]; + + float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k]; + float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k]; +//#define TNV_NEW_STYLE +#ifdef TNV_NEW_STYLE + qx_upd[l + k] = qx[l + k] + sigma * ubarx; + qy_upd[l + k] = qy[l + k] + sigma * ubary; + + float vx = divsigma * qx_upd[l + k]; //+ ubarx + float vy = divsigma * qy_upd[l + k]; //+ ubary +#else + float vx = ubarx + divsigma * qx[l + k]; + float vy = ubary + divsigma * qy[l + k]; +#endif + + M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy); + } + + coefF(t, M1, M2, M3, sigma, p, q, r); + +//#pragma unroll 64 + for(k = 0; k < dimZ; k++) { +#ifdef TNV_NEW_STYLE + float vx = divsigma * qx_upd[l + k]; + float vy = divsigma * qy_upd[l + k]; + + float gx_upd = vx * t[0] + vy * t[1]; + float gy_upd = vx * t[1] + vy * t[2]; + + qx_upd[l + k] -= sigma * gx_upd; + qy_upd[l + k] -= sigma * gy_upd; +#else + float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k]; + float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k]; + float vx = ubarx + divsigma * qx[l + k]; + float vy = ubary + divsigma * qy[l + k]; + + float gx_upd = vx * t[0] + vy * t[1]; + float gy_upd = vx * t[1] + vy * t[2]; + + qx_upd[l + k] = qx[l + k] + sigma * (ubarx - gx_upd); + qy_upd[l + k] = qy[l + k] + sigma * (ubary - gy_upd); +#endif + +if(i != (dimX-1)) { + div_upd[l + k] += qx_upd[l + k]; + div_upd[l + k + padZ] -= qx_upd[l + k]; +} +if(j != (copY-1)) { + div_upd[l + k] += qy_upd[l + k]; + div_upd[l + k + m] = -qy_upd[l + k]; // We need to update div in the next thread on last iter +} + + qxdiff = qx[l + k] - qx_upd[l + k]; + qydiff = qy[l + k] - qy_upd[l + k]; + qnorm += (qxdiff * qxdiff + qydiff * qydiff); + + resdual += fabs(divsigma * qxdiff - gradxdiff[k]); + resdual += fabs(divsigma * qydiff - gradydiff[k]); + product += (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff); + + if ((offY == 0)||(j > 0)) { + divdiff = div[l + k] - div_upd[l + k]; // Multiple steps... How we compute without history? + resprimal += fabs(divtau * udiff[k] + divdiff); + } + } + + } // i + } + + + ctx->resprimal = resprimal; + ctx->resdual = resdual; + ctx->product = product; + ctx->unorm = unorm; + ctx->qnorm = qnorm; + + return 0; +} + +static void TNV_CPU_init(float *InputT, float *uT, int dimX, int dimY, int dimZ) { + int i, off, size, err; + + if (sched) return; + + tnv_ctx.dimX = dimX; + tnv_ctx.dimY = dimY; + tnv_ctx.dimZ = dimZ; + // Padding seems actually slower +// tnv_ctx.padZ = 64 * ((dimZ / 64) + ((dimZ % 64)?1:0)); + tnv_ctx.padZ = dimZ; + + hw_sched_init(); + + int threads = hw_sched_get_cpu_count(); + if (threads > dimY) threads = dimY/2; + + int step = dimY / threads; + int extra = dimY % threads; + + tnv_ctx.threads = threads; + tnv_ctx.thr_ctx = (tnv_thread_t*)calloc(threads, sizeof(tnv_thread_t)); + for (i = 0, off = 0; i < threads; i++, off += size) { + tnv_thread_t *ctx = tnv_ctx.thr_ctx + i; + size = step + ((i < extra)?1:0); + + ctx->offY = off; + ctx->stepY = size; + + if (i == (threads-1)) ctx->copY = ctx->stepY; + else ctx->copY = ctx->stepY + 1; + } + + sched = hw_sched_create(threads); + if (!sched) { fprintf(stderr, "Error creating threads\n"); exit(-1); } + + err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_init); + if (!err) err = hw_sched_wait_task(sched); + if (err) { fprintf(stderr, "Error %i scheduling init threads", err); exit(-1); } +} + + + +/* + * C-OMP implementation of Total Nuclear Variation regularisation model (2D + channels) [1] + * The code is modified from the implementation by Joan Duran see + * "denoisingPDHG_ipol.cpp" in Joans Collaborative Total Variation package + * + * Input Parameters: + * 1. Noisy volume of 2D + channel dimension, i.e. 3D volume + * 2. lambda - regularisation parameter + * 3. Number of iterations [OPTIONAL parameter] + * 4. eplsilon - tolerance constant [OPTIONAL parameter] + * 5. print information: 0 (off) or 1 (on) [OPTIONAL parameter] + * + * Output: + * 1. Filtered/regularized image (u) + * + * [1]. Duran, J., Moeller, M., Sbert, C. and Cremers, D., 2016. Collaborative total variation: a general framework for vectorial TV models. SIAM Journal on Imaging Sciences, 9(1), pp.116-151. + */ + +float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float tol, int dimX, int dimY, int dimZ) +{ + int err; + int iter; + int i,j,k,l,m; + + lambda = 1.0f/(2.0f*lambda); + tnv_ctx.lambda = lambda; + + // PDHG algorithm parameters + float tau = 0.5f; + float sigma = 0.5f; + float theta = 1.0f; + + // Backtracking parameters + float s = 1.0f; + float gamma = 0.75f; + float beta = 0.95f; + float alpha0 = 0.2f; + float alpha = alpha0; + float delta = 1.5f; + float eta = 0.95f; + + TNV_CPU_init(InputT, uT, dimX, dimY, dimZ); + + tnv_ctx.InputT = InputT; + tnv_ctx.uT = uT; + + int padZ = tnv_ctx.padZ; + + err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_start); + if (!err) err = hw_sched_wait_task(sched); + if (err) { fprintf(stderr, "Error %i scheduling start threads", err); exit(-1); } + + + // Apply Primal-Dual Hybrid Gradient scheme + float residual = fLarge; + for(iter = 0; iter < maxIter; iter++) { + float resprimal = 0.0f; + float resdual = 0.0f; + float product = 0.0f; + float unorm = 0.0f; + float qnorm = 0.0f; + + float divtau = 1.0f / tau; + + tnv_ctx.sigma = sigma; + tnv_ctx.tau = tau; + tnv_ctx.theta = theta; + + err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_step); + if (!err) err = hw_sched_wait_task(sched); + if (err) { fprintf(stderr, "Error %i scheduling tnv threads", err); exit(-1); } + + // border regions + for (j = 1; j < tnv_ctx.threads; j++) { + tnv_thread_t *ctx0 = tnv_ctx.thr_ctx + (j - 1); + tnv_thread_t *ctx = tnv_ctx.thr_ctx + j; + + m = ctx0->stepY * dimX * padZ; + for(i = 0; i < dimX; i++) { + for(k = 0; k < dimZ; k++) { + int l = i * padZ + k; + + float div_upd_add = ctx0->div_upd[m + l]; + ctx->div_upd[l] += div_upd_add; + ctx0->div_upd[m + l] = ctx->div_upd[l]; + //ctx0->u_upd[m + l] = ctx->u_upd[l]; + + float divdiff = ctx->div[l] - ctx->div_upd[l]; // Multiple steps... How we compute without history? + float udiff = ctx->u[l] - ctx->u_upd[l]; + resprimal += fabs(divtau * udiff + divdiff); + } + } + } + + for (j = 0; j < tnv_ctx.threads; j++) { + tnv_thread_t *ctx = tnv_ctx.thr_ctx + j; + resprimal += ctx->resprimal; + resdual += ctx->resdual; + product += ctx->product; + unorm += ctx->unorm; + qnorm += ctx->qnorm; + } + + residual = (resprimal + resdual) / ((float) (dimX*dimY*dimZ)); + float b = (2.0f * tau * sigma * product) / (gamma * sigma * unorm + gamma * tau * qnorm); + float dual_dot_delta = resdual * s * delta; + float dual_div_delta = (resdual * s) / delta; + printf("resprimal: %f, resdual: %f, b: %f (product: %f, unorm: %f, qnorm: %f)\n", resprimal, resdual, b, product, unorm, qnorm); + + + if(b > 1) { + // Decrease step-sizes to fit balancing principle + tau = (beta * tau) / b; + sigma = (beta * sigma) / b; + alpha = alpha0; + + err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_restore); + if (!err) err = hw_sched_wait_task(sched); + if (err) { fprintf(stderr, "Error %i scheduling restore threads", err); exit(-1); } + } else { + err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_copy); + if (!err) err = hw_sched_wait_task(sched); + if (err) { fprintf(stderr, "Error %i scheduling copy threads", err); exit(-1); } + + if(resprimal > dual_dot_delta) { + // Increase primal step-size and decrease dual step-size + tau = tau / (1.0f - alpha); + sigma = sigma * (1.0f - alpha); + alpha = alpha * eta; + } else if(resprimal < dual_div_delta) { + // Decrease primal step-size and increase dual step-size + tau = tau * (1.0f - alpha); + sigma = sigma / (1.0f - alpha); + alpha = alpha * eta; + } + } + + if (residual < tol) break; + } + + err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_finish); + if (!err) err = hw_sched_wait_task(sched); + if (err) { fprintf(stderr, "Error %i scheduling finish threads", err); exit(-1); } + + + printf("Iterations stopped at %i with the residual %f \n", iter, residual); + printf("Return: %f\n", *uT); + + return *uT; +} diff --git a/src/Core/regularisers_CPU/hw_sched.c b/src/Core/regularisers_CPU/hw_sched.c new file mode 100644 index 00000000..be895cc7 --- /dev/null +++ b/src/Core/regularisers_CPU/hw_sched.c @@ -0,0 +1,396 @@ +/* + * The PyHST program is Copyright (C) 2002-2011 of the + * European Synchrotron Radiation Facility (ESRF) and + * Karlsruhe Institute of Technology (KIT). + * + * PyHST is free software: you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * hst is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program. If not, see . + */ + +#define _GNU_SOURCE +#include +#include +#include + +#ifdef HW_HAVE_SCHED_HEADERS +# include +# include +# include +#endif /* HW_HAVE_SCHED_HEADERS */ + +#include "hw_sched.h" + + +#ifdef HW_USE_THREADS +# define MUTEX_INIT(ctx, name) \ + if (!err) { \ + ctx->name##_mutex = g_mutex_new(); \ + if (!ctx->name##_mutex) err = 1; \ + } + +# define MUTEX_FREE(ctx, name) \ + if (ctx->name##_mutex) g_mutex_free(ctx->name##_mutex); + +# define COND_INIT(ctx, name) \ + MUTEX_INIT(ctx, name##_cond) \ + if (!err) { \ + ctx->name##_cond = g_cond_new(); \ + if (!ctx->name##_cond) { \ + err = 1; \ + MUTEX_FREE(ctx, name##_cond) \ + } \ + } + +# define COND_FREE(ctx, name) \ + if (ctx->name##_cond) g_cond_free(ctx->name##_cond); \ + MUTEX_FREE(ctx, name##_cond) +#else /* HW_USE_THREADS */ +# define MUTEX_INIT(ctx, name) +# define MUTEX_FREE(ctx, name) +# define COND_INIT(ctx, name) +# define COND_FREE(ctx, name) +#endif /* HW_USE_THREADS */ + + +HWRunFunction ppu_run[] = { + (HWRunFunction)NULL +}; + +static int hw_sched_initialized = 0; + +int hw_sched_init(void) { + if (!hw_sched_initialized) { +#ifdef HW_USE_THREADS + g_thread_init(NULL); +#endif /* HW_USE_THREADS */ + hw_sched_initialized = 1; + } + + return 0; +} + + +int hw_sched_get_cpu_count(void) { +#ifdef HW_HAVE_SCHED_HEADERS + int err; + + int cpu_count; + cpu_set_t mask; + + err = sched_getaffinity(getpid(), sizeof(mask), &mask); + if (err) return 1; + +# ifdef CPU_COUNT + cpu_count = CPU_COUNT(&mask); +# else + for (cpu_count = 0; cpu_count < CPU_SETSIZE; cpu_count++) { + if (!CPU_ISSET(cpu_count, &mask)) break; + } +# endif + + if (!cpu_count) cpu_count = 1; + return cpu_count; +#else /* HW_HAVE_SCHED_HEADERS */ + return 1; +#endif /* HW_HAVE_SCHED_HEADERS */ +} + + +HWSched hw_sched_create(int cpu_count) { + int i; + int err = 0; + + HWSched ctx; + + //hw_sched_init(); + + ctx = (HWSched)malloc(sizeof(HWSchedS)); + if (!ctx) return NULL; + + memset(ctx, 0, sizeof(HWSchedS)); + + ctx->status = 1; + + MUTEX_INIT(ctx, sync); + MUTEX_INIT(ctx, data); + COND_INIT(ctx, compl); + COND_INIT(ctx, job); + + if (err) { + hw_sched_destroy(ctx); + return NULL; + } + + if (!cpu_count) cpu_count = hw_sched_get_cpu_count(); + if (cpu_count > HW_MAX_THREADS) cpu_count = HW_MAX_THREADS; + + ctx->n_threads = 0; + for (i = 0; i < cpu_count; i++) { + ctx->thread[ctx->n_threads] = hw_thread_create(ctx, ctx->n_threads, NULL, ppu_run, NULL); + if (ctx->thread[ctx->n_threads]) { +#ifndef HW_USE_THREADS + ctx->thread[ctx->n_threads]->status = HW_THREAD_STATUS_STARTING; +#endif /* HW_USE_THREADS */ + ++ctx->n_threads; + } + } + + if (!ctx->n_threads) { + hw_sched_destroy(ctx); + return NULL; + } + + return ctx; +} + +static int hw_sched_wait_threads(HWSched ctx) { +#ifdef HW_USE_THREADS + int i = 0; + + hw_sched_lock(ctx, compl_cond); + while (i < ctx->n_threads) { + for (; i < ctx->n_threads; i++) { + if (ctx->thread[i]->status == HW_THREAD_STATUS_INIT) { + hw_sched_wait(ctx, compl); + break; + } + } + + } + hw_sched_unlock(ctx, compl_cond); +#endif /* HW_USE_THREADS */ + + ctx->started = 1; + + return 0; +} + +void hw_sched_destroy(HWSched ctx) { + int i; + + if (ctx->n_threads > 0) { + if (!ctx->started) { + hw_sched_wait_threads(ctx); + } + + ctx->status = 0; + hw_sched_lock(ctx, job_cond); + hw_sched_broadcast(ctx, job); + hw_sched_unlock(ctx, job_cond); + + for (i = 0; i < ctx->n_threads; i++) { + hw_thread_destroy(ctx->thread[i]); + } + } + + COND_FREE(ctx, job); + COND_FREE(ctx, compl); + MUTEX_FREE(ctx, data); + MUTEX_FREE(ctx, sync); + + free(ctx); +} + +int hw_sched_set_sequential_mode(HWSched ctx, int *n_blocks, int *cur_block, HWSchedFlags flags) { + ctx->mode = HW_SCHED_MODE_SEQUENTIAL; + ctx->n_blocks = n_blocks; + ctx->cur_block = cur_block; + ctx->flags = flags; + + return 0; +} + +int hw_sched_get_chunk(HWSched ctx, int thread_id) { + int block; + + switch (ctx->mode) { + case HW_SCHED_MODE_PREALLOCATED: + if (ctx->thread[thread_id]->status == HW_THREAD_STATUS_STARTING) { +#ifndef HW_USE_THREADS + ctx->thread[thread_id]->status = HW_THREAD_STATUS_DONE; +#endif /* HW_USE_THREADS */ + return thread_id; + } else { + return HW_SCHED_CHUNK_INVALID; + } + case HW_SCHED_MODE_SEQUENTIAL: + if ((ctx->flags&HW_SCHED_FLAG_INIT_CALL)&&(ctx->thread[thread_id]->status == HW_THREAD_STATUS_STARTING)) { + return HW_SCHED_CHUNK_INIT; + } + hw_sched_lock(ctx, data); + block = *ctx->cur_block; + if (block < *ctx->n_blocks) { + *ctx->cur_block = *ctx->cur_block + 1; + } else { + block = HW_SCHED_CHUNK_INVALID; + } + hw_sched_unlock(ctx, data); + if (block == HW_SCHED_CHUNK_INVALID) { + if (((ctx->flags&HW_SCHED_FLAG_FREE_CALL)&&(ctx->thread[thread_id]->status == HW_THREAD_STATUS_RUNNING))) { + ctx->thread[thread_id]->status = HW_THREAD_STATUS_FINISHING; + return HW_SCHED_CHUNK_FREE; + } + if ((ctx->flags&HW_SCHED_FLAG_TERMINATOR_CALL)&&((ctx->thread[thread_id]->status == HW_THREAD_STATUS_RUNNING)||(ctx->thread[thread_id]->status == HW_THREAD_STATUS_FINISHING))) { + int i; + hw_sched_lock(ctx, data); + for (i = 0; i < ctx->n_threads; i++) { + if (thread_id == i) continue; + if ((ctx->thread[i]->status != HW_THREAD_STATUS_DONE)&&(ctx->thread[i]->status != HW_THREAD_STATUS_FINISHING2)&&(ctx->thread[i]->status != HW_THREAD_STATUS_IDLE)) { + break; + } + } + ctx->thread[thread_id]->status = HW_THREAD_STATUS_FINISHING2; + hw_sched_unlock(ctx, data); + if (i == ctx->n_threads) { + return HW_SCHED_CHUNK_TERMINATOR; + } + } + } + return block; + default: + return HW_SCHED_CHUNK_INVALID; + } + + return -1; +} + + +int hw_sched_schedule_task(HWSched ctx, void *appctx, HWEntry entry) { +#ifdef HW_USE_THREADS + if (!ctx->started) { + hw_sched_wait_threads(ctx); + } +#else /* HW_USE_THREADS */ + int err; + int i, chunk_id, n_threads; + HWRunFunction run; + HWThread thrctx; +#endif /* HW_USE_THREADS */ + + ctx->ctx = appctx; + ctx->entry = entry; + + switch (ctx->mode) { + case HW_SCHED_MODE_SEQUENTIAL: + *ctx->cur_block = 0; + break; + default: + ; + } + +#ifdef HW_USE_THREADS + hw_sched_lock(ctx, compl_cond); + + hw_sched_lock(ctx, job_cond); + hw_sched_broadcast(ctx, job); + hw_sched_unlock(ctx, job_cond); +#else /* HW_USE_THREADS */ + n_threads = ctx->n_threads; + + for (i = 0; i < n_threads; i++) { + thrctx = ctx->thread[i]; + thrctx->err = 0; + } + + i = 0; + thrctx = ctx->thread[i]; + chunk_id = hw_sched_get_chunk(ctx, thrctx->thread_id); + + while (chunk_id >= 0) { + run = hw_run_entry(thrctx->runs, entry); + err = run(thrctx, thrctx->hwctx, chunk_id, appctx); + if (err) { + thrctx->err = err; + break; + } + + if ((++i) == n_threads) i = 0; + thrctx = ctx->thread[i]; + chunk_id = hw_sched_get_chunk(ctx, thrctx->thread_id); + } +#endif /* HW_USE_THREADS */ + + return 0; +} + +int hw_sched_wait_task(HWSched ctx) { + int err = 0; + int i = 0, n_threads = ctx->n_threads; + +#ifdef HW_USE_THREADS + while (i < ctx->n_threads) { + for (; i < ctx->n_threads; i++) { + if (ctx->thread[i]->status == HW_THREAD_STATUS_DONE) { + ctx->thread[i]->status = HW_THREAD_STATUS_IDLE; + } else { + hw_sched_wait(ctx, compl); + break; + } + } + + } + + hw_sched_unlock(ctx, compl_cond); +#endif /* HW_USE_THREADS */ + + for (i = 0; i < n_threads; i++) { + HWThread thrctx = ctx->thread[i]; + if (thrctx->err) return err = thrctx->err; + +#ifndef HW_USE_THREADS + thrctx->status = HW_THREAD_STATUS_IDLE; +#endif /* HW_USE_THREADS */ + } + + return err; +} + +int hw_sched_execute_task(HWSched ctx, void *appctx, HWEntry entry) { + int err; + + err = hw_sched_schedule_task(ctx, appctx, entry); + if (err) return err; + + return hw_sched_wait_task(ctx); +} + +int hw_sched_schedule_thread_task(HWSched ctx, void *appctx, HWEntry entry) { + int err; + + ctx->saved_mode = ctx->mode; + ctx->mode = HW_SCHED_MODE_PREALLOCATED; + err = hw_sched_schedule_task(ctx, appctx, entry); + + return err; +} + + +int hw_sched_wait_thread_task(HWSched ctx) { + int err; + + err = hw_sched_wait_task(ctx); + ctx->mode = ctx->saved_mode; + + return err; +} + +int hw_sched_execute_thread_task(HWSched ctx, void *appctx, HWEntry entry) { + int err; + int saved_mode = ctx->mode; + + ctx->mode = HW_SCHED_MODE_PREALLOCATED; + err = hw_sched_execute_task(ctx, appctx, entry); + ctx->mode = saved_mode; + + return err; +} diff --git a/src/Core/regularisers_CPU/hw_sched.h b/src/Core/regularisers_CPU/hw_sched.h new file mode 100644 index 00000000..c5d1285f --- /dev/null +++ b/src/Core/regularisers_CPU/hw_sched.h @@ -0,0 +1,151 @@ +/* + * The PyHST program is Copyright (C) 2002-2011 of the + * European Synchrotron Radiation Facility (ESRF) and + * Karlsruhe Institute of Technology (KIT). + * + * PyHST is free software: you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * hst is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program. If not, see . + */ + +#ifndef _HW_SCHED_H +#define _HW_SCHED_H + +#include + + // enable threading +#define HW_HAVE_SCHED_HEADERS +#define HW_USE_THREADS + + +typedef struct HWSchedT *HWSched; +#ifdef HW_USE_THREADS +typedef GMutex *HWMutex; +#else /* HW_USE_THREADS */ +typedef void *HWMutex; +#endif /* HW_USE_THREADS */ + + +#include "hw_thread.h" + +enum HWSchedModeT { + HW_SCHED_MODE_PREALLOCATED = 0, + HW_SCHED_MODE_SEQUENTIAL +}; +typedef enum HWSchedModeT HWSchedMode; + +enum HWSchedChunkT { + HW_SCHED_CHUNK_INVALID = -1, + HW_SCHED_CHUNK_INIT = -2, + HW_SCHED_CHUNK_FREE = -3, + HW_SCHED_CHUNK_TERMINATOR = -4 +}; +typedef enum HWSchedChunkT HWSchedChunk; + +enum HWSchedFlagsT { + HW_SCHED_FLAG_INIT_CALL = 1, //! Executed in each thread before real chunks + HW_SCHED_FLAG_FREE_CALL = 2, //! Executed in each thread after real chunks + HW_SCHED_FLAG_TERMINATOR_CALL = 4 //! Executed in one of the threads after all threads are done +}; +typedef enum HWSchedFlagsT HWSchedFlags; + + +#define HW_SINGLE_MODE +//#define HW_DETECT_CPU_CORES +#define HW_MAX_THREADS 128 + +#ifdef HW_SINGLE_MODE + typedef HWRunFunction HWEntry; +# define hw_run_entry(runs, entry) entry +#else /* HW_SINGLE_MODE */ + typedef int HWEntry; +# define hw_run_entry(runs, entry) runs[entry] +#endif /* HW_SINGLE_MODE */ + +#ifndef HW_HIDE_DETAILS +struct HWSchedT { + int status; + int started; + + int n_threads; + HWThread thread[HW_MAX_THREADS]; + +#ifdef HW_USE_THREADS + GCond *job_cond, *compl_cond; + GMutex *job_cond_mutex, *compl_cond_mutex, *data_mutex; + GMutex *sync_mutex; +#endif /* HW_USE_THREADS */ + + HWSchedMode mode; + HWSchedMode saved_mode; + HWSchedFlags flags; + int *n_blocks; + int *cur_block; + + HWEntry entry; + void *ctx; +}; +typedef struct HWSchedT HWSchedS; +#endif /* HW_HIDE_DETAILS */ + +# ifdef __cplusplus +extern "C" { +# endif + +HWSched hw_sched_create(int ppu_count); +int hw_sched_init(void); +void hw_sched_destroy(HWSched ctx); +int hw_sched_get_cpu_count(void); + +int hw_sched_set_sequential_mode(HWSched ctx, int *n_blocks, int *cur_block, HWSchedFlags flags); +int hw_sched_get_chunk(HWSched ctx, int thread_id); +int hw_sched_schedule_task(HWSched ctx, void *appctx, HWEntry entry); +int hw_sched_wait_task(HWSched ctx); +int hw_sched_execute_task(HWSched ctx, void *appctx, HWEntry entry); + +int hw_sched_schedule_thread_task(HWSched ctx, void *appctx, HWEntry entry); +int hw_sched_wait_thread_task(HWSched ctx); +int hw_sched_execute_thread_task(HWSched ctx, void *appctx, HWEntry entry); + +HWMutex hw_sched_create_mutex(void); +void hw_sched_destroy_mutex(HWMutex ctx); + +#ifdef HW_USE_THREADS +# define hw_sched_lock(ctx, type) g_mutex_lock(ctx->type##_mutex) +# define hw_sched_unlock(ctx, type) g_mutex_unlock(ctx->type##_mutex) +# define hw_sched_broadcast(ctx, type) g_cond_broadcast(ctx->type##_cond) +# define hw_sched_signal(ctx, type) g_cond_signal(ctx->type##_cond) +# define hw_sched_wait(ctx, type) g_cond_wait(ctx->type##_cond, ctx->type##_cond_mutex) + +#define hw_sched_create_mutex(void) g_mutex_new() +#define hw_sched_destroy_mutex(ctx) g_mutex_free(ctx) +#define hw_sched_lock_mutex(ctx) g_mutex_lock(ctx) +#define hw_sched_unlock_mutex(ctx) g_mutex_unlock(ctx) +#else /* HW_USE_THREADS */ +# define hw_sched_lock(ctx, type) +# define hw_sched_unlock(ctx, type) +# define hw_sched_broadcast(ctx, type) +# define hw_sched_signal(ctx, type) +# define hw_sched_wait(ctx, type) + +#define hw_sched_create_mutex(void) NULL +#define hw_sched_destroy_mutex(ctx) +#define hw_sched_lock_mutex(ctx) +#define hw_sched_unlock_mutex(ctx) +#endif /* HW_USE_THREADS */ + +# ifdef __cplusplus +} +# endif + +#endif /* _HW_SCHED_H */ + diff --git a/src/Core/regularisers_CPU/hw_thread.c b/src/Core/regularisers_CPU/hw_thread.c new file mode 100644 index 00000000..41b18006 --- /dev/null +++ b/src/Core/regularisers_CPU/hw_thread.c @@ -0,0 +1,141 @@ +/* + * The PyHST program is Copyright (C) 2002-2011 of the + * European Synchrotron Radiation Facility (ESRF) and + * Karlsruhe Institute of Technology (KIT). + * + * PyHST is free software: you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * hst is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program. If not, see . + */ + +#include +#include +#include + +#include "hw_sched.h" +#include "hw_thread.h" + +#ifdef HW_USE_THREADS +static void *hw_thread_function(HWThread ctx) { + int err; + int chunk_id; + + HWRunFunction *runs; + HWRunFunction run; + HWSched sched; + void *hwctx; + + sched = ctx->sched; + runs = ctx->run; + hwctx = ctx->hwctx; + + hw_sched_lock(sched, job_cond); + + hw_sched_lock(sched, compl_cond); + ctx->status = HW_THREAD_STATUS_IDLE; + hw_sched_broadcast(sched, compl); + hw_sched_unlock(sched, compl_cond); + + while (sched->status) { + hw_sched_wait(sched, job); + if (!sched->status) break; + + ctx->err = 0; + ctx->status = HW_THREAD_STATUS_STARTING; + hw_sched_unlock(sched, job_cond); + + run = hw_run_entry(runs, sched->entry); +#if 0 + // Offset to interleave transfers if the GPUBox is used + // Just check with CUDA_LAUNCH_BLOCKED the togpu time and put it here + // It should be still significantly less than BP time + // We can do callibration during initilization in future + + usleep(12000 * ctx->thread_id); +#endif + chunk_id = hw_sched_get_chunk(sched, ctx->thread_id); + + /* Should be after get_chunk, since we can check if it's first time */ + ctx->status = HW_THREAD_STATUS_RUNNING; + while (chunk_id != HW_SCHED_CHUNK_INVALID) { + //printf("Thread %i processing slice %i\n", ctx->thread_id, chunk_id); + err = run(ctx, hwctx, chunk_id, sched->ctx); + if (err) { + ctx->err = err; + break; + } + chunk_id = hw_sched_get_chunk(sched, ctx->thread_id); + } + + hw_sched_lock(sched, job_cond); + + hw_sched_lock(sched, compl_cond); + ctx->status = HW_THREAD_STATUS_DONE; + hw_sched_broadcast(sched, compl); + hw_sched_unlock(sched, compl_cond); + } + + hw_sched_unlock(sched, job_cond); + + g_thread_exit(NULL); + return NULL; /* TODO: check this */ +} +#endif /* HW_USE_THREADS */ + + +HWThread hw_thread_create(HWSched sched, int thread_id, void *hwctx, HWRunFunction *run_func, HWFreeFunction free_func) { + GError *err; + + HWThread ctx; + + ctx = (HWThread)malloc(sizeof(HWThreadS)); + if (!ctx) return ctx; + + memset(ctx, 0, sizeof(HWThreadS)); + + ctx->sched = sched; + ctx->hwctx = hwctx; + ctx->run = run_func; + ctx->free = free_func; + ctx->thread_id = thread_id; + ctx->status = HW_THREAD_STATUS_INIT; + +#ifdef HW_USE_THREADS + ctx->thread = g_thread_create((GThreadFunc)hw_thread_function, ctx, 1, &err); + if (!ctx->thread) { + g_error_free(err); + + hw_thread_destroy(ctx); + return NULL; + } +#endif /* HW_USE_THREADS */ + + return ctx; +} + +void hw_thread_destroy(HWThread ctx) { +#ifdef HW_USE_THREADS + if (ctx->thread) { + g_thread_join(ctx->thread); + } +#endif /* HW_USE_THREADS */ + + if (ctx->data) { + free(ctx->data); + } + + if (ctx->free) { + ctx->free(ctx->hwctx); + } + + free(ctx); +} diff --git a/src/Core/regularisers_CPU/hw_thread.h b/src/Core/regularisers_CPU/hw_thread.h new file mode 100644 index 00000000..de7f60f3 --- /dev/null +++ b/src/Core/regularisers_CPU/hw_thread.h @@ -0,0 +1,76 @@ +/* + * The PyHST program is Copyright (C) 2002-2011 of the + * European Synchrotron Radiation Facility (ESRF) and + * Karlsruhe Institute of Technology (KIT). + * + * PyHST is free software: you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * hst is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program. If not, see . + */ + +#ifndef _HW_THREAD_H +#define _HW_THREAD_H + +#include + +typedef struct HWThreadT *HWThread; +typedef int (*HWRunFunction)(HWThread thread, void *ctx, int block, void *attr); +typedef int (*HWFreeFunction)(void *ctx); + +#include "hw_sched.h" + +enum HWThreadStatusT { + HW_THREAD_STATUS_IDLE = 0, + HW_THREAD_STATUS_STARTING = 1, + HW_THREAD_STATUS_RUNNING = 2, + HW_THREAD_STATUS_FINISHING = 3, + HW_THREAD_STATUS_FINISHING2 = 4, + HW_THREAD_STATUS_DONE = 5, + HW_THREAD_STATUS_INIT = 6 +}; +typedef enum HWThreadStatusT HWThreadStatus; + + +#ifndef HW_HIDE_DETAILS +struct HWThreadT { + int thread_id; + HWSched sched; + +#ifdef HW_USE_THREADS + GThread *thread; +#endif /* HW_USE_THREADS */ + + void *hwctx; + HWRunFunction *run; + HWFreeFunction free; + + int err; + HWThreadStatus status; + + void *data; /**< Per-thread data storage, will be free'd if set */ +}; +typedef struct HWThreadT HWThreadS; +#endif /* HW_HIDE_DETAILS */ + +# ifdef __cplusplus +extern "C" { +# endif + +HWThread hw_thread_create(HWSched sched, int thread_id, void *hwctx, HWRunFunction *run_func, HWFreeFunction free_func); +void hw_thread_destroy(HWThread ctx); + +# ifdef __cplusplus +} +# endif + + +#endif /* _HW_THREAD_H */ From ec31378b47fdf0eae06732e4e91c3b1bb02c454a Mon Sep 17 00:00:00 2001 From: "Suren A. Chilingaryan" Date: Sun, 29 Mar 2020 16:04:44 +0200 Subject: [PATCH 02/10] Optimize cache usage with blocking (icc is faster and gcc is slightly slower than previous version) --- src/Core/regularisers_CPU/TNV_core.c | 94 +++++++++++++++------------- 1 file changed, 51 insertions(+), 43 deletions(-) diff --git a/src/Core/regularisers_CPU/TNV_core.c b/src/Core/regularisers_CPU/TNV_core.c index be7fdef7..dce414ac 100755 --- a/src/Core/regularisers_CPU/TNV_core.c +++ b/src/Core/regularisers_CPU/TNV_core.c @@ -25,6 +25,7 @@ #include "TNV_core.h" +#define BLOCK 32 #define min(a,b) (((a)<(b))?(a):(b)) inline void coefF(float *t, float M1, float M2, float M3, float sigma, int p, int q, int r) { @@ -141,7 +142,7 @@ inline void coefF(float *t, float M1, float M2, float M3, float sigma, int p, in typedef struct { int offY, stepY, copY; float *Input, *u, *qx, *qy, *gradx, *grady, *div; - float *div0, *udiff0, *udiff; + float *div0, *udiff0; float resprimal, resdual; float unorm, qnorm, product; } tnv_thread_t; @@ -173,7 +174,6 @@ static int tnv_free(HWThread thr, void *hwctx, int device_id, void *data) { free(ctx->div0); free(ctx->udiff0); - free(ctx->udiff); return 0; } @@ -196,19 +196,18 @@ static int tnv_init(HWThread thr, void *hwctx, int device_id, void *data) { long DimRow = (long)(dimX * padZ); // Auxiliar vectors - ctx->Input = malloc(Dim1Total * sizeof(float)); - ctx->u = malloc(Dim1Total * sizeof(float)); - ctx->qx = malloc(DimTotal * sizeof(float)); - ctx->qy = malloc(DimTotal * sizeof(float)); - ctx->gradx = malloc(DimTotal * sizeof(float)); - ctx->grady = malloc(DimTotal * sizeof(float)); - ctx->div = malloc(Dim1Total * sizeof(float)); - - ctx->div0 = malloc(DimRow * sizeof(float)); - ctx->udiff0 = malloc(DimRow * sizeof(float)); - ctx->udiff = malloc(DimRow * sizeof(float)); - - if ((!ctx->Input)||(!ctx->u)||(!ctx->qx)||(!ctx->qy)||(!ctx->gradx)||(!ctx->grady)||(!ctx->div)||(!ctx->div0)||(!ctx->udiff)||(!ctx->udiff0)) { + ctx->Input = memalign(64, Dim1Total * sizeof(float)); + ctx->u = memalign(64, Dim1Total * sizeof(float)); + ctx->qx = memalign(64, DimTotal * sizeof(float)); + ctx->qy = memalign(64, DimTotal * sizeof(float)); + ctx->gradx = memalign(64, DimTotal * sizeof(float)); + ctx->grady = memalign(64, DimTotal * sizeof(float)); + ctx->div = memalign(64, Dim1Total * sizeof(float)); + + ctx->div0 = memalign(64, DimRow * sizeof(float)); + ctx->udiff0 = memalign(64, DimRow * sizeof(float)); + + if ((!ctx->Input)||(!ctx->u)||(!ctx->qx)||(!ctx->qy)||(!ctx->gradx)||(!ctx->grady)||(!ctx->div)||(!ctx->div0)||(!ctx->udiff0)) { fprintf(stderr, "Error allocating memory\n"); exit(-1); } @@ -304,7 +303,7 @@ static int tnv_restore(HWThread thr, void *hwctx, int device_id, void *data) { static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) { - long i, j, k, l, m; + long i, j, k, l; tnv_context_t *tnv_ctx = (tnv_context_t*)data; tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; @@ -349,39 +348,45 @@ static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) { float qxdiff; float qydiff; float divdiff; - float gradxdiff[dimZ]; - float gradydiff[dimZ]; - float ubarx[dimZ]; - float ubary[dimZ]; - float udiff_next[dimZ]; + float gradxdiff[dimZ] __attribute__((aligned(64))); + float gradydiff[dimZ] __attribute__((aligned(64))); + float ubarx[dimZ] __attribute__((aligned(64))); + float ubary[dimZ] __attribute__((aligned(64))); + float udiff[dimZ] __attribute__((aligned(64))); + for(i=0; i < dimX; i++) { for(k = 0; k < dimZ; k++) { int l = i * padZ + k; float u_upd = (u[l] + tau * div[l] + taulambda * Input[l])/constant; float udiff = u[l] - u_upd; - ctx->udiff[l] = udiff; ctx->udiff0[l] = udiff; ctx->div0[l] = div[l]; - u[l] = u_upd; } } - for(j = 0; j < stepY; j++) { - for(i = 0; i < dimX; i++) { + for(int j1 = 0; j1 < stepY; j1 += BLOCK) { + for(int i1 = 0; i1 < dimX; i1 += BLOCK) { + for(int j2 = 0; j2 < BLOCK; j2++) { + j = j1 + j2; + for(int i2 = 0; i2 < BLOCK; i2++) { float t[3]; float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f; + + i = i1 + i2; + if (i == dimX) break; + if (j == stepY) { j2 = BLOCK; break; } l = (j * dimX + i) * padZ; - m = dimX * padZ; - -//#pragma unroll 64 + +#pragma vector aligned +#pragma GCC ivdep for(k = 0; k < dimZ; k++) { - float u_upd = (u[l + k + m] + tau * div[l + k + m] + taulambda * Input[l + k + m]) / constant; - udiff_next[k] = u[l + k + m] - u_upd; - u[l + k + m] = u_upd; + float u_upd = (u[l + k] + tau * div[l + k] + taulambda * Input[l + k]) / constant; + udiff[k] = u[l + k] - u_upd; + u[l + k] = u_upd; - float gradx_upd = (i == (dimX - 1))?0:(u[l + k + padZ] - u[l + k]); - float grady_upd = (j == (copY - 1))?0:(u[l + k + m] - u[l + k]); + float gradx_upd = (i == (dimX - 1))?0:((u[l + k + padZ] + tau * div[l + k + padZ] + taulambda * Input[l + k + padZ]) / constant - u_upd); + float grady_upd = (j == (copY - 1))?0:((u[l + k + dimX*padZ] + tau * div[l + k + dimX*padZ] + taulambda * Input[l + k + dimX*padZ]) / constant - u_upd); gradxdiff[k] = gradx[l + k] - gradx_upd; gradydiff[k] = grady[l + k] - grady_upd; gradx[l + k] = gradx_upd; @@ -398,7 +403,8 @@ static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) { coefF(t, M1, M2, M3, sigma, p, q, r); -//#pragma unroll 64 +#pragma vector aligned +#pragma GCC ivdep for(k = 0; k < dimZ; k++) { float vx = ubarx[k] + divsigma * qx[l + k]; float vy = ubary[k] + divsigma * qy[l + k]; @@ -411,26 +417,26 @@ static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) { qx[l + k] += qxdiff; qy[l + k] += qydiff; - float udiff = ctx->udiff[i * padZ + k]; - ctx->udiff[i * padZ + k] = udiff_next[k]; - unorm += (udiff * udiff); + unorm += (udiff[k] * udiff[k]); qnorm += (qxdiff * qxdiff + qydiff * qydiff); float div_upd = 0; div_upd -= (i > 0)?qx[l + k - padZ]:0; - div_upd -= (j > 0)?qy[l + k - m]:0; + div_upd -= (j > 0)?qy[l + k - dimX*padZ]:0; div_upd += (i < (dimX-1))?qx[l + k]:0; div_upd += (j < (copY-1))?qy[l + k]:0; divdiff = div[l + k] - div_upd; div[l + k] = div_upd; - resprimal += ((offY == 0)||(j > 0))?fabs(divtau * udiff + divdiff):0; + resprimal += ((offY == 0)||(j > 0))?fabs(divtau * udiff[k] + divdiff):0; resdual += fabs(divsigma * qxdiff + gradxdiff[k]); resdual += fabs(divsigma * qydiff + gradydiff[k]); product -= (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff); } } // i + } // j + } // i } // j @@ -452,8 +458,9 @@ static void TNV_CPU_init(float *InputT, float *uT, int dimX, int dimY, int dimZ) tnv_ctx.dimY = dimY; tnv_ctx.dimZ = dimZ; // Padding seems actually slower -// tnv_ctx.padZ = 64 * ((dimZ / 64) + ((dimZ % 64)?1:0)); - tnv_ctx.padZ = dimZ; +// tnv_ctx.padZ = dimZ; +// tnv_ctx.padZ = 4 * ((dimZ / 4) + ((dimZ % 4)?1:0)); + tnv_ctx.padZ = 16 * ((dimZ / 16) + ((dimZ % 16)?1:0)); hw_sched_init(); @@ -573,8 +580,9 @@ float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float to float udiff = ctx->udiff0[l]; ctx->div[l] -= ctx0->qy[l + m]; - ctx0->div[m + l + dimX*padZ] = ctx->div[l]; - + ctx0->div[m + l + dimX * padZ] = ctx->div[l]; + ctx0->u[m + l + dimX * padZ] = ctx->u[l]; + divdiff += ctx0->qy[l + m]; resprimal += fabs(divtau * udiff + divdiff); } From 9b2ffcb1eb7e87b2b08670bbbd2fa8db6e5b2e1e Mon Sep 17 00:00:00 2001 From: "Suren A. Chilingaryan" Date: Sun, 29 Mar 2020 20:22:23 +0200 Subject: [PATCH 03/10] Optimize cache usage also for the version with full back-tracking --- .../regularisers_CPU/TNV_core_backtrack.c | 123 +++++++++--------- 1 file changed, 58 insertions(+), 65 deletions(-) diff --git a/src/Core/regularisers_CPU/TNV_core_backtrack.c b/src/Core/regularisers_CPU/TNV_core_backtrack.c index d771db52..71924751 100755 --- a/src/Core/regularisers_CPU/TNV_core_backtrack.c +++ b/src/Core/regularisers_CPU/TNV_core_backtrack.c @@ -22,8 +22,10 @@ * limitations under the License. */ +#include #include "TNV_core.h" +#define BLOCK 32 #define min(a,b) (((a)<(b))?(a):(b)) inline void coefF(float *t, float M1, float M2, float M3, float sigma, int p, int q, int r) { @@ -196,18 +198,18 @@ static int tnv_init(HWThread thr, void *hwctx, int device_id, void *data) { long Dim1Total = (long)(dimX*(stepY+1)*padZ); // Auxiliar vectors - ctx->Input = malloc(Dim1Total * sizeof(float)); - ctx->u = malloc(Dim1Total * sizeof(float)); - ctx->u_upd = malloc(Dim1Total * sizeof(float)); - ctx->qx = malloc(DimTotal * sizeof(float)); - ctx->qy = malloc(DimTotal * sizeof(float)); - ctx->qx_upd = malloc(DimTotal * sizeof(float)); - ctx->qy_upd = malloc(DimTotal * sizeof(float)); - ctx->gradx = malloc(DimTotal * sizeof(float)); - ctx->grady = malloc(DimTotal * sizeof(float)); - ctx->gradx_upd = malloc(DimTotal * sizeof(float)); - ctx->grady_upd = malloc(DimTotal * sizeof(float)); - ctx->div = malloc(Dim1Total * sizeof(float)); + ctx->Input = memalign(64, Dim1Total * sizeof(float)); + ctx->u = memalign(64, Dim1Total * sizeof(float)); + ctx->u_upd = memalign(64, Dim1Total * sizeof(float)); + ctx->qx = memalign(64, DimTotal * sizeof(float)); + ctx->qy = memalign(64, DimTotal * sizeof(float)); + ctx->qx_upd = memalign(64, DimTotal * sizeof(float)); + ctx->qy_upd = memalign(64, DimTotal * sizeof(float)); + ctx->gradx = memalign(64, DimTotal * sizeof(float)); + ctx->grady = memalign(64, DimTotal * sizeof(float)); + ctx->gradx_upd = memalign(64, DimTotal * sizeof(float)); + ctx->grady_upd = memalign(64, DimTotal * sizeof(float)); + ctx->div = memalign(64, Dim1Total * sizeof(float)); ctx->div_upd = malloc(Dim1Total * sizeof(float)); if ((!ctx->Input)||(!ctx->u)||(!ctx->u_upd)||(!ctx->qx)||(!ctx->qy)||(!ctx->qx_upd)||(!ctx->qy_upd)||(!ctx->gradx)||(!ctx->grady)||(!ctx->gradx_upd)||(!ctx->grady_upd)||(!ctx->div)||(!ctx->div_upd)) { @@ -332,7 +334,7 @@ static int tnv_restore(HWThread thr, void *hwctx, int device_id, void *data) { static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) { - long i, j, k, l, m; + long i, j, k, l; tnv_context_t *tnv_ctx = (tnv_context_t*)data; tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; @@ -374,50 +376,41 @@ static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) { float theta1 = 1.0f + theta; float constant = 1.0f + taulambda; - float resprimal = 0.0f; - float resdual = 0.0f; - float product = 0.0f; - float unorm = 0.0f; - float qnorm = 0.0f; - - float udiff[dimZ]; - float qxdiff; - float qydiff; - float divdiff; - float gradxdiff[dimZ]; - float gradydiff[dimZ]; - - for(i=0; i < dimX; i++) { - for(k = 0; k < dimZ; k++) { - int l = i * padZ + k; - u_upd[l] = (u[l] + tau * div[l] + taulambda * Input[l])/constant; - div_upd[l] = 0; - } - } - - for(j = 0; j < stepY; j++) { -/* m = j * dimX * dimZ + (dimX - 1) * dimZ; - for(k = 0; k < dimZ; k++) { - u_upd[k + m] = (u[k + m] + tau * div[k + m] + taulambda * Input[k + m]) / constant; - }*/ - - for(i = 0; i < dimX/* - 1*/; i++) { + float resprimal = 0.0; + float resdual = 0.0; + float product = 0.0; + float unorm = 0.0; + float qnorm = 0.0; + + float udiff[dimZ] __attribute__((aligned(64))); + float qxdiff __attribute__((aligned(64))); + float qydiff __attribute__((aligned(64))); + float divdiff __attribute__((aligned(64))); + float gradxdiff[dimZ] __attribute__((aligned(64))); + float gradydiff[dimZ] __attribute__((aligned(64))); + + for(int j1 = 0; j1 < stepY; j1 += BLOCK) { + for(int i1 = 0; i1 < dimX; i1 += BLOCK) { + for(int j2 = 0; j2 < BLOCK; j2++) { + j = j1 + j2; + for(int i2 = 0; i2 < BLOCK; i2++) { float t[3]; float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f; + + i = i1 + i2; + if (i == dimX) break; + if (j == stepY) { j2 = BLOCK; break; } l = (j * dimX + i) * padZ; - m = dimX * padZ; -//#pragma unroll 64 +//#pragma vector aligned +#pragma GCC ivdep for(k = 0; k < dimZ; k++) { - u_upd[l + k + m] = (u[l + k + m] + tau * div[l + k + m] + taulambda * Input[l + k + m]) / constant; - - gradx_upd[l + k] = (i == (dimX - 1))?0:(u_upd[l + k + padZ] - u_upd[l + k]); - grady_upd[l + k] = (j == (copY - 1))?0:(u_upd[l + k + m] - u_upd[l + k]); // We need div from the next thread on last iter - + u_upd[l + k] = (u[l + k] + tau * div[l + k] + taulambda * Input[l + k]) / constant; udiff[k] = u[l + k] - u_upd[l + k]; unorm += (udiff[k] * udiff[k]); -// if ((!k)&&(!i)) printf("%i = %f %f, %f %f\n", offY + j, u[l + k], u_upd[l + k], udiff[k], unorm); + gradx_upd[l + k] = (i == (dimX - 1))?0:((u[l + k + padZ] + tau * div[l + k + padZ] + taulambda * Input[l + k + padZ]) / constant - u_upd[l + k]); + grady_upd[l + k] = (j == (copY - 1))?0:((u[l + k + dimX*padZ] + tau * div[l + k + dimX*padZ] + taulambda * Input[l + k + dimX*padZ]) / constant - u_upd[l + k]); gradxdiff[k] = gradx[l + k] - gradx_upd[l + k]; gradydiff[k] = grady[l + k] - grady_upd[l + k]; @@ -440,7 +433,8 @@ static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) { coefF(t, M1, M2, M3, sigma, p, q, r); -//#pragma unroll 64 +//#pragma vector aligned +#pragma GCC ivdep for(k = 0; k < dimZ; k++) { #ifdef TNV_NEW_STYLE float vx = divsigma * qx_upd[l + k]; @@ -464,14 +458,12 @@ static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) { qy_upd[l + k] = qy[l + k] + sigma * (ubary - gy_upd); #endif -if(i != (dimX-1)) { - div_upd[l + k] += qx_upd[l + k]; - div_upd[l + k + padZ] -= qx_upd[l + k]; -} -if(j != (copY-1)) { - div_upd[l + k] += qy_upd[l + k]; - div_upd[l + k + m] = -qy_upd[l + k]; // We need to update div in the next thread on last iter -} + float div_upd_val = 0; + div_upd_val -= (i > 0)?qx_upd[l + k - padZ]:0; + div_upd_val -= (j > 0)?qy_upd[l + k - dimX * padZ]:0; + div_upd_val += (i < (dimX-1))?qx_upd[l + k]:0; + div_upd_val += (j < (copY-1))?qy_upd[l + k]:0; + div_upd[l + k] = div_upd_val; qxdiff = qx[l + k] - qx_upd[l + k]; qydiff = qy[l + k] - qy_upd[l + k]; @@ -488,7 +480,9 @@ if(j != (copY-1)) { } } // i - } + } // j + } // i + } // j ctx->resprimal = resprimal; @@ -509,8 +503,8 @@ static void TNV_CPU_init(float *InputT, float *uT, int dimX, int dimY, int dimZ) tnv_ctx.dimY = dimY; tnv_ctx.dimZ = dimZ; // Padding seems actually slower -// tnv_ctx.padZ = 64 * ((dimZ / 64) + ((dimZ % 64)?1:0)); tnv_ctx.padZ = dimZ; +// tnv_ctx.padZ = 16 * ((dimZ / 16) + ((dimZ % 16)?1:0)); hw_sched_init(); @@ -620,17 +614,16 @@ float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float to tnv_thread_t *ctx0 = tnv_ctx.thr_ctx + (j - 1); tnv_thread_t *ctx = tnv_ctx.thr_ctx + j; - m = ctx0->stepY * dimX * padZ; + m = ctx0->stepY * dimX * padZ; for(i = 0; i < dimX; i++) { for(k = 0; k < dimZ; k++) { int l = i * padZ + k; - float div_upd_add = ctx0->div_upd[m + l]; - ctx->div_upd[l] += div_upd_add; + ctx->div_upd[l] -= ctx0->qy_upd[m - dimX * padZ + l]; ctx0->div_upd[m + l] = ctx->div_upd[l]; - //ctx0->u_upd[m + l] = ctx->u_upd[l]; + ctx0->u_upd[m + l] = ctx->u_upd[l]; - float divdiff = ctx->div[l] - ctx->div_upd[l]; // Multiple steps... How we compute without history? + float divdiff = ctx->div[l] - ctx->div_upd[l]; float udiff = ctx->u[l] - ctx->u_upd[l]; resprimal += fabs(divtau * udiff + divdiff); } From da30ce619bce168fc24ed0c17f04e411626ba18b Mon Sep 17 00:00:00 2001 From: "Suren A. Chilingaryan" Date: Sun, 29 Mar 2020 20:34:12 +0200 Subject: [PATCH 04/10] Eliminate conditionals in the innermost loop to help gcc auto-vectorizer --- src/Core/CMakeLists.txt | 13 +- src/Core/regularisers_CPU/TNV_core.c | 216 ++++++++++-------- .../regularisers_CPU/TNV_core_backtrack.c | 189 ++++++++------- .../TNV_core_backtrack_loop.h | 100 ++++++++ src/Core/regularisers_CPU/TNV_core_loop.h | 107 +++++++++ 5 files changed, 423 insertions(+), 202 deletions(-) create mode 100644 src/Core/regularisers_CPU/TNV_core_backtrack_loop.h create mode 100644 src/Core/regularisers_CPU/TNV_core_loop.h diff --git a/src/Core/CMakeLists.txt b/src/Core/CMakeLists.txt index 10b16f30..76b0f3e5 100644 --- a/src/Core/CMakeLists.txt +++ b/src/Core/CMakeLists.txt @@ -60,9 +60,16 @@ message("CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}") message("Adding regularisers as a shared library") #set(CMAKE_C_COMPILER /opt/intel/compilers_and_libraries/linux/bin/intel64/icc) -#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -xSSE4.2 -qopt-report=2 -qopt-report-phase=vec -qopenmp") -set(CMAKE_C_FLAGS "-march=nocona -msse -msse2 -msse3 -mssse3 -msse4 -msse4.1 -msse4.2 -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=128 -fopenmp") -#set(CMAKE_C_FLAGS "-march=native -msse -msse2 -msse3 -mssse3 -msse4 -msse4.1 -msse4.2 -mavx -mavx2 -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=512 -fopenmp") +#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -xSSE4.2 -qopt-report=5 -qopt-report-file=stdout -qopt-report-phase=vec -qopenmp -g") +#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -axAVX2 -xAVX2 -qopt-report=5 -qopt-report-file=stdout -qopt-report-phase=vec -qopenmp -g") +#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -mavx512f -mavx512dq -mavx512bw -mavx512vbmi -mavx512vbmi2 -mavx512vl -qopt-report=5 -qopt-report-file=stdout -qopt-report-phase=vec -qopenmp -g") + +#set(CMAKE_C_COMPILER clang) +#set(CMAKE_C_FLAGS "-march=nocona -msse -msse2 -msse3 -mssse3 -msse4 -msse4.1 -msse4.2 -ftree-vectorize -fopenmp") + +#set(CMAKE_C_FLAGS "-march=nocona -msse -msse2 -msse3 -mssse3 -msse4 -msse4.1 -msse4.2 -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=128 -fopenmp") +set(CMAKE_C_FLAGS "-march=native -mavx2 -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=512 -fopenmp") +#set(CMAKE_C_FLAGS "-march=native -mavx512f -mavx512dq -mavx512bw -mavx512vbmi -mavx512vbmi2 -mavx512vl -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=512 -fopenmp") #set(CMAKE_C_FLAGS_RELEASE "-g -gdwarf-2 -g3 -fno-omit-frame-pointer") #set(CMAKE_C_FLAGS "-acc -Minfo -ta=tesla:cc20 -openmp") diff --git a/src/Core/regularisers_CPU/TNV_core.c b/src/Core/regularisers_CPU/TNV_core.c index dce414ac..415c6448 100755 --- a/src/Core/regularisers_CPU/TNV_core.c +++ b/src/Core/regularisers_CPU/TNV_core.c @@ -5,12 +5,6 @@ * * Copyright 2017 Daniil Kazantsev * Copyright 2017 Srikanth Nagella, Edoardo Pasca - * - * Copyriht 2020 Suren A. Chlingaryan - * Optimized version with 1/3 of memory consumption and ~10x performance - * This version is not able to perform back-track except during first iterations - * But warning would be printed if backtracking is required and slower version (TNV_core_backtrack.c) - * could be executed instead. It still better than original with 1/2 of memory consumption and 4x performance gain * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,6 +17,7 @@ * limitations under the License. */ +#include #include "TNV_core.h" #define BLOCK 32 @@ -143,6 +138,7 @@ typedef struct { int offY, stepY, copY; float *Input, *u, *qx, *qy, *gradx, *grady, *div; float *div0, *udiff0; + float *gradxdiff, *gradydiff, *ubarx, *ubary, *udiff; float resprimal, resdual; float unorm, qnorm, product; } tnv_thread_t; @@ -174,7 +170,13 @@ static int tnv_free(HWThread thr, void *hwctx, int device_id, void *data) { free(ctx->div0); free(ctx->udiff0); - + + free(ctx->gradxdiff); + free(ctx->gradydiff); + free(ctx->ubarx); + free(ctx->ubary); + free(ctx->udiff); + return 0; } @@ -194,6 +196,7 @@ static int tnv_init(HWThread thr, void *hwctx, int device_id, void *data) { long DimTotal = (long)(dimX*stepY*padZ); long Dim1Total = (long)(dimX*(stepY+1)*padZ); long DimRow = (long)(dimX * padZ); + long DimCell = (long)(padZ); // Auxiliar vectors ctx->Input = memalign(64, Dim1Total * sizeof(float)); @@ -207,7 +210,13 @@ static int tnv_init(HWThread thr, void *hwctx, int device_id, void *data) { ctx->div0 = memalign(64, DimRow * sizeof(float)); ctx->udiff0 = memalign(64, DimRow * sizeof(float)); - if ((!ctx->Input)||(!ctx->u)||(!ctx->qx)||(!ctx->qy)||(!ctx->gradx)||(!ctx->grady)||(!ctx->div)||(!ctx->div0)||(!ctx->udiff0)) { + ctx->gradxdiff = memalign(64, DimCell * sizeof(float)); + ctx->gradydiff = memalign(64, DimCell * sizeof(float)); + ctx->ubarx = memalign(64, DimCell * sizeof(float)); + ctx->ubary = memalign(64, DimCell * sizeof(float)); + ctx->udiff = memalign(64, DimCell * sizeof(float)); + + if ((!ctx->Input)||(!ctx->u)||(!ctx->qx)||(!ctx->qy)||(!ctx->gradx)||(!ctx->grady)||(!ctx->div)||(!ctx->div0)||(!ctx->udiff)||(!ctx->udiff0)) { fprintf(stderr, "Error allocating memory\n"); exit(-1); } @@ -303,7 +312,7 @@ static int tnv_restore(HWThread thr, void *hwctx, int device_id, void *data) { static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) { - long i, j, k, l; + long i, j, k, l, m; tnv_context_t *tnv_ctx = (tnv_context_t*)data; tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; @@ -340,7 +349,8 @@ static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) { float constant = 1.0f + taulambda; float resprimal = 0.0f; - float resdual = 0.0f; + float resdual1 = 0.0f; + float resdual2 = 0.0f; float product = 0.0f; float unorm = 0.0f; float qnorm = 0.0f; @@ -348,100 +358,92 @@ static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) { float qxdiff; float qydiff; float divdiff; - float gradxdiff[dimZ] __attribute__((aligned(64))); - float gradydiff[dimZ] __attribute__((aligned(64))); - float ubarx[dimZ] __attribute__((aligned(64))); - float ubary[dimZ] __attribute__((aligned(64))); - float udiff[dimZ] __attribute__((aligned(64))); - - - for(i=0; i < dimX; i++) { - for(k = 0; k < dimZ; k++) { - int l = i * padZ + k; - float u_upd = (u[l] + tau * div[l] + taulambda * Input[l])/constant; - float udiff = u[l] - u_upd; - ctx->udiff0[l] = udiff; - ctx->div0[l] = div[l]; + float *gradxdiff = ctx->gradxdiff; + float *gradydiff = ctx->gradydiff; + float *ubarx = ctx->ubarx; + float *ubary = ctx->ubary; + float *udiff = ctx->udiff; + + float *udiff0 = ctx->udiff0; + float *div0 = ctx->div0; + + + j = 0; { +# define TNV_LOOP_FIRST_J + i = 0; { +# define TNV_LOOP_FIRST_I +# include "TNV_core_loop.h" +# undef TNV_LOOP_FIRST_I + } + for(i = 1; i < (dimX - 1); i++) { +# include "TNV_core_loop.h" } + i = dimX - 1; { +# define TNV_LOOP_LAST_I +# include "TNV_core_loop.h" +# undef TNV_LOOP_LAST_I + } +# undef TNV_LOOP_FIRST_J } - for(int j1 = 0; j1 < stepY; j1 += BLOCK) { - for(int i1 = 0; i1 < dimX; i1 += BLOCK) { - for(int j2 = 0; j2 < BLOCK; j2++) { - j = j1 + j2; - for(int i2 = 0; i2 < BLOCK; i2++) { - float t[3]; - float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f; - - i = i1 + i2; - if (i == dimX) break; - if (j == stepY) { j2 = BLOCK; break; } - l = (j * dimX + i) * padZ; - -#pragma vector aligned -#pragma GCC ivdep - for(k = 0; k < dimZ; k++) { - float u_upd = (u[l + k] + tau * div[l + k] + taulambda * Input[l + k]) / constant; - udiff[k] = u[l + k] - u_upd; - u[l + k] = u_upd; - - float gradx_upd = (i == (dimX - 1))?0:((u[l + k + padZ] + tau * div[l + k + padZ] + taulambda * Input[l + k + padZ]) / constant - u_upd); - float grady_upd = (j == (copY - 1))?0:((u[l + k + dimX*padZ] + tau * div[l + k + dimX*padZ] + taulambda * Input[l + k + dimX*padZ]) / constant - u_upd); - gradxdiff[k] = gradx[l + k] - gradx_upd; - gradydiff[k] = grady[l + k] - grady_upd; - gradx[l + k] = gradx_upd; - grady[l + k] = grady_upd; - - ubarx[k] = gradx_upd - theta * gradxdiff[k]; - ubary[k] = grady_upd - theta * gradydiff[k]; - - float vx = ubarx[k] + divsigma * qx[l + k]; - float vy = ubary[k] + divsigma * qy[l + k]; - - M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy); - } - coefF(t, M1, M2, M3, sigma, p, q, r); - -#pragma vector aligned -#pragma GCC ivdep - for(k = 0; k < dimZ; k++) { - float vx = ubarx[k] + divsigma * qx[l + k]; - float vy = ubary[k] + divsigma * qy[l + k]; - float gx_upd = vx * t[0] + vy * t[1]; - float gy_upd = vx * t[1] + vy * t[2]; - - qxdiff = sigma * (ubarx[k] - gx_upd); - qydiff = sigma * (ubary[k] - gy_upd); - - qx[l + k] += qxdiff; - qy[l + k] += qydiff; - - unorm += (udiff[k] * udiff[k]); - qnorm += (qxdiff * qxdiff + qydiff * qydiff); - - float div_upd = 0; - div_upd -= (i > 0)?qx[l + k - padZ]:0; - div_upd -= (j > 0)?qy[l + k - dimX*padZ]:0; - div_upd += (i < (dimX-1))?qx[l + k]:0; - div_upd += (j < (copY-1))?qy[l + k]:0; - divdiff = div[l + k] - div_upd; - div[l + k] = div_upd; - - resprimal += ((offY == 0)||(j > 0))?fabs(divtau * udiff[k] + divdiff):0; - resdual += fabs(divsigma * qxdiff + gradxdiff[k]); - resdual += fabs(divsigma * qydiff + gradydiff[k]); - product -= (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff); + + for(int j = 1; j < (copY - 1); j++) { + i = 0; { +# define TNV_LOOP_FIRST_I +# include "TNV_core_loop.h" +# undef TNV_LOOP_FIRST_I + } + } + + for(int j1 = 1; j1 < (copY - 1); j1 += BLOCK) { + for(int i1 = 1; i1 < (dimX - 1); i1 += BLOCK) { + for(int j2 = 0; j2 < BLOCK; j2 ++) { + j = j1 + j2; + for(int i2 = 0; i2 < BLOCK; i2++) { + i = i1 + i2; + + if (i == (dimX - 1)) break; + if (j == (copY - 1)) { j2 = BLOCK; break; } +# include "TNV_core_loop.h" + } } - } // i - } // j - } // i - } // j + + } + + for(int j = 1; j < (copY - 1); j++) { + i = dimX - 1; { +# define TNV_LOOP_LAST_I +# include "TNV_core_loop.h" +# undef TNV_LOOP_LAST_I + } + } + + + + for (j = copY - 1; j < stepY; j++) { +# define TNV_LOOP_LAST_J + i = 0; { +# define TNV_LOOP_FIRST_I +# include "TNV_core_loop.h" +# undef TNV_LOOP_FIRST_I + } + for(i = 1; i < (dimX - 1); i++) { +# include "TNV_core_loop.h" + } + i = dimX - 1; { +# define TNV_LOOP_LAST_I +# include "TNV_core_loop.h" +# undef TNV_LOOP_LAST_I + } +# undef TNV_LOOP_LAST_J + } + ctx->resprimal = resprimal; - ctx->resdual = resdual; + ctx->resdual = resdual1 + resdual2; ctx->product = product; ctx->unorm = unorm; ctx->qnorm = qnorm; @@ -458,9 +460,9 @@ static void TNV_CPU_init(float *InputT, float *uT, int dimX, int dimY, int dimZ) tnv_ctx.dimY = dimY; tnv_ctx.dimZ = dimZ; // Padding seems actually slower -// tnv_ctx.padZ = dimZ; + tnv_ctx.padZ = dimZ; // tnv_ctx.padZ = 4 * ((dimZ / 4) + ((dimZ % 4)?1:0)); - tnv_ctx.padZ = 16 * ((dimZ / 16) + ((dimZ % 16)?1:0)); +// tnv_ctx.padZ = 16 * ((dimZ / 16) + ((dimZ % 16)?1:0)); hw_sched_init(); @@ -580,15 +582,28 @@ float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float to float udiff = ctx->udiff0[l]; ctx->div[l] -= ctx0->qy[l + m]; - ctx0->div[m + l + dimX * padZ] = ctx->div[l]; - ctx0->u[m + l + dimX * padZ] = ctx->u[l]; - + ctx0->div[m + l + dimX*padZ] = ctx->div[l]; + ctx0->u[m + l + dimX*padZ] = ctx->u[l]; + divdiff += ctx0->qy[l + m]; resprimal += fabs(divtau * udiff + divdiff); } } } + { + tnv_thread_t *ctx = tnv_ctx.thr_ctx + 0; + for(i = 0; i < dimX; i++) { + for(k = 0; k < dimZ; k++) { + int l = i * padZ + k; + + float divdiff = ctx->div0[l] - ctx->div[l]; + float udiff = ctx->udiff0[l]; + resprimal += fabs(divtau * udiff + divdiff); + } + } + } + for (j = 0; j < tnv_ctx.threads; j++) { tnv_thread_t *ctx = tnv_ctx.thr_ctx + j; resprimal += ctx->resprimal; @@ -645,6 +660,5 @@ float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float to printf("Iterations stopped at %i with the residual %f \n", iter, residual); printf("Return: %f\n", *uT); -// exit(-1); return *uT; } diff --git a/src/Core/regularisers_CPU/TNV_core_backtrack.c b/src/Core/regularisers_CPU/TNV_core_backtrack.c index 71924751..9b19ed5c 100755 --- a/src/Core/regularisers_CPU/TNV_core_backtrack.c +++ b/src/Core/regularisers_CPU/TNV_core_backtrack.c @@ -335,6 +335,7 @@ static int tnv_restore(HWThread thr, void *hwctx, int device_id, void *data) { static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) { long i, j, k, l; + long i1, i2, j1, j2; tnv_context_t *tnv_ctx = (tnv_context_t*)data; tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; @@ -376,11 +377,12 @@ static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) { float theta1 = 1.0f + theta; float constant = 1.0f + taulambda; - float resprimal = 0.0; - float resdual = 0.0; - float product = 0.0; - float unorm = 0.0; - float qnorm = 0.0; + float resprimal = 0.0f; + float resdual1 = 0.0f; + float resdual2 = 0.0f; + float product = 0.0f; + float unorm = 0.0f; + float qnorm = 0.0f; float udiff[dimZ] __attribute__((aligned(64))); float qxdiff __attribute__((aligned(64))); @@ -389,104 +391,82 @@ static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) { float gradxdiff[dimZ] __attribute__((aligned(64))); float gradydiff[dimZ] __attribute__((aligned(64))); - for(int j1 = 0; j1 < stepY; j1 += BLOCK) { - for(int i1 = 0; i1 < dimX; i1 += BLOCK) { - for(int j2 = 0; j2 < BLOCK; j2++) { - j = j1 + j2; - for(int i2 = 0; i2 < BLOCK; i2++) { - float t[3]; - float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f; - - i = i1 + i2; - if (i == dimX) break; - if (j == stepY) { j2 = BLOCK; break; } - l = (j * dimX + i) * padZ; - -//#pragma vector aligned -#pragma GCC ivdep - for(k = 0; k < dimZ; k++) { - u_upd[l + k] = (u[l + k] + tau * div[l + k] + taulambda * Input[l + k]) / constant; - udiff[k] = u[l + k] - u_upd[l + k]; - unorm += (udiff[k] * udiff[k]); - - gradx_upd[l + k] = (i == (dimX - 1))?0:((u[l + k + padZ] + tau * div[l + k + padZ] + taulambda * Input[l + k + padZ]) / constant - u_upd[l + k]); - grady_upd[l + k] = (j == (copY - 1))?0:((u[l + k + dimX*padZ] + tau * div[l + k + dimX*padZ] + taulambda * Input[l + k + dimX*padZ]) / constant - u_upd[l + k]); - gradxdiff[k] = gradx[l + k] - gradx_upd[l + k]; - gradydiff[k] = grady[l + k] - grady_upd[l + k]; - - float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k]; - float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k]; -//#define TNV_NEW_STYLE -#ifdef TNV_NEW_STYLE - qx_upd[l + k] = qx[l + k] + sigma * ubarx; - qy_upd[l + k] = qy[l + k] + sigma * ubary; - - float vx = divsigma * qx_upd[l + k]; //+ ubarx - float vy = divsigma * qy_upd[l + k]; //+ ubary -#else - float vx = ubarx + divsigma * qx[l + k]; - float vy = ubary + divsigma * qy[l + k]; -#endif - - M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy); - } - coefF(t, M1, M2, M3, sigma, p, q, r); - -//#pragma vector aligned -#pragma GCC ivdep - for(k = 0; k < dimZ; k++) { -#ifdef TNV_NEW_STYLE - float vx = divsigma * qx_upd[l + k]; - float vy = divsigma * qy_upd[l + k]; - - float gx_upd = vx * t[0] + vy * t[1]; - float gy_upd = vx * t[1] + vy * t[2]; - - qx_upd[l + k] -= sigma * gx_upd; - qy_upd[l + k] -= sigma * gy_upd; -#else - float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k]; - float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k]; - float vx = ubarx + divsigma * qx[l + k]; - float vy = ubary + divsigma * qy[l + k]; - - float gx_upd = vx * t[0] + vy * t[1]; - float gy_upd = vx * t[1] + vy * t[2]; - - qx_upd[l + k] = qx[l + k] + sigma * (ubarx - gx_upd); - qy_upd[l + k] = qy[l + k] + sigma * (ubary - gy_upd); -#endif - - float div_upd_val = 0; - div_upd_val -= (i > 0)?qx_upd[l + k - padZ]:0; - div_upd_val -= (j > 0)?qy_upd[l + k - dimX * padZ]:0; - div_upd_val += (i < (dimX-1))?qx_upd[l + k]:0; - div_upd_val += (j < (copY-1))?qy_upd[l + k]:0; - div_upd[l + k] = div_upd_val; - - qxdiff = qx[l + k] - qx_upd[l + k]; - qydiff = qy[l + k] - qy_upd[l + k]; - qnorm += (qxdiff * qxdiff + qydiff * qydiff); - - resdual += fabs(divsigma * qxdiff - gradxdiff[k]); - resdual += fabs(divsigma * qydiff - gradydiff[k]); - product += (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff); - - if ((offY == 0)||(j > 0)) { - divdiff = div[l + k] - div_upd[l + k]; // Multiple steps... How we compute without history? - resprimal += fabs(divtau * udiff[k] + divdiff); - } + j = 0; { +# define TNV_LOOP_FIRST_J + i = 0; { +# define TNV_LOOP_FIRST_I +# include "TNV_core_backtrack_loop.h" +# undef TNV_LOOP_FIRST_I + } + for(i = 1; i < (dimX - 1); i++) { +# include "TNV_core_backtrack_loop.h" + } + i = dimX - 1; { +# define TNV_LOOP_LAST_I +# include "TNV_core_backtrack_loop.h" +# undef TNV_LOOP_LAST_I + } +# undef TNV_LOOP_FIRST_J + } + + + + for(int j = 1; j < (copY - 1); j++) { + i = 0; { +# define TNV_LOOP_FIRST_I +# include "TNV_core_backtrack_loop.h" +# undef TNV_LOOP_FIRST_I + } + } + + for(int j1 = 1; j1 < (copY - 1); j1 += BLOCK) { + for(int i1 = 1; i1 < (dimX - 1); i1 += BLOCK) { + for(int j2 = 0; j2 < BLOCK; j2 ++) { + j = j1 + j2; + for(int i2 = 0; i2 < BLOCK; i2++) { + i = i1 + i2; + + if (i == (dimX - 1)) break; + if (j == (copY - 1)) { j2 = BLOCK; break; } +# include "TNV_core_backtrack_loop.h" + } } - } // i - } // j - } // i - } // j - + + } + + for(int j = 1; j < (copY - 1); j++) { + i = dimX - 1; { +# define TNV_LOOP_LAST_I +# include "TNV_core_backtrack_loop.h" +# undef TNV_LOOP_LAST_I + } + } + + + + for (j = copY - 1; j < stepY; j++) { +# define TNV_LOOP_LAST_J + i = 0; { +# define TNV_LOOP_FIRST_I +# include "TNV_core_backtrack_loop.h" +# undef TNV_LOOP_FIRST_I + } + for(i = 1; i < (dimX - 1); i++) { +# include "TNV_core_backtrack_loop.h" + } + i = dimX - 1; { +# define TNV_LOOP_LAST_I +# include "TNV_core_backtrack_loop.h" +# undef TNV_LOOP_LAST_I + } +# undef TNV_LOOP_LAST_J + } + ctx->resprimal = resprimal; - ctx->resdual = resdual; + ctx->resdual = resdual1 + resdual2; ctx->product = product; ctx->unorm = unorm; ctx->qnorm = qnorm; @@ -630,6 +610,19 @@ float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float to } } + { + tnv_thread_t *ctx = tnv_ctx.thr_ctx + 0; + for(i = 0; i < dimX; i++) { + for(k = 0; k < dimZ; k++) { + int l = i * padZ + k; + + float divdiff = ctx->div[l] - ctx->div_upd[l]; + float udiff = ctx->u[l] - ctx->u_upd[l]; + resprimal += fabs(divtau * udiff + divdiff); + } + } + } + for (j = 0; j < tnv_ctx.threads; j++) { tnv_thread_t *ctx = tnv_ctx.thr_ctx + j; resprimal += ctx->resprimal; diff --git a/src/Core/regularisers_CPU/TNV_core_backtrack_loop.h b/src/Core/regularisers_CPU/TNV_core_backtrack_loop.h new file mode 100644 index 00000000..3ec42509 --- /dev/null +++ b/src/Core/regularisers_CPU/TNV_core_backtrack_loop.h @@ -0,0 +1,100 @@ + float t[3]; + float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f; + + l = (j * dimX + i) * padZ; + +//#pragma vector aligned +#pragma GCC ivdep + for(k = 0; k < dimZ; k++) { + u_upd[l + k] = (u[l + k] + tau * div[l + k] + taulambda * Input[l + k]) / constant; + udiff[k] = u[l + k] - u_upd[l + k]; + unorm += (udiff[k] * udiff[k]); + +#ifdef TNV_LOOP_LAST_I + gradx_upd[l + k] = 0; +#else + gradx_upd[l + k] = ((u[l + k + padZ] + tau * div[l + k + padZ] + taulambda * Input[l + k + padZ]) / constant - u_upd[l + k]); +#endif + +#ifdef TNV_LOOP_LAST_J + grady_upd[l + k] = 0; +#else + grady_upd[l + k] = ((u[l + k + dimX*padZ] + tau * div[l + k + dimX*padZ] + taulambda * Input[l + k + dimX*padZ]) / constant - u_upd[l + k]); +#endif + + gradxdiff[k] = gradx[l + k] - gradx_upd[l + k]; + gradydiff[k] = grady[l + k] - grady_upd[l + k]; + + float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k]; + float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k]; +//#define TNV_NEW_STYLE +#ifdef TNV_NEW_STYLE + qx_upd[l + k] = qx[l + k] + sigma * ubarx; + qy_upd[l + k] = qy[l + k] + sigma * ubary; + + float vx = divsigma * qx_upd[l + k]; //+ ubarx + float vy = divsigma * qy_upd[l + k]; //+ ubary +#else + float vx = ubarx + divsigma * qx[l + k]; + float vy = ubary + divsigma * qy[l + k]; +#endif + + M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy); + } + + coefF(t, M1, M2, M3, sigma, p, q, r); + +//#pragma vector aligned +#pragma GCC ivdep + for(k = 0; k < dimZ; k++) { +#ifdef TNV_NEW_STYLE + float vx = divsigma * qx_upd[l + k]; + float vy = divsigma * qy_upd[l + k]; + + float gx_upd = vx * t[0] + vy * t[1]; + float gy_upd = vx * t[1] + vy * t[2]; + + qx_upd[l + k] -= sigma * gx_upd; + qy_upd[l + k] -= sigma * gy_upd; +#else + float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k]; + float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k]; + float vx = ubarx + divsigma * qx[l + k]; + float vy = ubary + divsigma * qy[l + k]; + + float gx_upd = vx * t[0] + vy * t[1]; + float gy_upd = vx * t[1] + vy * t[2]; + + qx_upd[l + k] = qx[l + k] + sigma * (ubarx - gx_upd); + qy_upd[l + k] = qy[l + k] + sigma * (ubary - gy_upd); +#endif + + float div_upd_val = 0; +#ifndef TNV_LOOP_FIRST_I + div_upd_val -= qx_upd[l + k - padZ]; +#endif + +#ifndef TNV_LOOP_FIRST_J + div_upd_val -= qy_upd[l + k - dimX * padZ]; +#endif +#ifndef TNV_LOOP_LAST_I + div_upd_val += qx_upd[l + k]; +#endif +#ifndef TNV_LOOP_LAST_J + div_upd_val += qy_upd[l + k]; +#endif + div_upd[l + k] = div_upd_val; + + qxdiff = qx[l + k] - qx_upd[l + k]; + qydiff = qy[l + k] - qy_upd[l + k]; + qnorm += (qxdiff * qxdiff + qydiff * qydiff); + + resdual1 += fabs(divsigma * qxdiff - gradxdiff[k]); + resdual2 += fabs(divsigma * qydiff - gradydiff[k]); + product += (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff); + +#ifndef TNV_LOOP_FIRST_J + divdiff = div[l + k] - div_upd[l + k]; // Multiple steps... How we compute without history? + resprimal += fabs(divtau * udiff[k] + divdiff); +#endif + } diff --git a/src/Core/regularisers_CPU/TNV_core_loop.h b/src/Core/regularisers_CPU/TNV_core_loop.h new file mode 100644 index 00000000..34e71398 --- /dev/null +++ b/src/Core/regularisers_CPU/TNV_core_loop.h @@ -0,0 +1,107 @@ + { + float t[3]; + float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f; + l = (j * dimX + i) * padZ; + m = dimX * padZ; + + float *__restrict u_next = u + l + padZ; + float *__restrict u_current = u + l; + float *__restrict u_next_row = u + l + m; + + + float *__restrict qx_current = qx + l; + float *__restrict qy_current = qy + l; + float *__restrict qx_prev = qx + l - padZ; + float *__restrict qy_prev = qy + l - m; + + +// __assume(padZ%16==0); + +//#pragma vector aligned +#pragma GCC ivdep + for(k = 0; k < dimZ; k++) { + float u_upd = (u[l + k] + tau * div[l + k] + taulambda * Input[l + k]) / constant; // 3 reads + udiff[k] = u[l + k] - u_upd; // cache 1w + u[l + k] = u_upd; // 1 write + +#ifdef TNV_LOOP_FIRST_J + udiff0[l + k] = udiff[k]; + div0[l + k] = div[l + k]; +#endif + +#ifdef TNV_LOOP_LAST_I + float gradx_upd = 0; +#else + float u_next_upd = (u[l + k + padZ] + tau * div[l + k + padZ] + taulambda * Input[l + k + padZ]) / constant; // 3 reads + float gradx_upd = (u_next_upd - u_upd); // 2 reads +#endif + +#ifdef TNV_LOOP_LAST_J + float grady_upd = 0; +#else + float u_next_row_upd = (u[l + k + m] + tau * div[l + k + m] + taulambda * Input[l + k + m]) / constant; // 3 reads + float grady_upd = (u_next_row_upd - u_upd); // 1 read +#endif + + gradxdiff[k] = gradx[l + k] - gradx_upd; // 1 read, cache 1w + gradydiff[k] = grady[l + k] - grady_upd; // 1 read, cache 1w + gradx[l + k] = gradx_upd; // 1 write + grady[l + k] = grady_upd; // 1 write + + ubarx[k] = gradx_upd - theta * gradxdiff[k]; // cache 1w + ubary[k] = grady_upd - theta * gradydiff[k]; // cache 1w + + float vx = ubarx[k] + divsigma * qx[l + k]; // 1 read + float vy = ubary[k] + divsigma * qy[l + k]; // 1 read + + M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy); + } + + coefF(t, M1, M2, M3, sigma, p, q, r); + +//#pragma vector aligned +#pragma GCC ivdep + for(k = 0; k < padZ; k++) { + float vx = ubarx[k] + divsigma * qx_current[k]; // cache 2r + float vy = ubary[k] + divsigma * qy_current[k]; // cache 2r + float gx_upd = vx * t[0] + vy * t[1]; + float gy_upd = vx * t[1] + vy * t[2]; + + qxdiff = sigma * (ubarx[k] - gx_upd); + qydiff = sigma * (ubary[k] - gy_upd); + + qx_current[k] += qxdiff; // write 1 + qy_current[k] += qydiff; // write 1 + + unorm += (udiff[k] * udiff[k]); + qnorm += (qxdiff * qxdiff + qydiff * qydiff); + + float div_upd = 0; + +#ifndef TNV_LOOP_FIRST_I + div_upd -= qx_prev[k]; // 1 read +#endif +#ifndef TNV_LOOP_FIRST_J + div_upd -= qy_prev[k]; // 1 read +#endif +#ifndef TNV_LOOP_LAST_I + div_upd += qx_current[k]; +#endif +#ifndef TNV_LOOP_LAST_J + div_upd += qy_current[k]; +#endif + + divdiff = div[l + k] - div_upd; // 1 read + div[l + k] = div_upd; // 1 write + +#ifndef TNV_LOOP_FIRST_J + resprimal += fabs(divtau * udiff[k] + divdiff); +#endif + + // We need to have two independent accumulators to allow gcc-autovectorization + resdual1 += fabs(divsigma * qxdiff + gradxdiff[k]); // cache 1r + resdual2 += fabs(divsigma * qydiff + gradydiff[k]); // cache 1r + product -= (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff); + } + } + \ No newline at end of file From febfe9a6490052d4b8789fd8f7a0342115bfd55e Mon Sep 17 00:00:00 2001 From: "Suren A. Chilingaryan" Date: Sun, 29 Mar 2020 23:13:21 +0200 Subject: [PATCH 05/10] Enable padding over Z dimmension --- src/Core/CMakeLists.txt | 10 ++++++---- src/Core/regularisers_CPU/TNV_core.c | 4 ++-- src/Core/regularisers_CPU/TNV_core_backtrack.c | 4 ++-- src/Core/regularisers_CPU/TNV_core_backtrack_loop.h | 4 ++-- src/Core/regularisers_CPU/TNV_core_loop.h | 4 ++-- 5 files changed, 14 insertions(+), 12 deletions(-) diff --git a/src/Core/CMakeLists.txt b/src/Core/CMakeLists.txt index 76b0f3e5..9a53b676 100644 --- a/src/Core/CMakeLists.txt +++ b/src/Core/CMakeLists.txt @@ -60,15 +60,17 @@ message("CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}") message("Adding regularisers as a shared library") #set(CMAKE_C_COMPILER /opt/intel/compilers_and_libraries/linux/bin/intel64/icc) -#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -xSSE4.2 -qopt-report=5 -qopt-report-file=stdout -qopt-report-phase=vec -qopenmp -g") -#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -axAVX2 -xAVX2 -qopt-report=5 -qopt-report-file=stdout -qopt-report-phase=vec -qopenmp -g") -#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -mavx512f -mavx512dq -mavx512bw -mavx512vbmi -mavx512vbmi2 -mavx512vl -qopt-report=5 -qopt-report-file=stdout -qopt-report-phase=vec -qopenmp -g") +#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -xSSE4.2 -qopt-report=5 -qopt-report-file=stdout -qopt-report-phase=vec -qopenmp") +#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -axAVX2 -xAVX2 -qopt-report=5 -qopt-report-file=stdout -qopt-report-phase=vec -qopenmp") +#set(CMAKE_C_FLAGS "-Ofast -mtune=sandybridge -mavx512f -mavx512dq -mavx512bw -mavx512vbmi -mavx512vbmi2 -mavx512vl -qopt-report=5 -qopt-report-file=stdout -qopt-report-phase=vec -qopenmp") #set(CMAKE_C_COMPILER clang) #set(CMAKE_C_FLAGS "-march=nocona -msse -msse2 -msse3 -mssse3 -msse4 -msse4.1 -msse4.2 -ftree-vectorize -fopenmp") +#set(CMAKE_C_COMPILER gcc-9) +set(CMAKE_C_FLAGS "-march=native -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -fopenmp") #set(CMAKE_C_FLAGS "-march=nocona -msse -msse2 -msse3 -mssse3 -msse4 -msse4.1 -msse4.2 -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=128 -fopenmp") -set(CMAKE_C_FLAGS "-march=native -mavx2 -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=512 -fopenmp") +#set(CMAKE_C_FLAGS "-march=native -mavx2 -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=512 -fopenmp") #set(CMAKE_C_FLAGS "-march=native -mavx512f -mavx512dq -mavx512bw -mavx512vbmi -mavx512vbmi2 -mavx512vl -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=512 -fopenmp") #set(CMAKE_C_FLAGS_RELEASE "-g -gdwarf-2 -g3 -fno-omit-frame-pointer") diff --git a/src/Core/regularisers_CPU/TNV_core.c b/src/Core/regularisers_CPU/TNV_core.c index 415c6448..7b8023bc 100755 --- a/src/Core/regularisers_CPU/TNV_core.c +++ b/src/Core/regularisers_CPU/TNV_core.c @@ -460,9 +460,9 @@ static void TNV_CPU_init(float *InputT, float *uT, int dimX, int dimY, int dimZ) tnv_ctx.dimY = dimY; tnv_ctx.dimZ = dimZ; // Padding seems actually slower - tnv_ctx.padZ = dimZ; +// tnv_ctx.padZ = dimZ; // tnv_ctx.padZ = 4 * ((dimZ / 4) + ((dimZ % 4)?1:0)); -// tnv_ctx.padZ = 16 * ((dimZ / 16) + ((dimZ % 16)?1:0)); + tnv_ctx.padZ = 16 * ((dimZ / 16) + ((dimZ % 16)?1:0)); hw_sched_init(); diff --git a/src/Core/regularisers_CPU/TNV_core_backtrack.c b/src/Core/regularisers_CPU/TNV_core_backtrack.c index 9b19ed5c..7eb367e3 100755 --- a/src/Core/regularisers_CPU/TNV_core_backtrack.c +++ b/src/Core/regularisers_CPU/TNV_core_backtrack.c @@ -483,8 +483,8 @@ static void TNV_CPU_init(float *InputT, float *uT, int dimX, int dimY, int dimZ) tnv_ctx.dimY = dimY; tnv_ctx.dimZ = dimZ; // Padding seems actually slower - tnv_ctx.padZ = dimZ; -// tnv_ctx.padZ = 16 * ((dimZ / 16) + ((dimZ % 16)?1:0)); +// tnv_ctx.padZ = dimZ; + tnv_ctx.padZ = 16 * ((dimZ / 16) + ((dimZ % 16)?1:0)); hw_sched_init(); diff --git a/src/Core/regularisers_CPU/TNV_core_backtrack_loop.h b/src/Core/regularisers_CPU/TNV_core_backtrack_loop.h index 3ec42509..2605d223 100644 --- a/src/Core/regularisers_CPU/TNV_core_backtrack_loop.h +++ b/src/Core/regularisers_CPU/TNV_core_backtrack_loop.h @@ -3,7 +3,7 @@ l = (j * dimX + i) * padZ; -//#pragma vector aligned +#pragma vector aligned #pragma GCC ivdep for(k = 0; k < dimZ; k++) { u_upd[l + k] = (u[l + k] + tau * div[l + k] + taulambda * Input[l + k]) / constant; @@ -44,7 +44,7 @@ coefF(t, M1, M2, M3, sigma, p, q, r); -//#pragma vector aligned +#pragma vector aligned #pragma GCC ivdep for(k = 0; k < dimZ; k++) { #ifdef TNV_NEW_STYLE diff --git a/src/Core/regularisers_CPU/TNV_core_loop.h b/src/Core/regularisers_CPU/TNV_core_loop.h index 34e71398..3f6d9bc2 100644 --- a/src/Core/regularisers_CPU/TNV_core_loop.h +++ b/src/Core/regularisers_CPU/TNV_core_loop.h @@ -17,7 +17,7 @@ // __assume(padZ%16==0); -//#pragma vector aligned +#pragma vector aligned #pragma GCC ivdep for(k = 0; k < dimZ; k++) { float u_upd = (u[l + k] + tau * div[l + k] + taulambda * Input[l + k]) / constant; // 3 reads @@ -59,7 +59,7 @@ coefF(t, M1, M2, M3, sigma, p, q, r); -//#pragma vector aligned +#pragma vector aligned #pragma GCC ivdep for(k = 0; k < padZ; k++) { float vx = ubarx[k] + divsigma * qx_current[k]; // cache 2r From cd0dd76a48aada43c3066642a8eb327a0374d113 Mon Sep 17 00:00:00 2001 From: "Suren A. Chilingaryan" Date: Sun, 29 Mar 2020 23:13:45 +0200 Subject: [PATCH 06/10] Add optimization steps for reference --- src/Core/performance_CPU/README | 25 + src/Core/performance_CPU/TNV_core.c.v15 | 731 ++++++++++++++++++ src/Core/performance_CPU/TNV_core.c.v17 | 690 +++++++++++++++++ src/Core/performance_CPU/TNV_core.c.v18 | 688 +++++++++++++++++ src/Core/performance_CPU/TNV_core.c.v19 | 681 ++++++++++++++++ src/Core/performance_CPU/TNV_core.c.v27 | 650 ++++++++++++++++ src/Core/performance_CPU/TNV_core.c.v32 | 676 ++++++++++++++++ src/Core/performance_CPU/TNV_core.c.v4.stdver | 629 +++++++++++++++ .../TNV_core_backtrack_loop.h.v19 | 100 +++ src/Core/performance_CPU/TNV_core_loop.h.v32 | 119 +++ 10 files changed, 4989 insertions(+) create mode 100644 src/Core/performance_CPU/README create mode 100755 src/Core/performance_CPU/TNV_core.c.v15 create mode 100755 src/Core/performance_CPU/TNV_core.c.v17 create mode 100755 src/Core/performance_CPU/TNV_core.c.v18 create mode 100755 src/Core/performance_CPU/TNV_core.c.v19 create mode 100755 src/Core/performance_CPU/TNV_core.c.v27 create mode 100755 src/Core/performance_CPU/TNV_core.c.v32 create mode 100755 src/Core/performance_CPU/TNV_core.c.v4.stdver create mode 100644 src/Core/performance_CPU/TNV_core_backtrack_loop.h.v19 create mode 100644 src/Core/performance_CPU/TNV_core_loop.h.v32 diff --git a/src/Core/performance_CPU/README b/src/Core/performance_CPU/README new file mode 100644 index 00000000..948c544a --- /dev/null +++ b/src/Core/performance_CPU/README @@ -0,0 +1,25 @@ +TNV_core.c.v4.stdver: Fully equivalent (signle-threads). There is two potential breakage points. + - If OpenMP is enabled, the acces to div_upd will not be serialized and results will breaj + - The results will slightly differ due to different order of summation if loop summing resprimal/resdual organized in a logical way +TNV_core.c.v15: Multi-threads. Works correctly only in the single-threaded mode (if TNV_NEW_STYLE is disabled). In multi-threaded there results slightly differ due to changed order of operation + - TNV_NEW_STYLE slightly disturbs results in both single- and multi-threaded modes + - Resprimal/resdual are summed in groups (not sequentially) if multiple threads. But his actually should improve precision. Use TNV_CHECK_RES to check conformance + - Afterwards, in multi-threaded moded there is a still minor descripancy which first occurs in resprimal (after a few iterations). This is because of changed order of operations while computing + div_upd (only on the first lines of each new sub-block). Normally, we first compute the vertical and, then, add horizontal. On the border rows, instead we first add horizontals... + To check, div/div_upd changed to double. There is no difference then. +TNV_core.c.v17: Computationaly comptabile with v15. + - Padding actually harms performance + - Intel compiler gives about 10% speed-up +TNV_core.c.v18: Blocking helps to boost performance further but only with Intel Compiler. Gcc/Clang is slightly slower here. + - Padding here doesn't harm performance, but is not helpful either + - Difference between icc and gcc is probably due to auto-vectorization. + - Results slightly changed due to different order of operations +TNV_core.c.v19: Eliminate conditionals in the inner loops to help gcc-autovectorisation + - Last version implementing full algrotithm with backtrack in the middle of iterations. + - Again results slightly diverge from v18 due to different order of operations +TNV_core.c.v27: v18 with backtracking only on first iterations (otherwise warning reported) +TNV_core.c.v32: v19 with backtracking only on first iterations (otherwise warning reported) + + +Repo: + - Padding seems to have effect on the newer AVX2 systems. Re-enabled. diff --git a/src/Core/performance_CPU/TNV_core.c.v15 b/src/Core/performance_CPU/TNV_core.c.v15 new file mode 100755 index 00000000..3161cbfb --- /dev/null +++ b/src/Core/performance_CPU/TNV_core.c.v15 @@ -0,0 +1,731 @@ +/* + * This work is part of the Core Imaging Library developed by + * Visual Analytics and Imaging System Group of the Science Technology + * Facilities Council, STFC + * + * Copyright 2017 Daniil Kazantsev + * Copyright 2017 Srikanth Nagella, Edoardo Pasca + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "TNV_core.h" + +#define min(a,b) (((a)<(b))?(a):(b)) + +/*inline*/ void coefF(float *t, float M1, float M2, float M3, float sigma, int p, int q, int r) { + int ii, num; + float divsigma = 1.0f / sigma; + float sum, shrinkfactor; + float T,D,det,eig1,eig2,sig1,sig2,V1, V2, V3, V4, v0,v1,v2, mu1,mu2,sig1_upd,sig2_upd; + float proj[2] = {0}; + + // Compute eigenvalues of M + T = M1 + M3; + D = M1 * M3 - M2 * M2; + det = sqrtf(MAX((T * T / 4.0f) - D, 0.0f)); + eig1 = MAX((T / 2.0f) + det, 0.0f); + eig2 = MAX((T / 2.0f) - det, 0.0f); + sig1 = sqrtf(eig1); + sig2 = sqrtf(eig2); + + // Compute normalized eigenvectors + V1 = V2 = V3 = V4 = 0.0f; + + if(M2 != 0.0f) + { + v0 = M2; + v1 = eig1 - M3; + v2 = eig2 - M3; + + mu1 = sqrtf(v0 * v0 + v1 * v1); + mu2 = sqrtf(v0 * v0 + v2 * v2); + + if(mu1 > fTiny) + { + V1 = v1 / mu1; + V3 = v0 / mu1; + } + + if(mu2 > fTiny) + { + V2 = v2 / mu2; + V4 = v0 / mu2; + } + + } else + { + if(M1 > M3) + { + V1 = V4 = 1.0f; + V2 = V3 = 0.0f; + + } else + { + V1 = V4 = 0.0f; + V2 = V3 = 1.0f; + } + } + + // Compute prox_p of the diagonal entries + sig1_upd = sig2_upd = 0.0f; + + if(p == 1) + { + sig1_upd = MAX(sig1 - divsigma, 0.0f); + sig2_upd = MAX(sig2 - divsigma, 0.0f); + + } else if(p == INFNORM) + { + proj[0] = sigma * fabs(sig1); + proj[1] = sigma * fabs(sig2); + + /*l1 projection part */ + sum = fLarge; + num = 0l; + shrinkfactor = 0.0f; + while(sum > 1.0f) + { + sum = 0.0f; + num = 0; + + for(ii = 0; ii < 2; ii++) + { + proj[ii] = MAX(proj[ii] - shrinkfactor, 0.0f); + + sum += fabs(proj[ii]); + if(proj[ii]!= 0.0f) + num++; + } + + if(num > 0) + shrinkfactor = (sum - 1.0f) / num; + else + break; + } + /*l1 proj ends*/ + + sig1_upd = sig1 - divsigma * proj[0]; + sig2_upd = sig2 - divsigma * proj[1]; + } + + // Compute the diagonal entries of $\widehat{\Sigma}\Sigma^{\dagger}_0$ + if(sig1 > fTiny) + sig1_upd /= sig1; + + if(sig2 > fTiny) + sig2_upd /= sig2; + + // Compute solution + t[0] = sig1_upd * V1 * V1 + sig2_upd * V2 * V2; + t[1] = sig1_upd * V1 * V3 + sig2_upd * V2 * V4; + t[2] = sig1_upd * V3 * V3 + sig2_upd * V4 * V4; +} + + +#include "hw_sched.h" +typedef struct { + int offY, stepY, realY, copY; + float *Input, *u, *u_upd, *qx, *qy, *qx_upd, *qy_upd, *gradx, *grady, *gradx_upd, *grady_upd; + double *div, *div_upd; + float resprimal, resdual; + float unorm, qnorm, product; +} tnv_thread_t; + +typedef struct { + int threads; + tnv_thread_t *thr_ctx; + float *InputT, *uT; + int dimX, dimY, dimZ, padZ; + float lambda, sigma, tau, theta; +} tnv_context_t; + +HWSched sched = NULL; +tnv_context_t tnv_ctx; + + +static int tnv_free(HWThread thr, void *hwctx, int device_id, void *data) { + int i,j,k; + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + free(ctx->Input); + free(ctx->u); + free(ctx->u_upd); + free(ctx->qx); + free(ctx->qy); + free(ctx->qx_upd); + free(ctx->qy_upd); + free(ctx->gradx); + free(ctx->grady); + free(ctx->gradx_upd); + free(ctx->grady_upd); + free(ctx->div); + free(ctx->div_upd); + + return 0; +} + +static int tnv_init(HWThread thr, void *hwctx, int device_id, void *data) { + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + int dimX = tnv_ctx->dimX; + int dimY = tnv_ctx->dimY; + int dimZ = tnv_ctx->dimZ; + int offY = ctx->offY; + int stepY = ctx->stepY; + int realY = ctx->realY; + +// printf("%i %p - %i %i %i x %i %i\n", device_id, ctx, dimX, dimY, dimZ, offY, stepY); + + long DimTotal = (long)(dimX*realY*dimZ); + long Dim1Total = (long)(dimX*(stepY+1)*dimZ); + + // Auxiliar vectors + ctx->Input = calloc(Dim1Total, sizeof(float)); + ctx->u = calloc(Dim1Total, sizeof(float)); + ctx->u_upd = calloc(Dim1Total, sizeof(float)); + ctx->qx = calloc(DimTotal, sizeof(float)); + ctx->qy = calloc(DimTotal, sizeof(float)); + ctx->qx_upd = calloc(DimTotal, sizeof(float)); + ctx->qy_upd = calloc(DimTotal, sizeof(float)); + ctx->gradx = calloc(DimTotal, sizeof(float)); + ctx->grady = calloc(DimTotal, sizeof(float)); + ctx->gradx_upd = calloc(DimTotal, sizeof(float)); + ctx->grady_upd = calloc(DimTotal, sizeof(float)); + ctx->div = calloc(Dim1Total, sizeof(double)); + ctx->div_upd = calloc(Dim1Total, sizeof(double)); + + if ((!ctx->Input)||(!ctx->u)||(!ctx->u_upd)||(!ctx->qx)||(!ctx->qy)||(!ctx->qx_upd)||(!ctx->qy_upd)||(!ctx->gradx)||(!ctx->grady)||(!ctx->gradx_upd)||(!ctx->grady_upd)||(!ctx->div)||(!ctx->div_upd)) { + fprintf(stderr, "Error allocating memory\n"); + exit(-1); + } + + return 0; +} + +static int tnv_start(HWThread thr, void *hwctx, int device_id, void *data) { + int i,j,k; + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + int dimX = tnv_ctx->dimX; + int dimY = tnv_ctx->dimY; + int dimZ = tnv_ctx->dimZ; + int offY = ctx->offY; + int stepY = ctx->stepY; + int realY = ctx->realY; + int copY = ctx->copY; + +// printf("%i %p - %i %i %i x %i %i\n", device_id, ctx, dimX, dimY, dimZ, offY, stepY); + + long DimTotal = (long)(dimX*realY*dimZ); + long Dim1Total = (long)(dimX*(stepY+1)*dimZ); + + memset(ctx->u, 0, Dim1Total * sizeof(float)); + memset(ctx->qx, 0, DimTotal * sizeof(float)); + memset(ctx->qy, 0, DimTotal * sizeof(float)); + memset(ctx->gradx, 0, DimTotal * sizeof(float)); + memset(ctx->grady, 0, DimTotal * sizeof(float)); + memset(ctx->div, 0, Dim1Total * sizeof(double)); + memset(ctx->u_upd, 0, Dim1Total * sizeof(float)); + memset(ctx->qx_upd, 0, DimTotal * sizeof(float)); + memset(ctx->qy_upd, 0, DimTotal * sizeof(float)); + memset(ctx->gradx_upd, 0, DimTotal * sizeof(float)); + memset(ctx->grady_upd, 0, DimTotal * sizeof(float)); + memset(ctx->div_upd, 0, Dim1Total * sizeof(double)); + + for(k=0; kInput[j * dimX * dimZ + i * dimZ + k] = tnv_ctx->InputT[k * dimX * dimY + (j + offY) * dimX + i]; + ctx->u[j * dimX * dimZ + i * dimZ + k] = tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i]; + } + } + } + + return 0; +} + +static int tnv_finish(HWThread thr, void *hwctx, int device_id, void *data) { + int i,j,k; + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + int dimX = tnv_ctx->dimX; + int dimY = tnv_ctx->dimY; + int dimZ = tnv_ctx->dimZ; + int offY = ctx->offY; + int stepY = ctx->stepY; + int realY = ctx->realY; + int copY = ctx->copY; + + long DimTotal = (long)(dimX*realY*dimZ); + long Dim1Total = (long)(dimX*(stepY+1)*dimZ); + + for(k=0; kuT[k * dimX * dimY + (j + offY) * dimX + i] = ctx->u[j * dimX * dimZ + i * dimZ + k]; + } + } + } + + return 0; +} + + +static int tnv_copy(HWThread thr, void *hwctx, int device_id, void *data) { + int i,j,k; + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + int dimX = tnv_ctx->dimX; + int dimY = tnv_ctx->dimY; + int dimZ = tnv_ctx->dimZ; + int stepY = ctx->stepY; + int realY = ctx->realY; + long DimTotal = (long)(dimX*realY*dimZ); + long Dim1Total = (long)(dimX*(stepY+1)*dimZ); + + // Auxiliar vectors + memcpy(ctx->u, ctx->u_upd, Dim1Total * sizeof(float)); + memcpy(ctx->qx, ctx->qx_upd, DimTotal * sizeof(float)); + memcpy(ctx->qy, ctx->qy_upd, DimTotal * sizeof(float)); + memcpy(ctx->gradx, ctx->gradx_upd, DimTotal * sizeof(float)); + memcpy(ctx->grady, ctx->grady_upd, DimTotal * sizeof(float)); + memcpy(ctx->div, ctx->div_upd, Dim1Total * sizeof(double)); + + return 0; +} + +static int tnv_restore(HWThread thr, void *hwctx, int device_id, void *data) { + int i,j,k; + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + int dimX = tnv_ctx->dimX; + int dimY = tnv_ctx->dimY; + int dimZ = tnv_ctx->dimZ; + int stepY = ctx->stepY; + int realY = ctx->realY; + long DimTotal = (long)(dimX*realY*dimZ); + long Dim1Total = (long)(dimX*(stepY+1)*dimZ); + + // Auxiliar vectors + memcpy(ctx->u_upd, ctx->u, Dim1Total * sizeof(float)); + memcpy(ctx->qx_upd, ctx->qx, DimTotal * sizeof(float)); + memcpy(ctx->qy_upd, ctx->qy, DimTotal * sizeof(float)); + memcpy(ctx->gradx_upd, ctx->gradx, DimTotal * sizeof(float)); + memcpy(ctx->grady_upd, ctx->grady, DimTotal * sizeof(float)); + memcpy(ctx->div_upd, ctx->div, Dim1Total * sizeof(double)); + + return 0; +} + + +static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) { + long i, j, k, l, m; + + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + int dimX = tnv_ctx->dimX; + int dimY = tnv_ctx->dimY; + int dimZ = tnv_ctx->dimZ; + int padZ = tnv_ctx->padZ; + int offY = ctx->offY; + int stepY = ctx->stepY; + int copY = ctx->copY; + + float *Input = ctx->Input; + float *u = ctx->u; + float *u_upd = ctx->u_upd; + float *qx = ctx->qx; + float *qy = ctx->qy; + float *qx_upd = ctx->qx_upd; + float *qy_upd = ctx->qy_upd; + float *gradx = ctx->gradx; + float *grady = ctx->grady; + float *gradx_upd = ctx->gradx_upd; + float *grady_upd = ctx->grady_upd; + double *div = ctx->div; + double *div_upd = ctx->div_upd; + + long p = 1l; + long q = 1l; + long r = 0l; + + float lambda = tnv_ctx->lambda; + float sigma = tnv_ctx->sigma; + float tau = tnv_ctx->tau; + float theta = tnv_ctx->theta; + + float taulambda = tau * lambda; + float divtau = 1.0f / tau; + float divsigma = 1.0f / sigma; + float theta1 = 1.0f + theta; + float constant = 1.0f + taulambda; + + float resprimal = 0.0f; + float resdual = 0.0f; + float product = 0.0f; + float unorm = 0.0f; + float qnorm = 0.0f; + + float udiff[dimZ]; + float qxdiff; + float qydiff; + float divdiff; + float gradxdiff[dimZ]; + float gradydiff[dimZ]; + + for(k=0; k < dimZ * dimX; k++) { + u_upd[k] = (u[k] + tau * div[k] + taulambda * Input[k])/constant; + div_upd[k] = 0; + } + + for(j = 0; j < stepY; j++) { +/* m = j * dimX * dimZ + (dimX - 1) * dimZ; + for(k = 0; k < dimZ; k++) { + u_upd[k + m] = (u[k + m] + tau * div[k + m] + taulambda * Input[k + m]) / constant; + }*/ + + for(i=0; i < (dimX /*- 1*/); i++) { + l = (j * dimX + i) * dimZ; + float t[3]; + float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f; + m = dimX * dimZ; + +//#pragma unroll 64 + for(k = 0; k < dimZ; k++) { + u_upd[l + k + m] = (u[l + k + m] + tau * div[l + k + m] + taulambda * Input[l + k + m]) / constant; + + gradx_upd[l + k] = (i == (dimX - 1))?0:(u_upd[l + k + dimZ] - u_upd[l + k]); + grady_upd[l + k] = (j == (copY - 1))?0:(u_upd[l + k + dimX * dimZ] - u_upd[l + k]); // We need div from the next thread on last iter + + udiff[k] = u[l + k] - u_upd[l + k]; + unorm += (udiff[k] * udiff[k]); +// if ((!k)&&(!i)) printf("%i = %f %f, %f %f\n", offY + j, u[l + k], u_upd[l + k], udiff[k], unorm); + + gradxdiff[k] = gradx[l + k] - gradx_upd[l + k]; + gradydiff[k] = grady[l + k] - grady_upd[l + k]; + + float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k]; + float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k]; +//#define TNV_NEW_STYLE +#ifdef TNV_NEW_STYLE + qx_upd[l + k] = qx[l + k] + sigma * ubarx; + qy_upd[l + k] = qy[l + k] + sigma * ubary; + + float vx = divsigma * qx_upd[l + k]; //+ ubarx + float vy = divsigma * qy_upd[l + k]; //+ ubary +#else + float vx = ubarx + divsigma * qx[l + k]; + float vy = ubary + divsigma * qy[l + k]; +#endif + + M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy); + } + + coefF(t, M1, M2, M3, sigma, p, q, r); + +//#pragma unroll 64 + for(k = 0; k < dimZ; k++) { +#ifdef TNV_NEW_STYLE + float vx = divsigma * qx_upd[l + k]; + float vy = divsigma * qy_upd[l + k]; + + float gx_upd = vx * t[0] + vy * t[1]; + float gy_upd = vx * t[1] + vy * t[2]; + + qx_upd[l + k] -= sigma * gx_upd; + qy_upd[l + k] -= sigma * gy_upd; +#else + float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k]; + float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k]; + float vx = ubarx + divsigma * qx[l + k]; + float vy = ubary + divsigma * qy[l + k]; + + float gx_upd = vx * t[0] + vy * t[1]; + float gy_upd = vx * t[1] + vy * t[2]; + + qx_upd[l + k] = qx[l + k] + sigma * (ubarx - gx_upd); + qy_upd[l + k] = qy[l + k] + sigma * (ubary - gy_upd); +#endif + +if(i != (dimX-1)) { + div_upd[l + k] += qx_upd[l + k]; + div_upd[l + k + dimZ] -= qx_upd[l + k]; +} +if(j != (copY-1)) { + div_upd[l + k] += qy_upd[l + k]; + div_upd[l + k + dimX * dimZ] = -qy_upd[l + k]; // We need to update div in the next thread on last iter +} + + qxdiff = qx[l + k] - qx_upd[l + k]; + qydiff = qy[l + k] - qy_upd[l + k]; + qnorm += (qxdiff * qxdiff + qydiff * qydiff); + + resdual += fabs(divsigma * qxdiff - gradxdiff[k]); + resdual += fabs(divsigma * qydiff - gradydiff[k]); + product += (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff); + + if ((offY == 0)||(j > 0)) { + divdiff = div[l + k] - div_upd[l + k]; // Multiple steps... How we compute without history? + resprimal += fabs(divtau * udiff[k] + divdiff); + } + } + + } // i + } + + + ctx->resprimal = resprimal; + ctx->resdual = resdual; + ctx->product = product; + ctx->unorm = unorm; + ctx->qnorm = qnorm; + + return 0; +} + +static void TNV_CPU_init(float *InputT, float *uT, int dimX, int dimY, int dimZ) { + int i, off, size, err; + + if (sched) return; + + tnv_ctx.dimX = dimX; + tnv_ctx.dimY = dimY; + tnv_ctx.dimZ = dimZ; + tnv_ctx.padZ = 64 * ((dimZ / 64) + ((dimZ % 64)?1:0)); + + hw_sched_init(); + + int threads = hw_sched_get_cpu_count(); + if (threads > dimY) threads = dimY/2; + + int step = dimY / threads; + int extra = dimY % threads; + + tnv_ctx.threads = threads; + tnv_ctx.thr_ctx = (tnv_thread_t*)calloc(threads, sizeof(tnv_thread_t)); + for (i = 0, off = 0; i < threads; i++, off += size) { + tnv_thread_t *ctx = tnv_ctx.thr_ctx + i; + size = step + ((i < extra)?1:0); + + ctx->offY = off; + ctx->stepY = size; + ctx->realY = ctx->stepY; + + if (i == (threads-1)) ctx->copY = ctx->stepY; + else ctx->copY = ctx->stepY + 1; + } + + sched = hw_sched_create(threads); + if (!sched) { fprintf(stderr, "Error creating threads\n"); exit(-1); } + + err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_init); + if (!err) err = hw_sched_wait_task(sched); + if (err) { fprintf(stderr, "Error %i scheduling init threads", err); exit(-1); } +} + + + +/* + * C-OMP implementation of Total Nuclear Variation regularisation model (2D + channels) [1] + * The code is modified from the implementation by Joan Duran see + * "denoisingPDHG_ipol.cpp" in Joans Collaborative Total Variation package + * + * Input Parameters: + * 1. Noisy volume of 2D + channel dimension, i.e. 3D volume + * 2. lambda - regularisation parameter + * 3. Number of iterations [OPTIONAL parameter] + * 4. eplsilon - tolerance constant [OPTIONAL parameter] + * 5. print information: 0 (off) or 1 (on) [OPTIONAL parameter] + * + * Output: + * 1. Filtered/regularized image (u) + * + * [1]. Duran, J., Moeller, M., Sbert, C. and Cremers, D., 2016. Collaborative total variation: a general framework for vectorial TV models. SIAM Journal on Imaging Sciences, 9(1), pp.116-151. + */ + +float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float tol, int dimX, int dimY, int dimZ) +{ + int err; + int iter; + int i,j,k,l; + + lambda = 1.0f/(2.0f*lambda); + tnv_ctx.lambda = lambda; + + // PDHG algorithm parameters + float tau = 0.5f; + float sigma = 0.5f; + float theta = 1.0f; + + // Backtracking parameters + float s = 1.0f; + float gamma = 0.75f; + float beta = 0.95f; + float alpha0 = 0.2f; + float alpha = alpha0; + float delta = 1.5f; + float eta = 0.95f; + + TNV_CPU_init(InputT, uT, dimX, dimY, dimZ); + + tnv_ctx.InputT = InputT; + tnv_ctx.uT = uT; + + err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_start); + if (!err) err = hw_sched_wait_task(sched); + if (err) { fprintf(stderr, "Error %i scheduling start threads", err); exit(-1); } + + + // Apply Primal-Dual Hybrid Gradient scheme + float residual = fLarge; + for(iter = 0; iter < maxIter; iter++) { + float resprimal = 0.0f; + float resdual = 0.0f; + float product = 0.0f; + float unorm = 0.0f; + float qnorm = 0.0f; + + float divtau = 1.0f / tau; + + tnv_ctx.sigma = sigma; + tnv_ctx.tau = tau; + tnv_ctx.theta = theta; + + err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_step); + if (!err) err = hw_sched_wait_task(sched); + if (err) { fprintf(stderr, "Error %i scheduling tnv threads", err); exit(-1); } + + // border regions + for (i = 1; i < tnv_ctx.threads; i++) { + tnv_thread_t *ctx0 = tnv_ctx.thr_ctx + (i - 1); + tnv_thread_t *ctx = tnv_ctx.thr_ctx + i; + + l = ctx0->stepY * dimX * dimZ; + for(k=0; k < dimZ * (dimX /*- 1*/); k++) { + double div_upd_add = ctx0->div_upd[l + k]; + ctx->div_upd[k] += div_upd_add; + ctx0->div_upd[l + k] = ctx->div_upd[k]; + +// ctx0->u_upd[l + k] = ctx->u_upd[k]; + + float divdiff = ctx->div[k] - ctx->div_upd[k]; // Multiple steps... How we compute without history? + float udiff = ctx->u[k] - ctx->u_upd[k]; + resprimal += fabs(divtau * udiff + divdiff); + } + } + +#define TNV_CHECK_RES +#ifndef TNV_CHECK_RES + for (i = 0; i < tnv_ctx.threads; i++) { + tnv_thread_t *ctx = tnv_ctx.thr_ctx + i; + resprimal += ctx->resprimal; + resdual += ctx->resdual; + product += ctx->product; + unorm += ctx->unorm; + qnorm += ctx->qnorm; + } +#else + resprimal = 0; + float divsigma = 1.0f / sigma; + for(j=0; j= size) subj-= size; + else break; + } + + tnv_thread_t *ctx = tnv_ctx.thr_ctx + thr; + + int k = subj * dimX * dimZ + i * dimZ + l; + + float udiff = ctx->u[k] - ctx->u_upd[k]; + float qxdiff = ctx->qx[k] - ctx->qx_upd[k]; + float qydiff = ctx->qy[k] - ctx->qy_upd[k]; + float divdiff = ctx->div[k] - ctx->div_upd[k]; + float gradxdiff = ctx->gradx[k] - ctx->gradx_upd[k]; + float gradydiff = ctx->grady[k] - ctx->grady_upd[k]; + + resprimal += fabs(divtau*udiff + divdiff); + resdual += fabs(divsigma*qxdiff - gradxdiff); + resdual += fabs(divsigma*qydiff - gradydiff); + + unorm += (udiff * udiff); + qnorm += (qxdiff * qxdiff + qydiff * qydiff); + product += (gradxdiff * qxdiff + gradydiff * qydiff); + } +#endif + + + + residual = (resprimal + resdual) / ((float) (dimX*dimY*dimZ)); + float b = (2.0f * tau * sigma * product) / (gamma * sigma * unorm + gamma * tau * qnorm); + float dual_dot_delta = resdual * s * delta; + float dual_div_delta = (resdual * s) / delta; + printf("resprimal: %f, resdual: %f, b: %f (product: %f, unorm: %f, qnorm: %f)\n", resprimal, resdual, b, product, unorm, qnorm); + + + if(b > 1) { + // Decrease step-sizes to fit balancing principle + tau = (beta * tau) / b; + sigma = (beta * sigma) / b; + alpha = alpha0; + + err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_restore); + if (!err) err = hw_sched_wait_task(sched); + if (err) { fprintf(stderr, "Error %i scheduling restore threads", err); exit(-1); } + } else { + err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_copy); + if (!err) err = hw_sched_wait_task(sched); + if (err) { fprintf(stderr, "Error %i scheduling copy threads", err); exit(-1); } + + if(resprimal > dual_dot_delta) { + // Increase primal step-size and decrease dual step-size + tau = tau / (1.0f - alpha); + sigma = sigma * (1.0f - alpha); + alpha = alpha * eta; + } else if(resprimal < dual_div_delta) { + // Decrease primal step-size and increase dual step-size + tau = tau * (1.0f - alpha); + sigma = sigma / (1.0f - alpha); + alpha = alpha * eta; + } + } + + if (residual < tol) break; + } + + err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_finish); + if (!err) err = hw_sched_wait_task(sched); + if (err) { fprintf(stderr, "Error %i scheduling finish threads", err); exit(-1); } + + + printf("Iterations stopped at %i with the residual %f \n", iter, residual); + printf("Return: %f\n", *uT); + + return *uT; +} diff --git a/src/Core/performance_CPU/TNV_core.c.v17 b/src/Core/performance_CPU/TNV_core.c.v17 new file mode 100755 index 00000000..60739cd6 --- /dev/null +++ b/src/Core/performance_CPU/TNV_core.c.v17 @@ -0,0 +1,690 @@ +/* + * This work is part of the Core Imaging Library developed by + * Visual Analytics and Imaging System Group of the Science Technology + * Facilities Council, STFC + * + * Copyright 2017 Daniil Kazantsev + * Copyright 2017 Srikanth Nagella, Edoardo Pasca + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "TNV_core.h" + +#define min(a,b) (((a)<(b))?(a):(b)) + +inline void coefF(float *t, float M1, float M2, float M3, float sigma, int p, int q, int r) { + int ii, num; + float divsigma = 1.0f / sigma; + float sum, shrinkfactor; + float T,D,det,eig1,eig2,sig1,sig2,V1, V2, V3, V4, v0,v1,v2, mu1,mu2,sig1_upd,sig2_upd; + float proj[2] = {0}; + + // Compute eigenvalues of M + T = M1 + M3; + D = M1 * M3 - M2 * M2; + det = sqrtf(MAX((T * T / 4.0f) - D, 0.0f)); + eig1 = MAX((T / 2.0f) + det, 0.0f); + eig2 = MAX((T / 2.0f) - det, 0.0f); + sig1 = sqrtf(eig1); + sig2 = sqrtf(eig2); + + // Compute normalized eigenvectors + V1 = V2 = V3 = V4 = 0.0f; + + if(M2 != 0.0f) + { + v0 = M2; + v1 = eig1 - M3; + v2 = eig2 - M3; + + mu1 = sqrtf(v0 * v0 + v1 * v1); + mu2 = sqrtf(v0 * v0 + v2 * v2); + + if(mu1 > fTiny) + { + V1 = v1 / mu1; + V3 = v0 / mu1; + } + + if(mu2 > fTiny) + { + V2 = v2 / mu2; + V4 = v0 / mu2; + } + + } else + { + if(M1 > M3) + { + V1 = V4 = 1.0f; + V2 = V3 = 0.0f; + + } else + { + V1 = V4 = 0.0f; + V2 = V3 = 1.0f; + } + } + + // Compute prox_p of the diagonal entries + sig1_upd = sig2_upd = 0.0f; + + if(p == 1) + { + sig1_upd = MAX(sig1 - divsigma, 0.0f); + sig2_upd = MAX(sig2 - divsigma, 0.0f); + + } else if(p == INFNORM) + { + proj[0] = sigma * fabs(sig1); + proj[1] = sigma * fabs(sig2); + + /*l1 projection part */ + sum = fLarge; + num = 0l; + shrinkfactor = 0.0f; + while(sum > 1.0f) + { + sum = 0.0f; + num = 0; + + for(ii = 0; ii < 2; ii++) + { + proj[ii] = MAX(proj[ii] - shrinkfactor, 0.0f); + + sum += fabs(proj[ii]); + if(proj[ii]!= 0.0f) + num++; + } + + if(num > 0) + shrinkfactor = (sum - 1.0f) / num; + else + break; + } + /*l1 proj ends*/ + + sig1_upd = sig1 - divsigma * proj[0]; + sig2_upd = sig2 - divsigma * proj[1]; + } + + // Compute the diagonal entries of $\widehat{\Sigma}\Sigma^{\dagger}_0$ + if(sig1 > fTiny) + sig1_upd /= sig1; + + if(sig2 > fTiny) + sig2_upd /= sig2; + + // Compute solution + t[0] = sig1_upd * V1 * V1 + sig2_upd * V2 * V2; + t[1] = sig1_upd * V1 * V3 + sig2_upd * V2 * V4; + t[2] = sig1_upd * V3 * V3 + sig2_upd * V4 * V4; +} + + +#include "hw_sched.h" +typedef struct { + int offY, stepY, copY; + float *Input, *u, *u_upd, *qx, *qy, *qx_upd, *qy_upd, *gradx, *grady, *gradx_upd, *grady_upd; + float *div, *div_upd; + float resprimal, resdual; + float unorm, qnorm, product; +} tnv_thread_t; + +typedef struct { + int threads; + tnv_thread_t *thr_ctx; + float *InputT, *uT; + int dimX, dimY, dimZ, padZ; + float lambda, sigma, tau, theta; +} tnv_context_t; + +HWSched sched = NULL; +tnv_context_t tnv_ctx; + + +static int tnv_free(HWThread thr, void *hwctx, int device_id, void *data) { + int i,j,k; + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + free(ctx->Input); + free(ctx->u); + free(ctx->u_upd); + free(ctx->qx); + free(ctx->qy); + free(ctx->qx_upd); + free(ctx->qy_upd); + free(ctx->gradx); + free(ctx->grady); + free(ctx->gradx_upd); + free(ctx->grady_upd); + free(ctx->div); + free(ctx->div_upd); + + return 0; +} + +static int tnv_init(HWThread thr, void *hwctx, int device_id, void *data) { + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + int dimX = tnv_ctx->dimX; + int dimY = tnv_ctx->dimY; + int dimZ = tnv_ctx->dimZ; + int padZ = tnv_ctx->padZ; + int offY = ctx->offY; + int stepY = ctx->stepY; + +// printf("%i %p - %i %i %i x %i %i\n", device_id, ctx, dimX, dimY, dimZ, offY, stepY); + + long DimTotal = (long)(dimX*stepY*padZ); + long Dim1Total = (long)(dimX*(stepY+1)*padZ); + + // Auxiliar vectors + ctx->Input = malloc(Dim1Total * sizeof(float)); + ctx->u = malloc(Dim1Total * sizeof(float)); + ctx->u_upd = malloc(Dim1Total * sizeof(float)); + ctx->qx = malloc(DimTotal * sizeof(float)); + ctx->qy = malloc(DimTotal * sizeof(float)); + ctx->qx_upd = malloc(DimTotal * sizeof(float)); + ctx->qy_upd = malloc(DimTotal * sizeof(float)); + ctx->gradx = malloc(DimTotal * sizeof(float)); + ctx->grady = malloc(DimTotal * sizeof(float)); + ctx->gradx_upd = malloc(DimTotal * sizeof(float)); + ctx->grady_upd = malloc(DimTotal * sizeof(float)); + ctx->div = malloc(Dim1Total * sizeof(float)); + ctx->div_upd = malloc(Dim1Total * sizeof(float)); + + if ((!ctx->Input)||(!ctx->u)||(!ctx->u_upd)||(!ctx->qx)||(!ctx->qy)||(!ctx->qx_upd)||(!ctx->qy_upd)||(!ctx->gradx)||(!ctx->grady)||(!ctx->gradx_upd)||(!ctx->grady_upd)||(!ctx->div)||(!ctx->div_upd)) { + fprintf(stderr, "Error allocating memory\n"); + exit(-1); + } + + return 0; +} + +static int tnv_start(HWThread thr, void *hwctx, int device_id, void *data) { + int i,j,k; + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + int dimX = tnv_ctx->dimX; + int dimY = tnv_ctx->dimY; + int dimZ = tnv_ctx->dimZ; + int padZ = tnv_ctx->padZ; + int offY = ctx->offY; + int stepY = ctx->stepY; + int copY = ctx->copY; + +// printf("%i %p - %i %i %i (%i) x %i %i\n", device_id, ctx, dimX, dimY, dimZ, padZ, offY, stepY); + + long DimTotal = (long)(dimX*stepY*padZ); + long Dim1Total = (long)(dimX*copY*padZ); + + memset(ctx->u, 0, Dim1Total * sizeof(float)); + memset(ctx->qx, 0, DimTotal * sizeof(float)); + memset(ctx->qy, 0, DimTotal * sizeof(float)); + memset(ctx->gradx, 0, DimTotal * sizeof(float)); + memset(ctx->grady, 0, DimTotal * sizeof(float)); + memset(ctx->div, 0, Dim1Total * sizeof(float)); + + for(k=0; kInput[j * dimX * padZ + i * padZ + k] = tnv_ctx->InputT[k * dimX * dimY + (j + offY) * dimX + i]; + ctx->u[j * dimX * padZ + i * padZ + k] = tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i]; + } + } + } + + return 0; +} + +static int tnv_finish(HWThread thr, void *hwctx, int device_id, void *data) { + int i,j,k; + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + int dimX = tnv_ctx->dimX; + int dimY = tnv_ctx->dimY; + int dimZ = tnv_ctx->dimZ; + int padZ = tnv_ctx->padZ; + int offY = ctx->offY; + int stepY = ctx->stepY; + int copY = ctx->copY; + + for(k=0; kuT[k * dimX * dimY + (j + offY) * dimX + i] = ctx->u[j * dimX * padZ + i * padZ + k]; + } + } + } + + return 0; +} + + +static int tnv_copy(HWThread thr, void *hwctx, int device_id, void *data) { + int i,j,k; + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + int dimX = tnv_ctx->dimX; + int dimY = tnv_ctx->dimY; + int dimZ = tnv_ctx->dimZ; + int stepY = ctx->stepY; + int copY = ctx->copY; + int padZ = tnv_ctx->padZ; + long DimTotal = (long)(dimX*stepY*padZ); + long Dim1Total = (long)(dimX*copY*padZ); + + // Auxiliar vectors + memcpy(ctx->u, ctx->u_upd, Dim1Total * sizeof(float)); + memcpy(ctx->qx, ctx->qx_upd, DimTotal * sizeof(float)); + memcpy(ctx->qy, ctx->qy_upd, DimTotal * sizeof(float)); + memcpy(ctx->gradx, ctx->gradx_upd, DimTotal * sizeof(float)); + memcpy(ctx->grady, ctx->grady_upd, DimTotal * sizeof(float)); + memcpy(ctx->div, ctx->div_upd, Dim1Total * sizeof(float)); + + return 0; +} + +static int tnv_restore(HWThread thr, void *hwctx, int device_id, void *data) { + int i,j,k; + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + int dimX = tnv_ctx->dimX; + int dimY = tnv_ctx->dimY; + int dimZ = tnv_ctx->dimZ; + int stepY = ctx->stepY; + int copY = ctx->copY; + int padZ = tnv_ctx->padZ; + long DimTotal = (long)(dimX*stepY*padZ); + long Dim1Total = (long)(dimX*copY*padZ); + + // Auxiliar vectors + memcpy(ctx->u_upd, ctx->u, Dim1Total * sizeof(float)); + memcpy(ctx->qx_upd, ctx->qx, DimTotal * sizeof(float)); + memcpy(ctx->qy_upd, ctx->qy, DimTotal * sizeof(float)); + memcpy(ctx->gradx_upd, ctx->gradx, DimTotal * sizeof(float)); + memcpy(ctx->grady_upd, ctx->grady, DimTotal * sizeof(float)); + memcpy(ctx->div_upd, ctx->div, Dim1Total * sizeof(float)); + + return 0; +} + + +static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) { + long i, j, k, l, m; + + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + int dimX = tnv_ctx->dimX; + int dimY = tnv_ctx->dimY; + int dimZ = tnv_ctx->dimZ; + int padZ = tnv_ctx->padZ; + int offY = ctx->offY; + int stepY = ctx->stepY; + int copY = ctx->copY; + + float *Input = ctx->Input; + float *u = ctx->u; + float *u_upd = ctx->u_upd; + float *qx = ctx->qx; + float *qy = ctx->qy; + float *qx_upd = ctx->qx_upd; + float *qy_upd = ctx->qy_upd; + float *gradx = ctx->gradx; + float *grady = ctx->grady; + float *gradx_upd = ctx->gradx_upd; + float *grady_upd = ctx->grady_upd; + float *div = ctx->div; + float *div_upd = ctx->div_upd; + + long p = 1l; + long q = 1l; + long r = 0l; + + float lambda = tnv_ctx->lambda; + float sigma = tnv_ctx->sigma; + float tau = tnv_ctx->tau; + float theta = tnv_ctx->theta; + + float taulambda = tau * lambda; + float divtau = 1.0f / tau; + float divsigma = 1.0f / sigma; + float theta1 = 1.0f + theta; + float constant = 1.0f + taulambda; + + float resprimal = 0.0f; + float resdual = 0.0f; + float product = 0.0f; + float unorm = 0.0f; + float qnorm = 0.0f; + + float udiff[dimZ]; + float qxdiff; + float qydiff; + float divdiff; + float gradxdiff[dimZ]; + float gradydiff[dimZ]; + + for(i=0; i < dimX; i++) { + for(k = 0; k < dimZ; k++) { + int l = i * padZ + k; + u_upd[l] = (u[l] + tau * div[l] + taulambda * Input[l])/constant; + div_upd[l] = 0; + } + } + + for(j = 0; j < stepY; j++) { +/* m = j * dimX * dimZ + (dimX - 1) * dimZ; + for(k = 0; k < dimZ; k++) { + u_upd[k + m] = (u[k + m] + tau * div[k + m] + taulambda * Input[k + m]) / constant; + }*/ + + for(i = 0; i < dimX/* - 1*/; i++) { + float t[3]; + float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f; + l = (j * dimX + i) * padZ; + m = dimX * padZ; + +//#pragma unroll 64 + for(k = 0; k < dimZ; k++) { + u_upd[l + k + m] = (u[l + k + m] + tau * div[l + k + m] + taulambda * Input[l + k + m]) / constant; + + gradx_upd[l + k] = (i == (dimX - 1))?0:(u_upd[l + k + padZ] - u_upd[l + k]); + grady_upd[l + k] = (j == (copY - 1))?0:(u_upd[l + k + m] - u_upd[l + k]); // We need div from the next thread on last iter + + udiff[k] = u[l + k] - u_upd[l + k]; + unorm += (udiff[k] * udiff[k]); +// if ((!k)&&(!i)) printf("%i = %f %f, %f %f\n", offY + j, u[l + k], u_upd[l + k], udiff[k], unorm); + + gradxdiff[k] = gradx[l + k] - gradx_upd[l + k]; + gradydiff[k] = grady[l + k] - grady_upd[l + k]; + + float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k]; + float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k]; +//#define TNV_NEW_STYLE +#ifdef TNV_NEW_STYLE + qx_upd[l + k] = qx[l + k] + sigma * ubarx; + qy_upd[l + k] = qy[l + k] + sigma * ubary; + + float vx = divsigma * qx_upd[l + k]; //+ ubarx + float vy = divsigma * qy_upd[l + k]; //+ ubary +#else + float vx = ubarx + divsigma * qx[l + k]; + float vy = ubary + divsigma * qy[l + k]; +#endif + + M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy); + } + + coefF(t, M1, M2, M3, sigma, p, q, r); + +//#pragma unroll 64 + for(k = 0; k < dimZ; k++) { +#ifdef TNV_NEW_STYLE + float vx = divsigma * qx_upd[l + k]; + float vy = divsigma * qy_upd[l + k]; + + float gx_upd = vx * t[0] + vy * t[1]; + float gy_upd = vx * t[1] + vy * t[2]; + + qx_upd[l + k] -= sigma * gx_upd; + qy_upd[l + k] -= sigma * gy_upd; +#else + float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k]; + float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k]; + float vx = ubarx + divsigma * qx[l + k]; + float vy = ubary + divsigma * qy[l + k]; + + float gx_upd = vx * t[0] + vy * t[1]; + float gy_upd = vx * t[1] + vy * t[2]; + + qx_upd[l + k] = qx[l + k] + sigma * (ubarx - gx_upd); + qy_upd[l + k] = qy[l + k] + sigma * (ubary - gy_upd); +#endif + +if(i != (dimX-1)) { + div_upd[l + k] += qx_upd[l + k]; + div_upd[l + k + padZ] -= qx_upd[l + k]; +} +if(j != (copY-1)) { + div_upd[l + k] += qy_upd[l + k]; + div_upd[l + k + m] = -qy_upd[l + k]; // We need to update div in the next thread on last iter +} + + qxdiff = qx[l + k] - qx_upd[l + k]; + qydiff = qy[l + k] - qy_upd[l + k]; + qnorm += (qxdiff * qxdiff + qydiff * qydiff); + + resdual += fabs(divsigma * qxdiff - gradxdiff[k]); + resdual += fabs(divsigma * qydiff - gradydiff[k]); + product += (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff); + + if ((offY == 0)||(j > 0)) { + divdiff = div[l + k] - div_upd[l + k]; // Multiple steps... How we compute without history? + resprimal += fabs(divtau * udiff[k] + divdiff); + } + } + + } // i + } + + + ctx->resprimal = resprimal; + ctx->resdual = resdual; + ctx->product = product; + ctx->unorm = unorm; + ctx->qnorm = qnorm; + + return 0; +} + +static void TNV_CPU_init(float *InputT, float *uT, int dimX, int dimY, int dimZ) { + int i, off, size, err; + + if (sched) return; + + tnv_ctx.dimX = dimX; + tnv_ctx.dimY = dimY; + tnv_ctx.dimZ = dimZ; + // Padding seems actually slower +// tnv_ctx.padZ = 64 * ((dimZ / 64) + ((dimZ % 64)?1:0)); + tnv_ctx.padZ = dimZ; + + hw_sched_init(); + + int threads = hw_sched_get_cpu_count(); + if (threads > dimY) threads = dimY/2; + + int step = dimY / threads; + int extra = dimY % threads; + + tnv_ctx.threads = threads; + tnv_ctx.thr_ctx = (tnv_thread_t*)calloc(threads, sizeof(tnv_thread_t)); + for (i = 0, off = 0; i < threads; i++, off += size) { + tnv_thread_t *ctx = tnv_ctx.thr_ctx + i; + size = step + ((i < extra)?1:0); + + ctx->offY = off; + ctx->stepY = size; + + if (i == (threads-1)) ctx->copY = ctx->stepY; + else ctx->copY = ctx->stepY + 1; + } + + sched = hw_sched_create(threads); + if (!sched) { fprintf(stderr, "Error creating threads\n"); exit(-1); } + + err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_init); + if (!err) err = hw_sched_wait_task(sched); + if (err) { fprintf(stderr, "Error %i scheduling init threads", err); exit(-1); } +} + + + +/* + * C-OMP implementation of Total Nuclear Variation regularisation model (2D + channels) [1] + * The code is modified from the implementation by Joan Duran see + * "denoisingPDHG_ipol.cpp" in Joans Collaborative Total Variation package + * + * Input Parameters: + * 1. Noisy volume of 2D + channel dimension, i.e. 3D volume + * 2. lambda - regularisation parameter + * 3. Number of iterations [OPTIONAL parameter] + * 4. eplsilon - tolerance constant [OPTIONAL parameter] + * 5. print information: 0 (off) or 1 (on) [OPTIONAL parameter] + * + * Output: + * 1. Filtered/regularized image (u) + * + * [1]. Duran, J., Moeller, M., Sbert, C. and Cremers, D., 2016. Collaborative total variation: a general framework for vectorial TV models. SIAM Journal on Imaging Sciences, 9(1), pp.116-151. + */ + +float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float tol, int dimX, int dimY, int dimZ) +{ + int err; + int iter; + int i,j,k,l,m; + + lambda = 1.0f/(2.0f*lambda); + tnv_ctx.lambda = lambda; + + // PDHG algorithm parameters + float tau = 0.5f; + float sigma = 0.5f; + float theta = 1.0f; + + // Backtracking parameters + float s = 1.0f; + float gamma = 0.75f; + float beta = 0.95f; + float alpha0 = 0.2f; + float alpha = alpha0; + float delta = 1.5f; + float eta = 0.95f; + + TNV_CPU_init(InputT, uT, dimX, dimY, dimZ); + + tnv_ctx.InputT = InputT; + tnv_ctx.uT = uT; + + int padZ = tnv_ctx.padZ; + + err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_start); + if (!err) err = hw_sched_wait_task(sched); + if (err) { fprintf(stderr, "Error %i scheduling start threads", err); exit(-1); } + + + // Apply Primal-Dual Hybrid Gradient scheme + float residual = fLarge; + for(iter = 0; iter < maxIter; iter++) { + float resprimal = 0.0f; + float resdual = 0.0f; + float product = 0.0f; + float unorm = 0.0f; + float qnorm = 0.0f; + + float divtau = 1.0f / tau; + + tnv_ctx.sigma = sigma; + tnv_ctx.tau = tau; + tnv_ctx.theta = theta; + + err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_step); + if (!err) err = hw_sched_wait_task(sched); + if (err) { fprintf(stderr, "Error %i scheduling tnv threads", err); exit(-1); } + + // border regions + for (j = 1; j < tnv_ctx.threads; j++) { + tnv_thread_t *ctx0 = tnv_ctx.thr_ctx + (j - 1); + tnv_thread_t *ctx = tnv_ctx.thr_ctx + j; + + m = ctx0->stepY * dimX * padZ; + for(i = 0; i < dimX; i++) { + for(k = 0; k < dimZ; k++) { + int l = i * padZ + k; + + float div_upd_add = ctx0->div_upd[m + l]; + ctx->div_upd[l] += div_upd_add; + ctx0->div_upd[m + l] = ctx->div_upd[l]; + //ctx0->u_upd[m + l] = ctx->u_upd[l]; + + float divdiff = ctx->div[l] - ctx->div_upd[l]; // Multiple steps... How we compute without history? + float udiff = ctx->u[l] - ctx->u_upd[l]; + resprimal += fabs(divtau * udiff + divdiff); + } + } + } + + for (j = 0; j < tnv_ctx.threads; j++) { + tnv_thread_t *ctx = tnv_ctx.thr_ctx + j; + resprimal += ctx->resprimal; + resdual += ctx->resdual; + product += ctx->product; + unorm += ctx->unorm; + qnorm += ctx->qnorm; + } + + residual = (resprimal + resdual) / ((float) (dimX*dimY*dimZ)); + float b = (2.0f * tau * sigma * product) / (gamma * sigma * unorm + gamma * tau * qnorm); + float dual_dot_delta = resdual * s * delta; + float dual_div_delta = (resdual * s) / delta; + printf("resprimal: %f, resdual: %f, b: %f (product: %f, unorm: %f, qnorm: %f)\n", resprimal, resdual, b, product, unorm, qnorm); + + + if(b > 1) { + // Decrease step-sizes to fit balancing principle + tau = (beta * tau) / b; + sigma = (beta * sigma) / b; + alpha = alpha0; + + err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_restore); + if (!err) err = hw_sched_wait_task(sched); + if (err) { fprintf(stderr, "Error %i scheduling restore threads", err); exit(-1); } + } else { + err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_copy); + if (!err) err = hw_sched_wait_task(sched); + if (err) { fprintf(stderr, "Error %i scheduling copy threads", err); exit(-1); } + + if(resprimal > dual_dot_delta) { + // Increase primal step-size and decrease dual step-size + tau = tau / (1.0f - alpha); + sigma = sigma * (1.0f - alpha); + alpha = alpha * eta; + } else if(resprimal < dual_div_delta) { + // Decrease primal step-size and increase dual step-size + tau = tau * (1.0f - alpha); + sigma = sigma / (1.0f - alpha); + alpha = alpha * eta; + } + } + + if (residual < tol) break; + } + + err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_finish); + if (!err) err = hw_sched_wait_task(sched); + if (err) { fprintf(stderr, "Error %i scheduling finish threads", err); exit(-1); } + + + printf("Iterations stopped at %i with the residual %f \n", iter, residual); + printf("Return: %f\n", *uT); + + return *uT; +} diff --git a/src/Core/performance_CPU/TNV_core.c.v18 b/src/Core/performance_CPU/TNV_core.c.v18 new file mode 100755 index 00000000..71924751 --- /dev/null +++ b/src/Core/performance_CPU/TNV_core.c.v18 @@ -0,0 +1,688 @@ +/* + * This work is part of the Core Imaging Library developed by + * Visual Analytics and Imaging System Group of the Science Technology + * Facilities Council, STFC + * + * Copyright 2017 Daniil Kazantsev + * Copyright 2017 Srikanth Nagella, Edoardo Pasca + * + * Copyriht 2020 Suren A. Chlingaryan + * Optimized version with 1/2 of memory consumption and ~4x performance + * This version is algorithmicly comptable with the original, but slight change in results + * is expected due to different order of floating-point operations. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "TNV_core.h" + +#define BLOCK 32 +#define min(a,b) (((a)<(b))?(a):(b)) + +inline void coefF(float *t, float M1, float M2, float M3, float sigma, int p, int q, int r) { + int ii, num; + float divsigma = 1.0f / sigma; + float sum, shrinkfactor; + float T,D,det,eig1,eig2,sig1,sig2,V1, V2, V3, V4, v0,v1,v2, mu1,mu2,sig1_upd,sig2_upd; + float proj[2] = {0}; + + // Compute eigenvalues of M + T = M1 + M3; + D = M1 * M3 - M2 * M2; + det = sqrtf(MAX((T * T / 4.0f) - D, 0.0f)); + eig1 = MAX((T / 2.0f) + det, 0.0f); + eig2 = MAX((T / 2.0f) - det, 0.0f); + sig1 = sqrtf(eig1); + sig2 = sqrtf(eig2); + + // Compute normalized eigenvectors + V1 = V2 = V3 = V4 = 0.0f; + + if(M2 != 0.0f) + { + v0 = M2; + v1 = eig1 - M3; + v2 = eig2 - M3; + + mu1 = sqrtf(v0 * v0 + v1 * v1); + mu2 = sqrtf(v0 * v0 + v2 * v2); + + if(mu1 > fTiny) + { + V1 = v1 / mu1; + V3 = v0 / mu1; + } + + if(mu2 > fTiny) + { + V2 = v2 / mu2; + V4 = v0 / mu2; + } + + } else + { + if(M1 > M3) + { + V1 = V4 = 1.0f; + V2 = V3 = 0.0f; + + } else + { + V1 = V4 = 0.0f; + V2 = V3 = 1.0f; + } + } + + // Compute prox_p of the diagonal entries + sig1_upd = sig2_upd = 0.0f; + + if(p == 1) + { + sig1_upd = MAX(sig1 - divsigma, 0.0f); + sig2_upd = MAX(sig2 - divsigma, 0.0f); + + } else if(p == INFNORM) + { + proj[0] = sigma * fabs(sig1); + proj[1] = sigma * fabs(sig2); + + /*l1 projection part */ + sum = fLarge; + num = 0l; + shrinkfactor = 0.0f; + while(sum > 1.0f) + { + sum = 0.0f; + num = 0; + + for(ii = 0; ii < 2; ii++) + { + proj[ii] = MAX(proj[ii] - shrinkfactor, 0.0f); + + sum += fabs(proj[ii]); + if(proj[ii]!= 0.0f) + num++; + } + + if(num > 0) + shrinkfactor = (sum - 1.0f) / num; + else + break; + } + /*l1 proj ends*/ + + sig1_upd = sig1 - divsigma * proj[0]; + sig2_upd = sig2 - divsigma * proj[1]; + } + + // Compute the diagonal entries of $\widehat{\Sigma}\Sigma^{\dagger}_0$ + if(sig1 > fTiny) + sig1_upd /= sig1; + + if(sig2 > fTiny) + sig2_upd /= sig2; + + // Compute solution + t[0] = sig1_upd * V1 * V1 + sig2_upd * V2 * V2; + t[1] = sig1_upd * V1 * V3 + sig2_upd * V2 * V4; + t[2] = sig1_upd * V3 * V3 + sig2_upd * V4 * V4; +} + + +#include "hw_sched.h" +typedef struct { + int offY, stepY, copY; + float *Input, *u, *u_upd, *qx, *qy, *qx_upd, *qy_upd, *gradx, *grady, *gradx_upd, *grady_upd; + float *div, *div_upd; + float resprimal, resdual; + float unorm, qnorm, product; +} tnv_thread_t; + +typedef struct { + int threads; + tnv_thread_t *thr_ctx; + float *InputT, *uT; + int dimX, dimY, dimZ, padZ; + float lambda, sigma, tau, theta; +} tnv_context_t; + +HWSched sched = NULL; +tnv_context_t tnv_ctx; + + +static int tnv_free(HWThread thr, void *hwctx, int device_id, void *data) { + int i,j,k; + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + free(ctx->Input); + free(ctx->u); + free(ctx->u_upd); + free(ctx->qx); + free(ctx->qy); + free(ctx->qx_upd); + free(ctx->qy_upd); + free(ctx->gradx); + free(ctx->grady); + free(ctx->gradx_upd); + free(ctx->grady_upd); + free(ctx->div); + free(ctx->div_upd); + + return 0; +} + +static int tnv_init(HWThread thr, void *hwctx, int device_id, void *data) { + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + int dimX = tnv_ctx->dimX; + int dimY = tnv_ctx->dimY; + int dimZ = tnv_ctx->dimZ; + int padZ = tnv_ctx->padZ; + int offY = ctx->offY; + int stepY = ctx->stepY; + +// printf("%i %p - %i %i %i x %i %i\n", device_id, ctx, dimX, dimY, dimZ, offY, stepY); + + long DimTotal = (long)(dimX*stepY*padZ); + long Dim1Total = (long)(dimX*(stepY+1)*padZ); + + // Auxiliar vectors + ctx->Input = memalign(64, Dim1Total * sizeof(float)); + ctx->u = memalign(64, Dim1Total * sizeof(float)); + ctx->u_upd = memalign(64, Dim1Total * sizeof(float)); + ctx->qx = memalign(64, DimTotal * sizeof(float)); + ctx->qy = memalign(64, DimTotal * sizeof(float)); + ctx->qx_upd = memalign(64, DimTotal * sizeof(float)); + ctx->qy_upd = memalign(64, DimTotal * sizeof(float)); + ctx->gradx = memalign(64, DimTotal * sizeof(float)); + ctx->grady = memalign(64, DimTotal * sizeof(float)); + ctx->gradx_upd = memalign(64, DimTotal * sizeof(float)); + ctx->grady_upd = memalign(64, DimTotal * sizeof(float)); + ctx->div = memalign(64, Dim1Total * sizeof(float)); + ctx->div_upd = malloc(Dim1Total * sizeof(float)); + + if ((!ctx->Input)||(!ctx->u)||(!ctx->u_upd)||(!ctx->qx)||(!ctx->qy)||(!ctx->qx_upd)||(!ctx->qy_upd)||(!ctx->gradx)||(!ctx->grady)||(!ctx->gradx_upd)||(!ctx->grady_upd)||(!ctx->div)||(!ctx->div_upd)) { + fprintf(stderr, "Error allocating memory\n"); + exit(-1); + } + + return 0; +} + +static int tnv_start(HWThread thr, void *hwctx, int device_id, void *data) { + int i,j,k; + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + int dimX = tnv_ctx->dimX; + int dimY = tnv_ctx->dimY; + int dimZ = tnv_ctx->dimZ; + int padZ = tnv_ctx->padZ; + int offY = ctx->offY; + int stepY = ctx->stepY; + int copY = ctx->copY; + +// printf("%i %p - %i %i %i (%i) x %i %i\n", device_id, ctx, dimX, dimY, dimZ, padZ, offY, stepY); + + long DimTotal = (long)(dimX*stepY*padZ); + long Dim1Total = (long)(dimX*copY*padZ); + + memset(ctx->u, 0, Dim1Total * sizeof(float)); + memset(ctx->qx, 0, DimTotal * sizeof(float)); + memset(ctx->qy, 0, DimTotal * sizeof(float)); + memset(ctx->gradx, 0, DimTotal * sizeof(float)); + memset(ctx->grady, 0, DimTotal * sizeof(float)); + memset(ctx->div, 0, Dim1Total * sizeof(float)); + + for(k=0; kInput[j * dimX * padZ + i * padZ + k] = tnv_ctx->InputT[k * dimX * dimY + (j + offY) * dimX + i]; + ctx->u[j * dimX * padZ + i * padZ + k] = tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i]; + } + } + } + + return 0; +} + +static int tnv_finish(HWThread thr, void *hwctx, int device_id, void *data) { + int i,j,k; + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + int dimX = tnv_ctx->dimX; + int dimY = tnv_ctx->dimY; + int dimZ = tnv_ctx->dimZ; + int padZ = tnv_ctx->padZ; + int offY = ctx->offY; + int stepY = ctx->stepY; + int copY = ctx->copY; + + for(k=0; kuT[k * dimX * dimY + (j + offY) * dimX + i] = ctx->u[j * dimX * padZ + i * padZ + k]; + } + } + } + + return 0; +} + + +static int tnv_copy(HWThread thr, void *hwctx, int device_id, void *data) { + int i,j,k; + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + int dimX = tnv_ctx->dimX; + int dimY = tnv_ctx->dimY; + int dimZ = tnv_ctx->dimZ; + int stepY = ctx->stepY; + int copY = ctx->copY; + int padZ = tnv_ctx->padZ; + long DimTotal = (long)(dimX*stepY*padZ); + long Dim1Total = (long)(dimX*copY*padZ); + + // Auxiliar vectors + memcpy(ctx->u, ctx->u_upd, Dim1Total * sizeof(float)); + memcpy(ctx->qx, ctx->qx_upd, DimTotal * sizeof(float)); + memcpy(ctx->qy, ctx->qy_upd, DimTotal * sizeof(float)); + memcpy(ctx->gradx, ctx->gradx_upd, DimTotal * sizeof(float)); + memcpy(ctx->grady, ctx->grady_upd, DimTotal * sizeof(float)); + memcpy(ctx->div, ctx->div_upd, Dim1Total * sizeof(float)); + + return 0; +} + +static int tnv_restore(HWThread thr, void *hwctx, int device_id, void *data) { + int i,j,k; + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + int dimX = tnv_ctx->dimX; + int dimY = tnv_ctx->dimY; + int dimZ = tnv_ctx->dimZ; + int stepY = ctx->stepY; + int copY = ctx->copY; + int padZ = tnv_ctx->padZ; + long DimTotal = (long)(dimX*stepY*padZ); + long Dim1Total = (long)(dimX*copY*padZ); + + // Auxiliar vectors + memcpy(ctx->u_upd, ctx->u, Dim1Total * sizeof(float)); + memcpy(ctx->qx_upd, ctx->qx, DimTotal * sizeof(float)); + memcpy(ctx->qy_upd, ctx->qy, DimTotal * sizeof(float)); + memcpy(ctx->gradx_upd, ctx->gradx, DimTotal * sizeof(float)); + memcpy(ctx->grady_upd, ctx->grady, DimTotal * sizeof(float)); + memcpy(ctx->div_upd, ctx->div, Dim1Total * sizeof(float)); + + return 0; +} + + +static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) { + long i, j, k, l; + + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + int dimX = tnv_ctx->dimX; + int dimY = tnv_ctx->dimY; + int dimZ = tnv_ctx->dimZ; + int padZ = tnv_ctx->padZ; + int offY = ctx->offY; + int stepY = ctx->stepY; + int copY = ctx->copY; + + float *Input = ctx->Input; + float *u = ctx->u; + float *u_upd = ctx->u_upd; + float *qx = ctx->qx; + float *qy = ctx->qy; + float *qx_upd = ctx->qx_upd; + float *qy_upd = ctx->qy_upd; + float *gradx = ctx->gradx; + float *grady = ctx->grady; + float *gradx_upd = ctx->gradx_upd; + float *grady_upd = ctx->grady_upd; + float *div = ctx->div; + float *div_upd = ctx->div_upd; + + long p = 1l; + long q = 1l; + long r = 0l; + + float lambda = tnv_ctx->lambda; + float sigma = tnv_ctx->sigma; + float tau = tnv_ctx->tau; + float theta = tnv_ctx->theta; + + float taulambda = tau * lambda; + float divtau = 1.0f / tau; + float divsigma = 1.0f / sigma; + float theta1 = 1.0f + theta; + float constant = 1.0f + taulambda; + + float resprimal = 0.0; + float resdual = 0.0; + float product = 0.0; + float unorm = 0.0; + float qnorm = 0.0; + + float udiff[dimZ] __attribute__((aligned(64))); + float qxdiff __attribute__((aligned(64))); + float qydiff __attribute__((aligned(64))); + float divdiff __attribute__((aligned(64))); + float gradxdiff[dimZ] __attribute__((aligned(64))); + float gradydiff[dimZ] __attribute__((aligned(64))); + + for(int j1 = 0; j1 < stepY; j1 += BLOCK) { + for(int i1 = 0; i1 < dimX; i1 += BLOCK) { + for(int j2 = 0; j2 < BLOCK; j2++) { + j = j1 + j2; + for(int i2 = 0; i2 < BLOCK; i2++) { + float t[3]; + float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f; + + i = i1 + i2; + if (i == dimX) break; + if (j == stepY) { j2 = BLOCK; break; } + l = (j * dimX + i) * padZ; + +//#pragma vector aligned +#pragma GCC ivdep + for(k = 0; k < dimZ; k++) { + u_upd[l + k] = (u[l + k] + tau * div[l + k] + taulambda * Input[l + k]) / constant; + udiff[k] = u[l + k] - u_upd[l + k]; + unorm += (udiff[k] * udiff[k]); + + gradx_upd[l + k] = (i == (dimX - 1))?0:((u[l + k + padZ] + tau * div[l + k + padZ] + taulambda * Input[l + k + padZ]) / constant - u_upd[l + k]); + grady_upd[l + k] = (j == (copY - 1))?0:((u[l + k + dimX*padZ] + tau * div[l + k + dimX*padZ] + taulambda * Input[l + k + dimX*padZ]) / constant - u_upd[l + k]); + gradxdiff[k] = gradx[l + k] - gradx_upd[l + k]; + gradydiff[k] = grady[l + k] - grady_upd[l + k]; + + float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k]; + float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k]; +//#define TNV_NEW_STYLE +#ifdef TNV_NEW_STYLE + qx_upd[l + k] = qx[l + k] + sigma * ubarx; + qy_upd[l + k] = qy[l + k] + sigma * ubary; + + float vx = divsigma * qx_upd[l + k]; //+ ubarx + float vy = divsigma * qy_upd[l + k]; //+ ubary +#else + float vx = ubarx + divsigma * qx[l + k]; + float vy = ubary + divsigma * qy[l + k]; +#endif + + M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy); + } + + coefF(t, M1, M2, M3, sigma, p, q, r); + +//#pragma vector aligned +#pragma GCC ivdep + for(k = 0; k < dimZ; k++) { +#ifdef TNV_NEW_STYLE + float vx = divsigma * qx_upd[l + k]; + float vy = divsigma * qy_upd[l + k]; + + float gx_upd = vx * t[0] + vy * t[1]; + float gy_upd = vx * t[1] + vy * t[2]; + + qx_upd[l + k] -= sigma * gx_upd; + qy_upd[l + k] -= sigma * gy_upd; +#else + float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k]; + float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k]; + float vx = ubarx + divsigma * qx[l + k]; + float vy = ubary + divsigma * qy[l + k]; + + float gx_upd = vx * t[0] + vy * t[1]; + float gy_upd = vx * t[1] + vy * t[2]; + + qx_upd[l + k] = qx[l + k] + sigma * (ubarx - gx_upd); + qy_upd[l + k] = qy[l + k] + sigma * (ubary - gy_upd); +#endif + + float div_upd_val = 0; + div_upd_val -= (i > 0)?qx_upd[l + k - padZ]:0; + div_upd_val -= (j > 0)?qy_upd[l + k - dimX * padZ]:0; + div_upd_val += (i < (dimX-1))?qx_upd[l + k]:0; + div_upd_val += (j < (copY-1))?qy_upd[l + k]:0; + div_upd[l + k] = div_upd_val; + + qxdiff = qx[l + k] - qx_upd[l + k]; + qydiff = qy[l + k] - qy_upd[l + k]; + qnorm += (qxdiff * qxdiff + qydiff * qydiff); + + resdual += fabs(divsigma * qxdiff - gradxdiff[k]); + resdual += fabs(divsigma * qydiff - gradydiff[k]); + product += (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff); + + if ((offY == 0)||(j > 0)) { + divdiff = div[l + k] - div_upd[l + k]; // Multiple steps... How we compute without history? + resprimal += fabs(divtau * udiff[k] + divdiff); + } + } + + } // i + } // j + } // i + } // j + + + ctx->resprimal = resprimal; + ctx->resdual = resdual; + ctx->product = product; + ctx->unorm = unorm; + ctx->qnorm = qnorm; + + return 0; +} + +static void TNV_CPU_init(float *InputT, float *uT, int dimX, int dimY, int dimZ) { + int i, off, size, err; + + if (sched) return; + + tnv_ctx.dimX = dimX; + tnv_ctx.dimY = dimY; + tnv_ctx.dimZ = dimZ; + // Padding seems actually slower + tnv_ctx.padZ = dimZ; +// tnv_ctx.padZ = 16 * ((dimZ / 16) + ((dimZ % 16)?1:0)); + + hw_sched_init(); + + int threads = hw_sched_get_cpu_count(); + if (threads > dimY) threads = dimY/2; + + int step = dimY / threads; + int extra = dimY % threads; + + tnv_ctx.threads = threads; + tnv_ctx.thr_ctx = (tnv_thread_t*)calloc(threads, sizeof(tnv_thread_t)); + for (i = 0, off = 0; i < threads; i++, off += size) { + tnv_thread_t *ctx = tnv_ctx.thr_ctx + i; + size = step + ((i < extra)?1:0); + + ctx->offY = off; + ctx->stepY = size; + + if (i == (threads-1)) ctx->copY = ctx->stepY; + else ctx->copY = ctx->stepY + 1; + } + + sched = hw_sched_create(threads); + if (!sched) { fprintf(stderr, "Error creating threads\n"); exit(-1); } + + err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_init); + if (!err) err = hw_sched_wait_task(sched); + if (err) { fprintf(stderr, "Error %i scheduling init threads", err); exit(-1); } +} + + + +/* + * C-OMP implementation of Total Nuclear Variation regularisation model (2D + channels) [1] + * The code is modified from the implementation by Joan Duran see + * "denoisingPDHG_ipol.cpp" in Joans Collaborative Total Variation package + * + * Input Parameters: + * 1. Noisy volume of 2D + channel dimension, i.e. 3D volume + * 2. lambda - regularisation parameter + * 3. Number of iterations [OPTIONAL parameter] + * 4. eplsilon - tolerance constant [OPTIONAL parameter] + * 5. print information: 0 (off) or 1 (on) [OPTIONAL parameter] + * + * Output: + * 1. Filtered/regularized image (u) + * + * [1]. Duran, J., Moeller, M., Sbert, C. and Cremers, D., 2016. Collaborative total variation: a general framework for vectorial TV models. SIAM Journal on Imaging Sciences, 9(1), pp.116-151. + */ + +float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float tol, int dimX, int dimY, int dimZ) +{ + int err; + int iter; + int i,j,k,l,m; + + lambda = 1.0f/(2.0f*lambda); + tnv_ctx.lambda = lambda; + + // PDHG algorithm parameters + float tau = 0.5f; + float sigma = 0.5f; + float theta = 1.0f; + + // Backtracking parameters + float s = 1.0f; + float gamma = 0.75f; + float beta = 0.95f; + float alpha0 = 0.2f; + float alpha = alpha0; + float delta = 1.5f; + float eta = 0.95f; + + TNV_CPU_init(InputT, uT, dimX, dimY, dimZ); + + tnv_ctx.InputT = InputT; + tnv_ctx.uT = uT; + + int padZ = tnv_ctx.padZ; + + err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_start); + if (!err) err = hw_sched_wait_task(sched); + if (err) { fprintf(stderr, "Error %i scheduling start threads", err); exit(-1); } + + + // Apply Primal-Dual Hybrid Gradient scheme + float residual = fLarge; + for(iter = 0; iter < maxIter; iter++) { + float resprimal = 0.0f; + float resdual = 0.0f; + float product = 0.0f; + float unorm = 0.0f; + float qnorm = 0.0f; + + float divtau = 1.0f / tau; + + tnv_ctx.sigma = sigma; + tnv_ctx.tau = tau; + tnv_ctx.theta = theta; + + err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_step); + if (!err) err = hw_sched_wait_task(sched); + if (err) { fprintf(stderr, "Error %i scheduling tnv threads", err); exit(-1); } + + // border regions + for (j = 1; j < tnv_ctx.threads; j++) { + tnv_thread_t *ctx0 = tnv_ctx.thr_ctx + (j - 1); + tnv_thread_t *ctx = tnv_ctx.thr_ctx + j; + + m = ctx0->stepY * dimX * padZ; + for(i = 0; i < dimX; i++) { + for(k = 0; k < dimZ; k++) { + int l = i * padZ + k; + + ctx->div_upd[l] -= ctx0->qy_upd[m - dimX * padZ + l]; + ctx0->div_upd[m + l] = ctx->div_upd[l]; + ctx0->u_upd[m + l] = ctx->u_upd[l]; + + float divdiff = ctx->div[l] - ctx->div_upd[l]; + float udiff = ctx->u[l] - ctx->u_upd[l]; + resprimal += fabs(divtau * udiff + divdiff); + } + } + } + + for (j = 0; j < tnv_ctx.threads; j++) { + tnv_thread_t *ctx = tnv_ctx.thr_ctx + j; + resprimal += ctx->resprimal; + resdual += ctx->resdual; + product += ctx->product; + unorm += ctx->unorm; + qnorm += ctx->qnorm; + } + + residual = (resprimal + resdual) / ((float) (dimX*dimY*dimZ)); + float b = (2.0f * tau * sigma * product) / (gamma * sigma * unorm + gamma * tau * qnorm); + float dual_dot_delta = resdual * s * delta; + float dual_div_delta = (resdual * s) / delta; + printf("resprimal: %f, resdual: %f, b: %f (product: %f, unorm: %f, qnorm: %f)\n", resprimal, resdual, b, product, unorm, qnorm); + + + if(b > 1) { + // Decrease step-sizes to fit balancing principle + tau = (beta * tau) / b; + sigma = (beta * sigma) / b; + alpha = alpha0; + + err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_restore); + if (!err) err = hw_sched_wait_task(sched); + if (err) { fprintf(stderr, "Error %i scheduling restore threads", err); exit(-1); } + } else { + err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_copy); + if (!err) err = hw_sched_wait_task(sched); + if (err) { fprintf(stderr, "Error %i scheduling copy threads", err); exit(-1); } + + if(resprimal > dual_dot_delta) { + // Increase primal step-size and decrease dual step-size + tau = tau / (1.0f - alpha); + sigma = sigma * (1.0f - alpha); + alpha = alpha * eta; + } else if(resprimal < dual_div_delta) { + // Decrease primal step-size and increase dual step-size + tau = tau * (1.0f - alpha); + sigma = sigma / (1.0f - alpha); + alpha = alpha * eta; + } + } + + if (residual < tol) break; + } + + err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_finish); + if (!err) err = hw_sched_wait_task(sched); + if (err) { fprintf(stderr, "Error %i scheduling finish threads", err); exit(-1); } + + + printf("Iterations stopped at %i with the residual %f \n", iter, residual); + printf("Return: %f\n", *uT); + + return *uT; +} diff --git a/src/Core/performance_CPU/TNV_core.c.v19 b/src/Core/performance_CPU/TNV_core.c.v19 new file mode 100755 index 00000000..9b19ed5c --- /dev/null +++ b/src/Core/performance_CPU/TNV_core.c.v19 @@ -0,0 +1,681 @@ +/* + * This work is part of the Core Imaging Library developed by + * Visual Analytics and Imaging System Group of the Science Technology + * Facilities Council, STFC + * + * Copyright 2017 Daniil Kazantsev + * Copyright 2017 Srikanth Nagella, Edoardo Pasca + * + * Copyriht 2020 Suren A. Chlingaryan + * Optimized version with 1/2 of memory consumption and ~4x performance + * This version is algorithmicly comptable with the original, but slight change in results + * is expected due to different order of floating-point operations. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "TNV_core.h" + +#define BLOCK 32 +#define min(a,b) (((a)<(b))?(a):(b)) + +inline void coefF(float *t, float M1, float M2, float M3, float sigma, int p, int q, int r) { + int ii, num; + float divsigma = 1.0f / sigma; + float sum, shrinkfactor; + float T,D,det,eig1,eig2,sig1,sig2,V1, V2, V3, V4, v0,v1,v2, mu1,mu2,sig1_upd,sig2_upd; + float proj[2] = {0}; + + // Compute eigenvalues of M + T = M1 + M3; + D = M1 * M3 - M2 * M2; + det = sqrtf(MAX((T * T / 4.0f) - D, 0.0f)); + eig1 = MAX((T / 2.0f) + det, 0.0f); + eig2 = MAX((T / 2.0f) - det, 0.0f); + sig1 = sqrtf(eig1); + sig2 = sqrtf(eig2); + + // Compute normalized eigenvectors + V1 = V2 = V3 = V4 = 0.0f; + + if(M2 != 0.0f) + { + v0 = M2; + v1 = eig1 - M3; + v2 = eig2 - M3; + + mu1 = sqrtf(v0 * v0 + v1 * v1); + mu2 = sqrtf(v0 * v0 + v2 * v2); + + if(mu1 > fTiny) + { + V1 = v1 / mu1; + V3 = v0 / mu1; + } + + if(mu2 > fTiny) + { + V2 = v2 / mu2; + V4 = v0 / mu2; + } + + } else + { + if(M1 > M3) + { + V1 = V4 = 1.0f; + V2 = V3 = 0.0f; + + } else + { + V1 = V4 = 0.0f; + V2 = V3 = 1.0f; + } + } + + // Compute prox_p of the diagonal entries + sig1_upd = sig2_upd = 0.0f; + + if(p == 1) + { + sig1_upd = MAX(sig1 - divsigma, 0.0f); + sig2_upd = MAX(sig2 - divsigma, 0.0f); + + } else if(p == INFNORM) + { + proj[0] = sigma * fabs(sig1); + proj[1] = sigma * fabs(sig2); + + /*l1 projection part */ + sum = fLarge; + num = 0l; + shrinkfactor = 0.0f; + while(sum > 1.0f) + { + sum = 0.0f; + num = 0; + + for(ii = 0; ii < 2; ii++) + { + proj[ii] = MAX(proj[ii] - shrinkfactor, 0.0f); + + sum += fabs(proj[ii]); + if(proj[ii]!= 0.0f) + num++; + } + + if(num > 0) + shrinkfactor = (sum - 1.0f) / num; + else + break; + } + /*l1 proj ends*/ + + sig1_upd = sig1 - divsigma * proj[0]; + sig2_upd = sig2 - divsigma * proj[1]; + } + + // Compute the diagonal entries of $\widehat{\Sigma}\Sigma^{\dagger}_0$ + if(sig1 > fTiny) + sig1_upd /= sig1; + + if(sig2 > fTiny) + sig2_upd /= sig2; + + // Compute solution + t[0] = sig1_upd * V1 * V1 + sig2_upd * V2 * V2; + t[1] = sig1_upd * V1 * V3 + sig2_upd * V2 * V4; + t[2] = sig1_upd * V3 * V3 + sig2_upd * V4 * V4; +} + + +#include "hw_sched.h" +typedef struct { + int offY, stepY, copY; + float *Input, *u, *u_upd, *qx, *qy, *qx_upd, *qy_upd, *gradx, *grady, *gradx_upd, *grady_upd; + float *div, *div_upd; + float resprimal, resdual; + float unorm, qnorm, product; +} tnv_thread_t; + +typedef struct { + int threads; + tnv_thread_t *thr_ctx; + float *InputT, *uT; + int dimX, dimY, dimZ, padZ; + float lambda, sigma, tau, theta; +} tnv_context_t; + +HWSched sched = NULL; +tnv_context_t tnv_ctx; + + +static int tnv_free(HWThread thr, void *hwctx, int device_id, void *data) { + int i,j,k; + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + free(ctx->Input); + free(ctx->u); + free(ctx->u_upd); + free(ctx->qx); + free(ctx->qy); + free(ctx->qx_upd); + free(ctx->qy_upd); + free(ctx->gradx); + free(ctx->grady); + free(ctx->gradx_upd); + free(ctx->grady_upd); + free(ctx->div); + free(ctx->div_upd); + + return 0; +} + +static int tnv_init(HWThread thr, void *hwctx, int device_id, void *data) { + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + int dimX = tnv_ctx->dimX; + int dimY = tnv_ctx->dimY; + int dimZ = tnv_ctx->dimZ; + int padZ = tnv_ctx->padZ; + int offY = ctx->offY; + int stepY = ctx->stepY; + +// printf("%i %p - %i %i %i x %i %i\n", device_id, ctx, dimX, dimY, dimZ, offY, stepY); + + long DimTotal = (long)(dimX*stepY*padZ); + long Dim1Total = (long)(dimX*(stepY+1)*padZ); + + // Auxiliar vectors + ctx->Input = memalign(64, Dim1Total * sizeof(float)); + ctx->u = memalign(64, Dim1Total * sizeof(float)); + ctx->u_upd = memalign(64, Dim1Total * sizeof(float)); + ctx->qx = memalign(64, DimTotal * sizeof(float)); + ctx->qy = memalign(64, DimTotal * sizeof(float)); + ctx->qx_upd = memalign(64, DimTotal * sizeof(float)); + ctx->qy_upd = memalign(64, DimTotal * sizeof(float)); + ctx->gradx = memalign(64, DimTotal * sizeof(float)); + ctx->grady = memalign(64, DimTotal * sizeof(float)); + ctx->gradx_upd = memalign(64, DimTotal * sizeof(float)); + ctx->grady_upd = memalign(64, DimTotal * sizeof(float)); + ctx->div = memalign(64, Dim1Total * sizeof(float)); + ctx->div_upd = malloc(Dim1Total * sizeof(float)); + + if ((!ctx->Input)||(!ctx->u)||(!ctx->u_upd)||(!ctx->qx)||(!ctx->qy)||(!ctx->qx_upd)||(!ctx->qy_upd)||(!ctx->gradx)||(!ctx->grady)||(!ctx->gradx_upd)||(!ctx->grady_upd)||(!ctx->div)||(!ctx->div_upd)) { + fprintf(stderr, "Error allocating memory\n"); + exit(-1); + } + + return 0; +} + +static int tnv_start(HWThread thr, void *hwctx, int device_id, void *data) { + int i,j,k; + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + int dimX = tnv_ctx->dimX; + int dimY = tnv_ctx->dimY; + int dimZ = tnv_ctx->dimZ; + int padZ = tnv_ctx->padZ; + int offY = ctx->offY; + int stepY = ctx->stepY; + int copY = ctx->copY; + +// printf("%i %p - %i %i %i (%i) x %i %i\n", device_id, ctx, dimX, dimY, dimZ, padZ, offY, stepY); + + long DimTotal = (long)(dimX*stepY*padZ); + long Dim1Total = (long)(dimX*copY*padZ); + + memset(ctx->u, 0, Dim1Total * sizeof(float)); + memset(ctx->qx, 0, DimTotal * sizeof(float)); + memset(ctx->qy, 0, DimTotal * sizeof(float)); + memset(ctx->gradx, 0, DimTotal * sizeof(float)); + memset(ctx->grady, 0, DimTotal * sizeof(float)); + memset(ctx->div, 0, Dim1Total * sizeof(float)); + + for(k=0; kInput[j * dimX * padZ + i * padZ + k] = tnv_ctx->InputT[k * dimX * dimY + (j + offY) * dimX + i]; + ctx->u[j * dimX * padZ + i * padZ + k] = tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i]; + } + } + } + + return 0; +} + +static int tnv_finish(HWThread thr, void *hwctx, int device_id, void *data) { + int i,j,k; + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + int dimX = tnv_ctx->dimX; + int dimY = tnv_ctx->dimY; + int dimZ = tnv_ctx->dimZ; + int padZ = tnv_ctx->padZ; + int offY = ctx->offY; + int stepY = ctx->stepY; + int copY = ctx->copY; + + for(k=0; kuT[k * dimX * dimY + (j + offY) * dimX + i] = ctx->u[j * dimX * padZ + i * padZ + k]; + } + } + } + + return 0; +} + + +static int tnv_copy(HWThread thr, void *hwctx, int device_id, void *data) { + int i,j,k; + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + int dimX = tnv_ctx->dimX; + int dimY = tnv_ctx->dimY; + int dimZ = tnv_ctx->dimZ; + int stepY = ctx->stepY; + int copY = ctx->copY; + int padZ = tnv_ctx->padZ; + long DimTotal = (long)(dimX*stepY*padZ); + long Dim1Total = (long)(dimX*copY*padZ); + + // Auxiliar vectors + memcpy(ctx->u, ctx->u_upd, Dim1Total * sizeof(float)); + memcpy(ctx->qx, ctx->qx_upd, DimTotal * sizeof(float)); + memcpy(ctx->qy, ctx->qy_upd, DimTotal * sizeof(float)); + memcpy(ctx->gradx, ctx->gradx_upd, DimTotal * sizeof(float)); + memcpy(ctx->grady, ctx->grady_upd, DimTotal * sizeof(float)); + memcpy(ctx->div, ctx->div_upd, Dim1Total * sizeof(float)); + + return 0; +} + +static int tnv_restore(HWThread thr, void *hwctx, int device_id, void *data) { + int i,j,k; + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + int dimX = tnv_ctx->dimX; + int dimY = tnv_ctx->dimY; + int dimZ = tnv_ctx->dimZ; + int stepY = ctx->stepY; + int copY = ctx->copY; + int padZ = tnv_ctx->padZ; + long DimTotal = (long)(dimX*stepY*padZ); + long Dim1Total = (long)(dimX*copY*padZ); + + // Auxiliar vectors + memcpy(ctx->u_upd, ctx->u, Dim1Total * sizeof(float)); + memcpy(ctx->qx_upd, ctx->qx, DimTotal * sizeof(float)); + memcpy(ctx->qy_upd, ctx->qy, DimTotal * sizeof(float)); + memcpy(ctx->gradx_upd, ctx->gradx, DimTotal * sizeof(float)); + memcpy(ctx->grady_upd, ctx->grady, DimTotal * sizeof(float)); + memcpy(ctx->div_upd, ctx->div, Dim1Total * sizeof(float)); + + return 0; +} + + +static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) { + long i, j, k, l; + long i1, i2, j1, j2; + + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + int dimX = tnv_ctx->dimX; + int dimY = tnv_ctx->dimY; + int dimZ = tnv_ctx->dimZ; + int padZ = tnv_ctx->padZ; + int offY = ctx->offY; + int stepY = ctx->stepY; + int copY = ctx->copY; + + float *Input = ctx->Input; + float *u = ctx->u; + float *u_upd = ctx->u_upd; + float *qx = ctx->qx; + float *qy = ctx->qy; + float *qx_upd = ctx->qx_upd; + float *qy_upd = ctx->qy_upd; + float *gradx = ctx->gradx; + float *grady = ctx->grady; + float *gradx_upd = ctx->gradx_upd; + float *grady_upd = ctx->grady_upd; + float *div = ctx->div; + float *div_upd = ctx->div_upd; + + long p = 1l; + long q = 1l; + long r = 0l; + + float lambda = tnv_ctx->lambda; + float sigma = tnv_ctx->sigma; + float tau = tnv_ctx->tau; + float theta = tnv_ctx->theta; + + float taulambda = tau * lambda; + float divtau = 1.0f / tau; + float divsigma = 1.0f / sigma; + float theta1 = 1.0f + theta; + float constant = 1.0f + taulambda; + + float resprimal = 0.0f; + float resdual1 = 0.0f; + float resdual2 = 0.0f; + float product = 0.0f; + float unorm = 0.0f; + float qnorm = 0.0f; + + float udiff[dimZ] __attribute__((aligned(64))); + float qxdiff __attribute__((aligned(64))); + float qydiff __attribute__((aligned(64))); + float divdiff __attribute__((aligned(64))); + float gradxdiff[dimZ] __attribute__((aligned(64))); + float gradydiff[dimZ] __attribute__((aligned(64))); + + + j = 0; { +# define TNV_LOOP_FIRST_J + i = 0; { +# define TNV_LOOP_FIRST_I +# include "TNV_core_backtrack_loop.h" +# undef TNV_LOOP_FIRST_I + } + for(i = 1; i < (dimX - 1); i++) { +# include "TNV_core_backtrack_loop.h" + } + i = dimX - 1; { +# define TNV_LOOP_LAST_I +# include "TNV_core_backtrack_loop.h" +# undef TNV_LOOP_LAST_I + } +# undef TNV_LOOP_FIRST_J + } + + + + for(int j = 1; j < (copY - 1); j++) { + i = 0; { +# define TNV_LOOP_FIRST_I +# include "TNV_core_backtrack_loop.h" +# undef TNV_LOOP_FIRST_I + } + } + + for(int j1 = 1; j1 < (copY - 1); j1 += BLOCK) { + for(int i1 = 1; i1 < (dimX - 1); i1 += BLOCK) { + for(int j2 = 0; j2 < BLOCK; j2 ++) { + j = j1 + j2; + for(int i2 = 0; i2 < BLOCK; i2++) { + i = i1 + i2; + + if (i == (dimX - 1)) break; + if (j == (copY - 1)) { j2 = BLOCK; break; } +# include "TNV_core_backtrack_loop.h" + } + } + } // i + + } + + for(int j = 1; j < (copY - 1); j++) { + i = dimX - 1; { +# define TNV_LOOP_LAST_I +# include "TNV_core_backtrack_loop.h" +# undef TNV_LOOP_LAST_I + } + } + + + + for (j = copY - 1; j < stepY; j++) { +# define TNV_LOOP_LAST_J + i = 0; { +# define TNV_LOOP_FIRST_I +# include "TNV_core_backtrack_loop.h" +# undef TNV_LOOP_FIRST_I + } + for(i = 1; i < (dimX - 1); i++) { +# include "TNV_core_backtrack_loop.h" + } + i = dimX - 1; { +# define TNV_LOOP_LAST_I +# include "TNV_core_backtrack_loop.h" +# undef TNV_LOOP_LAST_I + } +# undef TNV_LOOP_LAST_J + } + + + ctx->resprimal = resprimal; + ctx->resdual = resdual1 + resdual2; + ctx->product = product; + ctx->unorm = unorm; + ctx->qnorm = qnorm; + + return 0; +} + +static void TNV_CPU_init(float *InputT, float *uT, int dimX, int dimY, int dimZ) { + int i, off, size, err; + + if (sched) return; + + tnv_ctx.dimX = dimX; + tnv_ctx.dimY = dimY; + tnv_ctx.dimZ = dimZ; + // Padding seems actually slower + tnv_ctx.padZ = dimZ; +// tnv_ctx.padZ = 16 * ((dimZ / 16) + ((dimZ % 16)?1:0)); + + hw_sched_init(); + + int threads = hw_sched_get_cpu_count(); + if (threads > dimY) threads = dimY/2; + + int step = dimY / threads; + int extra = dimY % threads; + + tnv_ctx.threads = threads; + tnv_ctx.thr_ctx = (tnv_thread_t*)calloc(threads, sizeof(tnv_thread_t)); + for (i = 0, off = 0; i < threads; i++, off += size) { + tnv_thread_t *ctx = tnv_ctx.thr_ctx + i; + size = step + ((i < extra)?1:0); + + ctx->offY = off; + ctx->stepY = size; + + if (i == (threads-1)) ctx->copY = ctx->stepY; + else ctx->copY = ctx->stepY + 1; + } + + sched = hw_sched_create(threads); + if (!sched) { fprintf(stderr, "Error creating threads\n"); exit(-1); } + + err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_init); + if (!err) err = hw_sched_wait_task(sched); + if (err) { fprintf(stderr, "Error %i scheduling init threads", err); exit(-1); } +} + + + +/* + * C-OMP implementation of Total Nuclear Variation regularisation model (2D + channels) [1] + * The code is modified from the implementation by Joan Duran see + * "denoisingPDHG_ipol.cpp" in Joans Collaborative Total Variation package + * + * Input Parameters: + * 1. Noisy volume of 2D + channel dimension, i.e. 3D volume + * 2. lambda - regularisation parameter + * 3. Number of iterations [OPTIONAL parameter] + * 4. eplsilon - tolerance constant [OPTIONAL parameter] + * 5. print information: 0 (off) or 1 (on) [OPTIONAL parameter] + * + * Output: + * 1. Filtered/regularized image (u) + * + * [1]. Duran, J., Moeller, M., Sbert, C. and Cremers, D., 2016. Collaborative total variation: a general framework for vectorial TV models. SIAM Journal on Imaging Sciences, 9(1), pp.116-151. + */ + +float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float tol, int dimX, int dimY, int dimZ) +{ + int err; + int iter; + int i,j,k,l,m; + + lambda = 1.0f/(2.0f*lambda); + tnv_ctx.lambda = lambda; + + // PDHG algorithm parameters + float tau = 0.5f; + float sigma = 0.5f; + float theta = 1.0f; + + // Backtracking parameters + float s = 1.0f; + float gamma = 0.75f; + float beta = 0.95f; + float alpha0 = 0.2f; + float alpha = alpha0; + float delta = 1.5f; + float eta = 0.95f; + + TNV_CPU_init(InputT, uT, dimX, dimY, dimZ); + + tnv_ctx.InputT = InputT; + tnv_ctx.uT = uT; + + int padZ = tnv_ctx.padZ; + + err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_start); + if (!err) err = hw_sched_wait_task(sched); + if (err) { fprintf(stderr, "Error %i scheduling start threads", err); exit(-1); } + + + // Apply Primal-Dual Hybrid Gradient scheme + float residual = fLarge; + for(iter = 0; iter < maxIter; iter++) { + float resprimal = 0.0f; + float resdual = 0.0f; + float product = 0.0f; + float unorm = 0.0f; + float qnorm = 0.0f; + + float divtau = 1.0f / tau; + + tnv_ctx.sigma = sigma; + tnv_ctx.tau = tau; + tnv_ctx.theta = theta; + + err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_step); + if (!err) err = hw_sched_wait_task(sched); + if (err) { fprintf(stderr, "Error %i scheduling tnv threads", err); exit(-1); } + + // border regions + for (j = 1; j < tnv_ctx.threads; j++) { + tnv_thread_t *ctx0 = tnv_ctx.thr_ctx + (j - 1); + tnv_thread_t *ctx = tnv_ctx.thr_ctx + j; + + m = ctx0->stepY * dimX * padZ; + for(i = 0; i < dimX; i++) { + for(k = 0; k < dimZ; k++) { + int l = i * padZ + k; + + ctx->div_upd[l] -= ctx0->qy_upd[m - dimX * padZ + l]; + ctx0->div_upd[m + l] = ctx->div_upd[l]; + ctx0->u_upd[m + l] = ctx->u_upd[l]; + + float divdiff = ctx->div[l] - ctx->div_upd[l]; + float udiff = ctx->u[l] - ctx->u_upd[l]; + resprimal += fabs(divtau * udiff + divdiff); + } + } + } + + { + tnv_thread_t *ctx = tnv_ctx.thr_ctx + 0; + for(i = 0; i < dimX; i++) { + for(k = 0; k < dimZ; k++) { + int l = i * padZ + k; + + float divdiff = ctx->div[l] - ctx->div_upd[l]; + float udiff = ctx->u[l] - ctx->u_upd[l]; + resprimal += fabs(divtau * udiff + divdiff); + } + } + } + + for (j = 0; j < tnv_ctx.threads; j++) { + tnv_thread_t *ctx = tnv_ctx.thr_ctx + j; + resprimal += ctx->resprimal; + resdual += ctx->resdual; + product += ctx->product; + unorm += ctx->unorm; + qnorm += ctx->qnorm; + } + + residual = (resprimal + resdual) / ((float) (dimX*dimY*dimZ)); + float b = (2.0f * tau * sigma * product) / (gamma * sigma * unorm + gamma * tau * qnorm); + float dual_dot_delta = resdual * s * delta; + float dual_div_delta = (resdual * s) / delta; + printf("resprimal: %f, resdual: %f, b: %f (product: %f, unorm: %f, qnorm: %f)\n", resprimal, resdual, b, product, unorm, qnorm); + + + if(b > 1) { + // Decrease step-sizes to fit balancing principle + tau = (beta * tau) / b; + sigma = (beta * sigma) / b; + alpha = alpha0; + + err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_restore); + if (!err) err = hw_sched_wait_task(sched); + if (err) { fprintf(stderr, "Error %i scheduling restore threads", err); exit(-1); } + } else { + err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_copy); + if (!err) err = hw_sched_wait_task(sched); + if (err) { fprintf(stderr, "Error %i scheduling copy threads", err); exit(-1); } + + if(resprimal > dual_dot_delta) { + // Increase primal step-size and decrease dual step-size + tau = tau / (1.0f - alpha); + sigma = sigma * (1.0f - alpha); + alpha = alpha * eta; + } else if(resprimal < dual_div_delta) { + // Decrease primal step-size and increase dual step-size + tau = tau * (1.0f - alpha); + sigma = sigma / (1.0f - alpha); + alpha = alpha * eta; + } + } + + if (residual < tol) break; + } + + err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_finish); + if (!err) err = hw_sched_wait_task(sched); + if (err) { fprintf(stderr, "Error %i scheduling finish threads", err); exit(-1); } + + + printf("Iterations stopped at %i with the residual %f \n", iter, residual); + printf("Return: %f\n", *uT); + + return *uT; +} diff --git a/src/Core/performance_CPU/TNV_core.c.v27 b/src/Core/performance_CPU/TNV_core.c.v27 new file mode 100755 index 00000000..dce414ac --- /dev/null +++ b/src/Core/performance_CPU/TNV_core.c.v27 @@ -0,0 +1,650 @@ +/* + * This work is part of the Core Imaging Library developed by + * Visual Analytics and Imaging System Group of the Science Technology + * Facilities Council, STFC + * + * Copyright 2017 Daniil Kazantsev + * Copyright 2017 Srikanth Nagella, Edoardo Pasca + * + * Copyriht 2020 Suren A. Chlingaryan + * Optimized version with 1/3 of memory consumption and ~10x performance + * This version is not able to perform back-track except during first iterations + * But warning would be printed if backtracking is required and slower version (TNV_core_backtrack.c) + * could be executed instead. It still better than original with 1/2 of memory consumption and 4x performance gain + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "TNV_core.h" + +#define BLOCK 32 +#define min(a,b) (((a)<(b))?(a):(b)) + +inline void coefF(float *t, float M1, float M2, float M3, float sigma, int p, int q, int r) { + int ii, num; + float divsigma = 1.0f / sigma; + float sum, shrinkfactor; + float T,D,det,eig1,eig2,sig1,sig2,V1, V2, V3, V4, v0,v1,v2, mu1,mu2,sig1_upd,sig2_upd; + float proj[2] = {0}; + + // Compute eigenvalues of M + T = M1 + M3; + D = M1 * M3 - M2 * M2; + det = sqrtf(MAX((T * T / 4.0f) - D, 0.0f)); + eig1 = MAX((T / 2.0f) + det, 0.0f); + eig2 = MAX((T / 2.0f) - det, 0.0f); + sig1 = sqrtf(eig1); + sig2 = sqrtf(eig2); + + // Compute normalized eigenvectors + V1 = V2 = V3 = V4 = 0.0f; + + if(M2 != 0.0f) + { + v0 = M2; + v1 = eig1 - M3; + v2 = eig2 - M3; + + mu1 = sqrtf(v0 * v0 + v1 * v1); + mu2 = sqrtf(v0 * v0 + v2 * v2); + + if(mu1 > fTiny) + { + V1 = v1 / mu1; + V3 = v0 / mu1; + } + + if(mu2 > fTiny) + { + V2 = v2 / mu2; + V4 = v0 / mu2; + } + + } else + { + if(M1 > M3) + { + V1 = V4 = 1.0f; + V2 = V3 = 0.0f; + + } else + { + V1 = V4 = 0.0f; + V2 = V3 = 1.0f; + } + } + + // Compute prox_p of the diagonal entries + sig1_upd = sig2_upd = 0.0f; + + if(p == 1) + { + sig1_upd = MAX(sig1 - divsigma, 0.0f); + sig2_upd = MAX(sig2 - divsigma, 0.0f); + + } else if(p == INFNORM) + { + proj[0] = sigma * fabs(sig1); + proj[1] = sigma * fabs(sig2); + + /*l1 projection part */ + sum = fLarge; + num = 0l; + shrinkfactor = 0.0f; + while(sum > 1.0f) + { + sum = 0.0f; + num = 0; + + for(ii = 0; ii < 2; ii++) + { + proj[ii] = MAX(proj[ii] - shrinkfactor, 0.0f); + + sum += fabs(proj[ii]); + if(proj[ii]!= 0.0f) + num++; + } + + if(num > 0) + shrinkfactor = (sum - 1.0f) / num; + else + break; + } + /*l1 proj ends*/ + + sig1_upd = sig1 - divsigma * proj[0]; + sig2_upd = sig2 - divsigma * proj[1]; + } + + // Compute the diagonal entries of $\widehat{\Sigma}\Sigma^{\dagger}_0$ + if(sig1 > fTiny) + sig1_upd /= sig1; + + if(sig2 > fTiny) + sig2_upd /= sig2; + + // Compute solution + t[0] = sig1_upd * V1 * V1 + sig2_upd * V2 * V2; + t[1] = sig1_upd * V1 * V3 + sig2_upd * V2 * V4; + t[2] = sig1_upd * V3 * V3 + sig2_upd * V4 * V4; +} + + +#include "hw_sched.h" +typedef struct { + int offY, stepY, copY; + float *Input, *u, *qx, *qy, *gradx, *grady, *div; + float *div0, *udiff0; + float resprimal, resdual; + float unorm, qnorm, product; +} tnv_thread_t; + +typedef struct { + int threads; + tnv_thread_t *thr_ctx; + float *InputT, *uT; + int dimX, dimY, dimZ, padZ; + float lambda, sigma, tau, theta; +} tnv_context_t; + +HWSched sched = NULL; +tnv_context_t tnv_ctx; + + +static int tnv_free(HWThread thr, void *hwctx, int device_id, void *data) { + int i,j,k; + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + free(ctx->Input); + free(ctx->u); + free(ctx->qx); + free(ctx->qy); + free(ctx->gradx); + free(ctx->grady); + free(ctx->div); + + free(ctx->div0); + free(ctx->udiff0); + + return 0; +} + +static int tnv_init(HWThread thr, void *hwctx, int device_id, void *data) { + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + int dimX = tnv_ctx->dimX; + int dimY = tnv_ctx->dimY; + int dimZ = tnv_ctx->dimZ; + int padZ = tnv_ctx->padZ; + int offY = ctx->offY; + int stepY = ctx->stepY; + +// printf("%i %p - %i %i %i x %i %i\n", device_id, ctx, dimX, dimY, dimZ, offY, stepY); + + long DimTotal = (long)(dimX*stepY*padZ); + long Dim1Total = (long)(dimX*(stepY+1)*padZ); + long DimRow = (long)(dimX * padZ); + + // Auxiliar vectors + ctx->Input = memalign(64, Dim1Total * sizeof(float)); + ctx->u = memalign(64, Dim1Total * sizeof(float)); + ctx->qx = memalign(64, DimTotal * sizeof(float)); + ctx->qy = memalign(64, DimTotal * sizeof(float)); + ctx->gradx = memalign(64, DimTotal * sizeof(float)); + ctx->grady = memalign(64, DimTotal * sizeof(float)); + ctx->div = memalign(64, Dim1Total * sizeof(float)); + + ctx->div0 = memalign(64, DimRow * sizeof(float)); + ctx->udiff0 = memalign(64, DimRow * sizeof(float)); + + if ((!ctx->Input)||(!ctx->u)||(!ctx->qx)||(!ctx->qy)||(!ctx->gradx)||(!ctx->grady)||(!ctx->div)||(!ctx->div0)||(!ctx->udiff0)) { + fprintf(stderr, "Error allocating memory\n"); + exit(-1); + } + + return 0; +} + +static int tnv_start(HWThread thr, void *hwctx, int device_id, void *data) { + int i,j,k; + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + int dimX = tnv_ctx->dimX; + int dimY = tnv_ctx->dimY; + int dimZ = tnv_ctx->dimZ; + int padZ = tnv_ctx->padZ; + int offY = ctx->offY; + int stepY = ctx->stepY; + int copY = ctx->copY; + +// printf("%i %p - %i %i %i (%i) x %i %i\n", device_id, ctx, dimX, dimY, dimZ, padZ, offY, stepY); + + long DimTotal = (long)(dimX*stepY*padZ); + long Dim1Total = (long)(dimX*copY*padZ); + + memset(ctx->u, 0, Dim1Total * sizeof(float)); + memset(ctx->qx, 0, DimTotal * sizeof(float)); + memset(ctx->qy, 0, DimTotal * sizeof(float)); + memset(ctx->gradx, 0, DimTotal * sizeof(float)); + memset(ctx->grady, 0, DimTotal * sizeof(float)); + memset(ctx->div, 0, Dim1Total * sizeof(float)); + + for(k=0; kInput[j * dimX * padZ + i * padZ + k] = tnv_ctx->InputT[k * dimX * dimY + (j + offY) * dimX + i]; + ctx->u[j * dimX * padZ + i * padZ + k] = tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i]; + } + } + } + + return 0; +} + +static int tnv_finish(HWThread thr, void *hwctx, int device_id, void *data) { + int i,j,k; + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + int dimX = tnv_ctx->dimX; + int dimY = tnv_ctx->dimY; + int dimZ = tnv_ctx->dimZ; + int padZ = tnv_ctx->padZ; + int offY = ctx->offY; + int stepY = ctx->stepY; + int copY = ctx->copY; + + for(k=0; kuT[k * dimX * dimY + (j + offY) * dimX + i] = ctx->u[j * dimX * padZ + i * padZ + k]; + } + } + } + + return 0; +} + + +static int tnv_restore(HWThread thr, void *hwctx, int device_id, void *data) { + int i,j,k; + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + int dimX = tnv_ctx->dimX; + int dimY = tnv_ctx->dimY; + int dimZ = tnv_ctx->dimZ; + int stepY = ctx->stepY; + int copY = ctx->copY; + int padZ = tnv_ctx->padZ; + long DimTotal = (long)(dimX*stepY*padZ); + long Dim1Total = (long)(dimX*copY*padZ); + + memset(ctx->u, 0, Dim1Total * sizeof(float)); + memset(ctx->qx, 0, DimTotal * sizeof(float)); + memset(ctx->qy, 0, DimTotal * sizeof(float)); + memset(ctx->gradx, 0, DimTotal * sizeof(float)); + memset(ctx->grady, 0, DimTotal * sizeof(float)); + memset(ctx->div, 0, Dim1Total * sizeof(float)); + + return 0; +} + + +static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) { + long i, j, k, l; + + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + int dimX = tnv_ctx->dimX; + int dimY = tnv_ctx->dimY; + int dimZ = tnv_ctx->dimZ; + int padZ = tnv_ctx->padZ; + int offY = ctx->offY; + int stepY = ctx->stepY; + int copY = ctx->copY; + + float *Input = ctx->Input; + float *u = ctx->u; + float *qx = ctx->qx; + float *qy = ctx->qy; + float *gradx = ctx->gradx; + float *grady = ctx->grady; + float *div = ctx->div; + + long p = 1l; + long q = 1l; + long r = 0l; + + float lambda = tnv_ctx->lambda; + float sigma = tnv_ctx->sigma; + float tau = tnv_ctx->tau; + float theta = tnv_ctx->theta; + + float taulambda = tau * lambda; + float divtau = 1.0f / tau; + float divsigma = 1.0f / sigma; + float theta1 = 1.0f + theta; + float constant = 1.0f + taulambda; + + float resprimal = 0.0f; + float resdual = 0.0f; + float product = 0.0f; + float unorm = 0.0f; + float qnorm = 0.0f; + + float qxdiff; + float qydiff; + float divdiff; + float gradxdiff[dimZ] __attribute__((aligned(64))); + float gradydiff[dimZ] __attribute__((aligned(64))); + float ubarx[dimZ] __attribute__((aligned(64))); + float ubary[dimZ] __attribute__((aligned(64))); + float udiff[dimZ] __attribute__((aligned(64))); + + + for(i=0; i < dimX; i++) { + for(k = 0; k < dimZ; k++) { + int l = i * padZ + k; + float u_upd = (u[l] + tau * div[l] + taulambda * Input[l])/constant; + float udiff = u[l] - u_upd; + ctx->udiff0[l] = udiff; + ctx->div0[l] = div[l]; + } + } + + for(int j1 = 0; j1 < stepY; j1 += BLOCK) { + for(int i1 = 0; i1 < dimX; i1 += BLOCK) { + for(int j2 = 0; j2 < BLOCK; j2++) { + j = j1 + j2; + for(int i2 = 0; i2 < BLOCK; i2++) { + float t[3]; + float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f; + + i = i1 + i2; + if (i == dimX) break; + if (j == stepY) { j2 = BLOCK; break; } + l = (j * dimX + i) * padZ; + +#pragma vector aligned +#pragma GCC ivdep + for(k = 0; k < dimZ; k++) { + float u_upd = (u[l + k] + tau * div[l + k] + taulambda * Input[l + k]) / constant; + udiff[k] = u[l + k] - u_upd; + u[l + k] = u_upd; + + float gradx_upd = (i == (dimX - 1))?0:((u[l + k + padZ] + tau * div[l + k + padZ] + taulambda * Input[l + k + padZ]) / constant - u_upd); + float grady_upd = (j == (copY - 1))?0:((u[l + k + dimX*padZ] + tau * div[l + k + dimX*padZ] + taulambda * Input[l + k + dimX*padZ]) / constant - u_upd); + gradxdiff[k] = gradx[l + k] - gradx_upd; + gradydiff[k] = grady[l + k] - grady_upd; + gradx[l + k] = gradx_upd; + grady[l + k] = grady_upd; + + ubarx[k] = gradx_upd - theta * gradxdiff[k]; + ubary[k] = grady_upd - theta * gradydiff[k]; + + float vx = ubarx[k] + divsigma * qx[l + k]; + float vy = ubary[k] + divsigma * qy[l + k]; + + M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy); + } + + coefF(t, M1, M2, M3, sigma, p, q, r); + +#pragma vector aligned +#pragma GCC ivdep + for(k = 0; k < dimZ; k++) { + float vx = ubarx[k] + divsigma * qx[l + k]; + float vy = ubary[k] + divsigma * qy[l + k]; + float gx_upd = vx * t[0] + vy * t[1]; + float gy_upd = vx * t[1] + vy * t[2]; + + qxdiff = sigma * (ubarx[k] - gx_upd); + qydiff = sigma * (ubary[k] - gy_upd); + + qx[l + k] += qxdiff; + qy[l + k] += qydiff; + + unorm += (udiff[k] * udiff[k]); + qnorm += (qxdiff * qxdiff + qydiff * qydiff); + + float div_upd = 0; + div_upd -= (i > 0)?qx[l + k - padZ]:0; + div_upd -= (j > 0)?qy[l + k - dimX*padZ]:0; + div_upd += (i < (dimX-1))?qx[l + k]:0; + div_upd += (j < (copY-1))?qy[l + k]:0; + divdiff = div[l + k] - div_upd; + div[l + k] = div_upd; + + resprimal += ((offY == 0)||(j > 0))?fabs(divtau * udiff[k] + divdiff):0; + resdual += fabs(divsigma * qxdiff + gradxdiff[k]); + resdual += fabs(divsigma * qydiff + gradydiff[k]); + product -= (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff); + } + + } // i + } // j + } // i + } // j + + + ctx->resprimal = resprimal; + ctx->resdual = resdual; + ctx->product = product; + ctx->unorm = unorm; + ctx->qnorm = qnorm; + + return 0; +} + +static void TNV_CPU_init(float *InputT, float *uT, int dimX, int dimY, int dimZ) { + int i, off, size, err; + + if (sched) return; + + tnv_ctx.dimX = dimX; + tnv_ctx.dimY = dimY; + tnv_ctx.dimZ = dimZ; + // Padding seems actually slower +// tnv_ctx.padZ = dimZ; +// tnv_ctx.padZ = 4 * ((dimZ / 4) + ((dimZ % 4)?1:0)); + tnv_ctx.padZ = 16 * ((dimZ / 16) + ((dimZ % 16)?1:0)); + + hw_sched_init(); + + int threads = hw_sched_get_cpu_count(); + if (threads > dimY) threads = dimY/2; + + int step = dimY / threads; + int extra = dimY % threads; + + tnv_ctx.threads = threads; + tnv_ctx.thr_ctx = (tnv_thread_t*)calloc(threads, sizeof(tnv_thread_t)); + for (i = 0, off = 0; i < threads; i++, off += size) { + tnv_thread_t *ctx = tnv_ctx.thr_ctx + i; + size = step + ((i < extra)?1:0); + + ctx->offY = off; + ctx->stepY = size; + + if (i == (threads-1)) ctx->copY = ctx->stepY; + else ctx->copY = ctx->stepY + 1; + } + + sched = hw_sched_create(threads); + if (!sched) { fprintf(stderr, "Error creating threads\n"); exit(-1); } + + err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_init); + if (!err) err = hw_sched_wait_task(sched); + if (err) { fprintf(stderr, "Error %i scheduling init threads", err); exit(-1); } +} + + + +/* + * C-OMP implementation of Total Nuclear Variation regularisation model (2D + channels) [1] + * The code is modified from the implementation by Joan Duran see + * "denoisingPDHG_ipol.cpp" in Joans Collaborative Total Variation package + * + * Input Parameters: + * 1. Noisy volume of 2D + channel dimension, i.e. 3D volume + * 2. lambda - regularisation parameter + * 3. Number of iterations [OPTIONAL parameter] + * 4. eplsilon - tolerance constant [OPTIONAL parameter] + * 5. print information: 0 (off) or 1 (on) [OPTIONAL parameter] + * + * Output: + * 1. Filtered/regularized image (u) + * + * [1]. Duran, J., Moeller, M., Sbert, C. and Cremers, D., 2016. Collaborative total variation: a general framework for vectorial TV models. SIAM Journal on Imaging Sciences, 9(1), pp.116-151. + */ + +float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float tol, int dimX, int dimY, int dimZ) +{ + int err; + int iter; + int i,j,k,l,m; + + lambda = 1.0f/(2.0f*lambda); + tnv_ctx.lambda = lambda; + + // PDHG algorithm parameters + float tau = 0.5f; + float sigma = 0.5f; + float theta = 1.0f; + + // Backtracking parameters + float s = 1.0f; + float gamma = 0.75f; + float beta = 0.95f; + float alpha0 = 0.2f; + float alpha = alpha0; + float delta = 1.5f; + float eta = 0.95f; + + TNV_CPU_init(InputT, uT, dimX, dimY, dimZ); + + tnv_ctx.InputT = InputT; + tnv_ctx.uT = uT; + + int padZ = tnv_ctx.padZ; + + err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_start); + if (!err) err = hw_sched_wait_task(sched); + if (err) { fprintf(stderr, "Error %i scheduling start threads", err); exit(-1); } + + + // Apply Primal-Dual Hybrid Gradient scheme + float residual = fLarge; + int started = 0; + for(iter = 0; iter < maxIter; iter++) { + float resprimal = 0.0f; + float resdual = 0.0f; + float product = 0.0f; + float unorm = 0.0f; + float qnorm = 0.0f; + + float divtau = 1.0f / tau; + + tnv_ctx.sigma = sigma; + tnv_ctx.tau = tau; + tnv_ctx.theta = theta; + + err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_step); + if (!err) err = hw_sched_wait_task(sched); + if (err) { fprintf(stderr, "Error %i scheduling tnv threads", err); exit(-1); } + + // border regions + for (j = 1; j < tnv_ctx.threads; j++) { + tnv_thread_t *ctx0 = tnv_ctx.thr_ctx + (j - 1); + tnv_thread_t *ctx = tnv_ctx.thr_ctx + j; + + m = (ctx0->stepY - 1) * dimX * padZ; + for(i = 0; i < dimX; i++) { + for(k = 0; k < dimZ; k++) { + int l = i * padZ + k; + + float divdiff = ctx->div0[l] - ctx->div[l]; + float udiff = ctx->udiff0[l]; + + ctx->div[l] -= ctx0->qy[l + m]; + ctx0->div[m + l + dimX * padZ] = ctx->div[l]; + ctx0->u[m + l + dimX * padZ] = ctx->u[l]; + + divdiff += ctx0->qy[l + m]; + resprimal += fabs(divtau * udiff + divdiff); + } + } + } + + for (j = 0; j < tnv_ctx.threads; j++) { + tnv_thread_t *ctx = tnv_ctx.thr_ctx + j; + resprimal += ctx->resprimal; + resdual += ctx->resdual; + product += ctx->product; + unorm += ctx->unorm; + qnorm += ctx->qnorm; + } + + residual = (resprimal + resdual) / ((float) (dimX*dimY*dimZ)); + float b = (2.0f * tau * sigma * product) / (gamma * sigma * unorm + gamma * tau * qnorm); + float dual_dot_delta = resdual * s * delta; + float dual_div_delta = (resdual * s) / delta; + printf("resprimal: %f, resdual: %f, b: %f (product: %f, unorm: %f, qnorm: %f)\n", resprimal, resdual, b, product, unorm, qnorm); + + + if(b > 1) { + + // Decrease step-sizes to fit balancing principle + tau = (beta * tau) / b; + sigma = (beta * sigma) / b; + alpha = alpha0; + + if (started) { + fprintf(stderr, "\n\n\nWARNING: Back-tracking is required in the middle of iterative optimization! We CAN'T do it in the fast version. The standard TNV recommended\n\n\n"); + } else { + err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_restore); + if (!err) err = hw_sched_wait_task(sched); + if (err) { fprintf(stderr, "Error %i scheduling restore threads", err); exit(-1); } + } + } else { + started = 1; + if(resprimal > dual_dot_delta) { + // Increase primal step-size and decrease dual step-size + tau = tau / (1.0f - alpha); + sigma = sigma * (1.0f - alpha); + alpha = alpha * eta; + } else if(resprimal < dual_div_delta) { + // Decrease primal step-size and increase dual step-size + tau = tau * (1.0f - alpha); + sigma = sigma / (1.0f - alpha); + alpha = alpha * eta; + } + } + + if (residual < tol) break; + } + + err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_finish); + if (!err) err = hw_sched_wait_task(sched); + if (err) { fprintf(stderr, "Error %i scheduling finish threads", err); exit(-1); } + + + printf("Iterations stopped at %i with the residual %f \n", iter, residual); + printf("Return: %f\n", *uT); + +// exit(-1); + return *uT; +} diff --git a/src/Core/performance_CPU/TNV_core.c.v32 b/src/Core/performance_CPU/TNV_core.c.v32 new file mode 100755 index 00000000..fcb2f878 --- /dev/null +++ b/src/Core/performance_CPU/TNV_core.c.v32 @@ -0,0 +1,676 @@ +/* + * This work is part of the Core Imaging Library developed by + * Visual Analytics and Imaging System Group of the Science Technology + * Facilities Council, STFC + * + * Copyright 2017 Daniil Kazantsev + * Copyright 2017 Srikanth Nagella, Edoardo Pasca + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include "TNV_core.h" + +#define min(a,b) (((a)<(b))?(a):(b)) + +inline void coefF(float *t, float M1, float M2, float M3, float sigma, int p, int q, int r) { + int ii, num; + float divsigma = 1.0f / sigma; + float sum, shrinkfactor; + float T,D,det,eig1,eig2,sig1,sig2,V1, V2, V3, V4, v0,v1,v2, mu1,mu2,sig1_upd,sig2_upd; + float proj[2] = {0}; + + // Compute eigenvalues of M + T = M1 + M3; + D = M1 * M3 - M2 * M2; + det = sqrtf(MAX((T * T / 4.0f) - D, 0.0f)); + eig1 = MAX((T / 2.0f) + det, 0.0f); + eig2 = MAX((T / 2.0f) - det, 0.0f); + sig1 = sqrtf(eig1); + sig2 = sqrtf(eig2); + + // Compute normalized eigenvectors + V1 = V2 = V3 = V4 = 0.0f; + + if(M2 != 0.0f) + { + v0 = M2; + v1 = eig1 - M3; + v2 = eig2 - M3; + + mu1 = sqrtf(v0 * v0 + v1 * v1); + mu2 = sqrtf(v0 * v0 + v2 * v2); + + if(mu1 > fTiny) + { + V1 = v1 / mu1; + V3 = v0 / mu1; + } + + if(mu2 > fTiny) + { + V2 = v2 / mu2; + V4 = v0 / mu2; + } + + } else + { + if(M1 > M3) + { + V1 = V4 = 1.0f; + V2 = V3 = 0.0f; + + } else + { + V1 = V4 = 0.0f; + V2 = V3 = 1.0f; + } + } + + // Compute prox_p of the diagonal entries + sig1_upd = sig2_upd = 0.0f; + + if(p == 1) + { + sig1_upd = MAX(sig1 - divsigma, 0.0f); + sig2_upd = MAX(sig2 - divsigma, 0.0f); + + } else if(p == INFNORM) + { + proj[0] = sigma * fabs(sig1); + proj[1] = sigma * fabs(sig2); + + /*l1 projection part */ + sum = fLarge; + num = 0l; + shrinkfactor = 0.0f; + while(sum > 1.0f) + { + sum = 0.0f; + num = 0; + + for(ii = 0; ii < 2; ii++) + { + proj[ii] = MAX(proj[ii] - shrinkfactor, 0.0f); + + sum += fabs(proj[ii]); + if(proj[ii]!= 0.0f) + num++; + } + + if(num > 0) + shrinkfactor = (sum - 1.0f) / num; + else + break; + } + /*l1 proj ends*/ + + sig1_upd = sig1 - divsigma * proj[0]; + sig2_upd = sig2 - divsigma * proj[1]; + } + + // Compute the diagonal entries of $\widehat{\Sigma}\Sigma^{\dagger}_0$ + if(sig1 > fTiny) + sig1_upd /= sig1; + + if(sig2 > fTiny) + sig2_upd /= sig2; + + // Compute solution + t[0] = sig1_upd * V1 * V1 + sig2_upd * V2 * V2; + t[1] = sig1_upd * V1 * V3 + sig2_upd * V2 * V4; + t[2] = sig1_upd * V3 * V3 + sig2_upd * V4 * V4; +} + + +#include "hw_sched.h" +typedef struct { + int offY, stepY, copY; + float *Input, *u, *qx, *qy, *gradx, *grady, *div; + float *div0, *udiff0; + float *gradxdiff, *gradydiff, *ubarx, *ubary, *udiff; + float resprimal, resdual; + float unorm, qnorm, product; +} tnv_thread_t; + +typedef struct { + int threads; + tnv_thread_t *thr_ctx; + float *InputT, *uT; + int dimX, dimY, dimZ, padZ; + float lambda, sigma, tau, theta; +} tnv_context_t; + +HWSched sched = NULL; +tnv_context_t tnv_ctx; + + +static int tnv_free(HWThread thr, void *hwctx, int device_id, void *data) { + int i,j,k; + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + free(ctx->Input); + free(ctx->u); + free(ctx->qx); + free(ctx->qy); + free(ctx->gradx); + free(ctx->grady); + free(ctx->div); + + free(ctx->div0); + free(ctx->udiff0); + + free(ctx->gradxdiff); + free(ctx->gradydiff); + free(ctx->ubarx); + free(ctx->ubary); + free(ctx->udiff); + + return 0; +} + +static int tnv_init(HWThread thr, void *hwctx, int device_id, void *data) { + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + int dimX = tnv_ctx->dimX; + int dimY = tnv_ctx->dimY; + int dimZ = tnv_ctx->dimZ; + int padZ = tnv_ctx->padZ; + int offY = ctx->offY; + int stepY = ctx->stepY; + +// printf("%i %p - %i %i %i x %i %i\n", device_id, ctx, dimX, dimY, dimZ, offY, stepY); + + long DimTotal = (long)(dimX*stepY*padZ); + long Dim1Total = (long)(dimX*(stepY+1)*padZ); + long DimRow = (long)(dimX * padZ); + long DimCell = (long)(padZ); + + // Auxiliar vectors + ctx->Input = memalign(64, Dim1Total * sizeof(float)); + ctx->u = memalign(64, Dim1Total * sizeof(float)); + ctx->qx = memalign(64, DimTotal * sizeof(float)); + ctx->qy = memalign(64, DimTotal * sizeof(float)); + ctx->gradx = memalign(64, DimTotal * sizeof(float)); + ctx->grady = memalign(64, DimTotal * sizeof(float)); + ctx->div = memalign(64, Dim1Total * sizeof(float)); + + ctx->div0 = memalign(64, DimRow * sizeof(float)); + ctx->udiff0 = memalign(64, DimRow * sizeof(float)); + + ctx->gradxdiff = memalign(64, DimCell * sizeof(float)); + ctx->gradydiff = memalign(64, DimCell * sizeof(float)); + ctx->ubarx = memalign(64, DimCell * sizeof(float)); + ctx->ubary = memalign(64, DimCell * sizeof(float)); + ctx->udiff = memalign(64, DimCell * sizeof(float)); + + if ((!ctx->Input)||(!ctx->u)||(!ctx->qx)||(!ctx->qy)||(!ctx->gradx)||(!ctx->grady)||(!ctx->div)||(!ctx->div0)||(!ctx->udiff)||(!ctx->udiff0)) { + fprintf(stderr, "Error allocating memory\n"); + exit(-1); + } + + return 0; +} + +static int tnv_start(HWThread thr, void *hwctx, int device_id, void *data) { + int i,j,k; + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + int dimX = tnv_ctx->dimX; + int dimY = tnv_ctx->dimY; + int dimZ = tnv_ctx->dimZ; + int padZ = tnv_ctx->padZ; + int offY = ctx->offY; + int stepY = ctx->stepY; + int copY = ctx->copY; + +// printf("%i %p - %i %i %i (%i) x %i %i\n", device_id, ctx, dimX, dimY, dimZ, padZ, offY, stepY); + + long DimTotal = (long)(dimX*stepY*padZ); + long Dim1Total = (long)(dimX*copY*padZ); + + memset(ctx->u, 0, Dim1Total * sizeof(float)); + memset(ctx->qx, 0, DimTotal * sizeof(float)); + memset(ctx->qy, 0, DimTotal * sizeof(float)); + memset(ctx->gradx, 0, DimTotal * sizeof(float)); + memset(ctx->grady, 0, DimTotal * sizeof(float)); + memset(ctx->div, 0, Dim1Total * sizeof(float)); + + for(k=0; kInput[j * dimX * padZ + i * padZ + k] = tnv_ctx->InputT[k * dimX * dimY + (j + offY) * dimX + i]; + ctx->u[j * dimX * padZ + i * padZ + k] = tnv_ctx->uT[k * dimX * dimY + (j + offY) * dimX + i]; + } + } + } + + return 0; +} + +static int tnv_finish(HWThread thr, void *hwctx, int device_id, void *data) { + int i,j,k; + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + int dimX = tnv_ctx->dimX; + int dimY = tnv_ctx->dimY; + int dimZ = tnv_ctx->dimZ; + int padZ = tnv_ctx->padZ; + int offY = ctx->offY; + int stepY = ctx->stepY; + int copY = ctx->copY; + + for(k=0; kuT[k * dimX * dimY + (j + offY) * dimX + i] = ctx->u[j * dimX * padZ + i * padZ + k]; + } + } + } + + return 0; +} + + +static int tnv_restore(HWThread thr, void *hwctx, int device_id, void *data) { + int i,j,k; + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + int dimX = tnv_ctx->dimX; + int dimY = tnv_ctx->dimY; + int dimZ = tnv_ctx->dimZ; + int stepY = ctx->stepY; + int copY = ctx->copY; + int padZ = tnv_ctx->padZ; + long DimTotal = (long)(dimX*stepY*padZ); + long Dim1Total = (long)(dimX*copY*padZ); + + memset(ctx->u, 0, Dim1Total * sizeof(float)); + memset(ctx->qx, 0, DimTotal * sizeof(float)); + memset(ctx->qy, 0, DimTotal * sizeof(float)); + memset(ctx->gradx, 0, DimTotal * sizeof(float)); + memset(ctx->grady, 0, DimTotal * sizeof(float)); + memset(ctx->div, 0, Dim1Total * sizeof(float)); + + return 0; +} + + +static int tnv_step(HWThread thr, void *hwctx, int device_id, void *data) { + long i, j, k, l, m; + + tnv_context_t *tnv_ctx = (tnv_context_t*)data; + tnv_thread_t *ctx = tnv_ctx->thr_ctx + device_id; + + int dimX = tnv_ctx->dimX; + int dimY = tnv_ctx->dimY; + int dimZ = tnv_ctx->dimZ; + int padZ = tnv_ctx->padZ; + int offY = ctx->offY; + int stepY = ctx->stepY; + int copY = ctx->copY; + + float *Input = ctx->Input; + float *u = ctx->u; + float *qx = ctx->qx; + float *qy = ctx->qy; + float *gradx = ctx->gradx; + float *grady = ctx->grady; + float *div = ctx->div; + + long p = 1l; + long q = 1l; + long r = 0l; + + float lambda = tnv_ctx->lambda; + float sigma = tnv_ctx->sigma; + float tau = tnv_ctx->tau; + float theta = tnv_ctx->theta; + + float taulambda = tau * lambda; + float divtau = 1.0f / tau; + float divsigma = 1.0f / sigma; + float theta1 = 1.0f + theta; + float constant = 1.0f + taulambda; + + float resprimal = 0.0f; + float resdual1 = 0.0f; + float resdual2 = 0.0f; + float product = 0.0f; + float unorm = 0.0f; + float qnorm = 0.0f; + + float qxdiff; + float qydiff; + float divdiff; + float *gradxdiff = ctx->gradxdiff; + float *gradydiff = ctx->gradydiff; + float *ubarx = ctx->ubarx; + float *ubary = ctx->ubary; + float *udiff = ctx->udiff; + + float *udiff0 = ctx->udiff0; + float *div0 = ctx->div0; + +/* + for(i=0; i < dimX; i++) { + for(k = 0; k < dimZ; k++) { + int l = i * padZ + k; + float u_upd = (u[l] + tau * div[l] + taulambda * Input[l])/constant; + float udiff_val = u[l] - u_upd; + udiff0[l] = udiff_val; + div0[l] = div[l]; + } + } +*/ + + + j = 0; { +# define TNV_LOOP_FIRST_J + i = 0; { +# define TNV_LOOP_FIRST_I +# include "TNV_core_loop.h" +# undef TNV_LOOP_FIRST_I + } + for(i = 1; i < (dimX - 1); i++) { +# include "TNV_core_loop.h" + } + i = dimX - 1; { +# define TNV_LOOP_LAST_I +# include "TNV_core_loop.h" +# undef TNV_LOOP_LAST_I + } +# undef TNV_LOOP_FIRST_J + } + + + + for(int j = 1; j < (copY - 1); j++) { + i = 0; { +# define TNV_LOOP_FIRST_I +# include "TNV_core_loop.h" +# undef TNV_LOOP_FIRST_I + } + } + +#define BLOCK 32 + for(int j1 = 1; j1 < (copY - 1); j1 += BLOCK) { + for(int i1 = 1; i1 < (dimX - 1); i1 += BLOCK) { + for(int j2 = 0; j2 < BLOCK; j2 ++) { + j = j1 + j2; + for(int i2 = 0; i2 < BLOCK; i2++) { + i = i1 + i2; + + if (i == (dimX - 1)) break; + if (j == (copY - 1)) { j2 = BLOCK; break; } +# include "TNV_core_loop.h" + } + } + } // i + + } + + for(int j = 1; j < (copY - 1); j++) { + i = dimX - 1; { +# define TNV_LOOP_LAST_I +# include "TNV_core_loop.h" +# undef TNV_LOOP_LAST_I + } + } + + + + for (j = copY - 1; j < stepY; j++) { +# define TNV_LOOP_LAST_J + i = 0; { +# define TNV_LOOP_FIRST_I +# include "TNV_core_loop.h" +# undef TNV_LOOP_FIRST_I + } + for(i = 1; i < (dimX - 1); i++) { +# include "TNV_core_loop.h" + } + i = dimX - 1; { +# define TNV_LOOP_LAST_I +# include "TNV_core_loop.h" +# undef TNV_LOOP_LAST_I + } +# undef TNV_LOOP_LAST_J + } + + + + ctx->resprimal = resprimal; + ctx->resdual = resdual1 + resdual2; + ctx->product = product; + ctx->unorm = unorm; + ctx->qnorm = qnorm; + + return 0; +} + +static void TNV_CPU_init(float *InputT, float *uT, int dimX, int dimY, int dimZ) { + int i, off, size, err; + + if (sched) return; + + tnv_ctx.dimX = dimX; + tnv_ctx.dimY = dimY; + tnv_ctx.dimZ = dimZ; + // Padding seems actually slower +// tnv_ctx.padZ = dimZ; +// tnv_ctx.padZ = 4 * ((dimZ / 4) + ((dimZ % 4)?1:0)); + tnv_ctx.padZ = 16 * ((dimZ / 16) + ((dimZ % 16)?1:0)); + + hw_sched_init(); + + int threads = hw_sched_get_cpu_count(); + if (threads > dimY) threads = dimY/2; + + int step = dimY / threads; + int extra = dimY % threads; + + tnv_ctx.threads = threads; + tnv_ctx.thr_ctx = (tnv_thread_t*)calloc(threads, sizeof(tnv_thread_t)); + for (i = 0, off = 0; i < threads; i++, off += size) { + tnv_thread_t *ctx = tnv_ctx.thr_ctx + i; + size = step + ((i < extra)?1:0); + + ctx->offY = off; + ctx->stepY = size; + + if (i == (threads-1)) ctx->copY = ctx->stepY; + else ctx->copY = ctx->stepY + 1; + } + + sched = hw_sched_create(threads); + if (!sched) { fprintf(stderr, "Error creating threads\n"); exit(-1); } + + err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_init); + if (!err) err = hw_sched_wait_task(sched); + if (err) { fprintf(stderr, "Error %i scheduling init threads", err); exit(-1); } +} + + + +/* + * C-OMP implementation of Total Nuclear Variation regularisation model (2D + channels) [1] + * The code is modified from the implementation by Joan Duran see + * "denoisingPDHG_ipol.cpp" in Joans Collaborative Total Variation package + * + * Input Parameters: + * 1. Noisy volume of 2D + channel dimension, i.e. 3D volume + * 2. lambda - regularisation parameter + * 3. Number of iterations [OPTIONAL parameter] + * 4. eplsilon - tolerance constant [OPTIONAL parameter] + * 5. print information: 0 (off) or 1 (on) [OPTIONAL parameter] + * + * Output: + * 1. Filtered/regularized image (u) + * + * [1]. Duran, J., Moeller, M., Sbert, C. and Cremers, D., 2016. Collaborative total variation: a general framework for vectorial TV models. SIAM Journal on Imaging Sciences, 9(1), pp.116-151. + */ + +float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float tol, int dimX, int dimY, int dimZ) +{ + int err; + int iter; + int i,j,k,l,m; + + lambda = 1.0f/(2.0f*lambda); + tnv_ctx.lambda = lambda; + + // PDHG algorithm parameters + float tau = 0.5f; + float sigma = 0.5f; + float theta = 1.0f; + + // Backtracking parameters + float s = 1.0f; + float gamma = 0.75f; + float beta = 0.95f; + float alpha0 = 0.2f; + float alpha = alpha0; + float delta = 1.5f; + float eta = 0.95f; + + TNV_CPU_init(InputT, uT, dimX, dimY, dimZ); + + tnv_ctx.InputT = InputT; + tnv_ctx.uT = uT; + + int padZ = tnv_ctx.padZ; + + err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_start); + if (!err) err = hw_sched_wait_task(sched); + if (err) { fprintf(stderr, "Error %i scheduling start threads", err); exit(-1); } + + + // Apply Primal-Dual Hybrid Gradient scheme + float residual = fLarge; + int started = 0; + for(iter = 0; iter < maxIter; iter++) { + float resprimal = 0.0f; + float resdual = 0.0f; + float product = 0.0f; + float unorm = 0.0f; + float qnorm = 0.0f; + + float divtau = 1.0f / tau; + + tnv_ctx.sigma = sigma; + tnv_ctx.tau = tau; + tnv_ctx.theta = theta; + + err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_step); + if (!err) err = hw_sched_wait_task(sched); + if (err) { fprintf(stderr, "Error %i scheduling tnv threads", err); exit(-1); } + + // border regions + for (j = 1; j < tnv_ctx.threads; j++) { + tnv_thread_t *ctx0 = tnv_ctx.thr_ctx + (j - 1); + tnv_thread_t *ctx = tnv_ctx.thr_ctx + j; + + m = (ctx0->stepY - 1) * dimX * padZ; + for(i = 0; i < dimX; i++) { + for(k = 0; k < dimZ; k++) { + int l = i * padZ + k; + + float divdiff = ctx->div0[l] - ctx->div[l]; + float udiff = ctx->udiff0[l]; + + ctx->div[l] -= ctx0->qy[l + m]; + ctx0->div[m + l + dimX*padZ] = ctx->div[l]; + ctx0->u[m + l + dimX*padZ] = ctx->u[l]; + + divdiff += ctx0->qy[l + m]; + resprimal += fabs(divtau * udiff + divdiff); + } + } + } + + { + tnv_thread_t *ctx = tnv_ctx.thr_ctx + 0; + for(i = 0; i < dimX; i++) { + for(k = 0; k < dimZ; k++) { + int l = i * padZ + k; + + float divdiff = ctx->div0[l] - ctx->div[l]; + float udiff = ctx->udiff0[l]; + resprimal += fabs(divtau * udiff + divdiff); + } + } + } + + for (j = 0; j < tnv_ctx.threads; j++) { + tnv_thread_t *ctx = tnv_ctx.thr_ctx + j; + resprimal += ctx->resprimal; + resdual += ctx->resdual; + product += ctx->product; + unorm += ctx->unorm; + qnorm += ctx->qnorm; + } + + residual = (resprimal + resdual) / ((float) (dimX*dimY*dimZ)); + float b = (2.0f * tau * sigma * product) / (gamma * sigma * unorm + gamma * tau * qnorm); + float dual_dot_delta = resdual * s * delta; + float dual_div_delta = (resdual * s) / delta; + printf("resprimal: %f, resdual: %f, b: %f (product: %f, unorm: %f, qnorm: %f)\n", resprimal, resdual, b, product, unorm, qnorm); + + + if(b > 1) { + + // Decrease step-sizes to fit balancing principle + tau = (beta * tau) / b; + sigma = (beta * sigma) / b; + alpha = alpha0; + + if (started) { + fprintf(stderr, "\n\n\nWARNING: Back-tracking is required in the middle of iterative optimization! We CAN'T do it in the fast version. The standard TNV recommended\n\n\n"); + } else { + err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_restore); + if (!err) err = hw_sched_wait_task(sched); + if (err) { fprintf(stderr, "Error %i scheduling restore threads", err); exit(-1); } + } + } else { + started = 1; + if(resprimal > dual_dot_delta) { + // Increase primal step-size and decrease dual step-size + tau = tau / (1.0f - alpha); + sigma = sigma * (1.0f - alpha); + alpha = alpha * eta; + } else if(resprimal < dual_div_delta) { + // Decrease primal step-size and increase dual step-size + tau = tau * (1.0f - alpha); + sigma = sigma / (1.0f - alpha); + alpha = alpha * eta; + } + } + + if (residual < tol) break; + } + + err = hw_sched_schedule_thread_task(sched, (void*)&tnv_ctx, tnv_finish); + if (!err) err = hw_sched_wait_task(sched); + if (err) { fprintf(stderr, "Error %i scheduling finish threads", err); exit(-1); } + + + printf("Iterations stopped at %i with the residual %f \n", iter, residual); + printf("Return: %f\n", *uT); + + return *uT; +} diff --git a/src/Core/performance_CPU/TNV_core.c.v4.stdver b/src/Core/performance_CPU/TNV_core.c.v4.stdver new file mode 100755 index 00000000..6be60ae6 --- /dev/null +++ b/src/Core/performance_CPU/TNV_core.c.v4.stdver @@ -0,0 +1,629 @@ +/* + * This work is part of the Core Imaging Library developed by + * Visual Analytics and Imaging System Group of the Science Technology + * Facilities Council, STFC + * + * Copyright 2017 Daniil Kazantsev + * Copyright 2017 Srikanth Nagella, Edoardo Pasca + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "TNV_core.h" + + +inline void coefF(float *t, float M1, float M2, float M3, float sigma, int p, int q, int r) { + int ii, num; + float divsigma = 1.0f / sigma; + float sum, shrinkfactor; + float T,D,det,eig1,eig2,sig1,sig2,V1, V2, V3, V4, v0,v1,v2, mu1,mu2,sig1_upd,sig2_upd; + float proj[2] = {0}; + + // Compute eigenvalues of M + T = M1 + M3; + D = M1 * M3 - M2 * M2; + det = sqrt(MAX((T * T / 4.0f) - D, 0.0f)); + eig1 = MAX((T / 2.0f) + det, 0.0f); + eig2 = MAX((T / 2.0f) - det, 0.0f); + sig1 = sqrt(eig1); + sig2 = sqrt(eig2); + + // Compute normalized eigenvectors + V1 = V2 = V3 = V4 = 0.0f; + + if(M2 != 0.0f) + { + v0 = M2; + v1 = eig1 - M3; + v2 = eig2 - M3; + + mu1 = sqrtf(v0 * v0 + v1 * v1); + mu2 = sqrtf(v0 * v0 + v2 * v2); + + if(mu1 > fTiny) + { + V1 = v1 / mu1; + V3 = v0 / mu1; + } + + if(mu2 > fTiny) + { + V2 = v2 / mu2; + V4 = v0 / mu2; + } + + } else + { + if(M1 > M3) + { + V1 = V4 = 1.0f; + V2 = V3 = 0.0f; + + } else + { + V1 = V4 = 0.0f; + V2 = V3 = 1.0f; + } + } + + // Compute prox_p of the diagonal entries + sig1_upd = sig2_upd = 0.0f; + + if(p == 1) + { + sig1_upd = MAX(sig1 - divsigma, 0.0f); + sig2_upd = MAX(sig2 - divsigma, 0.0f); + + } else if(p == INFNORM) + { + proj[0] = sigma * fabs(sig1); + proj[1] = sigma * fabs(sig2); + + /*l1 projection part */ + sum = fLarge; + num = 0l; + shrinkfactor = 0.0f; + while(sum > 1.0f) + { + sum = 0.0f; + num = 0; + + for(ii = 0; ii < 2; ii++) + { + proj[ii] = MAX(proj[ii] - shrinkfactor, 0.0f); + + sum += fabs(proj[ii]); + if(proj[ii]!= 0.0f) + num++; + } + + if(num > 0) + shrinkfactor = (sum - 1.0f) / num; + else + break; + } + /*l1 proj ends*/ + + sig1_upd = sig1 - divsigma * proj[0]; + sig2_upd = sig2 - divsigma * proj[1]; + } + + // Compute the diagonal entries of $\widehat{\Sigma}\Sigma^{\dagger}_0$ + if(sig1 > fTiny) + sig1_upd /= sig1; + + if(sig2 > fTiny) + sig2_upd /= sig2; + + // Compute solution + t[0] = sig1_upd * V1 * V1 + sig2_upd * V2 * V2; + t[1] = sig1_upd * V1 * V3 + sig2_upd * V2 * V4; + t[2] = sig1_upd * V3 * V3 + sig2_upd * V4 * V4; +} + + +/* + * C-OMP implementation of Total Nuclear Variation regularisation model (2D + channels) [1] + * The code is modified from the implementation by Joan Duran see + * "denoisingPDHG_ipol.cpp" in Joans Collaborative Total Variation package + * + * Input Parameters: + * 1. Noisy volume of 2D + channel dimension, i.e. 3D volume + * 2. lambda - regularisation parameter + * 3. Number of iterations [OPTIONAL parameter] + * 4. eplsilon - tolerance constant [OPTIONAL parameter] + * 5. print information: 0 (off) or 1 (on) [OPTIONAL parameter] + * + * Output: + * 1. Filtered/regularized image (u) + * + * [1]. Duran, J., Moeller, M., Sbert, C. and Cremers, D., 2016. Collaborative total variation: a general framework for vectorial TV models. SIAM Journal on Imaging Sciences, 9(1), pp.116-151. + */ + +float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float tol, int dimX, int dimY, int dimZ) +{ + long i, j, k, p, q, r, DimTotal; + float taulambda; + float *u_upd, *qx, *qy, *qx_upd, *qy_upd, *gradx, *grady, *gradx_upd, *grady_upd, *div, *div_upd; + + p = 1l; + q = 1l; + r = 0l; + + lambda = 1.0f/(2.0f*lambda); + DimTotal = (long)(dimX*dimY*dimZ); + /* PDHG algorithm parameters*/ + float tau = 0.5f; + float sigma = 0.5f; + float theta = 1.0f; + + // Auxiliar vectors + u_upd = calloc(DimTotal, sizeof(float)); + qx = calloc(DimTotal, sizeof(float)); + qy = calloc(DimTotal, sizeof(float)); + qx_upd = calloc(DimTotal, sizeof(float)); + qy_upd = calloc(DimTotal, sizeof(float)); + gradx = calloc(DimTotal, sizeof(float)); + grady = calloc(DimTotal, sizeof(float)); + gradx_upd = calloc(DimTotal, sizeof(float)); + grady_upd = calloc(DimTotal, sizeof(float)); + div = calloc(DimTotal, sizeof(float)); + div_upd = calloc(DimTotal, sizeof(float)); + + + float *Input = calloc(DimTotal, sizeof(float)); + float *u = calloc(DimTotal, sizeof(float)); + for(k=0; k 1) + { + // Decrease step-sizes to fit balancing principle + tau = (beta * tau) / b; + sigma = (beta * sigma) / b; + alpha = alpha0; + + copyIm(u, u_upd, (long)(dimX), (long)(dimY), (long)(dimZ)); + copyIm(qx, qx_upd, (long)(dimX), (long)(dimY), (long)(dimZ)); + copyIm(qy, qy_upd, (long)(dimX), (long)(dimY), (long)(dimZ)); + copyIm(gradx, gradx_upd, (long)(dimX), (long)(dimY), (long)(dimZ)); + copyIm(grady, grady_upd, (long)(dimX), (long)(dimY), (long)(dimZ)); + copyIm(div, div_upd, (long)(dimX), (long)(dimY), (long)(dimZ)); + + } else if(resprimal > dual_dot_delta) + { + // Increase primal step-size and decrease dual step-size + tau = tau / (1.0f - alpha); + sigma = sigma * (1.0f - alpha); + alpha = alpha * eta; + + } else if(resprimal < dual_div_delta) + { + // Decrease primal step-size and increase dual step-size + tau = tau * (1.0f - alpha); + sigma = sigma / (1.0f - alpha); + alpha = alpha * eta; + } + +// Update variables + taulambda = tau * lambda; +//for(k=0; k < dimZ; k++) taulambda[k] = tau*lambda[k]; + + divsigma = 1.0f / sigma; + divtau = 1.0f / tau; + + copyIm(u_upd, u, (long)(dimX), (long)(dimY), (long)(dimZ)); + copyIm(qx_upd, qx, (long)(dimX), (long)(dimY), (long)(dimZ)); + copyIm(qy_upd, qy, (long)(dimX), (long)(dimY), (long)(dimZ)); + copyIm(gradx_upd, gradx, (long)(dimX), (long)(dimY), (long)(dimZ)); + copyIm(grady_upd, grady, (long)(dimX), (long)(dimY), (long)(dimZ)); + copyIm(div_upd, div, (long)(dimX), (long)(dimY), (long)(dimZ)); + +// Compute residual at current iteration + residual = (resprimal + resdual) / ((float) (dimX*dimY*dimZ)); + +// printf("%f \n", residual); + if (residual < tol) { + printf("Iterations stopped at %i with the residual %f \n", iter, residual); + break; + } + + } + printf("Iterations stopped at %i with the residual %f \n", iter, residual); + + free (u_upd); + free(qx); + free(qy); + free(qx_upd); + free(qy_upd); + free(gradx); + free(grady); + free(gradx_upd); + free(grady_upd); + free(div); + free(div_upd); + printf("Return: %f\n", *u); + + for(k=0; k fTiny) + { + V1 = v1 / mu1; + V3 = v0 / mu1; + } + + if(mu2 > fTiny) + { + V2 = v2 / mu2; + V4 = v0 / mu2; + } + + } else + { + if(M1 > M3) + { + V1 = V4 = 1.0f; + V2 = V3 = 0.0f; + + } else + { + V1 = V4 = 0.0f; + V2 = V3 = 1.0f; + } + } + + // Compute prox_p of the diagonal entries + sig1_upd = sig2_upd = 0.0f; + + if(p == 1) + { + sig1_upd = MAX(sig1 - divsigma, 0.0f); + sig2_upd = MAX(sig2 - divsigma, 0.0f); + + } else if(p == INFNORM) + { + proj[0] = sigma * fabs(sig1); + proj[1] = sigma * fabs(sig2); + + /*l1 projection part */ + sum = fLarge; + num = 0l; + shrinkfactor = 0.0f; + while(sum > 1.0f) + { + sum = 0.0f; + num = 0; + + for(ii = 0; ii < 2; ii++) + { + proj[ii] = MAX(proj[ii] - shrinkfactor, 0.0f); + + sum += fabs(proj[ii]); + if(proj[ii]!= 0.0f) + num++; + } + + if(num > 0) + shrinkfactor = (sum - 1.0f) / num; + else + break; + } + /*l1 proj ends*/ + + sig1_upd = sig1 - divsigma * proj[0]; + sig2_upd = sig2 - divsigma * proj[1]; + } + + // Compute the diagonal entries of $\widehat{\Sigma}\Sigma^{\dagger}_0$ + if(sig1 > fTiny) + sig1_upd /= sig1; + + if(sig2 > fTiny) + sig2_upd /= sig2; + + // Compute solution + t1 = sig1_upd * V1 * V1 + sig2_upd * V2 * V2; + t2 = sig1_upd * V1 * V3 + sig2_upd * V2 * V4; + t3 = sig1_upd * V3 * V3 + sig2_upd * V4 * V4; + + for(k = 0; k < dimZ; k++) + { + gx[(dimX*dimY)*k + j*dimX + i] = vx[(dimX*dimY)*k + j*dimX + i] * t1 + vy[(dimX*dimY)*k + j*dimX + i] * t2; + gy[(dimX*dimY)*k + j*dimX + i] = vx[(dimX*dimY)*k + j*dimX + i] * t2 + vy[(dimX*dimY)*k + j*dimX + i] * t3; + } + + // Delete allocated memory + free(proj); + } + } + + return 1; +} + +float divergence(float *qx_upd, float *qy_upd, float *div_upd, long dimX, long dimY, long dimZ) +{ + long i, j, k, l; + #pragma omp parallel for shared(qx_upd,qy_upd,div_upd) private(i, j, k, l) + for(k = 0; k < dimZ; k++) { + for(j = 0; j < dimY; j++) { + l = j * dimX; + for(i = 0; i < dimX; i++) { + if(i != dimX-1) + { + // ux[k][i+l] = u[k][i+1+l] - u[k][i+l] + div_upd[(dimX*dimY)*k + i+1+l] -= qx_upd[(dimX*dimY)*k + i+l]; + div_upd[(dimX*dimY)*k + i+l] += qx_upd[(dimX*dimY)*k + i+l]; + } + + if(j != dimY-1) + { + // uy[k][i+l] = u[k][i+width+l] - u[k][i+l] + //div_upd[(dimX*dimY)*k + i+dimY+l] -= qy_upd[(dimX*dimY)*k + i+l]; + div_upd[(dimX*dimY)*k + i+(j+1)*dimX] -= qy_upd[(dimX*dimY)*k + i+l]; + div_upd[(dimX*dimY)*k + i+l] += qy_upd[(dimX*dimY)*k + i+l]; + } + } + } + } + return *div_upd; +} diff --git a/src/Core/performance_CPU/TNV_core_backtrack_loop.h.v19 b/src/Core/performance_CPU/TNV_core_backtrack_loop.h.v19 new file mode 100644 index 00000000..3ec42509 --- /dev/null +++ b/src/Core/performance_CPU/TNV_core_backtrack_loop.h.v19 @@ -0,0 +1,100 @@ + float t[3]; + float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f; + + l = (j * dimX + i) * padZ; + +//#pragma vector aligned +#pragma GCC ivdep + for(k = 0; k < dimZ; k++) { + u_upd[l + k] = (u[l + k] + tau * div[l + k] + taulambda * Input[l + k]) / constant; + udiff[k] = u[l + k] - u_upd[l + k]; + unorm += (udiff[k] * udiff[k]); + +#ifdef TNV_LOOP_LAST_I + gradx_upd[l + k] = 0; +#else + gradx_upd[l + k] = ((u[l + k + padZ] + tau * div[l + k + padZ] + taulambda * Input[l + k + padZ]) / constant - u_upd[l + k]); +#endif + +#ifdef TNV_LOOP_LAST_J + grady_upd[l + k] = 0; +#else + grady_upd[l + k] = ((u[l + k + dimX*padZ] + tau * div[l + k + dimX*padZ] + taulambda * Input[l + k + dimX*padZ]) / constant - u_upd[l + k]); +#endif + + gradxdiff[k] = gradx[l + k] - gradx_upd[l + k]; + gradydiff[k] = grady[l + k] - grady_upd[l + k]; + + float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k]; + float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k]; +//#define TNV_NEW_STYLE +#ifdef TNV_NEW_STYLE + qx_upd[l + k] = qx[l + k] + sigma * ubarx; + qy_upd[l + k] = qy[l + k] + sigma * ubary; + + float vx = divsigma * qx_upd[l + k]; //+ ubarx + float vy = divsigma * qy_upd[l + k]; //+ ubary +#else + float vx = ubarx + divsigma * qx[l + k]; + float vy = ubary + divsigma * qy[l + k]; +#endif + + M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy); + } + + coefF(t, M1, M2, M3, sigma, p, q, r); + +//#pragma vector aligned +#pragma GCC ivdep + for(k = 0; k < dimZ; k++) { +#ifdef TNV_NEW_STYLE + float vx = divsigma * qx_upd[l + k]; + float vy = divsigma * qy_upd[l + k]; + + float gx_upd = vx * t[0] + vy * t[1]; + float gy_upd = vx * t[1] + vy * t[2]; + + qx_upd[l + k] -= sigma * gx_upd; + qy_upd[l + k] -= sigma * gy_upd; +#else + float ubarx = theta1 * gradx_upd[l + k] - theta * gradx[l + k]; + float ubary = theta1 * grady_upd[l + k] - theta * grady[l + k]; + float vx = ubarx + divsigma * qx[l + k]; + float vy = ubary + divsigma * qy[l + k]; + + float gx_upd = vx * t[0] + vy * t[1]; + float gy_upd = vx * t[1] + vy * t[2]; + + qx_upd[l + k] = qx[l + k] + sigma * (ubarx - gx_upd); + qy_upd[l + k] = qy[l + k] + sigma * (ubary - gy_upd); +#endif + + float div_upd_val = 0; +#ifndef TNV_LOOP_FIRST_I + div_upd_val -= qx_upd[l + k - padZ]; +#endif + +#ifndef TNV_LOOP_FIRST_J + div_upd_val -= qy_upd[l + k - dimX * padZ]; +#endif +#ifndef TNV_LOOP_LAST_I + div_upd_val += qx_upd[l + k]; +#endif +#ifndef TNV_LOOP_LAST_J + div_upd_val += qy_upd[l + k]; +#endif + div_upd[l + k] = div_upd_val; + + qxdiff = qx[l + k] - qx_upd[l + k]; + qydiff = qy[l + k] - qy_upd[l + k]; + qnorm += (qxdiff * qxdiff + qydiff * qydiff); + + resdual1 += fabs(divsigma * qxdiff - gradxdiff[k]); + resdual2 += fabs(divsigma * qydiff - gradydiff[k]); + product += (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff); + +#ifndef TNV_LOOP_FIRST_J + divdiff = div[l + k] - div_upd[l + k]; // Multiple steps... How we compute without history? + resprimal += fabs(divtau * udiff[k] + divdiff); +#endif + } diff --git a/src/Core/performance_CPU/TNV_core_loop.h.v32 b/src/Core/performance_CPU/TNV_core_loop.h.v32 new file mode 100644 index 00000000..be531566 --- /dev/null +++ b/src/Core/performance_CPU/TNV_core_loop.h.v32 @@ -0,0 +1,119 @@ + { + float t[3]; + float M1 = 0.0f, M2 = 0.0f, M3 = 0.0f; + l = (j * dimX + i) * padZ; + m = dimX * padZ; + + float *__restrict u_next = u + l + padZ; + float *__restrict u_current = u + l; + float *__restrict u_next_row = u + l + m; + + + float *__restrict qx_current = qx + l; + float *__restrict qy_current = qy + l; + float *__restrict qx_prev = qx + l - padZ; + float *__restrict qy_prev = qy + l - m; + + + __assume(padZ%16==0); +/* + __assume_aligned(Input, 64); + __assume_aligned(div, 64); + __assume_aligned(gradx, 64); + __assume_aligned(grady, 64); + __assume_aligned(u, 64); + __assume_aligned(qx, 64); + __assume_aligned(qy, 64); + __assume_aligned(u_current, 64); + __assume_aligned(u_next, 64); + __assume_aligned(u_next_row, 64); +*/ + +#pragma vector aligned +#pragma GCC ivdep + for(k = 0; k < dimZ; k++) { + float u_upd = (u[l + k] + tau * div[l + k] + taulambda * Input[l + k]) / constant; // 3 reads + udiff[k] = u[l + k] - u_upd; // cache 1w + u[l + k] = u_upd; // 1 write + +#ifdef TNV_LOOP_FIRST_J + udiff0[l + k] = udiff[k]; + div0[l + k] = div[l + k]; +#endif + +#ifdef TNV_LOOP_LAST_I + float gradx_upd = 0; +#else + float u_next_upd = (u[l + k + padZ] + tau * div[l + k + padZ] + taulambda * Input[l + k + padZ]) / constant; // 3 reads + float gradx_upd = (u_next_upd - u_upd); // 2 reads +#endif + +#ifdef TNV_LOOP_LAST_J + float grady_upd = 0; +#else + float u_next_row_upd = (u[l + k + m] + tau * div[l + k + m] + taulambda * Input[l + k + m]) / constant; // 3 reads + float grady_upd = (u_next_row_upd - u_upd); // 1 read +#endif + + gradxdiff[k] = gradx[l + k] - gradx_upd; // 1 read, cache 1w + gradydiff[k] = grady[l + k] - grady_upd; // 1 read, cache 1w + gradx[l + k] = gradx_upd; // 1 write + grady[l + k] = grady_upd; // 1 write + + ubarx[k] = gradx_upd - theta * gradxdiff[k]; // cache 1w + ubary[k] = grady_upd - theta * gradydiff[k]; // cache 1w + + float vx = ubarx[k] + divsigma * qx[l + k]; // 1 read + float vy = ubary[k] + divsigma * qy[l + k]; // 1 read + + M1 += (vx * vx); M2 += (vx * vy); M3 += (vy * vy); + } + + coefF(t, M1, M2, M3, sigma, p, q, r); + +#pragma vector aligned +#pragma GCC ivdep + for(k = 0; k < padZ; k++) { + float vx = ubarx[k] + divsigma * qx_current[k]; // cache 2r + float vy = ubary[k] + divsigma * qy_current[k]; // cache 2r + float gx_upd = vx * t[0] + vy * t[1]; + float gy_upd = vx * t[1] + vy * t[2]; + + qxdiff = sigma * (ubarx[k] - gx_upd); + qydiff = sigma * (ubary[k] - gy_upd); + + qx_current[k] += qxdiff; // write 1 + qy_current[k] += qydiff; // write 1 + + unorm += (udiff[k] * udiff[k]); + qnorm += (qxdiff * qxdiff + qydiff * qydiff); + + float div_upd = 0; + +#ifndef TNV_LOOP_FIRST_I + div_upd -= qx_prev[k]; // 1 read +#endif +#ifndef TNV_LOOP_FIRST_J + div_upd -= qy_prev[k]; // 1 read +#endif +#ifndef TNV_LOOP_LAST_I + div_upd += qx_current[k]; +#endif +#ifndef TNV_LOOP_LAST_J + div_upd += qy_current[k]; +#endif + + divdiff = div[l + k] - div_upd; // 1 read + div[l + k] = div_upd; // 1 write + +#ifndef TNV_LOOP_FIRST_J + resprimal += fabs(divtau * udiff[k] + divdiff); +#endif + + // We need to have two independent accumulators to allow gcc-autovectorization + resdual1 += fabs(divsigma * qxdiff + gradxdiff[k]); // cache 1r + resdual2 += fabs(divsigma * qydiff + gradydiff[k]); // cache 1r + product -= (gradxdiff[k] * qxdiff + gradydiff[k] * qydiff); + } + } + \ No newline at end of file From f59a2ea520ba9fb453582b9a4246f3f8e5257474 Mon Sep 17 00:00:00 2001 From: "Suren A. Chilingaryan" Date: Sun, 29 Mar 2020 23:31:17 +0200 Subject: [PATCH 07/10] Remove verification output --- src/Core/regularisers_CPU/TNV_core.c | 4 ++-- src/Core/regularisers_CPU/TNV_core_backtrack.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Core/regularisers_CPU/TNV_core.c b/src/Core/regularisers_CPU/TNV_core.c index 7b8023bc..21f228d5 100755 --- a/src/Core/regularisers_CPU/TNV_core.c +++ b/src/Core/regularisers_CPU/TNV_core.c @@ -617,7 +617,7 @@ float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float to float b = (2.0f * tau * sigma * product) / (gamma * sigma * unorm + gamma * tau * qnorm); float dual_dot_delta = resdual * s * delta; float dual_div_delta = (resdual * s) / delta; - printf("resprimal: %f, resdual: %f, b: %f (product: %f, unorm: %f, qnorm: %f)\n", resprimal, resdual, b, product, unorm, qnorm); +// printf("resprimal: %f, resdual: %f, b: %f (product: %f, unorm: %f, qnorm: %f)\n", resprimal, resdual, b, product, unorm, qnorm); if(b > 1) { @@ -658,7 +658,7 @@ float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float to printf("Iterations stopped at %i with the residual %f \n", iter, residual); - printf("Return: %f\n", *uT); +// printf("Return: %f\n", *uT); return *uT; } diff --git a/src/Core/regularisers_CPU/TNV_core_backtrack.c b/src/Core/regularisers_CPU/TNV_core_backtrack.c index 7eb367e3..f166f5be 100755 --- a/src/Core/regularisers_CPU/TNV_core_backtrack.c +++ b/src/Core/regularisers_CPU/TNV_core_backtrack.c @@ -636,7 +636,7 @@ float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float to float b = (2.0f * tau * sigma * product) / (gamma * sigma * unorm + gamma * tau * qnorm); float dual_dot_delta = resdual * s * delta; float dual_div_delta = (resdual * s) / delta; - printf("resprimal: %f, resdual: %f, b: %f (product: %f, unorm: %f, qnorm: %f)\n", resprimal, resdual, b, product, unorm, qnorm); +// printf("resprimal: %f, resdual: %f, b: %f (product: %f, unorm: %f, qnorm: %f)\n", resprimal, resdual, b, product, unorm, qnorm); if(b > 1) { @@ -675,7 +675,7 @@ float TNV_CPU_main(float *InputT, float *uT, float lambda, int maxIter, float to printf("Iterations stopped at %i with the residual %f \n", iter, residual); - printf("Return: %f\n", *uT); +// printf("Return: %f\n", *uT); return *uT; } From a08cd34eed5620e810dab60072237d462969cba5 Mon Sep 17 00:00:00 2001 From: Edoardo Pasca Date: Sun, 5 Apr 2020 08:07:28 +0100 Subject: [PATCH 08/10] initial integration in CMake --- src/Core/CMakeLists.txt | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/Core/CMakeLists.txt b/src/Core/CMakeLists.txt index 35d5bb41..e639e1f2 100644 --- a/src/Core/CMakeLists.txt +++ b/src/Core/CMakeLists.txt @@ -60,7 +60,7 @@ message("Adding regularisers as a shared library") #set(CMAKE_C_FLAGS "-march=nocona -msse -msse2 -msse3 -mssse3 -msse4 -msse4.1 -msse4.2 -ftree-vectorize -fopenmp") #set(CMAKE_C_COMPILER gcc-9) -set(CMAKE_C_FLAGS "-march=native -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -fopenmp") +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -fopenmp") #set(CMAKE_C_FLAGS "-march=nocona -msse -msse2 -msse3 -mssse3 -msse4 -msse4.1 -msse4.2 -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=128 -fopenmp") #set(CMAKE_C_FLAGS "-march=native -mavx2 -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=512 -fopenmp") #set(CMAKE_C_FLAGS "-march=native -mavx512f -mavx512dq -mavx512bw -mavx512vbmi -mavx512vbmi2 -mavx512vl -ftree-vectorize -fopt-info-vec-optimized -fopt-info-vec -mprefer-vector-width=512 -fopenmp") @@ -81,6 +81,7 @@ add_library(cilreg SHARED ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/ROF_TV_core.c ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/FGP_dTV_core.c ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/TNV_core.c + #${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/TNV_core_fast.c ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/Nonlocal_TV_core.c ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/PatchSelect_core.c ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/utils.c @@ -91,9 +92,9 @@ target_link_libraries(cilreg ${EXTRA_LIBRARIES} ${GLIB2_LIBRARIES} ${GTHREAD2_LI include_directories(cilreg PUBLIC ${GLIB2_INCLUDE_DIR} ${LIBRARY_INC}/include - ${CMAKE_CURRENT_SOURCE_DIR} - ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/ - ${CMAKE_CURRENT_SOURCE_DIR}/inpainters_CPU/ ) + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/regularisers_CPU/ + ${CMAKE_CURRENT_SOURCE_DIR}/inpainters_CPU/ ) ## Install From 10c3df58c5f4c0649c1c23607f430f8a420b6f6f Mon Sep 17 00:00:00 2001 From: Edoardo Pasca Date: Mon, 6 Apr 2020 13:44:06 +0100 Subject: [PATCH 09/10] fix build with conda packages --- CMakeLists.txt | 19 +++++++++++++++++++ cmake/FindGLIB2.cmake | 11 +++++++---- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index afbff2da..0471a017 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -56,5 +56,24 @@ endif() message(STATUS "Python wrappers will be installed in " ${PYTHON_DEST}) + +# include(ExternalProject) +# ExternalProject_Add( +# GLIB2 +# #URL http://download.dre.vanderbilt.edu/previous_versions/ACE-${ACE_VERSION}.zip +# GIT_REPOSITORY https://github.com/GNOME/glib.git +# GIT_TAG 2.62.6 +# CONFIGURE_COMMAND meson _build +# BUILD_COMMAND ninja -C _build +# INSTALL_COMMAND ninja -C _build install +# LOG_INSTALL 1 +# BINARY_DIR ${CMAKE_BINARY_DIR}/build/ +# INSTALL_DIR ${LIBRARY_DIR} +# STAMP_DIR ${CMAKE_BINARY_DIR}/stamp +# TMP_DIR ${CMAKE_BINARY_DIR}/tmp +# DOWNLOAD_DIR ${CMAKE_BINARY_DIR}/download +# SOURCE_DIR ${CMAKE_BINARY_DIR}/source +# ) + #add_subdirectory(src/Core) add_subdirectory(src) diff --git a/cmake/FindGLIB2.cmake b/cmake/FindGLIB2.cmake index 2940d9f2..75297ab6 100644 --- a/cmake/FindGLIB2.cmake +++ b/cmake/FindGLIB2.cmake @@ -7,15 +7,18 @@ # GLIB2_LIBRARIES - glib2 library set(GLIB2_DIR GLIB2_DIR-NOTFOUND CACHE PATH "Location of GLIB2 package") - -if(GLIB2_INCLUDE_DIR AND GLIB2_LIBRARIES) +message(STATUS "GLIB2_INCLUDE_DIR > ${GLIB2_INCLUDE_DIR} <") +message(STATUS "GLIB2_LIBRARIES > ${GLIB2_LIBRARIES} <") +if(GLIB2_INCLUDE_DIR AND GLIB2_LIBRARY_DIR) # Already in cache, be silent + message(STATUS "Should have variables set") set(GLIB2_FIND_QUIETLY TRUE) -endif(GLIB2_INCLUDE_DIR AND GLIB2_LIBRARIES) + set (GLIB2_DIR ${GLIB2_LIBRARY_DIR}) +endif(GLIB2_INCLUDE_DIR AND GLIB2_LIBRARY_DIR) if (GLIB2_DIR) set(PKG_GLIB_LIBRARY_DIRS ${GLIB2_DIR}/lib${CMAKE_BUILD_ARCH} ${GLIB2_DIR}/lib) - set(PKG_GLIB_INCLUDE_DIRS ${GLIB2_DIR}/include/) + set(PKG_GLIB_INCLUDE_DIRS ${GLIB2_DIR}/include/ ${GLIB2_DIR}/lib/glib-2.0/include) else (GLIB2_DIR) if (NOT WIN32) find_package(PkgConfig REQUIRED) From 8589ac62236b352a0ac3f4ff6cb8f5f824a65fdb Mon Sep 17 00:00:00 2001 From: Edoardo Pasca Date: Mon, 6 Apr 2020 14:07:35 +0100 Subject: [PATCH 10/10] fix linking and conda recipe --- recipe/build.sh | 2 +- recipe/meta.yaml | 2 ++ src/Core/CMakeLists.txt | 9 +++++---- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/recipe/build.sh b/recipe/build.sh index 6ba7e275..a5debcd9 100644 --- a/recipe/build.sh +++ b/recipe/build.sh @@ -3,6 +3,6 @@ cp -rv "$RECIPE_DIR/../test" "$SRC_DIR/" cd $SRC_DIR -cmake -G "Unix Makefiles" $RECIPE_DIR/../ -DBUILD_PYTHON_WRAPPER=ON -DCONDA_BUILD=ON -DBUILD_CUDA=ON -DCMAKE_BUILD_TYPE="Release" -DLIBRARY_LIB=$CONDA_PREFIX/lib -DLIBRARY_INC=$CONDA_PREFIX -DCMAKE_INSTALL_PREFIX=$PREFIX +cmake -G "Unix Makefiles" $RECIPE_DIR/../ -DBUILD_PYTHON_WRAPPER=ON -DCONDA_BUILD=ON -DBUILD_CUDA=ON -DCMAKE_BUILD_TYPE="Release" -DLIBRARY_LIB=$CONDA_PREFIX/lib -DLIBRARY_INC=$CONDA_PREFIX -DCMAKE_INSTALL_PREFIX=$PREFIX -DGLIB2_INCLUDE_DIR=$CONDA_PREFIX -DGLIB2_LIBRARY_DIR=$CONDA_PREFIX make install diff --git a/recipe/meta.yaml b/recipe/meta.yaml index 10901029..7d805948 100644 --- a/recipe/meta.yaml +++ b/recipe/meta.yaml @@ -28,6 +28,7 @@ requirements: - vc 9 # [win and py27] - cmake - ripgrep + - glib # [unix] run: - {{ pin_compatible('numpy', max_pin='x.x') }} @@ -37,6 +38,7 @@ requirements: - vc 14 # [win and py35] - vc 9 # [win and py27] - libgcc-ng # [unix] + - glib # [unix] about: home: http://www.ccpi.ac.uk diff --git a/src/Core/CMakeLists.txt b/src/Core/CMakeLists.txt index e639e1f2..2295ea6e 100644 --- a/src/Core/CMakeLists.txt +++ b/src/Core/CMakeLists.txt @@ -18,11 +18,12 @@ message("Creating Regularisers as a shared library") find_package(GLIB2 REQUIRED) find_package(OpenMP REQUIRED) if (OPENMP_FOUND) + message(STATUS "OpenMP_C_FLAGS ${OpenMP_C_FLAGS}") set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") - set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS} ${OpenMP_CXX_FLAGS}") - set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_SHARED_LINKER_FLAGS} ${OpenMP_CXX_FLAGS}") - set (CMAKE_STATIC_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_STATIC_LINKER_FLAGS} ${OpenMP_CXX_FLAGS}") + # set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS} ${OpenMP_C_FLAGS}") + # set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_SHARED_LINKER_FLAGS} ${OpenMP_C_FLAGS}") + # set (CMAKE_STATIC_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_STATIC_LINKER_FLAGS} ${OpenMP_C_FLAGS}") endif() message("CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} -fno-omit-frame-pointer") @@ -45,7 +46,7 @@ elseif(UNIX) set(EXTRA_LIBRARIES "m") endif() set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS}") - set (CMAKE_C_FLAGS "${CMAKE_CXX_FLAGS} ${FLAGS}") + set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${FLAGS}") message("CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}") ## Build the regularisers package as a library