diff --git a/.gitlab/pnnl/base.gitlab-ci.yml b/.gitlab/pnnl/base.gitlab-ci.yml index af0dc039..9606393a 100644 --- a/.gitlab/pnnl/base.gitlab-ci.yml +++ b/.gitlab/pnnl/base.gitlab-ci.yml @@ -369,4 +369,4 @@ stages: variables: WORKDIR_SUFFIX: "x86_64-clang-hip-build" MY_CLUSTER: "incline" - SLURM_ARGS: " --exclusive --ntasks=3 " + SLURM_ARGS: " -N 1 --ntasks=3 " diff --git a/.gitlab/pnnl/incline.gitlab-ci.yml b/.gitlab/pnnl/incline.gitlab-ci.yml index f309d7c9..67614de8 100644 --- a/.gitlab/pnnl/incline.gitlab-ci.yml +++ b/.gitlab/pnnl/incline.gitlab-ci.yml @@ -3,15 +3,15 @@ Incline Build: - .cluster_build - .incline variables: - SCRIPT_ARGS: " --build-only " #--job=clang-hip " + SCRIPT_ARGS: " --build-only " Incline Test: extends: - .cluster_test - .incline variables: - SCRIPT_ARGS: " --test-only " #--job=clang-hip " - CTESTARGS: " --timeout 240 --output-on-failure -LE incline-skip " + SCRIPT_ARGS: " --test-only " + CTESTARGS: " --timeout 240 --output-on-failure " needs: ['Incline Build'] Incline Module Rebuild: diff --git a/CMakeLists.txt b/CMakeLists.txt index 13d65cfa..cd99f931 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23,39 +23,31 @@ endif() option(RESOLVE_TEST_WITH_BSUB "Use `jsrun` instead of `mpirun` commands when running tests" OFF) option(RESOLVE_USE_KLU "Use KLU, AMD and COLAMD libraries from SuiteSparse" ON) -option(RESOLVE_USE_GPU "Use GPU device for computations" ON) -option(RESOLVE_USE_CUDA "Use CUDA language and SDK" ON) -set(RESOLVE_CTEST_OUTPUT_DIR ${PROJECT_BINARY_DIR} CACHE PATH "Directory where CTest outputs are saved") +option(RESOLVE_USE_CUDA "Use CUDA language and SDK" OFF) +option(RESOLVE_USE_HIP "Use HIP language and ROCm library" OFF) + +option(RESOLVE_USE_GPU "Use GPU device for computations" OFF) +mark_as_advanced(FORCE RESOLVE_USE_GPU) if(RESOLVE_USE_CUDA) - set(RESOLVE_USE_GPU On CACHE BOOL "Using GPU!" FORCE) -else() - set(RESOLVE_USE_GPU Off CACHE BOOL "Using GPU!" FORCE) + set(RESOLVE_USE_GPU ON CACHE BOOL "Using CUDA GPU!" FORCE) endif() +if(RESOLVE_USE_HIP) + set(RESOLVE_USE_GPU ON CACHE BOOL "Using HIP GPU!" FORCE) +endif() +# MacOS specific things set(CMAKE_MACOSX_RPATH 1) -# set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib") -# https://gitlab.kitware.com/cmake/community/-/wikis/doc/cmake/RPATH-handling#always-full-rpath -# use, i.e. don't skip the full RPATH for the build tree -#set(CMAKE_SKIP_BUILD_RPATH FALSE) -# when building, don't use the install RPATH already -# (but later on when installing) +# Install with RPATH but do not build with it set(CMAKE_BUILD_WITH_INSTALL_RPATH FALSE) - set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) - set(CMAKE_INSTALL_RPATH ${CMAKE_INSTALL_PREFIX}/lib) -#list(APPEND CMAKE_INSTALL_RPATH ${CMAKE_INSTALL_PREFIX}/lib) # Add CMake sources from `cmake` dir list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake) -# Including clang-format cmake files to do automatic checking of formating -# TODO: Set up clang-format -#include(./cmake/clang-format) - if (RESOLVE_USE_KLU) include(FindKLU) if(NOT KLU_LIBRARY) @@ -89,22 +81,25 @@ else() message(STATUS "Not using CUDA") endif() +if(RESOLVE_USE_HIP) + enable_language(HIP) + check_language(HIP) + include(ReSolveFindHipLibraries) +else() + message(STATUS "Not using HIP") +endif(RESOLVE_USE_HIP) + + # The binary dir is already a global include directory configure_file( ${CMAKE_SOURCE_DIR}/resolve/resolve_defs.hpp.in ${CMAKE_BINARY_DIR}/resolve/resolve_defs.hpp) - -# include build directory for Fortran name mangling header -include_directories(${CMAKE_BINARY_DIR}) - install( FILES ${CMAKE_BINARY_DIR}/resolve/resolve_defs.hpp DESTINATION include/resolve ) -include_directories(${CMAKE_SOURCE_DIR}) - # Enable testing enable_testing() @@ -141,4 +136,5 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/ReSolveConfig.cmake" add_subdirectory(examples) # Add tests +set(RESOLVE_CTEST_OUTPUT_DIR ${PROJECT_BINARY_DIR} CACHE PATH "Directory where CTest outputs are saved") add_subdirectory(tests) diff --git a/CMakePresets.json b/CMakePresets.json index e4784095..4809aca5 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -12,21 +12,31 @@ "description": "Base config to build with CUDA", "binaryDir": "${sourceDir}/build", "installDir": "${sourceDir}/install", - "generator": "Unix Makefiles" + "generator": "Unix Makefiles", + "cacheVariables": { + "RESOLVE_USE_CUDA": "ON" + } }, { - "name": "cpu", - "displayName": "CPU only build", - "description": "Base config to build without GPUs", + "name": "rocm", + "displayName": "ROCM build", + "description": "Base config to build with ROCM", "binaryDir": "${sourceDir}/build", "installDir": "${sourceDir}/install", "generator": "Unix Makefiles", "cacheVariables": { - "RESOLVE_USE_CUDA": "OFF", - "RESOLVE_USE_GPU": "OFF" + "RESOLVE_USE_HIP": "ON" } }, - { + { + "name": "cpu", + "displayName": "CPU only build", + "description": "Base config to build without GPUs", + "binaryDir": "${sourceDir}/build", + "installDir": "${sourceDir}/install", + "generator": "Unix Makefiles" + }, + { "name": "ascent", "inherits": "cuda", "displayName": "Ascent Build", @@ -44,9 +54,18 @@ }, { "name": "incline", - "inherits": "cpu", - "displayName": "Incline CPU only Build", - "description": "Custom changes specific for Incline" + "inherits": "rocm", + "displayName": "Incline Build with rocm", + "description": "Custom changes specific for Incline", + "cacheVariables": { + "CMAKE_HIP_ARCHITECTURES" : "gfx908", + "CMAKE_BUILD_TYPE" : "Debug" + }, + "environment": { + "CC" : "clang", + "CXX" : "clang++", + "FC" : "gfortran" + } } ] diff --git a/buildsystem/incline-env.sh b/buildsystem/incline-env.sh index 3c4e2194..348139ff 100644 --- a/buildsystem/incline-env.sh +++ b/buildsystem/incline-env.sh @@ -1,5 +1,15 @@ +#!/bin/bash + +# Load system rocm source /etc/profile.d/modules.sh module purge module load gcc/8.4.0 -module load rocm/5.3.0 + +# These are necessary in order to see GPUs with sbatch +unset ROCR_VISIBLE_DEVICES +unset CUDA_VISIBLE_DEVICES +unset GPU_DEVICE_ORDINAL + +# Load spack generated modules source ./buildsystem/spack/incline/modules/dependencies.sh + diff --git a/buildsystem/spack/incline/env.sh b/buildsystem/spack/incline/env.sh index 31d03fa4..757cc090 100644 --- a/buildsystem/spack/incline/env.sh +++ b/buildsystem/spack/incline/env.sh @@ -3,19 +3,22 @@ source /etc/profile.d/modules.sh module purge -# Load system python +# Load system python and gcc module load python/miniconda4.12 source /share/apps/python/miniconda4.12/etc/profile.d/conda.sh +module load gcc/8.4.0 # Define environment variables for where spack stores key files # For now, SPACK_INSTALL is the path where everything spack related is installed # If you want to modify the module install path, edit the spack.yaml manually BASE=/qfs/projects/exasgd/resolve/spack-ci export SPACK_INSTALL=$BASE/install +export SPACK_MIRROR=$BASE/../$(whoami)/spack-mirror export SPACK_CACHE=$BASE/../$(whoami)/spack-cache export SPACK_DISABLE_LOCAL_CONFIG=1 -export SPACK_PYTHON=$(which python) +export SPACK_PYTHON=$(which python3) export tempdir=$SPACK_CACHE export TMP=$SPACK_CACHE export TMPDIR=$SPACK_CACHE + diff --git a/buildsystem/spack/incline/install.sh b/buildsystem/spack/incline/install.sh index afb98bc1..d591950b 100755 --- a/buildsystem/spack/incline/install.sh +++ b/buildsystem/spack/incline/install.sh @@ -9,6 +9,9 @@ #SBATCH -e spack_install.%J.output #SBTACH -t 240 +export HTTPS_PROXY=http://proxy01.pnl.gov:3128 +export https_proxy=http://proxy01.pnl.gov:3128 + exit() { # Clear all trap handlers so this isn't echo'ed multiple times, potentially # throwing off the CI script watching for this output @@ -44,6 +47,9 @@ cleanup() { export MY_CLUSTER=incline . buildsystem/load-spack.sh && spack develop --no-clone --path=$(pwd) resolve@develop && +spack concretize -f && +spack install -j 64 llvm-amdgpu && +spack load llvm-amdgpu && ./buildsystem/configure-modules.sh 64 diff --git a/buildsystem/spack/incline/modules/dependencies.sh b/buildsystem/spack/incline/modules/dependencies.sh index 33b8b322..75cf6209 100644 --- a/buildsystem/spack/incline/modules/dependencies.sh +++ b/buildsystem/spack/incline/modules/dependencies.sh @@ -1,24 +1,170 @@ module use -a /qfs/projects/exasgd/resolve/spack-ci/install/modules/linux-centos7-zen +# curl@=7.29.0%gcc@=8.4.0~gssapi~ldap~libidn2~librtmp~libssh2+nghttp2 build_system=autotools libs=shared,static tls=openssl arch=linux-centos7-zen +module load curl/7.29.0-gcc-8.4.0-3emq5yx +# gmake@=4.4.1%gcc@=8.4.0~guile build_system=generic arch=linux-centos7-zen +module load gmake/4.4.1-gcc-8.4.0-l7nyr34 # pkgconf@=1.9.5%gcc@=8.4.0 build_system=autotools arch=linux-centos7-zen -module load pkgconf/1.9.5-gcc-8.4.0-kl4sdjo -# nghttp2@=1.52.0%gcc@=8.4.0 build_system=autotools arch=linux-centos7-zen -module load nghttp2/1.52.0-gcc-8.4.0-pqmjl5g -# ca-certificates-mozilla@=2023-05-30%gcc@=8.4.0 build_system=generic arch=linux-centos7-zen -module load ca-certificates-mozilla/2023-05-30-gcc-8.4.0-txgcsig -# perl@=5.26.0%gcc@=8.4.0+cpanm+opcode+open+shared+threads build_system=generic patches=0eac10e,8cf4302 arch=linux-centos7-zen -module load perl/5.26.0-gcc-8.4.0-h324qox -# zlib-ng@=2.1.3%gcc@=8.4.0+compat+opt build_system=autotools patches=299b958,ae9077a,b692621 arch=linux-centos7-zen -module load zlib-ng/2.1.3-gcc-8.4.0-44tydhr -# openssl@=3.1.3%gcc@=8.4.0~docs+shared build_system=generic certs=mozilla arch=linux-centos7-zen -module load openssl/3.1.3-gcc-8.4.0-46yttzm -# curl@=8.4.0%gcc@=8.4.0~gssapi~ldap~libidn2~librtmp~libssh~libssh2+nghttp2 build_system=autotools libs=shared,static tls=openssl arch=linux-centos7-zen -module load curl/8.4.0-gcc-8.4.0-g2rrs23 +module load pkgconf/1.9.5-gcc-8.4.0-733ltud # ncurses@=6.4%gcc@=8.4.0~symlinks+termlib abi=none build_system=autotools arch=linux-centos7-zen -module load ncurses/6.4-gcc-8.4.0-jt7rpqq +module load ncurses/6.4-gcc-8.4.0-gwo76of +# zlib-ng@=2.1.4%gcc@=8.4.0+compat+opt build_system=autotools arch=linux-centos7-zen +module load zlib-ng/2.1.4-gcc-8.4.0-feah6zt # cmake@=3.27.7%gcc@=8.4.0~doc+ncurses+ownlibs build_system=generic build_type=Release arch=linux-centos7-zen -module load cmake/3.27.7-gcc-8.4.0-tu2rruq -# gmake@=4.4.1%gcc@=8.4.0~guile build_system=generic arch=linux-centos7-zen -module load gmake/4.4.1-gcc-8.4.0-l7nyr34 +module load cmake/3.27.7-gcc-8.4.0-rmou7zf +# gmake@=4.4.1%clang@=16.0.0-rocm5.6.0 cxxflags="--gcc-toolchain=/share/apps/gcc/8.4.0" ~guile build_system=generic arch=linux-centos7-zen +module load gmake/4.4.1-clang-16.0.0-rocm5.6.0-6c7b35p +# python@=3.9.12%gcc@=8.4.0+bz2+crypt+ctypes+dbm~debug+libxml2+lzma~nis~optimizations+pic+pyexpat+pythoncmd+readline+shared+sqlite3+ssl~tkinter+uuid+zlib build_system=generic patches=0d98e93,4c24573,ebdca64,f2fd060 arch=linux-centos7-zen +module load python/3.9.12-gcc-8.4.0-ob2n5zs +# re2c@=2.2%gcc@=8.4.0 build_system=generic arch=linux-centos7-zen +module load re2c/2.2-gcc-8.4.0-zmj4cst +# ninja@=1.11.1%gcc@=8.4.0+re2c build_system=generic arch=linux-centos7-zen +module load ninja/1.11.1-gcc-8.4.0-ofxvwff +# z3@=4.11.2%gcc@=8.4.0~gmp~ipo~python build_system=cmake build_type=Release generator=make arch=linux-centos7-zen +module load z3/4.11.2-gcc-8.4.0-363odap +# llvm-amdgpu@=5.6.1%gcc@=8.4.0~ipo~link_llvm_dylib~llvm_dylib~openmp+rocm-device-libs build_system=cmake build_type=Release generator=ninja patches=a08bbe1,b66529f,d35aec9 arch=linux-centos7-zen +module load llvm-amdgpu/5.6.1-gcc-8.4.0-vy3wrnq +# rocm-core@=5.6.1%gcc@=8.4.0~ipo build_system=cmake build_type=Release generator=make arch=linux-centos7-zen +module load rocm-core/5.6.1-gcc-8.4.0-llv2yv4 +# rocm-cmake@=5.6.1%gcc@=8.4.0~ipo build_system=cmake build_type=Release generator=make arch=linux-centos7-zen +module load rocm-cmake/5.6.1-gcc-8.4.0-klwq5kk +# comgr@=5.6.1%gcc@=8.4.0~ipo build_system=cmake build_type=Release generator=make arch=linux-centos7-zen +module load comgr/5.6.1-gcc-8.4.0-yl7z2re +# mesa@=23.0.2%gcc@=8.4.0+glx+llvm+opengl~opengles+osmesa~strip build_system=meson buildtype=release default_library=shared arch=linux-centos7-zen +module load mesa/23.0.2-gcc-8.4.0-xffioaq +# glx@=1.4%gcc@=8.4.0 build_system=bundle arch=linux-centos7-zen +module load glx/1.4-gcc-8.4.0-vh5g6sx +# hipify-clang@=5.6.1%gcc@=8.4.0~ipo build_system=cmake build_type=Release generator=make patches=54b8b39 arch=linux-centos7-zen +module load hipify-clang/5.6.1-gcc-8.4.0-e3jea5v +# libiconv@=1.17%gcc@=8.4.0 build_system=autotools libs=shared,static arch=linux-centos7-zen +module load libiconv/1.17-gcc-8.4.0-o2hwfiz +# diffutils@=3.9%gcc@=8.4.0 build_system=autotools arch=linux-centos7-zen +module load diffutils/3.9-gcc-8.4.0-7ceszkk +# bzip2@=1.0.8%gcc@=8.4.0~debug~pic+shared build_system=generic arch=linux-centos7-zen +module load bzip2/1.0.8-gcc-8.4.0-on73m5o +# xz@=5.4.1%gcc@=8.4.0~pic build_system=autotools libs=shared,static arch=linux-centos7-zen +module load xz/5.4.1-gcc-8.4.0-v5kymdq +# libxml2@=2.10.3%gcc@=8.4.0+pic~python+shared build_system=autotools arch=linux-centos7-zen +module load libxml2/2.10.3-gcc-8.4.0-6mgqxiy +# pigz@=2.7%gcc@=8.4.0 build_system=makefile arch=linux-centos7-zen +module load pigz/2.7-gcc-8.4.0-btbzuey +# zstd@=1.5.5%gcc@=8.4.0+programs build_system=makefile compression=none libs=shared,static arch=linux-centos7-zen +module load zstd/1.5.5-gcc-8.4.0-3ets7dy +# tar@=1.34%gcc@=8.4.0 build_system=autotools zip=pigz arch=linux-centos7-zen +module load tar/1.34-gcc-8.4.0-atzwdgy +# gettext@=0.22.3%gcc@=8.4.0+bzip2+curses+git~libunistring+libxml2+pic+shared+tar+xz build_system=autotools arch=linux-centos7-zen +module load gettext/0.22.3-gcc-8.4.0-m33ujza +# libsigsegv@=2.14%gcc@=8.4.0 build_system=autotools arch=linux-centos7-zen +module load libsigsegv/2.14-gcc-8.4.0-gzna4n3 +# m4@=1.4.19%gcc@=8.4.0+sigsegv build_system=autotools patches=9dc5fbd,bfdffa7 arch=linux-centos7-zen +module load m4/1.4.19-gcc-8.4.0-bwzchwl +# elfutils@=0.189%gcc@=8.4.0~debuginfod+exeprefix+nls build_system=autotools arch=linux-centos7-zen +module load elfutils/0.189-gcc-8.4.0-23kjwto +# libtool@=2.4.7%gcc@=8.4.0 build_system=autotools arch=linux-centos7-zen +module load libtool/2.4.7-gcc-8.4.0-2bmpsy4 +# util-macros@=1.19.3%gcc@=8.4.0 build_system=autotools arch=linux-centos7-zen +module load util-macros/1.19.3-gcc-8.4.0-64inrmm +# libpciaccess@=0.17%gcc@=8.4.0 build_system=autotools arch=linux-centos7-zen +module load libpciaccess/0.17-gcc-8.4.0-sh2c4la +# libpthread-stubs@=0.4%gcc@=8.4.0 build_system=autotools arch=linux-centos7-zen +module load libpthread-stubs/0.4-gcc-8.4.0-kcav646 +# py-pip@=23.1.2%gcc@=8.4.0 build_system=generic arch=linux-centos7-zen +module load py-pip/23.1.2-gcc-8.4.0-yajovh7 +# py-wheel@=0.41.2%gcc@=8.4.0 build_system=generic arch=linux-centos7-zen +module load py-wheel/0.41.2-gcc-8.4.0-dkkw2va +# py-setuptools@=68.0.0%gcc@=8.4.0 build_system=python_pip arch=linux-centos7-zen +module load py-setuptools/68.0.0-gcc-8.4.0-ihu4sfq +# meson@=1.2.2%gcc@=8.4.0 build_system=python_pip patches=0f0b1bd,ae59765 arch=linux-centos7-zen +module load meson/1.2.2-gcc-8.4.0-vcdwjmb +# libdrm@=2.4.115%gcc@=8.4.0~docs build_system=generic arch=linux-centos7-zen +module load libdrm/2.4.115-gcc-8.4.0-6h77lxh +# perl@=5.26.0%gcc@=8.4.0+cpanm+opcode+open+shared+threads build_system=generic patches=0eac10e,8cf4302 arch=linux-centos7-zen +module load perl/5.26.0-gcc-8.4.0-6tdzqfd +# autoconf@=2.69%gcc@=8.4.0 build_system=autotools patches=35c4492,7793209,a49dd5b arch=linux-centos7-zen +module load autoconf/2.69-gcc-8.4.0-dcrbb7h +# automake@=1.16.5%gcc@=8.4.0 build_system=autotools arch=linux-centos7-zen +module load automake/1.16.5-gcc-8.4.0-tvi3cks +# numactl@=2.0.14%gcc@=8.4.0 build_system=autotools patches=4e1d78c,62fc8a8,ff37630 arch=linux-centos7-zen +module load numactl/2.0.14-gcc-8.4.0-7mpcwqq +# hsakmt-roct@=5.6.1%gcc@=8.4.0~ipo+shared build_system=cmake build_type=Release generator=make arch=linux-centos7-zen +module load hsakmt-roct/5.6.1-gcc-8.4.0-4on3xib +# hsa-rocr-dev@=5.6.1%gcc@=8.4.0~image~ipo+shared build_system=cmake build_type=Release generator=make patches=9267179 arch=linux-centos7-zen +module load hsa-rocr-dev/5.6.1-gcc-8.4.0-tdlpv7w +# perl-file-which@=1.27%gcc@=8.4.0 build_system=perl arch=linux-centos7-zen +module load perl-file-which/1.27-gcc-8.4.0-nix64yx +# perl-module-build@=0.4232%gcc@=8.4.0 build_system=perl arch=linux-centos7-zen +module load perl-module-build/0.4232-gcc-8.4.0-ayed35p +# perl-uri-encode@=1.1.1%gcc@=8.4.0 build_system=perl arch=linux-centos7-zen +module load perl-uri-encode/1.1.1-gcc-8.4.0-biqataj +# py-ply@=3.11%gcc@=8.4.0 build_system=python_pip arch=linux-centos7-zen +module load py-ply/3.11-gcc-8.4.0-creftnl +# py-cppheaderparser@=2.7.4%gcc@=8.4.0 build_system=python_pip arch=linux-centos7-zen +module load py-cppheaderparser/2.7.4-gcc-8.4.0-nw7554i +# rocminfo@=5.6.1%gcc@=8.4.0~ipo build_system=cmake build_type=Release generator=make arch=linux-centos7-zen +module load rocminfo/5.6.1-gcc-8.4.0-5shaxxj +# roctracer-dev-api@=5.6.1%gcc@=8.4.0 build_system=generic arch=linux-centos7-zen +module load roctracer-dev-api/5.6.1-gcc-8.4.0-gbaoh25 +# hip@=5.6.1%gcc@=8.4.0~cuda~ipo+rocm build_system=cmake build_type=Release generator=make patches=aee7249,c2ee21c,e73e91b arch=linux-centos7-zen +module load hip/5.6.1-gcc-8.4.0-zpa2j7f +# msgpack-c@=3.1.1%gcc@=8.4.0~ipo build_system=cmake build_type=Release generator=make arch=linux-centos7-zen +module load msgpack-c/3.1.1-gcc-8.4.0-buxbznu +# procps@=4.0.4%gcc@=8.4.0+nls build_system=autotools arch=linux-centos7-zen +module load procps/4.0.4-gcc-8.4.0-gyn6his +# py-joblib@=1.2.0%gcc@=8.4.0 build_system=python_pip arch=linux-centos7-zen +module load py-joblib/1.2.0-gcc-8.4.0-ukcd432 +# py-cython@=0.29.36%gcc@=8.4.0 build_system=python_pip patches=c4369ad arch=linux-centos7-zen +module load py-cython/0.29.36-gcc-8.4.0-5f4zyzb +# py-msgpack@=1.0.5%gcc@=8.4.0 build_system=python_pip arch=linux-centos7-zen +module load py-msgpack/1.0.5-gcc-8.4.0-2xh5udm +# libyaml@=0.2.5%gcc@=8.4.0 build_system=autotools arch=linux-centos7-zen +module load libyaml/0.2.5-gcc-8.4.0-hidc7bw +# py-pyyaml@=6.0%gcc@=8.4.0+libyaml build_system=python_pip arch=linux-centos7-zen +module load py-pyyaml/6.0-gcc-8.4.0-4mdsdw2 +# py-distlib@=0.3.7%gcc@=8.4.0 build_system=python_pip arch=linux-centos7-zen +module load py-distlib/0.3.7-gcc-8.4.0-f25ay4b +# py-editables@=0.3%gcc@=8.4.0 build_system=python_pip arch=linux-centos7-zen +module load py-editables/0.3-gcc-8.4.0-hrmamrk +# py-flit-core@=3.9.0%gcc@=8.4.0 build_system=python_pip arch=linux-centos7-zen +module load py-flit-core/3.9.0-gcc-8.4.0-q3yng6k +# py-packaging@=23.1%gcc@=8.4.0 build_system=python_pip arch=linux-centos7-zen +module load py-packaging/23.1-gcc-8.4.0-7krugqt +# py-pathspec@=0.11.1%gcc@=8.4.0 build_system=python_pip arch=linux-centos7-zen +module load py-pathspec/0.11.1-gcc-8.4.0-vm5freh +# git@=2.42.0%gcc@=8.4.0+man+nls+perl+subtree~svn~tcltk build_system=autotools arch=linux-centos7-zen +module load git/2.42.0-gcc-8.4.0-k5crf2q +# py-tomli@=2.0.1%gcc@=8.4.0 build_system=python_pip arch=linux-centos7-zen +module load py-tomli/2.0.1-gcc-8.4.0-m4gh2nb +# py-typing-extensions@=4.8.0%gcc@=8.4.0 build_system=python_pip arch=linux-centos7-zen +module load py-typing-extensions/4.8.0-gcc-8.4.0-ovqdpbs +# py-setuptools-scm@=7.1.0%gcc@=8.4.0+toml build_system=python_pip arch=linux-centos7-zen +module load py-setuptools-scm/7.1.0-gcc-8.4.0-hqzn5lb +# py-pluggy@=1.0.0%gcc@=8.4.0 build_system=python_pip arch=linux-centos7-zen +module load py-pluggy/1.0.0-gcc-8.4.0-lqpf66l +# py-calver@=2022.6.26%gcc@=8.4.0 build_system=python_pip arch=linux-centos7-zen +module load py-calver/2022.6.26-gcc-8.4.0-pm6rj2c +# py-trove-classifiers@=2023.8.7%gcc@=8.4.0 build_system=python_pip arch=linux-centos7-zen +module load py-trove-classifiers/2023.8.7-gcc-8.4.0-iy66qnh +# py-hatchling@=1.18.0%gcc@=8.4.0 build_system=python_pip arch=linux-centos7-zen +module load py-hatchling/1.18.0-gcc-8.4.0-bjpjiiq +# py-hatch-vcs@=0.3.0%gcc@=8.4.0 build_system=python_pip arch=linux-centos7-zen +module load py-hatch-vcs/0.3.0-gcc-8.4.0-hc6rq3a +# py-filelock@=3.12.4%gcc@=8.4.0 build_system=python_pip arch=linux-centos7-zen +module load py-filelock/3.12.4-gcc-8.4.0-rzqmlrq +# py-platformdirs@=3.10.0%gcc@=8.4.0~wheel build_system=python_pip arch=linux-centos7-zen +module load py-platformdirs/3.10.0-gcc-8.4.0-6hnyp7h +# py-virtualenv@=20.24.5%gcc@=8.4.0 build_system=python_pip arch=linux-centos7-zen +module load py-virtualenv/20.24.5-gcc-8.4.0-h4mzkzl +# rocblas@=5.6.1%gcc@=8.4.0~ipo+tensile amdgpu_target=auto build_system=cmake build_type=Release generator=make arch=linux-centos7-zen +module load rocblas/5.6.1-gcc-8.4.0-arsno2b +# fmt@=10.1.1%gcc@=8.4.0~ipo+pic~shared build_system=cmake build_type=Release cxxstd=11 generator=make arch=linux-centos7-zen +module load fmt/10.1.1-gcc-8.4.0-4d5ehr5 +# rocprim@=5.6.1%gcc@=8.4.0~ipo amdgpu_target=auto build_system=cmake build_type=Release generator=make arch=linux-centos7-zen +module load rocprim/5.6.1-gcc-8.4.0-nu465tt +# rocsparse@=5.6.1%gcc@=8.4.0~ipo~test amdgpu_target=auto build_system=cmake build_type=Release generator=make arch=linux-centos7-zen +module load rocsparse/5.6.1-gcc-8.4.0-wtmfgyn +# rocsolver@=5.6.1%gcc@=8.4.0~ipo+optimal amdgpu_target=auto build_system=cmake build_type=Release generator=make arch=linux-centos7-zen +module load rocsolver/5.6.1-gcc-8.4.0-wlgpkqj +# roctracer-dev@=5.6.1%gcc@=8.4.0~ipo~rocm build_system=cmake build_type=Release generator=make arch=linux-centos7-zen +module load roctracer-dev/5.6.1-gcc-8.4.0-lilld4h # libiconv@=1.17%gcc@=8.4.0 build_system=autotools libs=shared,static arch=linux-centos7-zen module load libiconv/1.17-gcc-8.4.0-wfdnlg6 # diffutils@=3.9%gcc@=8.4.0 build_system=autotools arch=linux-centos7-zen @@ -27,6 +173,8 @@ module load diffutils/3.9-gcc-8.4.0-qh566r6 module load libsigsegv/2.14-gcc-8.4.0-iutj4de # m4@=1.4.19%gcc@=8.4.0+sigsegv build_system=autotools patches=9dc5fbd,bfdffa7 arch=linux-centos7-zen module load m4/1.4.19-gcc-8.4.0-x7ktvaf +# perl@=5.26.0%gcc@=8.4.0+cpanm+opcode+open+shared+threads build_system=generic patches=0eac10e,8cf4302 arch=linux-centos7-zen +module load perl/5.26.0-gcc-8.4.0-h324qox # autoconf@=2.69%gcc@=8.4.0 build_system=autotools patches=35c4492,7793209,a49dd5b arch=linux-centos7-zen module load autoconf/2.69-gcc-8.4.0-npluk5j # automake@=1.16.5%gcc@=8.4.0 build_system=autotools arch=linux-centos7-zen @@ -35,6 +183,22 @@ module load automake/1.16.5-gcc-8.4.0-tgloywk module load libtool/2.4.7-gcc-8.4.0-gs6gyy3 # gmp@=6.2.1%gcc@=8.4.0+cxx build_system=autotools libs=shared,static patches=69ad2e2 arch=linux-centos7-zen module load gmp/6.2.1-gcc-8.4.0-ythx4o2 +# pkgconf@=1.9.5%gcc@=8.4.0 build_system=autotools arch=linux-centos7-zen +module load pkgconf/1.9.5-gcc-8.4.0-kl4sdjo +# nghttp2@=1.52.0%gcc@=8.4.0 build_system=autotools arch=linux-centos7-zen +module load nghttp2/1.52.0-gcc-8.4.0-pqmjl5g +# ca-certificates-mozilla@=2023-05-30%gcc@=8.4.0 build_system=generic arch=linux-centos7-zen +module load ca-certificates-mozilla/2023-05-30-gcc-8.4.0-txgcsig +# zlib-ng@=2.1.3%gcc@=8.4.0+compat+opt build_system=autotools patches=299b958,ae9077a,b692621 arch=linux-centos7-zen +module load zlib-ng/2.1.3-gcc-8.4.0-44tydhr +# openssl@=3.1.3%gcc@=8.4.0~docs+shared build_system=generic certs=mozilla arch=linux-centos7-zen +module load openssl/3.1.3-gcc-8.4.0-46yttzm +# curl@=8.4.0%gcc@=8.4.0~gssapi~ldap~libidn2~librtmp~libssh~libssh2+nghttp2 build_system=autotools libs=shared,static tls=openssl arch=linux-centos7-zen +module load curl/8.4.0-gcc-8.4.0-g2rrs23 +# ncurses@=6.4%gcc@=8.4.0~symlinks+termlib abi=none build_system=autotools arch=linux-centos7-zen +module load ncurses/6.4-gcc-8.4.0-jt7rpqq +# cmake@=3.27.7%gcc@=8.4.0~doc+ncurses+ownlibs build_system=generic build_type=Release arch=linux-centos7-zen +module load cmake/3.27.7-gcc-8.4.0-tu2rruq # gmake@=4.4.1%gcc@=8.4.0~guile build_system=autotools arch=linux-centos7-zen module load gmake/4.4.1-gcc-8.4.0-f23wik2 # metis@=5.1.0%gcc@=8.4.0~gdb~int64~ipo~real64+shared build_system=cmake build_type=Release generator=make patches=4991da9,93a7903,b1225da arch=linux-centos7-zen @@ -63,5 +227,5 @@ module load mpfr/4.2.0-gcc-8.4.0-cjhi2el module load openblas/0.3.24-gcc-8.4.0-4ei4hpg # suite-sparse@=5.13.0%gcc@=8.4.0~cuda~graphblas~openmp+pic build_system=generic arch=linux-centos7-zen module load suite-sparse/5.13.0-gcc-8.4.0-ivey23b -# resolve@=develop%gcc@=8.4.0~cuda~ipo+klu build_system=cmake build_type=Release dev_path=/people/svcexasgd/gitlab/24143/spack_incline generator=make arch=linux-centos7-zen -## module load resolve/develop-gcc-8.4.0-l7tspub +# resolve@=develop%clang@=16.0.0-rocm5.6.0 cxxflags="--gcc-toolchain=/share/apps/gcc/8.4.0" ~cuda~ipo+klu+rocm amdgpu_target=gfx908 build_system=cmake build_type=Release dev_path=/people/ruth521/projects/resolve generator=make arch=linux-centos7-zen +## module load resolve/develop-clang-16.0.0-rocm5.6.0-6kaaut4 diff --git a/buildsystem/spack/incline/spack.yaml b/buildsystem/spack/incline/spack.yaml index 36234ce0..894daf7c 100644 --- a/buildsystem/spack/incline/spack.yaml +++ b/buildsystem/spack/incline/spack.yaml @@ -1,10 +1,35 @@ spack: specs: - - resolve~cuda%gcc@8.4.0 + - resolve~cuda+rocm%clang@16.0.0-rocm5.6.0 amdgpu_target=gfx908 + ^ llvm-amdgpu%gcc + ^ hsa-rocr-dev~image view: false concretizer: - unify: when_possible reuse: true + unify: true + compilers: + - compiler: + spec: gcc@8.4.0 + paths: + cc: /share/apps/gcc/8.4.0/bin/gcc + cxx: /share/apps/gcc/8.4.0/bin/g++ + f77: /share/apps/gcc/8.4.0/bin/gfortran + fc: /share/apps/gcc/8.4.0/bin/gfortran + operating_system: centos7 + target: x86_64 + modules: [gcc/8.4.0] + - compiler: + spec: clang@16.0.0-rocm5.6.0 + paths: + cc: amdclang + cxx: amdclang++ + f77: /share/apps/gcc/8.4.0/bin/gfortran + fc: /share/apps/gcc/8.4.0/bin/gfortran + flags: + cxxflags: --gcc-toolchain=/share/apps/gcc/8.4.0 + operating_system: centos7 + target: x86_64 + modules: [] config: concretizer: clingo install_tree: @@ -23,20 +48,39 @@ spack: write: group read: world group: exasgd + mesa: + externals: + - spec: mesa@23.0.2+glx + prefix: /usr + buildable: false + curl: + externals: + - spec: curl@7.29.0 + prefix: /usr/bin/curl + buildable: false + git: + externals: + - spec: git@2.42.0 + prefix: /share/apps/git/2.42.0 + modules: + - git/2.42.0 + buildable: false + lua: + externals: + - spec: lua@5.4.2 + modules: + - lua/5.4.2 + buildable: false + python: + externals: + - spec: python@3.9.12%gcc + modules: + - python/miniconda4.12 + buildable: false perl: externals: - spec: perl@5.26.0 modules: - perl/5.26.0 buildable: false - compilers: - - compiler: - spec: gcc@8.4.0 - paths: - cc: /share/apps/gcc/8.4.0/bin/gcc - cxx: /share/apps/gcc/8.4.0/bin/g++ - f77: /share/apps/gcc/8.4.0/bin/gfortran - fc: /share/apps/gcc/8.4.0/bin/gfortran - operating_system: centos7 - target: x86_64 - modules: [gcc/8.4.0] + diff --git a/buildsystem/spack/spack b/buildsystem/spack/spack index 7e466f7d..f120cada 160000 --- a/buildsystem/spack/spack +++ b/buildsystem/spack/spack @@ -1 +1 @@ -Subproject commit 7e466f7d22839f034b1e542daf5d2b6ef8c568c4 +Subproject commit f120cada59dbc5115d94c2fce3cbffc946b72bb0 diff --git a/cmake/ReSolveConfig.cmake.in b/cmake/ReSolveConfig.cmake.in index 7a162d90..fd73d0c8 100644 --- a/cmake/ReSolveConfig.cmake.in +++ b/cmake/ReSolveConfig.cmake.in @@ -4,6 +4,10 @@ include("${CMAKE_CURRENT_LIST_DIR}/ReSolveTargets.cmake") +if(NOT CMAKE_CXX_STANDARD) + set(CMAKE_CXX_STANDARD @CMAKE_CXX_STANDARD@) +endif() + include(CheckLanguage) # This must come before enable_language(CUDA) if(@RESOLVE_USE_CUDA@) @@ -12,6 +16,23 @@ if(@RESOLVE_USE_CUDA@) check_language(CUDA) set(CMAKE_CUDA_FLAGS "@CMAKE_CUDA_FLAGS@") find_package(CUDAToolkit REQUIRED) + add_library(ReSolve::CUDA ALIAS ReSolve::resolve_backend_cuda) +endif() +if(@RESOLVE_USE_HIP@) + # TODO - This is a bit heavy-handed, but otherwise you get gcc which is not ideal + # - if(NOT CMAKE_C_COMPILER) wasn't working at top of file... + set(CMAKE_C_COMPILER @CMAKE_C_COMPILER@) + set(CMAKE_CXX_COMPILER @CMAKE_CXX_COMPILER@) + enable_language(HIP) + check_language(HIP) + find_package(hip REQUIRED) + find_package(rocblas REQUIRED) + find_package(rocsparse REQUIRED) + find_package(rocsolver REQUIRED) + # This is just an agly hack to make HIP build work + get_target_property(hip_includes hip::device INTERFACE_INCLUDE_DIRECTORIES) + target_include_directories(ReSolve::resolve_hip INTERFACE $) + add_library(ReSolve::HIP ALIAS ReSolve::resolve_backend_hip) endif() # Compute installation prefix relative to this file. diff --git a/cmake/ReSolveFindHipLibraries.cmake b/cmake/ReSolveFindHipLibraries.cmake new file mode 100644 index 00000000..b23d8021 --- /dev/null +++ b/cmake/ReSolveFindHipLibraries.cmake @@ -0,0 +1,25 @@ +# Exports target `resolve_hip` which finds all hip libraries needed by resolve. + + +add_library(resolve_hip INTERFACE) + +find_package(hip REQUIRED) +find_package(rocblas REQUIRED) +find_package(rocsparse REQUIRED) +find_package(rocsolver REQUIRED) + +target_link_libraries(resolve_hip INTERFACE + hip::host + hip::device + roc::rocblas + roc::rocsparse + roc::rocsolver +) + +get_target_property(hip_includes hip::device INTERFACE_INCLUDE_DIRECTORIES) + +target_include_directories(resolve_hip INTERFACE + $) + +install(TARGETS resolve_hip EXPORT ReSolveTargets) + diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 8e8a2498..faa53807 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -39,7 +39,16 @@ if(RESOLVE_USE_CUDA) endif(RESOLVE_USE_CUDA) +# Create HIP examples +if(RESOLVE_USE_HIP) + # Build example with KLU factorization and rocsolver Rf refactorization + add_executable(klu_rocsolverrf.exe r_KLU_rocsolverrf.cpp) + target_link_libraries(klu_rocsolverrf.exe PRIVATE ReSolve) + # Build example with KLU factorization, rocsolver Rf refactorization, and FGMRES iterative refinement + add_executable(klu_rocsolverrf_fgmres.exe r_KLU_rocSolverRf_FGMRES.cpp) + target_link_libraries(klu_rocsolverrf_fgmres.exe PRIVATE ReSolve) +endif(RESOLVE_USE_HIP) # Install all examples in bin directory set(installable_executables klu_klu.exe klu_klu_standalone.exe) @@ -48,6 +57,10 @@ if(RESOLVE_USE_CUDA) set(installable_executables ${installable_executables} klu_glu.exe klu_rf.exe klu_rf_fgmres.exe klu_glu_values_update.exe) endif(RESOLVE_USE_CUDA) +if(RESOLVE_USE_HIP) + set(installable_executables ${installable_executables} klu_rocsolverrf.exe) +endif(RESOLVE_USE_HIP) + install(TARGETS ${installable_executables} RUNTIME DESTINATION bin) @@ -58,8 +71,11 @@ set(CONSUMER_PATH ${CMAKE_INSTALL_PREFIX}/share/examples) install(PROGRAMS test.sh DESTINATION ${CONSUMER_PATH}) # Select consumer app +# TODO - have an outer loop that adds a unique consumer test for each backend supproted if(RESOLVE_USE_CUDA) set(RESOLVE_CONSUMER_APP "testKLU_Rf_FGMRES.cpp") +elseif(RESOLVE_USE_HIP) + set(RESOLVE_CONSUMER_APP "testKLU_RocSolver.cpp") else() set(RESOLVE_CONSUMER_APP "testKLU.cpp") endif() diff --git a/examples/r_KLU_GLU.cpp b/examples/r_KLU_GLU.cpp index e2cbfde4..9f271254 100644 --- a/examples/r_KLU_GLU.cpp +++ b/examples/r_KLU_GLU.cpp @@ -41,8 +41,8 @@ int main(int argc, char *argv[]) workspace_CUDA->initializeHandles(); ReSolve::MatrixHandler* matrix_handler = new ReSolve::MatrixHandler(workspace_CUDA); ReSolve::VectorHandler* vector_handler = new ReSolve::VectorHandler(workspace_CUDA); - real_type* rhs; - real_type* x; + real_type* rhs = nullptr; + real_type* x = nullptr; vector_type* vec_rhs; vector_type* vec_x; @@ -93,8 +93,8 @@ int main(int argc, char *argv[]) x = new real_type[A->getNumRows()]; vec_rhs = new vector_type(A->getNumRows()); vec_x = new vector_type(A->getNumRows()); - vec_x->allocate("cpu");//for KLU - vec_x->allocate("cuda"); + vec_x->allocate(ReSolve::memory::HOST);//for KLU + vec_x->allocate(ReSolve::memory::DEVICE); vec_r = new vector_type(A->getNumRows()); } else { ReSolve::io::readAndUpdateMatrix(mat_file, A_coo); @@ -107,11 +107,11 @@ int main(int argc, char *argv[]) //Now convert to CSR. if (i < 1) { matrix_handler->coo2csr(A_coo, A, "cpu"); - vec_rhs->update(rhs, "cpu", "cpu"); - vec_rhs->setDataUpdated("cpu"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST); + vec_rhs->setDataUpdated(ReSolve::memory::HOST); } else { matrix_handler->coo2csr(A_coo, A, "cuda"); - vec_rhs->update(rhs, "cpu", "cuda"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); } std::cout<<"COO to CSR completed. Expanded NNZ: "<< A->getNnzExpanded()<solve(vec_rhs, vec_x); std::cout<<"CUSOLVER GLU solve status: "<update(rhs, "cpu", "cuda"); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); matrix_handler->setValuesChanged(true, "cuda"); @@ -159,7 +159,8 @@ int main(int argc, char *argv[]) delete A; delete KLU; delete GLU; - delete x; + delete [] x; + delete [] rhs; delete vec_r; delete vec_x; delete workspace_CUDA; diff --git a/examples/r_KLU_GLU_matrix_values_update.cpp b/examples/r_KLU_GLU_matrix_values_update.cpp index 7d1bb141..ded685ac 100644 --- a/examples/r_KLU_GLU_matrix_values_update.cpp +++ b/examples/r_KLU_GLU_matrix_values_update.cpp @@ -44,8 +44,8 @@ int main(int argc, char *argv[]) workspace_CUDA->initializeHandles(); ReSolve::MatrixHandler* matrix_handler = new ReSolve::MatrixHandler(workspace_CUDA); ReSolve::VectorHandler* vector_handler = new ReSolve::VectorHandler(workspace_CUDA); - real_type* rhs; - real_type* x; + real_type* rhs = nullptr; + real_type* x = nullptr; vector_type* vec_rhs; vector_type* vec_x; @@ -96,8 +96,8 @@ int main(int argc, char *argv[]) x = new real_type[A->getNumRows()]; vec_rhs = new vector_type(A->getNumRows()); vec_x = new vector_type(A->getNumRows()); - vec_x->allocate("cpu");//for KLU - vec_x->allocate("cuda"); + vec_x->allocate(ReSolve::memory::HOST);//for KLU + vec_x->allocate(ReSolve::memory::DEVICE); vec_r = new vector_type(A->getNumRows()); } else { if (i==1) { @@ -106,7 +106,7 @@ int main(int argc, char *argv[]) ReSolve::io::readAndUpdateMatrix(mat_file, A_exp_coo); } std::cout<<"Updating values of A_coo!"<updateValues(A_exp_coo->getValues("cpu"), "cpu", "cpu"); + A_coo->updateValues(A_exp_coo->getValues(ReSolve::memory::HOST), ReSolve::memory::HOST, ReSolve::memory::HOST); //ReSolve::io::readAndUpdateMatrix(mat_file, A_coo); ReSolve::io::readAndUpdateRhs(rhs_file, &rhs); } @@ -117,11 +117,11 @@ int main(int argc, char *argv[]) //Now convert to CSR. if (i < 1) { matrix_handler->coo2csr(A_coo, A, "cpu"); - vec_rhs->update(rhs, "cpu", "cpu"); - vec_rhs->setDataUpdated("cpu"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST); + vec_rhs->setDataUpdated(ReSolve::memory::HOST); } else { matrix_handler->coo2csr(A_coo, A, "cuda"); - vec_rhs->update(rhs, "cpu", "cuda"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); } std::cout<<"COO to CSR completed. Expanded NNZ: "<< A->getNnzExpanded()<solve(vec_rhs, vec_x); std::cout<<"CUSOLVER GLU solve status: "<update(rhs, "cpu", "cuda"); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); matrix_handler->setValuesChanged(true, "cuda"); @@ -170,7 +170,8 @@ int main(int argc, char *argv[]) delete A; delete KLU; delete GLU; - delete x; + delete [] x; + delete [] rhs; delete vec_r; delete vec_x; delete workspace_CUDA; diff --git a/examples/r_KLU_KLU.cpp b/examples/r_KLU_KLU.cpp index 8b0ea59a..901e36a5 100644 --- a/examples/r_KLU_KLU.cpp +++ b/examples/r_KLU_KLU.cpp @@ -40,8 +40,8 @@ int main(int argc, char *argv[]) ReSolve::LinAlgWorkspaceCpu* workspace = new ReSolve::LinAlgWorkspaceCpu(); ReSolve::MatrixHandler* matrix_handler = new ReSolve::MatrixHandler(workspace); ReSolve::VectorHandler* vector_handler = new ReSolve::VectorHandler(workspace); - real_type* rhs; - real_type* x; + real_type* rhs = nullptr; + real_type* x = nullptr; vector_type* vec_rhs; vector_type* vec_x; @@ -108,11 +108,11 @@ int main(int argc, char *argv[]) //Now convert to CSR. if (i < 2) { matrix_handler->coo2csr(A_coo, A, "cpu"); - vec_rhs->update(rhs, "cpu", "cpu"); - vec_rhs->setDataUpdated("cpu"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST); + vec_rhs->setDataUpdated(ReSolve::memory::HOST); } else { matrix_handler->coo2csr(A_coo, A, "cpu"); - vec_rhs->update(rhs, "cpu", "cpu"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST); } std::cout<<"COO to CSR completed. Expanded NNZ: "<< A->getNnzExpanded()<solve(vec_rhs, vec_x); std::cout<<"KLU solve status: "<update(rhs, "cpu", "cpu"); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST); matrix_handler->setValuesChanged(true, "cpu"); @@ -148,7 +148,8 @@ int main(int argc, char *argv[]) //now DELETE delete A; delete KLU; - delete x; + delete [] x; + delete [] rhs; delete vec_r; delete vec_x; delete matrix_handler; diff --git a/examples/r_KLU_KLU_standalone.cpp b/examples/r_KLU_KLU_standalone.cpp index 77e5b97a..3dfaf716 100644 --- a/examples/r_KLU_KLU_standalone.cpp +++ b/examples/r_KLU_KLU_standalone.cpp @@ -36,8 +36,8 @@ int main(int argc, char *argv[]) ReSolve::LinAlgWorkspaceCpu* workspace = new ReSolve::LinAlgWorkspaceCpu(); ReSolve::MatrixHandler* matrix_handler = new ReSolve::MatrixHandler(workspace); ReSolve::VectorHandler* vector_handler = new ReSolve::VectorHandler(workspace); - real_type* rhs; - real_type* x; + real_type* rhs = nullptr; + real_type* x = nullptr; vector_type* vec_rhs; vector_type* vec_x; @@ -83,8 +83,8 @@ int main(int argc, char *argv[]) //Now convert to CSR. matrix_handler->coo2csr(A_coo, A, "cpu"); - vec_rhs->update(rhs, "cpu", "cpu"); - vec_rhs->setDataUpdated("cpu"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST); + vec_rhs->setDataUpdated(ReSolve::memory::HOST); std::cout << "COO to CSR completed. Expanded NNZ: " << A->getNnzExpanded() << std::endl; //Now call direct solver KLU->setupParameters(1, 0.1, false); @@ -96,7 +96,7 @@ int main(int argc, char *argv[]) std::cout << "KLU factorization status: " << status << std::endl; status = KLU->solve(vec_rhs, vec_x); std::cout << "KLU solve status: " << status << std::endl; - vec_r->update(rhs, "cpu", "cpu"); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST); matrix_handler->setValuesChanged(true, "cpu"); @@ -111,7 +111,8 @@ int main(int argc, char *argv[]) //now DELETE delete A; delete KLU; - delete x; + delete [] x; + delete [] rhs; delete vec_r; delete vec_x; delete matrix_handler; diff --git a/examples/r_KLU_rf.cpp b/examples/r_KLU_rf.cpp index 01fa0f3c..b61029c5 100644 --- a/examples/r_KLU_rf.cpp +++ b/examples/r_KLU_rf.cpp @@ -42,8 +42,8 @@ int main(int argc, char *argv[] ) workspace_CUDA->initializeHandles(); ReSolve::MatrixHandler* matrix_handler = new ReSolve::MatrixHandler(workspace_CUDA); ReSolve::VectorHandler* vector_handler = new ReSolve::VectorHandler(workspace_CUDA); - real_type* rhs; - real_type* x; + real_type* rhs = nullptr; + real_type* x = nullptr; vector_type* vec_rhs; vector_type* vec_x; @@ -107,11 +107,11 @@ int main(int argc, char *argv[] ) //Now convert to CSR. if (i < 2) { matrix_handler->coo2csr(A_coo, A, "cpu"); - vec_rhs->update(rhs, "cpu", "cpu"); - vec_rhs->setDataUpdated("cpu"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST); + vec_rhs->setDataUpdated(ReSolve::memory::HOST); } else { matrix_handler->coo2csr(A_coo, A, "cuda"); - vec_rhs->update(rhs, "cpu", "cuda"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); } std::cout<<"COO to CSR completed. Expanded NNZ: "<< A->getNnzExpanded()<getQOrdering(); Rf->setup(A, L, U, P, Q); - delete [] P; - delete [] Q; delete L; - delete L_csc; delete U; - delete U_csc; } } else { //status = KLU->refactorize(); @@ -157,7 +153,7 @@ int main(int argc, char *argv[] ) //status = KLU->solve(vec_rhs, vec_x); //std::cout<<"KLU solve status: "<update(rhs, "cpu", "cuda"); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); matrix_handler->setValuesChanged(true, "cuda"); @@ -173,7 +169,8 @@ int main(int argc, char *argv[] ) delete A; delete KLU; delete Rf; - delete x; + delete [] x; + delete [] rhs; delete vec_r; delete vec_x; delete workspace_CUDA; diff --git a/examples/r_KLU_rf_FGMRES.cpp b/examples/r_KLU_rf_FGMRES.cpp index ee674869..584fcd10 100644 --- a/examples/r_KLU_rf_FGMRES.cpp +++ b/examples/r_KLU_rf_FGMRES.cpp @@ -96,8 +96,8 @@ int main(int argc, char *argv[]) x = new real_type[A->getNumRows()]; vec_rhs = new vector_type(A->getNumRows()); vec_x = new vector_type(A->getNumRows()); - vec_x->allocate("cpu");//for KLU - vec_x->allocate("cuda"); + vec_x->allocate(ReSolve::memory::HOST);//for KLU + vec_x->allocate(ReSolve::memory::DEVICE); vec_r = new vector_type(A->getNumRows()); } else { @@ -111,11 +111,11 @@ int main(int argc, char *argv[]) //Now convert to CSR. if (i < 2) { matrix_handler->coo2csr(A_coo, A, "cpu"); - vec_rhs->update(rhs, "cpu", "cpu"); - vec_rhs->setDataUpdated("cpu"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST); + vec_rhs->setDataUpdated(ReSolve::memory::HOST); } else { matrix_handler->coo2csr(A_coo,A, "cuda"); - vec_rhs->update(rhs, "cpu", "cuda"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); } std::cout<<"COO to CSR completed. Expanded NNZ: "<< A->getNnzExpanded()<solve(vec_rhs, vec_x); std::cout<<"KLU solve status: "<update(rhs, "cpu", "cuda"); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); norm_b = vector_handler->dot(vec_r, vec_r, "cuda"); norm_b = sqrt(norm_b); matrix_handler->setValuesChanged(true, "cuda"); @@ -162,13 +162,13 @@ int main(int argc, char *argv[]) status = Rf->solve(vec_rhs, vec_x); std::cout<<"CUSOLVER RF solve status: "<update(rhs, "cpu", "cuda"); - norm_b = vector_handler->dot(vec_r, vec_r, "cuda"); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + norm_b = vector_handler->dot(vec_r, vec_r, "cuda"); norm_b = sqrt(norm_b); //matrix_handler->setValuesChanged(true, "cuda"); FGMRES->resetMatrix(A); - FGMRES->setupPreconditioner("CuSolverRf", Rf); + FGMRES->setupPreconditioner("LU", Rf); matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "cuda"); @@ -176,7 +176,7 @@ int main(int argc, char *argv[]) << std::scientific << std::setprecision(16) << sqrt(vector_handler->dot(vec_r, vec_r, "cuda"))/norm_b << "\n"; - vec_rhs->update(rhs, "cpu", "cuda"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); FGMRES->solve(vec_rhs, vec_x); std::cout << "FGMRES: init nrm: " @@ -189,5 +189,16 @@ int main(int argc, char *argv[]) } // for (int i = 0; i < numSystems; ++i) + delete A; + delete KLU; + delete Rf; + delete [] x; + delete [] rhs; + delete vec_r; + delete vec_x; + delete workspace_CUDA; + delete matrix_handler; + delete vector_handler; + return 0; } diff --git a/examples/r_KLU_rf_FGMRES_reuse_factorization.cpp b/examples/r_KLU_rf_FGMRES_reuse_factorization.cpp index 6a520a7a..c4ab285b 100644 --- a/examples/r_KLU_rf_FGMRES_reuse_factorization.cpp +++ b/examples/r_KLU_rf_FGMRES_reuse_factorization.cpp @@ -98,8 +98,8 @@ int main(int argc, char *argv[]) x = new real_type[A->getNumRows()]; vec_rhs = new vector_type(A->getNumRows()); vec_x = new vector_type(A->getNumRows()); - vec_x->allocate("cpu");//for KLU - vec_x->allocate("cuda"); + vec_x->allocate(ReSolve::memory::HOST);//for KLU + vec_x->allocate(ReSolve::memory::DEVICE); vec_r = new vector_type(A->getNumRows()); } else { @@ -113,11 +113,11 @@ int main(int argc, char *argv[]) //Now convert to CSR. if (i < 2) { matrix_handler->coo2csr(A_coo,A, "cpu"); - vec_rhs->update(rhs, "cpu", "cpu"); - vec_rhs->setDataUpdated("cpu"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST); + vec_rhs->setDataUpdated(ReSolve::memory::HOST); } else { matrix_handler->coo2csr(A_coo, A, "cuda"); - vec_rhs->update(rhs, "cpu", "cuda"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); } std::cout<<"COO to CSR completed. Expanded NNZ: "<< A->getNnzExpanded()<solve(vec_rhs, vec_x); std::cout<<"KLU solve status: "<update(rhs, "cpu", "cuda"); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); norm_b = vector_handler->dot(vec_r, vec_r, "cuda"); norm_b = sqrt(norm_b); matrix_handler->setValuesChanged(true, "cuda"); @@ -171,20 +171,20 @@ int main(int argc, char *argv[]) status = Rf->refactorize(); std::cout << "CUSOLVER RF, using REAL refactorization, refactorization status: " << status << std::endl; - vec_rhs->update(rhs, "cpu", "cuda"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); status = Rf->solve(vec_rhs, vec_x); - FGMRES->setupPreconditioner("CuSolverRf", Rf); + FGMRES->setupPreconditioner("LU", Rf); } - //if (i%2!=0) vec_x->setToZero("cuda"); + //if (i%2!=0) vec_x->setToZero(ReSolve::memory::DEVICE); real_type norm_x = vector_handler->dot(vec_x, vec_x, "cuda"); std::cout << "Norm of x (before solve): " << std::scientific << std::setprecision(16) << sqrt(norm_x) << "\n"; std::cout<<"CUSOLVER RF solve status: "<update(rhs, "cpu", "cuda"); - vec_r->update(rhs, "cpu", "cuda"); - norm_b = vector_handler->dot(vec_r, vec_r, "cuda"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + norm_b = vector_handler->dot(vec_r, vec_r, "cuda"); norm_b = sqrt(norm_b); matrix_handler->setValuesChanged(true, "cuda"); @@ -199,7 +199,7 @@ int main(int argc, char *argv[]) << std::scientific << std::setprecision(16) << norm_b << "\n"; - vec_rhs->update(rhs, "cpu", "cuda"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); FGMRES->solve(vec_rhs, vec_x); std::cout << "FGMRES: init nrm: " @@ -217,5 +217,16 @@ int main(int argc, char *argv[]) } + delete A; + delete KLU; + delete Rf; + delete [] x; + delete [] rhs; + delete vec_r; + delete vec_x; + delete workspace_CUDA; + delete matrix_handler; + delete vector_handler; + return 0; } diff --git a/examples/r_KLU_rocSolverRf_FGMRES.cpp b/examples/r_KLU_rocSolverRf_FGMRES.cpp new file mode 100644 index 00000000..32d1865f --- /dev/null +++ b/examples/r_KLU_rocSolverRf_FGMRES.cpp @@ -0,0 +1,205 @@ +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace ReSolve::constants; + +int main(int argc, char *argv[]) +{ + // Use the same data types as those you specified in ReSolve build. + using index_type = ReSolve::index_type; + using real_type = ReSolve::real_type; + using vector_type = ReSolve::vector::Vector; + + (void) argc; // TODO: Check if the number of input parameters is correct. + std::string matrixFileName = argv[1]; + std::string rhsFileName = argv[2]; + + index_type numSystems = atoi(argv[3]); + std::cout<<"Family mtx file name: "<< matrixFileName << ", total number of matrices: "<initializeHandles(); + ReSolve::MatrixHandler* matrix_handler = new ReSolve::MatrixHandler(workspace_HIP); + ReSolve::VectorHandler* vector_handler = new ReSolve::VectorHandler(workspace_HIP); + real_type* rhs = nullptr; + real_type* x = nullptr; + + vector_type* vec_rhs; + vector_type* vec_x; + vector_type* vec_r; + + ReSolve::GramSchmidt* GS = new ReSolve::GramSchmidt(vector_handler, ReSolve::GramSchmidt::cgs2); + ReSolve::LinSolverDirectKLU* KLU = new ReSolve::LinSolverDirectKLU; + ReSolve::LinSolverDirectRocSolverRf* Rf = new ReSolve::LinSolverDirectRocSolverRf(workspace_HIP); + ReSolve::LinSolverIterativeFGMRES* FGMRES = new ReSolve::LinSolverIterativeFGMRES(matrix_handler, vector_handler, GS, "hip"); + + for (int i = 0; i < numSystems; ++i) + { + index_type j = 4 + i * 2; + fileId = argv[j]; + rhsId = argv[j + 1]; + + matrixFileNameFull = ""; + rhsFileNameFull = ""; + + // Read matrix first + matrixFileNameFull = matrixFileName + fileId + ".mtx"; + rhsFileNameFull = rhsFileName + rhsId + ".mtx"; + std::cout << std::endl << std::endl << std::endl; + std::cout << "========================================================================================================================"<getNumRows(), + A_coo->getNumColumns(), + A_coo->getNnz(), + A_coo->symmetric(), + A_coo->expanded()); + + rhs = ReSolve::io::readRhsFromFile(rhs_file); + x = new real_type[A->getNumRows()]; + vec_rhs = new vector_type(A->getNumRows()); + vec_x = new vector_type(A->getNumRows()); + vec_x->allocate(ReSolve::memory::HOST);//for KLU + vec_x->allocate(ReSolve::memory::DEVICE); + vec_r = new vector_type(A->getNumRows()); + } + else { + ReSolve::io::readAndUpdateMatrix(mat_file, A_coo); + ReSolve::io::readAndUpdateRhs(rhs_file, &rhs); + } + std::cout<<"Finished reading the matrix and rhs, size: "<getNumRows()<<" x "<getNumColumns()<< ", nnz: "<< A->getNnz()<< ", symmetric? "<symmetric()<< ", Expanded? "<expanded()<coo2csr(A_coo, A, "cpu"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST); + vec_rhs->setDataUpdated(ReSolve::memory::HOST); + } else { + matrix_handler->coo2csr(A_coo,A, "hip"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + } + std::cout<<"COO to CSR completed. Expanded NNZ: "<< A->getNnzExpanded()<setupParameters(1, 0.1, false); + } + int status; + real_type norm_b; + if (i < 2){ + KLU->setup(A); + matrix_handler->setValuesChanged(true, "hip"); + status = KLU->analyze(); + std::cout<<"KLU analysis status: "<factorize(); + std::cout<<"KLU factorization status: "<solve(vec_rhs, vec_x); + std::cout<<"KLU solve status: "<update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + norm_b = vector_handler->dot(vec_r, vec_r, "hip"); + norm_b = sqrt(norm_b); + matrix_handler->setValuesChanged(true, "hip"); + matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "hip"); + printf("\t 2-Norm of the residual : %16.16e\n", sqrt(vector_handler->dot(vec_r, vec_r, "hip"))/norm_b); + if (i == 1) { + ReSolve::matrix::Csc* L = (ReSolve::matrix::Csc*) KLU->getLFactor(); + ReSolve::matrix::Csc* U = (ReSolve::matrix::Csc*) KLU->getUFactor(); + if (L == nullptr) {printf("ERROR");} + index_type* P = KLU->getPOrdering(); + index_type* Q = KLU->getQOrdering(); + Rf->setSolveMode(1); + Rf->setup(A, L, U, P, Q, vec_rhs); + Rf->refactorize(); + std::cout<<"about to set FGMRES" <setup(A->getNumRows(), FGMRES->getRestart()); + FGMRES->setup(A); + } + } else { + //status = KLU->refactorize(); + std::cout<<"Using ROCSOLVER RF"<refactorize(); + std::cout<<"ROCSOLVER RF refactorization status: "<solve(vec_rhs, vec_x); + std::cout<<"ROCSOLVER RF solve status: "<update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + norm_b = vector_handler->dot(vec_r, vec_r, "hip"); + norm_b = sqrt(norm_b); + + //matrix_handler->setValuesChanged(true, "hip"); + FGMRES->resetMatrix(A); + FGMRES->setupPreconditioner("LU", Rf); + + matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "hip"); + real_type rnrm = sqrt(vector_handler->dot(vec_r, vec_r, "hip")); + std::cout << "\t 2-Norm of the residual (before IR): " + << std::scientific << std::setprecision(16) + << rnrm/norm_b << "\n"; + + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + if(!std::isnan(rnrm) && !std::isinf(rnrm)) { + FGMRES->solve(vec_rhs, vec_x); + + std::cout << "FGMRES: init nrm: " + << std::scientific << std::setprecision(16) + << FGMRES->getInitResidualNorm()/norm_b + << " final nrm: " + << FGMRES->getFinalResidualNorm()/norm_b + << " iter: " << FGMRES->getNumIter() << "\n"; + } + } + + } // for (int i = 0; i < numSystems; ++i) + + delete A; + delete A_coo; + delete KLU; + delete Rf; + delete [] x; + delete [] rhs; + delete vec_r; + delete vec_x; + delete workspace_HIP; + delete matrix_handler; + delete vector_handler; + + return 0; +} diff --git a/examples/r_KLU_rocsolverrf.cpp b/examples/r_KLU_rocsolverrf.cpp new file mode 100644 index 00000000..5651ed56 --- /dev/null +++ b/examples/r_KLU_rocsolverrf.cpp @@ -0,0 +1,171 @@ +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace ReSolve::constants; + +int main(int argc, char *argv[] ) +{ + // Use the same data types as those you specified in ReSolve build. + using index_type = ReSolve::index_type; + using real_type = ReSolve::real_type; + using vector_type = ReSolve::vector::Vector; + + (void) argc; // TODO: Check if the number of input parameters is correct. + std::string matrixFileName = argv[1]; + std::string rhsFileName = argv[2]; + + index_type numSystems = atoi(argv[3]); + std::cout<<"Family mtx file name: "<< matrixFileName << ", total number of matrices: "<initializeHandles(); + ReSolve::MatrixHandler* matrix_handler = new ReSolve::MatrixHandler(workspace_HIP); + ReSolve::VectorHandler* vector_handler = new ReSolve::VectorHandler(workspace_HIP); + real_type* rhs = nullptr; + real_type* x = nullptr; + + vector_type* vec_rhs; + vector_type* vec_x; + vector_type* vec_r; + + ReSolve::LinSolverDirectKLU* KLU = new ReSolve::LinSolverDirectKLU; + ReSolve::LinSolverDirectRocSolverRf* Rf = new ReSolve::LinSolverDirectRocSolverRf(workspace_HIP); + + for (int i = 0; i < numSystems; ++i) + { + index_type j = 4 + i * 2; + fileId = argv[j]; + rhsId = argv[j + 1]; + + matrixFileNameFull = ""; + rhsFileNameFull = ""; + + // Read matrix first + matrixFileNameFull = matrixFileName + fileId + ".mtx"; + rhsFileNameFull = rhsFileName + rhsId + ".mtx"; + std::cout << std::endl << std::endl << std::endl; + std::cout << "========================================================================================================================"<getNumRows(), + A_coo->getNumColumns(), + A_coo->getNnz(), + A_coo->symmetric(), + A_coo->expanded()); + + rhs = ReSolve::io::readRhsFromFile(rhs_file); + x = new real_type[A->getNumRows()]; + vec_rhs = new vector_type(A->getNumRows()); + vec_x = new vector_type(A->getNumRows()); + vec_r = new vector_type(A->getNumRows()); + } + else { + ReSolve::io::readAndUpdateMatrix(mat_file, A_coo); + ReSolve::io::readAndUpdateRhs(rhs_file, &rhs); + } + std::cout<<"Finished reading the matrix and rhs, size: "<getNumRows()<<" x "<getNumColumns()<< ", nnz: "<< A->getNnz()<< ", symmetric? "<symmetric()<< ", Expanded? "<expanded()<coo2csr(A_coo, A, "cpu"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST); + vec_rhs->setDataUpdated(ReSolve::memory::HOST); + } else { + matrix_handler->coo2csr(A_coo, A, "hip"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + } + std::cout<<"COO to CSR completed. Expanded NNZ: "<< A->getNnzExpanded()<setupParameters(1, 0.1, false); + } + int status; + if (i < 2){ + KLU->setup(A); + status = KLU->analyze(); + std::cout<<"KLU analysis status: "<factorize(); + std::cout<<"KLU factorization status: "<solve(vec_rhs, vec_x); + std::cout<<"KLU solve status: "<getLFactor(); + ReSolve::matrix::Csc* U = (ReSolve::matrix::Csc*) KLU->getUFactor(); + index_type* P = KLU->getPOrdering(); + index_type* Q = KLU->getQOrdering(); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + Rf->setup(A, L, U, P, Q, vec_rhs); + Rf->refactorize(); + } + } else { + std::cout<<"Using rocsolver rf"<refactorize(); + std::cout<<"rocsolver rf refactorization status: "<solve(vec_rhs, vec_x); + std::cout<<"rocsolver rf solve status: "<update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + + matrix_handler->setValuesChanged(true, "hip"); + + matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "hip"); + + std::cout << "\t 2-Norm of the residual: " + << std::scientific << std::setprecision(16) + << sqrt(vector_handler->dot(vec_r, vec_r, "hip")) << "\n"; + + } // for (int i = 0; i < numSystems; ++i) + + //now DELETE + delete A; + delete A_coo; + delete KLU; + delete Rf; + delete [] x; + delete [] rhs; + delete vec_r; + delete vec_x; + delete workspace_HIP; + delete matrix_handler; + delete vector_handler; + return 0; +} diff --git a/resolve/CMakeLists.txt b/resolve/CMakeLists.txt index 8dbcc467..b98c8234 100644 --- a/resolve/CMakeLists.txt +++ b/resolve/CMakeLists.txt @@ -14,14 +14,21 @@ set(ReSolve_SRC LinSolverDirectKLU.cpp ) +# Temporary until there is CPU-only option for FGMRES +set(ReSolve_GPU_SRC + GramSchmidt.cpp + LinSolverIterativeFGMRES.cpp +) + # C++ code that links to CUDA SDK libraries set(ReSolve_CUDASDK_SRC - LinSolverIterativeFGMRES.cpp - GramSchmidt.cpp LinSolverDirectCuSolverGLU.cpp LinSolverDirectCuSolverRf.cpp ) - +# HIP files +set(ReSolve_ROCM_SRC + LinSolverDirectRocSolverRf.cpp +) # Header files to be installed set(ReSolve_HEADER_INSTALL Common.hpp @@ -29,6 +36,7 @@ set(ReSolve_HEADER_INSTALL LinSolver.hpp LinSolverDirectCuSolverGLU.hpp LinSolverDirectCuSolverRf.hpp + LinSolverDirectRocSolverRf.hpp LinSolverDirectKLU.hpp LinSolverIterativeFGMRES.hpp RefactorizationSolver.hpp @@ -37,17 +45,6 @@ set(ReSolve_HEADER_INSTALL MemoryUtils.hpp ) -# If GPU support is not enabled, add dummy device backend -if(NOT RESOLVE_USE_GPU) - add_subdirectory(cpu) -endif() - -# If CUDA support is enabled, create CUDA backend -# (this should really be CUDA _API_ backend, separate backend will be needed for CUDA SDK) -if(RESOLVE_USE_CUDA) - add_subdirectory(cuda) -endif() - # Now, build workspaces add_subdirectory(workspace) @@ -55,19 +52,13 @@ add_subdirectory(workspace) add_subdirectory(vector) add_subdirectory(matrix) - # Build shared library ReSolve add_library(resolve_tpl INTERFACE) if(RESOLVE_USE_KLU) - target_link_libraries(resolve_tpl INTERFACE KLU) + target_link_libraries(resolve_tpl INTERFACE KLU) endif(RESOLVE_USE_KLU) -if(RESOLVE_USE_CUDA) - target_link_libraries(resolve_tpl INTERFACE resolve_cuda) -endif(RESOLVE_USE_CUDA) - - set(ReSolve_Targets_List resolve_matrix resolve_vector @@ -76,15 +67,31 @@ set(ReSolve_Targets_List resolve_workspace ) +# Temporary until there is CPU-only option for FGMRES +if(RESOLVE_USE_GPU) + set(ReSolve_SRC ${ReSolve_SRC} ${ReSolve_GPU_SRC}) +endif() + # If CUDA support is enabled add CUDA SDK specific code and dependencies if(RESOLVE_USE_CUDA) + add_subdirectory(cuda) + target_link_libraries(resolve_tpl INTERFACE resolve_cuda) set(ReSolve_SRC ${ReSolve_SRC} ${ReSolve_CUDASDK_SRC}) set(ReSolve_Targets_List ${ReSolve_Targets_List} resolve_backend_cuda) endif() +# If HIP support is enabled add HIP SDK specific code and dependencies +if(RESOLVE_USE_HIP) + add_subdirectory(hip) + target_link_libraries(resolve_tpl INTERFACE resolve_hip) + set(ReSolve_SRC ${ReSolve_SRC} ${ReSolve_ROCM_SRC}) + set(ReSolve_Targets_List ${ReSolve_Targets_List} resolve_backend_hip) +endif() + # If no GPU support is enabled, link to dummy device backend if(NOT RESOLVE_USE_GPU) - set(ReSolve_Targets_List ${ReSolve_Targets_List} resolve_backend_cpu) + add_subdirectory(cpu) + set(ReSolve_Targets_List ${ReSolve_Targets_List} resolve_backend_cpu) endif() # Set installable targets diff --git a/resolve/GramSchmidt.cpp b/resolve/GramSchmidt.cpp index b6a27b04..8f6f0850 100644 --- a/resolve/GramSchmidt.cpp +++ b/resolve/GramSchmidt.cpp @@ -36,10 +36,10 @@ namespace ReSolve delete h_L_; delete h_rv_; - vec_rv_->setData(nullptr, "cuda"); - vec_rv_->setData(nullptr, "cpu"); - vec_Hcolumn_->setData(nullptr, "cuda"); - vec_Hcolumn_->setData(nullptr, "cpu"); + vec_rv_->setData(nullptr, memory::DEVICE); + vec_rv_->setData(nullptr, memory::HOST); + vec_Hcolumn_->setData(nullptr, memory::DEVICE); + vec_Hcolumn_->setData(nullptr, memory::HOST); delete [] vec_rv_; delete [] vec_Hcolumn_;; @@ -47,18 +47,18 @@ namespace ReSolve if(variant_ == cgs2) { delete h_aux_; - vec_Hcolumn_->setData(nullptr, "cuda"); - // vec_Hcolumn_->setData(nullptr, "cpu"); + vec_Hcolumn_->setData(nullptr, memory::DEVICE); + // vec_Hcolumn_->setData(nullptr, memory::HOST); delete [] vec_Hcolumn_; } if(variant_ == mgs_pm) { delete h_aux_; } - vec_v_->setData(nullptr, "cuda"); - vec_v_->setData(nullptr, "cpu"); - vec_w_->setData(nullptr, "cuda"); - vec_w_->setData(nullptr, "cpu"); + vec_v_->setData(nullptr, memory::DEVICE); + vec_v_->setData(nullptr, memory::HOST); + vec_w_->setData(nullptr, memory::DEVICE); + vec_w_->setData(nullptr, memory::HOST); delete [] vec_w_; delete [] vec_v_; @@ -103,15 +103,15 @@ namespace ReSolve h_rv_ = new real_type[num_vecs_ + 1]; vec_rv_ = new vector_type(num_vecs_ + 1, 2); - vec_rv_->allocate("cuda"); + vec_rv_->allocate(memory::DEVICE); vec_Hcolumn_ = new vector_type(num_vecs_ + 1); - vec_Hcolumn_->allocate("cuda"); + vec_Hcolumn_->allocate(memory::DEVICE); } if(variant_ == cgs2) { h_aux_ = new real_type[num_vecs_ + 1]; vec_Hcolumn_ = new vector_type(num_vecs_ + 1); - vec_Hcolumn_->allocate("cuda"); + vec_Hcolumn_->allocate(memory::DEVICE); } if(variant_ == mgs_pm) { @@ -127,7 +127,7 @@ namespace ReSolve { using namespace constants; - if (memspace == "cuda") { // or hip + if ((memspace == "cuda") || (memspace == "hip")) { // or hip double t; double s; @@ -135,23 +135,23 @@ namespace ReSolve switch (variant_){ case mgs: - vec_w_->setData(V->getVectorData(i + 1, "cuda"), "cuda"); + vec_w_->setData(V->getVectorData(i + 1, memory::DEVICE), memory::DEVICE); for(int j = 0; j <= i; ++j) { t = 0.0; - vec_v_->setData( V->getVectorData(j, "cuda"), "cuda"); - t = vector_handler_->dot(vec_v_, vec_w_, "cuda"); + vec_v_->setData( V->getVectorData(j, memory::DEVICE), memory::DEVICE); + t = vector_handler_->dot(vec_v_, vec_w_, memspace); H[ idxmap(i, j, num_vecs_ + 1) ] = t; t *= -1.0; - vector_handler_->axpy(&t, vec_v_, vec_w_, "cuda"); + vector_handler_->axpy(&t, vec_v_, vec_w_, memspace); } t = 0.0; - t = vector_handler_->dot(vec_w_, vec_w_, "cuda"); + t = vector_handler_->dot(vec_w_, vec_w_, memspace); //set the last entry in Hessenberg matrix t = sqrt(t); H[ idxmap(i, i + 1, num_vecs_ + 1) ] = t; if(fabs(t) > EPSILON) { t = 1.0/t; - vector_handler_->scal(&t, vec_w_, "cuda"); + vector_handler_->scal(&t, vec_w_, memspace); } else { assert(0 && "Gram-Schmidt failed, vector with ZERO norm\n"); return -1; @@ -159,26 +159,25 @@ namespace ReSolve break; case cgs2: - vec_v_->setData(V->getVectorData(i + 1, "cuda"), "cuda"); - vector_handler_->gemv("T", n, i + 1, &ONE, &ZERO, V, vec_v_, vec_Hcolumn_,"cuda"); - + vec_v_->setData(V->getVectorData(i + 1, memory::DEVICE), memory::DEVICE); + vector_handler_->gemv("T", n, i + 1, &ONE, &ZERO, V, vec_v_, vec_Hcolumn_, memspace); // V(:,i+1) = V(:, i+1) - V(:,1:i)*Hcol - vector_handler_->gemv("N", n, i + 1, &ONE, &MINUSONE, V, vec_Hcolumn_, vec_v_, "cuda" ); + vector_handler_->gemv("N", n, i + 1, &ONE, &MINUSONE, V, vec_Hcolumn_, vec_v_, memspace ); // copy H_col to aux, we will need it later - vec_Hcolumn_->setDataUpdated("cuda"); + vec_Hcolumn_->setDataUpdated(memory::DEVICE); vec_Hcolumn_->setCurrentSize(i + 1); - vec_Hcolumn_->deepCopyVectorData(h_aux_, 0, "cpu"); + vec_Hcolumn_->deepCopyVectorData(h_aux_, 0, memory::HOST); //Hcol = V(:,1:i)^T*V(:,i+1); - vector_handler_->gemv("T", n, i + 1, &ONE, &ZERO, V, vec_v_, vec_Hcolumn_,"cuda"); + vector_handler_->gemv("T", n, i + 1, &ONE, &ZERO, V, vec_v_, vec_Hcolumn_, memspace); // V(:,i+1) = V(:, i+1) - V(:,1:i)*Hcol - vector_handler_->gemv("N", n, i + 1, &ONE, &MINUSONE, V, vec_Hcolumn_, vec_v_, "cuda" ); + vector_handler_->gemv("N", n, i + 1, &ONE, &MINUSONE, V, vec_Hcolumn_, vec_v_, memspace ); // copy H_col to H - vec_Hcolumn_->setDataUpdated("cuda"); - vec_Hcolumn_->deepCopyVectorData(&H[ idxmap(i, 0, num_vecs_ + 1)], 0, "cpu"); + vec_Hcolumn_->setDataUpdated(memory::DEVICE); + vec_Hcolumn_->deepCopyVectorData(&H[ idxmap(i, 0, num_vecs_ + 1)], 0, memory::HOST); // add both pieces together (unstable otherwise, careful here!!) t = 0.0; @@ -186,13 +185,13 @@ namespace ReSolve H[ idxmap(i, j, num_vecs_ + 1)] += h_aux_[j]; } - t = vector_handler_->dot(vec_v_, vec_v_, "cuda"); + t = vector_handler_->dot(vec_v_, vec_v_, memspace); //set the last entry in Hessenberg matrix t = sqrt(t); H[ idxmap(i, i + 1, num_vecs_ + 1) ] = t; if(fabs(t) > EPSILON) { t = 1.0/t; - vector_handler_->scal(&t, vec_v_, "cuda"); + vector_handler_->scal(&t, vec_v_, memspace); } else { assert(0 && "Gram-Schmidt failed, vector with ZERO norm\n"); return -1; @@ -201,16 +200,16 @@ namespace ReSolve break; case mgs_two_synch: // V[1:i]^T[V[i] w] - vec_v_->setData(V->getVectorData(i, "cuda"), "cuda"); - vec_w_->setData(V->getVectorData(i + 1, "cuda"), "cuda"); + vec_v_->setData(V->getVectorData(i, memory::DEVICE), memory::DEVICE); + vec_w_->setData(V->getVectorData(i + 1, memory::DEVICE), memory::DEVICE); vec_rv_->setCurrentSize(i + 1); - vector_handler_->massDot2Vec(n, V, i, vec_v_, vec_rv_, "cuda"); - vec_rv_->setDataUpdated("cuda"); - vec_rv_->copyData("cuda", "cpu"); + vector_handler_->massDot2Vec(n, V, i, vec_v_, vec_rv_, memspace); + vec_rv_->setDataUpdated(memory::DEVICE); + vec_rv_->copyData(memory::DEVICE, memory::HOST); - vec_rv_->deepCopyVectorData(&h_L_[idxmap(i, 0, num_vecs_ + 1)], 0, "cpu"); - h_rv_ = vec_rv_->getVectorData(1, "cpu"); + vec_rv_->deepCopyVectorData(&h_L_[idxmap(i, 0, num_vecs_ + 1)], 0, memory::HOST); + h_rv_ = vec_rv_->getVectorData(1, memory::HOST); for(int j=0; j<=i; ++j) { H[ idxmap(i, j, num_vecs_ + 1) ] = 0.0; @@ -225,17 +224,17 @@ namespace ReSolve H[ idxmap(i, j, num_vecs_ + 1) ] -= s; } // for j vec_Hcolumn_->setCurrentSize(i + 1); - vec_Hcolumn_->update(&H[ idxmap(i, 0, num_vecs_ + 1)], "cpu", "cuda"); - vector_handler_->massAxpy(n, vec_Hcolumn_, i, V, vec_w_, "cuda"); + vec_Hcolumn_->update(&H[ idxmap(i, 0, num_vecs_ + 1)], memory::HOST, memory::DEVICE); + vector_handler_->massAxpy(n, vec_Hcolumn_, i, V, vec_w_, memspace); // normalize (second synch) - t = vector_handler_->dot(vec_w_, vec_w_, "cuda"); + t = vector_handler_->dot(vec_w_, vec_w_, memspace); //set the last entry in Hessenberg matrix t = sqrt(t); H[ idxmap(i, i + 1, num_vecs_ + 1)] = t; if(fabs(t) > EPSILON) { t = 1.0 / t; - vector_handler_->scal(&t, vec_w_, "cuda"); + vector_handler_->scal(&t, vec_w_, memspace); } else { assert(0 && "Iterative refinement failed, Krylov vector with ZERO norm\n"); return -1; @@ -243,16 +242,16 @@ namespace ReSolve return 0; break; case mgs_pm: - vec_v_->setData(V->getVectorData(i, "cuda"), "cuda"); - vec_w_->setData(V->getVectorData(i + 1, "cuda"), "cuda"); + vec_v_->setData(V->getVectorData(i, memory::DEVICE), memory::DEVICE); + vec_w_->setData(V->getVectorData(i + 1, memory::DEVICE), memory::DEVICE); vec_rv_->setCurrentSize(i + 1); - vector_handler_->massDot2Vec(n, V, i, vec_v_, vec_rv_, "cuda"); - vec_rv_->setDataUpdated("cuda"); - vec_rv_->copyData("cuda", "cpu"); + vector_handler_->massDot2Vec(n, V, i, vec_v_, vec_rv_, memspace); + vec_rv_->setDataUpdated(memory::DEVICE); + vec_rv_->copyData(memory::DEVICE, memory::HOST); - vec_rv_->deepCopyVectorData(&h_L_[idxmap(i, 0, num_vecs_ + 1)], 0, "cpu"); - h_rv_ = vec_rv_->getVectorData(1, "cpu"); + vec_rv_->deepCopyVectorData(&h_L_[idxmap(i, 0, num_vecs_ + 1)], 0, memory::HOST); + h_rv_ = vec_rv_->getVectorData(1, memory::HOST); for(int j = 0; j <= i; ++j) { H[ idxmap(i, j, num_vecs_ + 1) ] = 0.0; @@ -295,17 +294,17 @@ namespace ReSolve } vec_Hcolumn_->setCurrentSize(i + 1); - vec_Hcolumn_->update(&H[ idxmap(i, 0, num_vecs_ + 1)], "cpu", "cuda"); + vec_Hcolumn_->update(&H[ idxmap(i, 0, num_vecs_ + 1)], memory::HOST, memory::DEVICE); - vector_handler_->massAxpy(n, vec_Hcolumn_, i, V, vec_w_, "cuda"); + vector_handler_->massAxpy(n, vec_Hcolumn_, i, V, vec_w_, memspace); // normalize (second synch) - t = vector_handler_->dot(vec_w_, vec_w_, "cuda"); + t = vector_handler_->dot(vec_w_, vec_w_, memspace); //set the last entry in Hessenberg matrix t = sqrt(t); H[ idxmap(i, i + 1, num_vecs_ + 1) ] = t; if(fabs(t) > EPSILON) { t = 1.0 / t; - vector_handler_->scal(&t, vec_w_, "cuda"); + vector_handler_->scal(&t, vec_w_, memspace); } else { assert(0 && "Iterative refinement failed, Krylov vector with ZERO norm\n"); return -1; diff --git a/resolve/LinSolver.cpp b/resolve/LinSolver.cpp index 558a6500..5682ec40 100644 --- a/resolve/LinSolver.cpp +++ b/resolve/LinSolver.cpp @@ -13,12 +13,6 @@ namespace ReSolve //destroy the matrix and hadlers } - int LinSolver::setup(matrix::Sparse* A) - { - this->A_ = A; - return 0; - } - real_type LinSolver::evaluateResidual() { //to be implemented @@ -42,6 +36,17 @@ namespace ReSolve delete [] Q_; } + int LinSolverDirect::setup(matrix::Sparse* A, + matrix::Sparse* /* L */, + matrix::Sparse* /* U */, + index_type* /* P */, + index_type* /* Q */, + vector_type* /* rhs */) + { + this->A_ = A; + return 0; + } + int LinSolverDirect::analyze() { return 0; @@ -92,6 +97,11 @@ namespace ReSolve { } + int LinSolverIterative::setup(matrix::Sparse* A) + { + this->A_ = A; + return 0; + } int LinSolverIterative::solve(vector_type* /* rhs */, vector_type* /* init_guess */) { diff --git a/resolve/LinSolver.hpp b/resolve/LinSolver.hpp index 8c9ca5c9..a34aeba0 100644 --- a/resolve/LinSolver.hpp +++ b/resolve/LinSolver.hpp @@ -31,7 +31,6 @@ namespace ReSolve LinSolver(); virtual ~LinSolver(); - virtual int setup(matrix::Sparse* A); real_type evaluateResidual(); protected: @@ -49,6 +48,13 @@ namespace ReSolve LinSolverDirect(); virtual ~LinSolverDirect(); //return 0 if successful! + virtual int setup(matrix::Sparse* A, + matrix::Sparse* L, + matrix::Sparse* U, + index_type* P, + index_type* Q, + vector_type* rhs); + virtual int analyze(); //the same as symbolic factorization virtual int factorize(); virtual int refactorize(); @@ -72,6 +78,7 @@ namespace ReSolve public: LinSolverIterative(); ~LinSolverIterative(); + virtual int setup(matrix::Sparse* A); virtual int solve(vector_type* rhs, vector_type* init_guess); }; diff --git a/resolve/LinSolverDirectCuSolverGLU.cpp b/resolve/LinSolverDirectCuSolverGLU.cpp index 75039ff4..65af5812 100644 --- a/resolve/LinSolverDirectCuSolverGLU.cpp +++ b/resolve/LinSolverDirectCuSolverGLU.cpp @@ -8,6 +8,8 @@ namespace ReSolve { + using vector_type = vector::Vector; + LinSolverDirectCuSolverGLU::LinSolverDirectCuSolverGLU(LinAlgWorkspaceCUDA* workspace) { this->workspace_ = workspace; @@ -22,7 +24,12 @@ namespace ReSolve delete M_; } - int LinSolverDirectCuSolverGLU::setup(matrix::Sparse* A, matrix::Sparse* L, matrix::Sparse* U, index_type* P, index_type* Q) + int LinSolverDirectCuSolverGLU::setup(matrix::Sparse* A, + matrix::Sparse* L, + matrix::Sparse* U, + index_type* P, + index_type* Q, + vector_type* /* rhs */) { int error_sum = 0; @@ -50,14 +57,14 @@ namespace ReSolve n, nnz, descr_A_, - A_->getRowData("cpu"), //kRowPtr_, - A_->getColData("cpu"), //jCol_, + A_->getRowData(memory::HOST), //kRowPtr_, + A_->getColData(memory::HOST), //jCol_, P, /* base-0 */ Q, /* base-0 */ M_->getNnz(), /* nnzM */ descr_M_, - M_->getRowData("cpu"), - M_->getColData("cpu"), + M_->getRowData(memory::HOST), + M_->getColData(memory::HOST), info_M_); error_sum += status_cusolver_; //NOW the buffer @@ -77,9 +84,9 @@ namespace ReSolve /* A is original matrix */ nnz, descr_A_, - A_->getValues("cuda"), //da_, - A_->getRowData("cuda"), //kRowPtr_, - A_->getColData("cuda"), //jCol_, + A_->getValues( memory::DEVICE), //da_, + A_->getRowData(memory::DEVICE), //kRowPtr_, + A_->getColData(memory::DEVICE), //jCol_, info_M_); error_sum += status_cusolver_; @@ -93,15 +100,15 @@ namespace ReSolve { // L and U need to be in CSC format index_type n = L->getNumRows(); - index_type* Lp = L->getColData("cpu"); - index_type* Li = L->getRowData("cpu"); - index_type* Up = U->getColData("cpu"); - index_type* Ui = U->getRowData("cpu"); + index_type* Lp = L->getColData(memory::HOST); + index_type* Li = L->getRowData(memory::HOST); + index_type* Up = U->getColData(memory::HOST); + index_type* Ui = U->getRowData(memory::HOST); index_type nnzM = ( L->getNnz() + U->getNnz() - n ); M_ = new matrix::Csr(n, n, nnzM); - M_->allocateMatrixData("cpu"); - index_type* mia = M_->getRowData("cpu"); - index_type* mja = M_->getColData("cpu"); + M_->allocateMatrixData(memory::HOST); + index_type* mia = M_->getRowData(memory::HOST); + index_type* mja = M_->getColData(memory::HOST); index_type row; for(index_type i = 0; i < n; ++i) { // go through EACH COLUMN OF L first @@ -153,9 +160,9 @@ namespace ReSolve /* A is original matrix */ A_->getNnzExpanded(), descr_A_, - A_->getValues("cuda"), //da_, - A_->getRowData("cuda"), //kRowPtr_, - A_->getColData("cuda"), //jCol_, + A_->getValues( memory::DEVICE), //da_, + A_->getRowData(memory::DEVICE), //kRowPtr_, + A_->getColData(memory::DEVICE), //jCol_, info_M_); error_sum += status_cusolver_; @@ -173,11 +180,11 @@ namespace ReSolve /* A is original matrix */ A_->getNnz(), descr_A_, - A_->getValues("cuda"), //da_, - A_->getRowData("cuda"), //kRowPtr_, - A_->getColData("cuda"), //jCol_, - rhs->getData("cuda"),/* right hand side */ - x->getData("cuda"),/* left hand side */ + A_->getValues( memory::DEVICE), //da_, + A_->getRowData(memory::DEVICE), //kRowPtr_, + A_->getColData(memory::DEVICE), //jCol_, + rhs->getData(memory::DEVICE),/* right hand side */ + x->getData(memory::DEVICE),/* left hand side */ &ite_refine_succ_, &r_nrminf_, info_M_, diff --git a/resolve/LinSolverDirectCuSolverGLU.hpp b/resolve/LinSolverDirectCuSolverGLU.hpp index a48c8cba..899f52e3 100644 --- a/resolve/LinSolverDirectCuSolverGLU.hpp +++ b/resolve/LinSolverDirectCuSolverGLU.hpp @@ -32,7 +32,12 @@ namespace ReSolve int refactorize(); int solve(vector_type* rhs, vector_type* x); - int setup(matrix::Sparse* A, matrix::Sparse* L, matrix::Sparse* U, index_type* P, index_type* Q); + int setup(matrix::Sparse* A, + matrix::Sparse* L, + matrix::Sparse* U, + index_type* P, + index_type* Q, + vector_type* rhs = nullptr); private: void addFactors(matrix::Sparse* L, matrix::Sparse* U); //create L+U from sepeate L, U factors diff --git a/resolve/LinSolverDirectCuSolverRf.cpp b/resolve/LinSolverDirectCuSolverRf.cpp index d51218cc..905a0e6e 100644 --- a/resolve/LinSolverDirectCuSolverRf.cpp +++ b/resolve/LinSolverDirectCuSolverRf.cpp @@ -17,7 +17,12 @@ namespace ReSolve mem_.deleteOnDevice(d_T_); } - int LinSolverDirectCuSolverRf::setup(matrix::Sparse* A, matrix::Sparse* L, matrix::Sparse* U, index_type* P, index_type* Q) + int LinSolverDirectCuSolverRf::setup(matrix::Sparse* A, + matrix::Sparse* L, + matrix::Sparse* U, + index_type* P, + index_type* Q, + vector_type* /* rhs */) { //remember - P and Q are generally CPU variables int error_sum = 0; @@ -35,17 +40,17 @@ namespace ReSolve error_sum += status_cusolverrf_; status_cusolverrf_ = cusolverRfSetupDevice(n, A_->getNnzExpanded(), - A_->getRowData("cuda"), //dia_, - A_->getColData("cuda"), //dja_, - A_->getValues("cuda"), //da_, + A_->getRowData(memory::DEVICE), //dia_, + A_->getColData(memory::DEVICE), //dja_, + A_->getValues( memory::DEVICE), //da_, L->getNnz(), - L->getRowData("cuda"), - L->getColData("cuda"), - L->getValues("cuda"), + L->getRowData(memory::DEVICE), + L->getColData(memory::DEVICE), + L->getValues( memory::DEVICE), U->getNnz(), - U->getRowData("cuda"), - U->getColData("cuda"), - U->getValues("cuda"), + U->getRowData(memory::DEVICE), + U->getColData(memory::DEVICE), + U->getValues( memory::DEVICE), d_P_, d_Q_, handle_cusolverrf_); @@ -76,9 +81,9 @@ namespace ReSolve int error_sum = 0; status_cusolverrf_ = cusolverRfResetValues(A_->getNumRows(), A_->getNnzExpanded(), - A_->getRowData("cuda"), //dia_, - A_->getColData("cuda"), //dja_, - A_->getValues("cuda"), //da_, + A_->getRowData(memory::DEVICE), //dia_, + A_->getColData(memory::DEVICE), //dja_, + A_->getValues( memory::DEVICE), //da_, d_P_, d_Q_, handle_cusolverrf_); @@ -100,22 +105,22 @@ namespace ReSolve 1, d_T_, A_->getNumRows(), - rhs->getData("cuda"), + rhs->getData(memory::DEVICE), A_->getNumRows()); return status_cusolverrf_; } int LinSolverDirectCuSolverRf::solve(vector_type* rhs, vector_type* x) { - x->update(rhs->getData("cuda"), "cuda", "cuda"); - x->setDataUpdated("cuda"); + x->update(rhs->getData(memory::DEVICE), memory::DEVICE, memory::DEVICE); + x->setDataUpdated(memory::DEVICE); status_cusolverrf_ = cusolverRfSolve(handle_cusolverrf_, d_P_, d_Q_, 1, d_T_, A_->getNumRows(), - x->getData("cuda"), + x->getData(memory::DEVICE), A_->getNumRows()); return status_cusolverrf_; } diff --git a/resolve/LinSolverDirectCuSolverRf.hpp b/resolve/LinSolverDirectCuSolverRf.hpp index f0ee755e..77e8b94f 100644 --- a/resolve/LinSolverDirectCuSolverRf.hpp +++ b/resolve/LinSolverDirectCuSolverRf.hpp @@ -26,7 +26,12 @@ namespace ReSolve LinSolverDirectCuSolverRf(); ~LinSolverDirectCuSolverRf(); - int setup(matrix::Sparse* A, matrix::Sparse* L, matrix::Sparse* U, index_type* P, index_type* Q); + int setup(matrix::Sparse* A, + matrix::Sparse* L, + matrix::Sparse* U, + index_type* P, + index_type* Q, + vector_type* rhs = nullptr); void setAlgorithms(cusolverRfFactorization_t fact_alg, cusolverRfTriangularSolve_t solve_alg); diff --git a/resolve/LinSolverDirectKLU.cpp b/resolve/LinSolverDirectKLU.cpp index b3f670c4..6336e9e9 100644 --- a/resolve/LinSolverDirectKLU.cpp +++ b/resolve/LinSolverDirectKLU.cpp @@ -18,7 +18,12 @@ namespace ReSolve klu_free_numeric(&Numeric_, &Common_); } - int LinSolverDirectKLU::setup(matrix::Sparse* A) + int LinSolverDirectKLU::setup(matrix::Sparse* A, + matrix::Sparse* /* L */, + matrix::Sparse* /* U */, + index_type* /* P */, + index_type* /* Q */, + vector_type* /* rhs */) { this->A_ = A; return 0; @@ -35,7 +40,7 @@ namespace ReSolve int LinSolverDirectKLU::analyze() { - Symbolic_ = klu_analyze(A_->getNumRows(), A_->getRowData("cpu"), A_->getColData("cpu"), &Common_) ; + Symbolic_ = klu_analyze(A_->getNumRows(), A_->getRowData(memory::HOST), A_->getColData(memory::HOST), &Common_) ; if (Symbolic_ == nullptr){ printf("Symbolic_ factorization crashed withCommon_.status = %d \n", Common_.status); @@ -46,7 +51,7 @@ namespace ReSolve int LinSolverDirectKLU::factorize() { - Numeric_ = klu_factor(A_->getRowData("cpu"), A_->getColData("cpu"),A_->getValues("cpu"), Symbolic_, &Common_); + Numeric_ = klu_factor(A_->getRowData(memory::HOST), A_->getColData(memory::HOST), A_->getValues(memory::HOST), Symbolic_, &Common_); if (Numeric_ == nullptr){ return 1; @@ -56,7 +61,7 @@ namespace ReSolve int LinSolverDirectKLU::refactorize() { - int kluStatus = klu_refactor (A_->getRowData("cpu"), A_->getColData("cpu"), A_->getValues("cpu"), Symbolic_, Numeric_, &Common_); + int kluStatus = klu_refactor (A_->getRowData(memory::HOST), A_->getColData(memory::HOST), A_->getValues(memory::HOST), Symbolic_, Numeric_, &Common_); if (!kluStatus){ //display error @@ -71,10 +76,10 @@ namespace ReSolve // std::memcpy(x, rhs, A->getNumRows() * sizeof(real_type)); - x->update(rhs->getData("cpu"), "cpu", "cpu"); - x->setDataUpdated("cpu"); + x->update(rhs->getData(memory::HOST), memory::HOST, memory::HOST); + x->setDataUpdated(memory::HOST); - int kluStatus = klu_solve(Symbolic_, Numeric_, A_->getNumRows(), 1, x->getData("cpu"), &Common_); + int kluStatus = klu_solve(Symbolic_, Numeric_, A_->getNumRows(), 1, x->getData(memory::HOST), &Common_); if (!kluStatus){ return 1; @@ -90,16 +95,16 @@ namespace ReSolve L_ = new matrix::Csc(A_->getNumRows(), A_->getNumColumns(), nnzL); U_ = new matrix::Csc(A_->getNumRows(), A_->getNumColumns(), nnzU); - L_->allocateMatrixData("cpu"); - U_->allocateMatrixData("cpu"); + L_->allocateMatrixData(memory::HOST); + U_->allocateMatrixData(memory::HOST); int ok = klu_extract(Numeric_, Symbolic_, - L_->getColData("cpu"), - L_->getRowData("cpu"), - L_->getValues("cpu"), - U_->getColData("cpu"), - U_->getRowData("cpu"), - U_->getValues("cpu"), + L_->getColData(memory::HOST), + L_->getRowData(memory::HOST), + L_->getValues( memory::HOST), + U_->getColData(memory::HOST), + U_->getRowData(memory::HOST), + U_->getValues( memory::HOST), nullptr, nullptr, nullptr, @@ -109,8 +114,8 @@ namespace ReSolve nullptr, &Common_); - L_->setUpdated("cpu"); - U_->setUpdated("cpu"); + L_->setUpdated(memory::HOST); + U_->setUpdated(memory::HOST); (void) ok; // TODO: Check status in ok before setting `factors_extracted_` factors_extracted_ = true; } @@ -125,16 +130,16 @@ namespace ReSolve L_ = new matrix::Csc(A_->getNumRows(), A_->getNumColumns(), nnzL); U_ = new matrix::Csc(A_->getNumRows(), A_->getNumColumns(), nnzU); - L_->allocateMatrixData("cpu"); - U_->allocateMatrixData("cpu"); + L_->allocateMatrixData(memory::HOST); + U_->allocateMatrixData(memory::HOST); int ok = klu_extract(Numeric_, Symbolic_, - L_->getColData("cpu"), - L_->getRowData("cpu"), - L_->getValues("cpu"), - U_->getColData("cpu"), - U_->getRowData("cpu"), - U_->getValues("cpu"), + L_->getColData(memory::HOST), + L_->getRowData(memory::HOST), + L_->getValues( memory::HOST), + U_->getColData(memory::HOST), + U_->getRowData(memory::HOST), + U_->getValues( memory::HOST), nullptr, nullptr, nullptr, @@ -144,8 +149,8 @@ namespace ReSolve nullptr, &Common_); - L_->setUpdated("cpu"); - U_->setUpdated("cpu"); + L_->setUpdated(memory::HOST); + U_->setUpdated(memory::HOST); (void) ok; // TODO: Check status in ok before setting `factors_extracted_` factors_extracted_ = true; @@ -157,7 +162,8 @@ namespace ReSolve { if (Numeric_ != nullptr){ P_ = new index_type[A_->getNumRows()]; - std::memcpy(P_, Numeric_->Pnum, A_->getNumRows() * sizeof(index_type)); + size_t nrows = static_cast(A_->getNumRows()); + std::memcpy(P_, Numeric_->Pnum, nrows * sizeof(index_type)); return P_; } else { return nullptr; @@ -169,7 +175,8 @@ namespace ReSolve { if (Numeric_ != nullptr){ Q_ = new index_type[A_->getNumRows()]; - std::memcpy(Q_, Symbolic_->Q, A_->getNumRows() * sizeof(index_type)); + size_t nrows = static_cast(A_->getNumRows()); + std::memcpy(Q_, Symbolic_->Q, nrows * sizeof(index_type)); return Q_; } else { return nullptr; diff --git a/resolve/LinSolverDirectKLU.hpp b/resolve/LinSolverDirectKLU.hpp index 13e27b47..b4edadb1 100644 --- a/resolve/LinSolverDirectKLU.hpp +++ b/resolve/LinSolverDirectKLU.hpp @@ -24,7 +24,13 @@ namespace ReSolve public: LinSolverDirectKLU(); ~LinSolverDirectKLU(); - int setup(matrix::Sparse* A); + + int setup(matrix::Sparse* A, + matrix::Sparse* L = nullptr, + matrix::Sparse* U = nullptr, + index_type* P = nullptr, + index_type* Q = nullptr, + vector_type* rhs = nullptr); void setupParameters(int ordering, double KLU_threshold, bool halt_if_singular); diff --git a/resolve/LinSolverDirectRocSolverRf.cpp b/resolve/LinSolverDirectRocSolverRf.cpp new file mode 100644 index 00000000..96d1da79 --- /dev/null +++ b/resolve/LinSolverDirectRocSolverRf.cpp @@ -0,0 +1,412 @@ +#include +#include +#include "LinSolverDirectRocSolverRf.hpp" +#include + +namespace ReSolve +{ + LinSolverDirectRocSolverRf::LinSolverDirectRocSolverRf(LinAlgWorkspaceHIP* workspace) + { + workspace_ = workspace; + infoM_ = nullptr; + solve_mode_ = 0; //solve mode - slow mode is default + } + + LinSolverDirectRocSolverRf::~LinSolverDirectRocSolverRf() + { + mem_.deleteOnDevice(d_P_); + mem_.deleteOnDevice(d_Q_); + + mem_.deleteOnDevice(d_aux1_); + mem_.deleteOnDevice(d_aux2_); + + delete L_csr_; + delete U_csr_; + } + + int LinSolverDirectRocSolverRf::setup(matrix::Sparse* A, + matrix::Sparse* L, + matrix::Sparse* U, + index_type* P, + index_type* Q, + vector_type* rhs) + { + //remember - P and Q are generally CPU variables + int error_sum = 0; + this->A_ = (matrix::Csr*) A; + index_type n = A_->getNumRows(); + //set matrix info + rocsolver_create_rfinfo(&infoM_, workspace_->getRocblasHandle()); + //create combined factor + addFactors(L,U); + M_->setUpdated(ReSolve::memory::HOST); + M_->copyData(ReSolve::memory::DEVICE); + mem_.allocateArrayOnDevice(&d_P_, n); + mem_.allocateArrayOnDevice(&d_Q_, n); + + mem_.copyArrayHostToDevice(d_P_, P, n); + mem_.copyArrayHostToDevice(d_Q_, Q, n); + + mem_.deviceSynchronize(); + status_rocblas_ = rocsolver_dcsrrf_analysis(workspace_->getRocblasHandle(), + n, + 1, + A_->getNnzExpanded(), + A_->getRowData(ReSolve::memory::DEVICE), //kRowPtr_, + A_->getColData(ReSolve::memory::DEVICE), //jCol_, + A_->getValues(ReSolve::memory::DEVICE), //vals_, + M_->getNnzExpanded(), + M_->getRowData(ReSolve::memory::DEVICE), + M_->getColData(ReSolve::memory::DEVICE), + M_->getValues(ReSolve::memory::DEVICE), //vals_, + d_P_, + d_Q_, + rhs->getData(ReSolve::memory::DEVICE), + n, + infoM_); + + mem_.deviceSynchronize(); + error_sum += status_rocblas_; + + // tri solve setup + if (solve_mode_ == 1) { // fast mode + L_csr_ = new ReSolve::matrix::Csr(L->getNumRows(), L->getNumColumns(), L->getNnz()); + U_csr_ = new ReSolve::matrix::Csr(U->getNumRows(), U->getNumColumns(), U->getNnz()); + + L_csr_->allocateMatrixData(ReSolve::memory::DEVICE); + U_csr_->allocateMatrixData(ReSolve::memory::DEVICE); + + rocsparse_create_mat_descr(&(descr_L_)); + rocsparse_set_mat_fill_mode(descr_L_, rocsparse_fill_mode_lower); + rocsparse_set_mat_index_base(descr_L_, rocsparse_index_base_zero); + + rocsparse_create_mat_descr(&(descr_U_)); + rocsparse_set_mat_index_base(descr_U_, rocsparse_index_base_zero); + rocsparse_set_mat_fill_mode(descr_U_, rocsparse_fill_mode_upper); + + rocsparse_create_mat_info(&info_L_); + rocsparse_create_mat_info(&info_U_); + + // local variables + size_t L_buffer_size; + size_t U_buffer_size; + + status_rocblas_ = rocsolver_dcsrrf_splitlu(workspace_->getRocblasHandle(), + n, + M_->getNnzExpanded(), + M_->getRowData(ReSolve::memory::DEVICE), + M_->getColData(ReSolve::memory::DEVICE), + M_->getValues(ReSolve::memory::DEVICE), //vals_, + L_csr_->getRowData(ReSolve::memory::DEVICE), + L_csr_->getColData(ReSolve::memory::DEVICE), + L_csr_->getValues(ReSolve::memory::DEVICE), //vals_, + U_csr_->getRowData(ReSolve::memory::DEVICE), + U_csr_->getColData(ReSolve::memory::DEVICE), + U_csr_->getValues(ReSolve::memory::DEVICE)); + + error_sum += status_rocblas_; + + status_rocsparse_ = rocsparse_dcsrsv_buffer_size(workspace_->getRocsparseHandle(), + rocsparse_operation_none, + n, + L_csr_->getNnz(), + descr_L_, + L_csr_->getValues(ReSolve::memory::DEVICE), //vals_, + L_csr_->getRowData(ReSolve::memory::DEVICE), + L_csr_->getColData(ReSolve::memory::DEVICE), + info_L_, + &L_buffer_size); + error_sum += status_rocsparse_; + + mem_.allocateBufferOnDevice(&L_buffer_, L_buffer_size); + status_rocsparse_ = rocsparse_dcsrsv_buffer_size(workspace_->getRocsparseHandle(), + rocsparse_operation_none, + n, + U_csr_->getNnz(), + descr_U_, + U_csr_->getValues(ReSolve::memory::DEVICE), //vals_, + U_csr_->getRowData(ReSolve::memory::DEVICE), + U_csr_->getColData(ReSolve::memory::DEVICE), + info_U_, + &U_buffer_size); + error_sum += status_rocsparse_; + mem_.allocateBufferOnDevice(&U_buffer_, U_buffer_size); + + status_rocsparse_ = rocsparse_dcsrsv_analysis(workspace_->getRocsparseHandle(), + rocsparse_operation_none, + n, + L_csr_->getNnz(), + descr_L_, + L_csr_->getValues(ReSolve::memory::DEVICE), //vals_, + L_csr_->getRowData(ReSolve::memory::DEVICE), + L_csr_->getColData(ReSolve::memory::DEVICE), + info_L_, + rocsparse_analysis_policy_force, + rocsparse_solve_policy_auto, + L_buffer_); + error_sum += status_rocsparse_; + if (status_rocsparse_!=0)printf("status after analysis 1 %d \n", status_rocsparse_); + status_rocsparse_ = rocsparse_dcsrsv_analysis(workspace_->getRocsparseHandle(), + rocsparse_operation_none, + n, + U_csr_->getNnz(), + descr_U_, + U_csr_->getValues(ReSolve::memory::DEVICE), //vals_, + U_csr_->getRowData(ReSolve::memory::DEVICE), + U_csr_->getColData(ReSolve::memory::DEVICE), + info_U_, + rocsparse_analysis_policy_force, + rocsparse_solve_policy_auto, + U_buffer_); + error_sum += status_rocsparse_; + if (status_rocsparse_!=0)printf("status after analysis 2 %d \n", status_rocsparse_); + //allocate aux data + + mem_.allocateArrayOnDevice(&d_aux1_,n); + mem_.allocateArrayOnDevice(&d_aux2_,n); + + } + return error_sum; + } + + int LinSolverDirectRocSolverRf::refactorize() + { + int error_sum = 0; + mem_.deviceSynchronize(); + status_rocblas_ = rocsolver_dcsrrf_refactlu(workspace_->getRocblasHandle(), + A_->getNumRows(), + A_->getNnzExpanded(), + A_->getRowData(ReSolve::memory::DEVICE), //kRowPtr_, + A_->getColData(ReSolve::memory::DEVICE), //jCol_, + A_->getValues(ReSolve::memory::DEVICE), //vals_, + M_->getNnzExpanded(), + M_->getRowData(ReSolve::memory::DEVICE), + M_->getColData(ReSolve::memory::DEVICE), + M_->getValues(ReSolve::memory::DEVICE), //OUTPUT, + d_P_, + d_Q_, + infoM_); + + + mem_.deviceSynchronize(); + error_sum += status_rocblas_; + + if (solve_mode_ == 1) { + //split M, fill L and U with correct values +printf("solve mode 1, splitting the factors again \n"); + status_rocblas_ = rocsolver_dcsrrf_splitlu(workspace_->getRocblasHandle(), + A_->getNumRows(), + M_->getNnzExpanded(), + M_->getRowData(ReSolve::memory::DEVICE), + M_->getColData(ReSolve::memory::DEVICE), + M_->getValues(ReSolve::memory::DEVICE), //vals_, + L_csr_->getRowData(ReSolve::memory::DEVICE), + L_csr_->getColData(ReSolve::memory::DEVICE), + L_csr_->getValues(ReSolve::memory::DEVICE), //vals_, + U_csr_->getRowData(ReSolve::memory::DEVICE), + U_csr_->getColData(ReSolve::memory::DEVICE), + U_csr_->getValues(ReSolve::memory::DEVICE)); + + mem_.deviceSynchronize(); + error_sum += status_rocblas_; + + } + + return error_sum; + } + + // solution is returned in RHS + int LinSolverDirectRocSolverRf::solve(vector_type* rhs) + { + int error_sum = 0; + if (solve_mode_ == 0) { + mem_.deviceSynchronize(); + status_rocblas_ = rocsolver_dcsrrf_solve(workspace_->getRocblasHandle(), + A_->getNumRows(), + 1, + M_->getNnz(), + M_->getRowData(ReSolve::memory::DEVICE), + M_->getColData(ReSolve::memory::DEVICE), + M_->getValues(ReSolve::memory::DEVICE), + d_P_, + d_Q_, + rhs->getData(ReSolve::memory::DEVICE), + A_->getNumRows(), + infoM_); + mem_.deviceSynchronize(); + } else { + // not implemented yet + permuteVectorP(A_->getNumRows(), d_P_, rhs->getData(ReSolve::memory::DEVICE), d_aux1_); + mem_.deviceSynchronize(); + rocsparse_dcsrsv_solve(workspace_->getRocsparseHandle(), + rocsparse_operation_none, + A_->getNumRows(), + L_csr_->getNnz(), + &(constants::ONE), + descr_L_, + L_csr_->getValues(ReSolve::memory::DEVICE), //vals_, + L_csr_->getRowData(ReSolve::memory::DEVICE), + L_csr_->getColData(ReSolve::memory::DEVICE), + info_L_, + d_aux1_, + d_aux2_, //result + rocsparse_solve_policy_auto, + L_buffer_); + error_sum += status_rocsparse_; + + rocsparse_dcsrsv_solve(workspace_->getRocsparseHandle(), + rocsparse_operation_none, + A_->getNumRows(), + U_csr_->getNnz(), + &(constants::ONE), + descr_L_, + U_csr_->getValues(ReSolve::memory::DEVICE), //vals_, + U_csr_->getRowData(ReSolve::memory::DEVICE), + U_csr_->getColData(ReSolve::memory::DEVICE), + info_U_, + d_aux2_, //input + d_aux1_,//result + rocsparse_solve_policy_auto, + U_buffer_); + error_sum += status_rocsparse_; + + permuteVectorQ(A_->getNumRows(), d_Q_,d_aux1_,rhs->getData(ReSolve::memory::DEVICE)); + mem_.deviceSynchronize(); + } + return error_sum; + } + + int LinSolverDirectRocSolverRf::solve(vector_type* rhs, vector_type* x) + { + x->update(rhs->getData(ReSolve::memory::DEVICE), ReSolve::memory::DEVICE, ReSolve::memory::DEVICE); + x->setDataUpdated(ReSolve::memory::DEVICE); + int error_sum = 0; + if (solve_mode_ == 0) { + mem_.deviceSynchronize(); + status_rocblas_ = rocsolver_dcsrrf_solve(workspace_->getRocblasHandle(), + A_->getNumRows(), + 1, + M_->getNnz(), + M_->getRowData(ReSolve::memory::DEVICE), + M_->getColData(ReSolve::memory::DEVICE), + M_->getValues(ReSolve::memory::DEVICE), + d_P_, + d_Q_, + x->getData(ReSolve::memory::DEVICE), + A_->getNumRows(), + infoM_); + error_sum += status_rocblas_; + mem_.deviceSynchronize(); + } else { + // not implemented yet + + permuteVectorP(A_->getNumRows(), d_P_, rhs->getData(ReSolve::memory::DEVICE), d_aux1_); + mem_.deviceSynchronize(); + + rocsparse_dcsrsv_solve(workspace_->getRocsparseHandle(), + rocsparse_operation_none, + A_->getNumRows(), + L_csr_->getNnz(), + &(constants::ONE), + descr_L_, + L_csr_->getValues(ReSolve::memory::DEVICE), //vals_, + L_csr_->getRowData(ReSolve::memory::DEVICE), + L_csr_->getColData(ReSolve::memory::DEVICE), + info_L_, + d_aux1_, + d_aux2_, //result + rocsparse_solve_policy_auto, + L_buffer_); + error_sum += status_rocsparse_; + + rocsparse_dcsrsv_solve(workspace_->getRocsparseHandle(), + rocsparse_operation_none, + A_->getNumRows(), + U_csr_->getNnz(), + &(constants::ONE), + descr_U_, + U_csr_->getValues(ReSolve::memory::DEVICE), //vals_, + U_csr_->getRowData(ReSolve::memory::DEVICE), + U_csr_->getColData(ReSolve::memory::DEVICE), + info_U_, + d_aux2_, //input + d_aux1_,//result + rocsparse_solve_policy_auto, + U_buffer_); + error_sum += status_rocsparse_; + + permuteVectorQ(A_->getNumRows(), d_Q_,d_aux1_,x->getData(ReSolve::memory::DEVICE)); + mem_.deviceSynchronize(); + } + return error_sum; + } + + int LinSolverDirectRocSolverRf::setSolveMode(int mode) + { + solve_mode_ = mode; + return 0; + } + + int LinSolverDirectRocSolverRf::getSolveMode() + { + return solve_mode_; + } + + void LinSolverDirectRocSolverRf::addFactors(matrix::Sparse* L, matrix::Sparse* U) + { + // L and U need to be in CSC format + index_type n = L->getNumRows(); + index_type* Lp = L->getColData(ReSolve::memory::HOST); + index_type* Li = L->getRowData(ReSolve::memory::HOST); + index_type* Up = U->getColData(ReSolve::memory::HOST); + index_type* Ui = U->getRowData(ReSolve::memory::HOST); + + index_type nnzM = ( L->getNnz() + U->getNnz() - n ); + M_ = new matrix::Csr(n, n, nnzM); + M_->allocateMatrixData(ReSolve::memory::DEVICE); + M_->allocateMatrixData(ReSolve::memory::HOST); + index_type* mia = M_->getRowData(ReSolve::memory::HOST); + index_type* mja = M_->getColData(ReSolve::memory::HOST); + index_type row; + for(index_type i = 0; i < n; ++i) { + // go through EACH COLUMN OF L first + for(index_type j = Lp[i]; j < Lp[i + 1]; ++j) { + row = Li[j]; + // BUT dont count diagonal twice, important + if(row != i) { + mia[row + 1]++; + } + } + // then each column of U + for(index_type j = Up[i]; j < Up[i + 1]; ++j) { + row = Ui[j]; + mia[row + 1]++; + } + } + // then organize mia_; + mia[0] = 0; + for(index_type i = 1; i < n + 1; i++) { + mia[i] += mia[i - 1]; + } + + std::vector Mshifts(static_cast(n), 0); + for(index_type i = 0; i < n; ++i) { + // go through EACH COLUMN OF L first + for(int j = Lp[i]; j < Lp[i + 1]; ++j) { + row = Li[j]; + if(row != i) { + // place (row, i) where it belongs! + mja[mia[row] + Mshifts[static_cast(row)]] = i; + Mshifts[static_cast(row)]++; + } + } + // each column of U next + for(index_type j = Up[i]; j < Up[i + 1]; ++j) { + row = Ui[j]; + mja[mia[row] + Mshifts[static_cast(row)]] = i; + Mshifts[static_cast(row)]++; + } + } + //Mshifts.~vector(); + } +}// namespace resolve diff --git a/resolve/LinSolverDirectRocSolverRf.hpp b/resolve/LinSolverDirectRocSolverRf.hpp new file mode 100644 index 00000000..97c95526 --- /dev/null +++ b/resolve/LinSolverDirectRocSolverRf.hpp @@ -0,0 +1,79 @@ +#pragma once +#include "Common.hpp" +#include "LinSolver.hpp" +#include +#include + +#include +#include +#include +#include + +namespace ReSolve +{ + // Forward declaration of vector::Vector class + namespace vector + { + class Vector; + } + + // Forward declaration of matrix::Sparse class + namespace matrix + { + class Sparse; + } + + class LinSolverDirectRocSolverRf : public LinSolverDirect + { + using vector_type = vector::Vector; + + public: + LinSolverDirectRocSolverRf(LinAlgWorkspaceHIP* workspace); + ~LinSolverDirectRocSolverRf(); + + int setup(matrix::Sparse* A, + matrix::Sparse* L, + matrix::Sparse* U, + index_type* P, + index_type* Q, + vector_type* rhs); + + int refactorize(); + int solve(vector_type* rhs, vector_type* x); + int solve(vector_type* rhs);// the solutuon is returned IN RHS (rhs is overwritten) + + int setSolveMode(int mode); // should probably be enum + int getSolveMode(); //should be enum too + + private: + rocblas_status status_rocblas_; + rocsparse_status status_rocsparse_; + index_type* d_P_; + index_type* d_Q_; + + MemoryHandler mem_; ///< Device memory manager object + LinAlgWorkspaceHIP* workspace_; + + // to be exported to matrix handler in a later time + void addFactors(matrix::Sparse* L, matrix::Sparse* U); //create L+U from sepeate L, U factors + rocsolver_rfinfo infoM_; + matrix::Sparse* M_;//the matrix that contains added factors + int solve_mode_; // 0 is default and 1 is fast + + // not used by default - for fast solve + rocsparse_mat_descr descr_L_{nullptr}; + rocsparse_mat_descr descr_U_{nullptr}; + + rocsparse_mat_info info_L_{nullptr}; + rocsparse_mat_info info_U_{nullptr}; + + void* L_buffer_{nullptr}; + void* U_buffer_{nullptr}; + + ReSolve::matrix::Csr* L_csr_; + ReSolve::matrix::Csr* U_csr_; + + real_type* d_aux1_{nullptr}; + real_type* d_aux2_{nullptr}; + }; +} diff --git a/resolve/LinSolverIterativeFGMRES.cpp b/resolve/LinSolverIterativeFGMRES.cpp index fa63f2d5..40fdb22c 100644 --- a/resolve/LinSolverIterativeFGMRES.cpp +++ b/resolve/LinSolverIterativeFGMRES.cpp @@ -10,8 +10,9 @@ namespace ReSolve { using out = io::Logger; - LinSolverIterativeFGMRES::LinSolverIterativeFGMRES() + LinSolverIterativeFGMRES::LinSolverIterativeFGMRES(std::string memspace) { + memspace_ = memspace; this->matrix_handler_ = nullptr; this->vector_handler_ = nullptr; tol_ = 1e-14; //default @@ -25,8 +26,10 @@ namespace ReSolve LinSolverIterativeFGMRES::LinSolverIterativeFGMRES(MatrixHandler* matrix_handler, VectorHandler* vector_handler, - GramSchmidt* gs) + GramSchmidt* gs, + std::string memspace) { + memspace_ = memspace; this->matrix_handler_ = matrix_handler; this->vector_handler_ = vector_handler; this->GS_ = gs; @@ -46,8 +49,10 @@ namespace ReSolve index_type conv_cond, MatrixHandler* matrix_handler, VectorHandler* vector_handler, - GramSchmidt* gs) + GramSchmidt* gs, + std::string memspace) { + memspace_ = memspace; this->matrix_handler_ = matrix_handler; this->vector_handler_ = vector_handler; this->GS_ = gs; @@ -82,9 +87,9 @@ namespace ReSolve n_ = A_->getNumRows(); d_V_ = new vector_type(n_, restart_ + 1); - d_V_->allocate("cuda"); + d_V_->allocate(memory::DEVICE); d_Z_ = new vector_type(n_, restart_ + 1); - d_Z_->allocate("cuda"); + d_Z_->allocate(memory::DEVICE); h_H_ = new real_type[restart_ * (restart_ + 1)]; h_c_ = new real_type[restart_]; // needed for givens h_s_ = new real_type[restart_]; // same @@ -113,12 +118,15 @@ namespace ReSolve vector_type* vec_v = new vector_type(n_); vector_type* vec_z = new vector_type(n_); //V[0] = b-A*x_0 + //debug + d_Z_->setToZero(memory::DEVICE); + d_V_->setToZero(memory::DEVICE); - rhs->deepCopyVectorData(d_V_->getData("cuda"), 0, "cuda"); - matrix_handler_->matvec(A_, x, d_V_, &MINUSONE, &ONE, "csr", "cuda"); + rhs->deepCopyVectorData(d_V_->getData(memory::DEVICE), 0, memory::DEVICE); + matrix_handler_->matvec(A_, x, d_V_, &MINUSONE, &ONE, "csr", memspace_); rnorm = 0.0; - bnorm = vector_handler_->dot(rhs, rhs, "cuda"); - rnorm = vector_handler_->dot(d_V_, d_V_, "cuda"); + bnorm = vector_handler_->dot(rhs, rhs, memspace_); + rnorm = vector_handler_->dot(d_V_, d_V_, memspace_); //rnorm = ||V_1|| rnorm = sqrt(rnorm); @@ -154,7 +162,7 @@ namespace ReSolve // normalize first vector t = 1.0 / rnorm; - vector_handler_->scal(&t, d_V_, "cuda"); + vector_handler_->scal(&t, d_V_, memspace_); // initialize norm history h_rs_[0] = rnorm; i = -1; @@ -166,20 +174,20 @@ namespace ReSolve // Z_i = (LU)^{-1}*V_i - vec_v->setData( d_V_->getVectorData(i, "cuda"), "cuda"); - vec_z->setData( d_Z_->getVectorData(i, "cuda"), "cuda"); + vec_v->setData( d_V_->getVectorData(i, memory::DEVICE), memory::DEVICE); + vec_z->setData( d_Z_->getVectorData(i, memory::DEVICE), memory::DEVICE); this->precV(vec_v, vec_z); mem_.deviceSynchronize(); // V_{i+1}=A*Z_i - vec_v->setData( d_V_->getVectorData(i + 1, "cuda"), "cuda"); + vec_v->setData( d_V_->getVectorData(i + 1, memory::DEVICE), memory::DEVICE); - matrix_handler_->matvec(A_, vec_z, vec_v, &ONE, &ZERO,"csr", "cuda"); + matrix_handler_->matvec(A_, vec_z, vec_v, &ONE, &ZERO,"csr", memspace_); // orthogonalize V[i+1], form a column of h_H_ - GS_->orthogonalize(n_, d_V_, h_H_, i, "cuda"); ; + GS_->orthogonalize(n_, d_V_, h_H_, i, memspace_); ; if(i != 0) { for(int k = 1; k <= i; k++) { k1 = k - 1; @@ -188,7 +196,6 @@ namespace ReSolve h_H_[i * (restart_ + 1) + k] = -h_s_[k1] * t + h_c_[k1] * h_H_[i * (restart_ + 1) + k]; } } // if i!=0 - double Hii = h_H_[i * (restart_ + 1) + i]; double Hii1 = h_H_[(i) * (restart_ + 1) + i + 1]; double gam = sqrt(Hii * Hii + Hii1 * Hii1); @@ -228,8 +235,8 @@ namespace ReSolve // get solution for(j = 0; j <= i; j++) { - vec_z->setData( d_Z_->getVectorData(j, "cuda"), "cuda"); - vector_handler_->axpy(&h_rs_[j], vec_z, x, "cuda"); + vec_z->setData( d_Z_->getVectorData(j, memory::DEVICE), memory::DEVICE); + vector_handler_->axpy(&h_rs_[j], vec_z, x, memspace_); } /* test solution */ @@ -239,9 +246,9 @@ namespace ReSolve outer_flag = 0; } - rhs->deepCopyVectorData(d_V_->getData("cuda"), 0, "cuda"); - matrix_handler_->matvec(A_, x, d_V_, &MINUSONE, &ONE,"csr", "cuda"); - rnorm = vector_handler_->dot(d_V_, d_V_, "cuda"); + rhs->deepCopyVectorData(d_V_->getData(memory::DEVICE), 0, memory::DEVICE); + matrix_handler_->matvec(A_, x, d_V_, &MINUSONE, &ONE,"csr", memspace_); + rnorm = vector_handler_->dot(d_V_, d_V_, memspace_); // rnorm = ||V_1|| rnorm = sqrt(rnorm); @@ -253,9 +260,9 @@ namespace ReSolve return 0; } - int LinSolverIterativeFGMRES::setupPreconditioner(std::string name, LinSolverDirect* LU_solver) + int LinSolverIterativeFGMRES::setupPreconditioner(std::string type, LinSolverDirect* LU_solver) { - if (name != "CuSolverRf") { + if (type != "LU") { out::warning() << "Only cusolverRf tri solve can be used as a preconditioner at this time." << std::endl; return 1; } else { @@ -308,7 +315,7 @@ namespace ReSolve int LinSolverIterativeFGMRES::resetMatrix(matrix::Sparse* new_matrix) { A_ = new_matrix; - matrix_handler_->setValuesChanged(true, "cuda"); + matrix_handler_->setValuesChanged(true, memspace_); return 0; } @@ -317,7 +324,7 @@ namespace ReSolve void LinSolverIterativeFGMRES::precV(vector_type* rhs, vector_type* x) { LU_solver_->solve(rhs, x); - // x->update(rhs->getData("cuda"), "cuda", "cuda"); + // x->update(rhs->getData(memory::DEVICE), memory::DEVICE, memory::DEVICE); } real_type LinSolverIterativeFGMRES::getFinalResidualNorm() diff --git a/resolve/LinSolverIterativeFGMRES.hpp b/resolve/LinSolverIterativeFGMRES.hpp index 8b2c722d..a9fc5058 100644 --- a/resolve/LinSolverIterativeFGMRES.hpp +++ b/resolve/LinSolverIterativeFGMRES.hpp @@ -13,17 +13,19 @@ namespace ReSolve using vector_type = vector::Vector; public: - LinSolverIterativeFGMRES(); + LinSolverIterativeFGMRES(std::string memspace = "cuda"); LinSolverIterativeFGMRES( MatrixHandler* matrix_handler, VectorHandler* vector_handler, - GramSchmidt* gs); + GramSchmidt* gs, + std::string memspace = "cuda"); LinSolverIterativeFGMRES(index_type restart, real_type tol, index_type maxit, index_type conv_cond, MatrixHandler* matrix_handler, VectorHandler* vector_handler, - GramSchmidt* gs); + GramSchmidt* gs, + std::string memspace = "cuda"); ~LinSolverIterativeFGMRES(); int solve(vector_type* rhs, vector_type* x); @@ -48,6 +50,8 @@ namespace ReSolve private: //remember matrix handler and vector handler are inherited. + std::string memspace_; + real_type tol_; index_type maxit_; index_type restart_; diff --git a/resolve/MemoryUtils.hpp b/resolve/MemoryUtils.hpp index 00f3d653..d87c621f 100644 --- a/resolve/MemoryUtils.hpp +++ b/resolve/MemoryUtils.hpp @@ -2,6 +2,16 @@ #include + +namespace ReSolve +{ + namespace memory + { + enum MemorySpace{HOST = 0, DEVICE}; + enum MemoryDirection{HOST_TO_HOST = 0, HOST_TO_DEVICE, DEVICE_TO_HOST, DEVICE_TO_DEVICE}; + } +} + namespace ReSolve { /** @@ -44,6 +54,15 @@ namespace ReSolve template int copyArrayHostToDevice(T* dst, const T* src, I n); + + /// Implemented here as it is always needed + template + int copyArrayHostToHost(T* dst, const T* src, I n) + { + size_t nelements = static_cast(n); + memcpy(dst, src, nelements * sizeof(T)); + return 0; + } }; } // namespace ReSolve @@ -55,7 +74,8 @@ namespace ReSolve #include using MemoryHandler = ReSolve::MemoryUtils; #elif defined RESOLVE_USE_HIP -#error HIP support requested, but not available! Probably a bug in CMake configuration. +#include +using MemoryHandler = ReSolve::MemoryUtils; #else #error Unrecognized device, probably bug in CMake configuration #endif diff --git a/resolve/cpu/CMakeLists.txt b/resolve/cpu/CMakeLists.txt index 7105655c..16455315 100644 --- a/resolve/cpu/CMakeLists.txt +++ b/resolve/cpu/CMakeLists.txt @@ -19,10 +19,5 @@ set(ReSolve_CPU_HEADER_INSTALL add_library(resolve_backend_cpu SHARED ${ReSolve_CPU_SRC}) target_link_libraries(resolve_backend_cpu PRIVATE resolve_logger) -target_include_directories(resolve_backend_cpu INTERFACE - $ - $ -) - # install include headers install(FILES ${ReSolve_CPU_HEADER_INSTALL} DESTINATION include/resolve/cpu) diff --git a/resolve/cuda/CMakeLists.txt b/resolve/cuda/CMakeLists.txt index f97267bc..225ea3c6 100644 --- a/resolve/cuda/CMakeLists.txt +++ b/resolve/cuda/CMakeLists.txt @@ -27,10 +27,7 @@ set_source_files_properties(${ReSolve_CUDA_SRC} PROPERTIES LANGUAGE CUDA) add_library(resolve_backend_cuda SHARED ${ReSolve_CUDA_SRC}) target_link_libraries(resolve_backend_cuda PRIVATE resolve_logger) target_link_libraries(resolve_backend_cuda PUBLIC resolve_cuda) -target_include_directories(resolve_backend_cuda INTERFACE - $ - $ -) # install include headers install(FILES ${ReSolve_CUDA_HEADER_INSTALL} DESTINATION include/resolve/cuda) + diff --git a/resolve/hip/CMakeLists.txt b/resolve/hip/CMakeLists.txt new file mode 100644 index 00000000..fb71a3bd --- /dev/null +++ b/resolve/hip/CMakeLists.txt @@ -0,0 +1,33 @@ +#[[ + +@brief Build ReSolve HIP backend + +@author Slaven Peles + +]] + +set(ReSolve_HIP_SRC + hipKernels.hip + hipVectorKernels.hip + MemoryUtils.hip +) + +set(ReSolve_HIP_HEADER_INSTALL + hipKernels.h + hipVectorKernels.h + HipMemory.hpp + hip_check_errors.hpp +) + +set_source_files_properties(${ReSolve_HIP_SRC} PROPERTIES LANGUAGE HIP) + +# First create HIP backend +# (this should really be HIP _API_ backend, +# separate backend will be needed for HIP SDK) +add_library(resolve_backend_hip SHARED ${ReSolve_HIP_SRC}) +target_link_libraries(resolve_backend_hip PRIVATE resolve_logger) +target_link_libraries(resolve_backend_hip PUBLIC resolve_hip) + +# install include headers +install(FILES ${ReSolve_HIP_HEADER_INSTALL} DESTINATION include/resolve/hip) + diff --git a/resolve/hip/HipMemory.hpp b/resolve/hip/HipMemory.hpp new file mode 100644 index 00000000..a6a482a5 --- /dev/null +++ b/resolve/hip/HipMemory.hpp @@ -0,0 +1,152 @@ +#pragma once + +#include +#include + +#include "hip_check_errors.hpp" + +namespace ReSolve +{ + namespace memory + { + /** + * @brief Class containing wrappers for CUDA API functions. + * + * All wrappers are implemented as static functions returning integer + * error code from CUDA API functions. + * + * @author Slaven Peles + */ + struct Hip + { + static void deviceSynchronize() + { + hipDeviceSynchronize(); + } + + static int getLastDeviceError() + { + return static_cast(hipGetLastError()); + } + + /** + * @brief deletes variable from device + * + * @param v - a variable on the device + * + * @post v is freed from the device + */ + static int deleteOnDevice(void* v) + { + return checkHipErrors(hipFree(v)); + } + + /** + * @brief allocates array v onto device + * + * @param v - pointer to the array to be allocated on the device + * @param n - number of array elements (int, size_t) + * + * @tparam T - Array element type + * @tparam I - Array index type + * + * @post v is now a array with size n on the device + */ + template + static int allocateArrayOnDevice(T** v, I n) + { + return checkHipErrors(hipMalloc((void**) v, sizeof(T) * n)); + } + + /** + * @brief allocates buffer v onto device. + * + * The difference from the array is that buffer size is required in bytes, + * not number of elements. + * + * @param v - pointer to the buffer to be allocated on the device + * @param n - size of the buffer in bytes + * + * @tparam T - Buffer element data type type (typically void) + * @tparam I - Buffer size type (typically size_t) + * + * @post v is now a buffer of n bytes + */ + template + static int allocateBufferOnDevice(T** v, I n) + { + return checkHipErrors(hipMalloc((void**) v, n)); + } + + /** + * @brief Sets elements of device array v to zero + * + * @param v - pointer to the array to be allocated on the device + * @param n - number of the array elements to be set to zero + * + * @tparam T - Array element type + * @tparam I - Array index type + * + * @post First n elements of array v are set to zero + */ + template + static int setZeroArrayOnDevice(T* v, I n) + { + return checkHipErrors(hipMemset(v, 0, sizeof(T) * n)); + } + + /** + * @brief Copies array `src` from device to the array `dst` on the host. + * + * @param[in] n - size of src array + * @param[in] src - array on device + * @param[out] dst - array on host + * + * @pre `src` is a pointer to an allocated array on the device + * @pre `dst` is allocated to size >= n on the host + * @post Content of `dst` is overwritten by the content of `src` + */ + template + static int copyArrayDeviceToHost(T* dst, const T* src, I n) + { + return checkHipErrors(hipMemcpy(dst, src, sizeof(T) * n, hipMemcpyDeviceToHost)); + } + + /** + * @brief Copies array `src` to the array `dst` on the device. + * + * @param n - size of src array + * @param src - array on device to be copied + * @param dst - array on device to be copied onto + * + * @pre `src` is a pointer to an allocated array on the device + * @pre `dst` is allocated to size >= n on the device + * @post Content of `dst` is overwritten by the content of `src` + */ + template + static int copyArrayDeviceToDevice(T* dst, const T* src, I n) + { + return checkHipErrors(hipMemcpy(dst, src, sizeof(T) * n, hipMemcpyDeviceToDevice)); + } + + /** + * @brief Copies array `src` from the host to the array `dst` on the device. + * + * @param n - size of src array + * @param src - array on the host to be copied + * @param dst - array on the device to be copied onto + * + * @pre `src` is a pointer to an allocated array on the host + * @pre `dst` is allocated to size >= n on the device + * @post Content of `dst` is overwritten by the content of `src` + */ + template + static int copyArrayHostToDevice(T* dst, const T* src, I n) + { + return checkHipErrors(hipMemcpy(dst, src, sizeof(T) * n, hipMemcpyHostToDevice)); + } + + }; + } + +} //namespace ReSolve diff --git a/resolve/hip/MemoryUtils.hip b/resolve/hip/MemoryUtils.hip new file mode 100644 index 00000000..bd3c666d --- /dev/null +++ b/resolve/hip/MemoryUtils.hip @@ -0,0 +1,40 @@ +/** + * @file MemoryUtils.cu + * + * This file includes MemoryUtils.tpp and specifies what functions to + * instantiate from function templates. + * + * @author Slaven Peles + */ + + +#include + +#include +#include + +#include + +namespace ReSolve +{ + template void MemoryUtils::deviceSynchronize(); + template int MemoryUtils::getLastDeviceError(); + template int MemoryUtils::deleteOnDevice(void*); + + template int MemoryUtils::allocateArrayOnDevice( real_type**, index_type); + template int MemoryUtils::allocateArrayOnDevice(index_type**, index_type); + + template int MemoryUtils::allocateBufferOnDevice(void** v, size_t n); + + template int MemoryUtils::setZeroArrayOnDevice( real_type*, index_type); + + template int MemoryUtils::copyArrayDeviceToHost( real_type*, const real_type*, index_type); + template int MemoryUtils::copyArrayDeviceToHost(index_type*, const index_type*, index_type); + + template int MemoryUtils::copyArrayDeviceToDevice( real_type*, const real_type*, index_type); + template int MemoryUtils::copyArrayDeviceToDevice(index_type*, const index_type*, index_type); + + template int MemoryUtils::copyArrayHostToDevice( real_type*, const real_type*, index_type); + template int MemoryUtils::copyArrayHostToDevice(index_type*, const index_type*, index_type); + +} //namespace ReSolve diff --git a/resolve/hip/hipKernels.h b/resolve/hip/hipKernels.h new file mode 100644 index 00000000..986efc84 --- /dev/null +++ b/resolve/hip/hipKernels.h @@ -0,0 +1,25 @@ +void mass_inner_product_two_vectors(int n, + int i, + double* vec1, + double* vec2, + double* mvec, + double* result); +void mass_axpy(int n, int i, double* x, double* y, double* alpha); + +//needed for matrix inf nrm +void matrix_row_sums(int n, + int nnz, + int* a_ia, + double* a_val, + double* result); + +// needed for triangular solve + +void permuteVectorP(int n, + int* perm_vector, + double* vec_in, + double* vec_out); +void permuteVectorQ(int n, + int* perm_vector, + double* vec_in, + double* vec_out); diff --git a/resolve/hip/hipKernels.hip b/resolve/hip/hipKernels.hip new file mode 100644 index 00000000..abad5b39 --- /dev/null +++ b/resolve/hip/hipKernels.hip @@ -0,0 +1,211 @@ +#include "hipKernels.h" +#define maxk 1024 +#define Tv5 1024 + +#include + +//computes V^T[u1 u2] where v is n x k and u1 and u2 are nx1 +__global__ void MassIPTwoVec_kernel(const double* __restrict__ u1, + const double* __restrict__ u2, + const double* __restrict__ v, + double* result, + const int k, + const int N) +{ + int t = threadIdx.x; + int bsize = blockDim.x; + + // assume T threads per thread block (and k reductions to be performed) + volatile __shared__ double s_tmp1[Tv5]; + + volatile __shared__ double s_tmp2[Tv5]; + // map between thread index space and the problem index space + int j = blockIdx.x; + s_tmp1[t] = 0.0f; + s_tmp2[t] = 0.0f; + int nn = t; + double can1, can2, cbn; + + while(nn < N) { + can1 = u1[nn]; + can2 = u2[nn]; + + cbn = v[N * j + nn]; + s_tmp1[t] += can1 * cbn; + s_tmp2[t] += can2 * cbn; + + nn += bsize; + } + + __syncthreads(); + + if(Tv5 >= 1024) { + if(t < 512) { + s_tmp1[t] += s_tmp1[t + 512]; + s_tmp2[t] += s_tmp2[t + 512]; + } + __syncthreads(); + } + if(Tv5 >= 512) { + if(t < 256) { + s_tmp1[t] += s_tmp1[t + 256]; + s_tmp2[t] += s_tmp2[t + 256]; + } + __syncthreads(); + } + { + if(t < 128) { + s_tmp1[t] += s_tmp1[t + 128]; + s_tmp2[t] += s_tmp2[t + 128]; + } + __syncthreads(); + } + { + if(t < 64) { + s_tmp1[t] += s_tmp1[t + 64]; + s_tmp2[t] += s_tmp2[t + 64]; + } + __syncthreads(); + } + + if(t < 32) { + s_tmp1[t] += s_tmp1[t + 32]; + s_tmp2[t] += s_tmp2[t + 32]; + + s_tmp1[t] += s_tmp1[t + 16]; + s_tmp2[t] += s_tmp2[t + 16]; + + s_tmp1[t] += s_tmp1[t + 8]; + s_tmp2[t] += s_tmp2[t + 8]; + + s_tmp1[t] += s_tmp1[t + 4]; + s_tmp2[t] += s_tmp2[t + 4]; + + s_tmp1[t] += s_tmp1[t + 2]; + s_tmp2[t] += s_tmp2[t + 2]; + + s_tmp1[t] += s_tmp1[t + 1]; + s_tmp2[t] += s_tmp2[t + 1]; + } + if(t == 0) { + result[blockIdx.x] = s_tmp1[0]; + result[blockIdx.x + k] = s_tmp2[0]; + } +} + + +//mass AXPY i.e y = y - x*alpha where alpha is [k x 1], needed in 1 and 2 synch GMRES + +__global__ void massAxpy3_kernel(int N, + int k, + const double* x_data, + double* y_data, + const double* alpha) { + + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + + unsigned int t = threadIdx.x; + + __shared__ double s_alpha[maxk]; + if(t < k) { + s_alpha[t] = alpha[t]; + } + __syncthreads(); + while (i < N){ + double temp = 0.0; + for(int j = 0; j < k; ++j) { + temp += x_data[j * N + i] * s_alpha[j]; + } + y_data[i] -= temp; + i += (blockDim.x*gridDim.x); + } +} +__global__ void matrixInfNormPart1(const int n, + const int nnz, + const int* a_ia, + const double* a_val, + double* result) { + + // one thread per row, pass through rows + // and sum + // can be done through atomics + //\sum_{j=1}^m abs(a_{ij}) + + int idx = blockIdx.x*blockDim.x + threadIdx.x; + while (idx < n){ + double sum = 0.0f; + for (int i = a_ia[idx]; i < a_ia[idx+1]; ++i) { + sum = sum + fabs(a_val[i]); + } + result[idx] = sum; + idx += (blockDim.x*gridDim.x); + } +} + + +__global__ void permuteVectorP_kernel(const int n, + const int* perm_vector, + const double* vec_in, + double* vec_out){ + + //one thread per vector entry, pass through rows + + int idx = blockIdx.x*blockDim.x + threadIdx.x; + while (idx + +#include + +//***************************************************************************// +//**** See VectorKernels.hpp for kernel wrapper functions documentation ****// +//***************************************************************************// + +namespace ReSolve { namespace vector { + +namespace kernels { + // __global__ void adapt_diag_scale(index_type, index_type, real_type*, index_type*, index_type*, real_type*, index_type*, + // index_type*, real_type*, index_type*, index_type*, real_type*, real_type*, real_type*, real_type*); + + // __global__ void adapt_row_max(index_type, index_type, real_type*, index_type*, index_type*, real_type*, index_type*, + // index_type*, real_type*, index_type*, index_type*, real_type*); + + // __global__ void add_const(index_type, index_type, index_type*); + + /** + * @brief CUDA kernel that sets values of an array to a constant. + * + * @param[in] n - length of the array + * @param[in] val - the value the array is set to + * @param[out] arr - a pointer to the array + * + * @pre `arr` is allocated to size `n` + * @post `arr` elements are set to `val` + */ + __global__ void set_const(index_type n, real_type val, real_type* arr); + + // __global__ void add_vecs(index_type, real_type*, real_type, real_type*); + + // __global__ void mult_const(index_type, real_type, real_type*); + + // __global__ void add_diag(index_type, real_type, index_type*, index_type*, real_type*); + + // __global__ void inv_vec_scale(index_type, real_type*, real_type*); + + // __global__ void vec_scale(index_type, real_type*, real_type*); + + // __global__ void concatenate(index_type, index_type, index_type, index_type, real_type*, index_type*, index_type*, + // real_type*, index_type*, index_type*, real_type*, index_type*, index_type*); + + // __global__ void row_scale(index_type, real_type*, index_type*, index_type*, real_type*, real_type*, + // real_type*, real_type*); + + // __global__ void diag_scale(index_type, index_type, real_type*, index_type*, index_type*, real_type*, index_type*, + // index_type*, real_type*, real_type*, real_type*, index_type); + + // __global__ void row_max(index_type, index_type, real_type*, index_type*, index_type*, real_type*, index_type*, index_type*, + // real_type* scale); +} // namespace kernels + +}} // namespace ReSolve::vector \ No newline at end of file diff --git a/resolve/hip/hipVectorKernels.hip b/resolve/hip/hipVectorKernels.hip new file mode 100644 index 00000000..5b3ace30 --- /dev/null +++ b/resolve/hip/hipVectorKernels.hip @@ -0,0 +1,29 @@ +#include +#include +#include + +namespace ReSolve { namespace vector { + +namespace kernels { + + __global__ void set_const(index_type n, real_type val, real_type* arr) + { + index_type i = blockIdx.x * blockDim.x + threadIdx.x; + while (i < n) + { + arr[i] = val; + i += blockDim.x * gridDim.x; + } + } +} // namespace kernels + +void set_array_const(index_type n, real_type val, real_type* arr) +{ + index_type num_blocks; + index_type block_size = 512; + num_blocks = (n + block_size - 1) / block_size; + hipLaunchKernelGGL( kernels::set_const, dim3(num_blocks), dim3(block_size), 0, 0, n, val, arr); +} + + +}} // namespace ReSolve::vector diff --git a/resolve/hip/hip_check_errors.hpp b/resolve/hip/hip_check_errors.hpp new file mode 100644 index 00000000..1f483d35 --- /dev/null +++ b/resolve/hip/hip_check_errors.hpp @@ -0,0 +1,28 @@ +/** + * @file hip_check_errors.hpp + * + * Contains macro to get error code from CUDA functions and to stream + * appropriate error output to Re::Solve's logger. + * + * @author Kasia Swirydowicz + * @author Slaven Peles + */ +#pragma once + +#include + +template +int check(T result, + char const *const func, + const char *const file, + int const line) +{ + if (result) { + ReSolve::io::Logger::error() << "HIP error in function " + << func << " at " << file << ":" << line + << ", error# " << result << "\n"; + return -1; + } + return 0; +} +#define checkHipErrors(val) check((val), #val, __FILE__, __LINE__) diff --git a/resolve/matrix/CMakeLists.txt b/resolve/matrix/CMakeLists.txt index 554c0ba7..565fa7c9 100644 --- a/resolve/matrix/CMakeLists.txt +++ b/resolve/matrix/CMakeLists.txt @@ -22,6 +22,11 @@ set(Matrix_CUDASDK_SRC MatrixHandlerCuda.cpp ) +# and on HIP +set(Matrix_ROCM_SRC + MatrixHandlerHip.cpp +) + # Header files to be installed set(Matrix_HEADER_INSTALL io.hpp @@ -37,6 +42,10 @@ if(RESOLVE_USE_CUDA) set(Matrix_SRC ${Matrix_SRC} ${Matrix_CUDASDK_SRC}) endif() +if(RESOLVE_USE_HIP) + set(Matrix_SRC ${Matrix_SRC} ${Matrix_ROCM_SRC}) +endif() + # Build shared library ReSolve::matrix add_library(resolve_matrix SHARED ${Matrix_SRC}) @@ -47,6 +56,10 @@ if (RESOLVE_USE_CUDA) target_link_libraries(resolve_matrix PUBLIC resolve_backend_cuda) endif() +if (RESOLVE_USE_HIP) + target_link_libraries(resolve_matrix PUBLIC resolve_backend_hip) +endif() + # Link to dummy device backend if GPU support is not enabled if (NOT RESOLVE_USE_GPU) target_link_libraries(resolve_matrix PUBLIC resolve_backend_cpu) diff --git a/resolve/matrix/Coo.cpp b/resolve/matrix/Coo.cpp index c8caebf6..326eba59 100644 --- a/resolve/matrix/Coo.cpp +++ b/resolve/matrix/Coo.cpp @@ -27,52 +27,49 @@ namespace ReSolve { } - index_type* matrix::Coo::getRowData(std::string memspace) + index_type* matrix::Coo::getRowData(memory::MemorySpace memspace) { - if (memspace == "cpu") { - copyData("cpu"); - return this->h_row_data_; - } else { - if (memspace == "cuda") { - copyData("cuda"); + using namespace ReSolve::memory; + copyData(memspace); + switch (memspace) { + case HOST: + return this->h_row_data_; + case DEVICE: return this->d_row_data_; - } else { + default: return nullptr; - } } } - index_type* matrix::Coo::getColData(std::string memspace) + index_type* matrix::Coo::getColData(memory::MemorySpace memspace) { - if (memspace == "cpu") { - copyData("cpu"); - return this->h_col_data_; - } else { - if (memspace == "cuda") { - copyData("cuda"); + using namespace ReSolve::memory; + copyData(memspace); + switch (memspace) { + case HOST: + return this->h_col_data_; + case DEVICE: return this->d_col_data_; - } else { + default: return nullptr; - } } } - real_type* matrix::Coo::getValues(std::string memspace) + real_type* matrix::Coo::getValues(memory::MemorySpace memspace) { - if (memspace == "cpu") { - copyData("cpu"); - return this->h_val_data_; - } else { - if (memspace == "cuda") { - copyData("cuda"); + using namespace ReSolve::memory; + copyData(memspace); + switch (memspace) { + case HOST: + return this->h_val_data_; + case DEVICE: return this->d_val_data_; - } else { + default: return nullptr; - } } } - index_type matrix::Coo::updateData(index_type* row_data, index_type* col_data, real_type* val_data, std::string memspaceIn, std::string memspaceOut) + index_type matrix::Coo::updateData(index_type* row_data, index_type* col_data, real_type* val_data, memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut) { //four cases (for now) @@ -80,12 +77,12 @@ namespace ReSolve if (is_expanded_) {nnz_current = nnz_expanded_;} setNotUpdated(); int control=-1; - if ((memspaceIn == "cpu") && (memspaceOut == "cpu")){ control = 0;} - if ((memspaceIn == "cpu") && (memspaceOut == "cuda")){ control = 1;} - if ((memspaceIn == "cuda") && (memspaceOut == "cpu")){ control = 2;} - if ((memspaceIn == "cuda") && (memspaceOut == "cuda")){ control = 3;} + if ((memspaceIn == memory::HOST) && (memspaceOut == memory::HOST)){ control = 0;} + if ((memspaceIn == memory::HOST) && ((memspaceOut == memory::DEVICE))){ control = 1;} + if (((memspaceIn == memory::DEVICE)) && (memspaceOut == memory::HOST)){ control = 2;} + if (((memspaceIn == memory::DEVICE)) && ((memspaceOut == memory::DEVICE))){ control = 3;} - if (memspaceOut == "cpu") { + if (memspaceOut == memory::HOST) { //check if cpu data allocated if (h_row_data_ == nullptr) { this->h_row_data_ = new index_type[nnz_current]; @@ -98,7 +95,7 @@ namespace ReSolve } } - if (memspaceOut == "cuda") { + if (memspaceOut == memory::DEVICE) { //check if cuda data allocated if (d_row_data_ == nullptr) { mem_.allocateArrayOnDevice(&d_row_data_, nnz_current); @@ -113,14 +110,14 @@ namespace ReSolve switch(control) { case 0: //cpu->cpu - std::memcpy(h_row_data_, row_data, (nnz_current) * sizeof(index_type)); - std::memcpy(h_col_data_, col_data, (nnz_current) * sizeof(index_type)); - std::memcpy(h_val_data_, val_data, (nnz_current) * sizeof(real_type)); + mem_.copyArrayHostToHost(h_row_data_, row_data, nnz_current); + mem_.copyArrayHostToHost(h_col_data_, col_data, nnz_current); + mem_.copyArrayHostToHost(h_val_data_, val_data, nnz_current); h_data_updated_ = true; owns_cpu_data_ = true; owns_cpu_vals_ = true; break; - case 2://cuda->cpu + case 2://gpu->cpu mem_.copyArrayDeviceToHost(h_row_data_, row_data, nnz_current); mem_.copyArrayDeviceToHost(h_col_data_, col_data, nnz_current); mem_.copyArrayDeviceToHost(h_val_data_, val_data, nnz_current); @@ -128,7 +125,7 @@ namespace ReSolve owns_cpu_data_ = true; owns_cpu_vals_ = true; break; - case 1://cpu->cuda + case 1://cpu->gpu mem_.copyArrayHostToDevice(d_row_data_, row_data, nnz_current); mem_.copyArrayHostToDevice(d_col_data_, col_data, nnz_current); mem_.copyArrayHostToDevice(d_val_data_, val_data, nnz_current); @@ -136,7 +133,7 @@ namespace ReSolve owns_gpu_data_ = true; owns_gpu_vals_ = true; break; - case 3://cuda->cuda + case 3://gpu->gpua mem_.copyArrayDeviceToDevice(d_row_data_, row_data, nnz_current); mem_.copyArrayDeviceToDevice(d_col_data_, col_data, nnz_current); mem_.copyArrayDeviceToDevice(d_val_data_, val_data, nnz_current); @@ -150,7 +147,7 @@ namespace ReSolve return 0; } - index_type matrix::Coo::updateData(index_type* row_data, index_type* col_data, real_type* val_data, index_type new_nnz, std::string memspaceIn, std::string memspaceOut) + index_type matrix::Coo::updateData(index_type* row_data, index_type* col_data, real_type* val_data, index_type new_nnz, memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut) { this->destroyMatrixData(memspaceOut); this->nnz_ = new_nnz; @@ -158,13 +155,13 @@ namespace ReSolve return i; } - index_type matrix::Coo::allocateMatrixData(std::string memspace) + index_type matrix::Coo::allocateMatrixData(memory::MemorySpace memspace) { index_type nnz_current = nnz_; if (is_expanded_) {nnz_current = nnz_expanded_;} destroyMatrixData(memspace);//just in case - if (memspace == "cpu") { + if (memspace == memory::HOST) { this->h_row_data_ = new index_type[nnz_current]; std::fill(h_row_data_, h_row_data_ + nnz_current, 0); this->h_col_data_ = new index_type[nnz_current]; @@ -176,7 +173,7 @@ namespace ReSolve return 0; } - if (memspace == "cuda") { + if (memspace == memory::DEVICE) { mem_.allocateArrayOnDevice(&d_row_data_, nnz_current); mem_.allocateArrayOnDevice(&d_col_data_, nnz_current); mem_.allocateArrayOnDevice(&d_val_data_, nnz_current); @@ -187,55 +184,57 @@ namespace ReSolve return -1; } - int matrix::Coo::copyData(std::string memspaceOut) + int matrix::Coo::copyData(memory::MemorySpace memspaceOut) { + using namespace ReSolve::memory; index_type nnz_current = nnz_; - if (is_expanded_) {nnz_current = nnz_expanded_;} - - if (memspaceOut == "cpu") { - //check if we need to copy or not - if ((d_data_updated_ == true) && (h_data_updated_ == false)) { - if (h_row_data_ == nullptr) { - h_row_data_ = new index_type[nnz_current]; - } - if (h_col_data_ == nullptr) { - h_col_data_ = new index_type[nnz_current]; - } - if (h_val_data_ == nullptr) { - h_val_data_ = new real_type[nnz_current]; - } - mem_.copyArrayDeviceToHost(h_row_data_, d_row_data_, nnz_current); - mem_.copyArrayDeviceToHost(h_col_data_, d_col_data_, nnz_current); - mem_.copyArrayDeviceToHost(h_val_data_, d_val_data_, nnz_current); - h_data_updated_ = true; - owns_cpu_data_ = true; - owns_cpu_vals_ = true; - } - return 0; + if (is_expanded_) { + nnz_current = nnz_expanded_; } - if (memspaceOut == "cuda") { - if ((d_data_updated_ == false) && (h_data_updated_ == true)) { - if (d_row_data_ == nullptr) { - mem_.allocateArrayOnDevice(&d_row_data_, nnz_current); - } - if (d_col_data_ == nullptr) { - mem_.allocateArrayOnDevice(&d_col_data_, nnz_current); + switch (memspaceOut) { + case HOST: + if ((d_data_updated_ == true) && (h_data_updated_ == false)) { + if (h_row_data_ == nullptr) { + h_row_data_ = new index_type[nnz_current]; + } + if (h_col_data_ == nullptr) { + h_col_data_ = new index_type[nnz_current]; + } + if (h_val_data_ == nullptr) { + h_val_data_ = new real_type[nnz_current]; + } + mem_.copyArrayDeviceToHost(h_row_data_, d_row_data_, nnz_current); + mem_.copyArrayDeviceToHost(h_col_data_, d_col_data_, nnz_current); + mem_.copyArrayDeviceToHost(h_val_data_, d_val_data_, nnz_current); + h_data_updated_ = true; + owns_cpu_data_ = true; + owns_cpu_vals_ = true; } - if (d_val_data_ == nullptr) { - mem_.allocateArrayOnDevice(&d_val_data_, nnz_current); + return 0; + case DEVICE: + if ((d_data_updated_ == false) && (h_data_updated_ == true)) { + if (d_row_data_ == nullptr) { + mem_.allocateArrayOnDevice(&d_row_data_, nnz_current); + } + if (d_col_data_ == nullptr) { + mem_.allocateArrayOnDevice(&d_col_data_, nnz_current); + } + if (d_val_data_ == nullptr) { + mem_.allocateArrayOnDevice(&d_val_data_, nnz_current); + } + mem_.copyArrayHostToDevice(d_row_data_, h_row_data_, nnz_current); + mem_.copyArrayHostToDevice(d_col_data_, h_col_data_, nnz_current); + mem_.copyArrayHostToDevice(d_val_data_, h_val_data_, nnz_current); + d_data_updated_ = true; + owns_gpu_data_ = true; + owns_gpu_vals_ = true; } - mem_.copyArrayHostToDevice(d_row_data_, h_row_data_, nnz_current); - mem_.copyArrayHostToDevice(d_col_data_, h_col_data_, nnz_current); - mem_.copyArrayHostToDevice(d_val_data_, h_val_data_, nnz_current); - d_data_updated_ = true; - owns_gpu_data_ = true; - owns_gpu_vals_ = true; - } - return 0; - } - return -1; + return 0; + default: + return -1; + } // switch } void matrix::Coo::print() diff --git a/resolve/matrix/Coo.hpp b/resolve/matrix/Coo.hpp index 3ec045c3..bc67ceef 100644 --- a/resolve/matrix/Coo.hpp +++ b/resolve/matrix/Coo.hpp @@ -15,18 +15,18 @@ namespace ReSolve { namespace matrix { bool expanded); ~Coo(); - virtual index_type* getRowData(std::string memspace); - virtual index_type* getColData(std::string memspace); - virtual real_type* getValues(std::string memspace); + virtual index_type* getRowData(memory::MemorySpace memspace); + virtual index_type* getColData(memory::MemorySpace memspace); + virtual real_type* getValues( memory::MemorySpace memspace); - virtual index_type updateData(index_type* row_data, index_type* col_data, real_type* val_data, std::string memspaceIn, std::string memspaceOut); - virtual index_type updateData(index_type* row_data, index_type* col_data, real_type* val_data, index_type new_nnz, std::string memspaceIn, std::string memspaceOut); + virtual index_type updateData(index_type* row_data, index_type* col_data, real_type* val_data, memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut); + virtual index_type updateData(index_type* row_data, index_type* col_data, real_type* val_data, index_type new_nnz, memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut); - virtual index_type allocateMatrixData(std::string memspace); + virtual index_type allocateMatrixData(memory::MemorySpace memspace); virtual void print(); - virtual int copyData(std::string memspaceOut); + virtual int copyData(memory::MemorySpace memspaceOut); }; }} // namespace ReSolve::matrix diff --git a/resolve/matrix/Csc.cpp b/resolve/matrix/Csc.cpp index 1a305e03..e6fed07c 100644 --- a/resolve/matrix/Csc.cpp +++ b/resolve/matrix/Csc.cpp @@ -24,64 +24,61 @@ namespace ReSolve { } - index_type* matrix::Csc::getRowData(std::string memspace) + index_type* matrix::Csc::getRowData(memory::MemorySpace memspace) { - if (memspace == "cpu") { - copyData("cpu"); - return this->h_row_data_; - } else { - if (memspace == "cuda") { - copyData("cuda"); + using namespace ReSolve::memory; + copyData(memspace); + switch (memspace) { + case HOST: + return this->h_row_data_; + case DEVICE: return this->d_row_data_; - } else { + default: return nullptr; - } } } - index_type* matrix::Csc::getColData(std::string memspace) + index_type* matrix::Csc::getColData(memory::MemorySpace memspace) { - if (memspace == "cpu") { - copyData("cpu"); - return this->h_col_data_; - } else { - if (memspace == "cuda") { - copyData("cuda"); + using namespace ReSolve::memory; + copyData(memspace); + switch (memspace) { + case HOST: + return this->h_col_data_; + case DEVICE: return this->d_col_data_; - } else { + default: return nullptr; - } } } - real_type* matrix::Csc::getValues(std::string memspace) + real_type* matrix::Csc::getValues(memory::MemorySpace memspace) { - if (memspace == "cpu") { - copyData("cpu"); - return this->h_val_data_; - } else { - if (memspace == "cuda") { - copyData("cuda"); + using namespace ReSolve::memory; + copyData(memspace); + switch (memspace) { + case HOST: + return this->h_val_data_; + case DEVICE: return this->d_val_data_; - } else { + default: return nullptr; - } } } - int matrix::Csc::updateData(index_type* row_data, index_type* col_data, real_type* val_data, std::string memspaceIn, std::string memspaceOut) + int matrix::Csc::updateData(index_type* row_data, index_type* col_data, real_type* val_data, memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut) { index_type nnz_current = nnz_; if (is_expanded_) {nnz_current = nnz_expanded_;} //four cases (for now) int control=-1; setNotUpdated(); - if ((memspaceIn == "cpu") && (memspaceOut == "cpu")){ control = 0;} - if ((memspaceIn == "cpu") && (memspaceOut == "cuda")){ control = 1;} - if ((memspaceIn == "cuda") && (memspaceOut == "cpu")){ control = 2;} - if ((memspaceIn == "cuda") && (memspaceOut == "cuda")){ control = 3;} + if ((memspaceIn == memory::HOST) && (memspaceOut == memory::HOST)) { control = 0;} + if ((memspaceIn == memory::HOST) && ((memspaceOut == memory::DEVICE))){ control = 1;} + if (((memspaceIn == memory::DEVICE)) && (memspaceOut == memory::HOST)) { control = 2;} + if (((memspaceIn == memory::DEVICE)) && ((memspaceOut == memory::DEVICE))){ control = 3;} - if (memspaceOut == "cpu") { + if (memspaceOut == memory::HOST) { //check if cpu data allocated if (h_col_data_ == nullptr) { this->h_col_data_ = new index_type[n_ + 1]; @@ -94,7 +91,7 @@ namespace ReSolve } } - if (memspaceOut == "cuda") { + if (memspaceOut == memory::DEVICE) { //check if cuda data allocated if (d_col_data_ == nullptr) { mem_.allocateArrayOnDevice(&d_col_data_, n_ + 1); @@ -109,14 +106,14 @@ namespace ReSolve switch(control) { case 0: //cpu->cpu - std::memcpy(h_col_data_, col_data, (n_ + 1) * sizeof(index_type)); - std::memcpy(h_row_data_, row_data, (nnz_current) * sizeof(index_type)); - std::memcpy(h_val_data_, val_data, (nnz_current) * sizeof(real_type)); + mem_.copyArrayHostToHost(h_col_data_, col_data, n_ + 1); + mem_.copyArrayHostToHost(h_row_data_, row_data, nnz_current); + mem_.copyArrayHostToHost(h_val_data_, val_data, nnz_current); h_data_updated_ = true; owns_cpu_data_ = true; owns_cpu_vals_ = true; break; - case 2://cuda->cpu + case 2://gpu->cpu mem_.copyArrayDeviceToHost(h_col_data_, col_data, n_ + 1); mem_.copyArrayDeviceToHost(h_row_data_, row_data, nnz_current); mem_.copyArrayDeviceToHost(h_val_data_, val_data, nnz_current); @@ -124,7 +121,7 @@ namespace ReSolve owns_cpu_data_ = true; owns_cpu_vals_ = true; break; - case 1://cpu->cuda + case 1://cpu->gpu mem_.copyArrayHostToDevice(d_col_data_, col_data, n_ + 1); mem_.copyArrayHostToDevice(d_row_data_, row_data, nnz_current); mem_.copyArrayHostToDevice(d_val_data_, val_data, nnz_current); @@ -132,7 +129,7 @@ namespace ReSolve owns_gpu_data_ = true; owns_gpu_vals_ = true; break; - case 3://cuda->cuda + case 3://gpu->gpu mem_.copyArrayDeviceToDevice(d_col_data_, col_data, n_ + 1); mem_.copyArrayDeviceToDevice(d_row_data_, row_data, nnz_current); mem_.copyArrayDeviceToDevice(d_val_data_, val_data, nnz_current); @@ -147,7 +144,7 @@ namespace ReSolve } - int matrix::Csc::updateData(index_type* row_data, index_type* col_data, real_type* val_data, index_type new_nnz, std::string memspaceIn, std::string memspaceOut) + int matrix::Csc::updateData(index_type* row_data, index_type* col_data, real_type* val_data, index_type new_nnz, memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut) { this->destroyMatrixData(memspaceOut); this->nnz_ = new_nnz; @@ -155,13 +152,13 @@ namespace ReSolve return i; } - int matrix::Csc::allocateMatrixData(std::string memspace) + int matrix::Csc::allocateMatrixData(memory::MemorySpace memspace) { index_type nnz_current = nnz_; if (is_expanded_) {nnz_current = nnz_expanded_;} destroyMatrixData(memspace);//just in case - if (memspace == "cpu") { + if (memspace == memory::HOST) { this->h_col_data_ = new index_type[n_ + 1]; std::fill(h_col_data_, h_col_data_ + n_ + 1, 0); this->h_row_data_ = new index_type[nnz_current]; @@ -173,7 +170,7 @@ namespace ReSolve return 0; } - if (memspace == "cuda") { + if (memspace == memory::DEVICE) { mem_.allocateArrayOnDevice(&d_col_data_, n_ + 1); mem_.allocateArrayOnDevice(&d_row_data_, nnz_current); mem_.allocateArrayOnDevice(&d_val_data_, nnz_current); @@ -184,54 +181,56 @@ namespace ReSolve return -1; } - int matrix::Csc::copyData(std::string memspaceOut) + int matrix::Csc::copyData(memory::MemorySpace memspaceOut) { + using namespace ReSolve::memory; index_type nnz_current = nnz_; - if (is_expanded_) {nnz_current = nnz_expanded_;} - - if (memspaceOut == "cpu") { - //check if we need to copy or not - if ((d_data_updated_ == true) && (h_data_updated_ == false)) { - if (h_col_data_ == nullptr) { - h_col_data_ = new index_type[n_ + 1]; - } - if (h_row_data_ == nullptr) { - h_row_data_ = new index_type[nnz_current]; - } - if (h_val_data_ == nullptr) { - h_val_data_ = new real_type[nnz_current]; - } - mem_.copyArrayDeviceToHost(h_col_data_, d_col_data_, n_ + 1); - mem_.copyArrayDeviceToHost(h_row_data_, d_row_data_, nnz_current); - mem_.copyArrayDeviceToHost(h_val_data_, d_val_data_, nnz_current); - h_data_updated_ = true; - owns_cpu_data_ = true; - owns_cpu_vals_ = true; - } - return 0; + if (is_expanded_) { + nnz_current = nnz_expanded_; } - if (memspaceOut == "cuda") { - if ((d_data_updated_ == false) && (h_data_updated_ == true)) { - if (d_col_data_ == nullptr) { - mem_.allocateArrayOnDevice(&d_col_data_, n_ + 1); + switch(memspaceOut) { + case HOST: + if ((d_data_updated_ == true) && (h_data_updated_ == false)) { + if (h_col_data_ == nullptr) { + h_col_data_ = new index_type[n_ + 1]; + } + if (h_row_data_ == nullptr) { + h_row_data_ = new index_type[nnz_current]; + } + if (h_val_data_ == nullptr) { + h_val_data_ = new real_type[nnz_current]; + } + mem_.copyArrayDeviceToHost(h_col_data_, d_col_data_, n_ + 1); + mem_.copyArrayDeviceToHost(h_row_data_, d_row_data_, nnz_current); + mem_.copyArrayDeviceToHost(h_val_data_, d_val_data_, nnz_current); + h_data_updated_ = true; + owns_cpu_data_ = true; + owns_cpu_vals_ = true; } - if (d_row_data_ == nullptr) { - mem_.allocateArrayOnDevice(&d_row_data_, nnz_current); + return 0; + case DEVICE: + if ((d_data_updated_ == false) && (h_data_updated_ == true)) { + if (d_col_data_ == nullptr) { + mem_.allocateArrayOnDevice(&d_col_data_, n_ + 1); + } + if (d_row_data_ == nullptr) { + mem_.allocateArrayOnDevice(&d_row_data_, nnz_current); + } + if (d_val_data_ == nullptr) { + mem_.allocateArrayOnDevice(&d_val_data_, nnz_current); + } + mem_.copyArrayHostToDevice(d_col_data_, h_col_data_, n_ + 1); + mem_.copyArrayHostToDevice(d_row_data_, h_row_data_, nnz_current); + mem_.copyArrayHostToDevice(d_val_data_, h_val_data_, nnz_current); + d_data_updated_ = true; + owns_gpu_data_ = true; + owns_gpu_vals_ = true; } - if (d_val_data_ == nullptr) { - mem_.allocateArrayOnDevice(&d_val_data_, nnz_current); - } - mem_.copyArrayHostToDevice(d_col_data_, h_col_data_, n_ + 1); - mem_.copyArrayHostToDevice(d_row_data_, h_row_data_, nnz_current); - mem_.copyArrayHostToDevice(d_val_data_, h_val_data_, nnz_current); - d_data_updated_ = true; - owns_gpu_data_ = true; - owns_gpu_vals_ = true; - } - return 0; - } - return -1; + return 0; + default: + return -1; + } // switch } } diff --git a/resolve/matrix/Csc.hpp b/resolve/matrix/Csc.hpp index f0598314..8a5dc551 100644 --- a/resolve/matrix/Csc.hpp +++ b/resolve/matrix/Csc.hpp @@ -15,18 +15,18 @@ namespace ReSolve { namespace matrix { bool expanded); ~Csc(); - virtual index_type* getRowData(std::string memspace); - virtual index_type* getColData(std::string memspace); - virtual real_type* getValues(std::string memspace); + virtual index_type* getRowData(memory::MemorySpace memspace); + virtual index_type* getColData(memory::MemorySpace memspace); + virtual real_type* getValues( memory::MemorySpace memspace); - virtual int updateData(index_type* row_data, index_type* col_data, real_type* val_data, std::string memspaceIn, std::string memspaceOut); - virtual int updateData(index_type* row_data, index_type* col_data, real_type* val_data, index_type new_nnz, std::string memspaceIn, std::string memspaceOut); + virtual int updateData(index_type* row_data, index_type* col_data, real_type* val_data, memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut); + virtual int updateData(index_type* row_data, index_type* col_data, real_type* val_data, index_type new_nnz, memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut); - virtual int allocateMatrixData(std::string memspace); + virtual int allocateMatrixData(memory::MemorySpace memspace); virtual void print() {return;} - virtual int copyData(std::string memspaceOut); + virtual int copyData(memory::MemorySpace memspaceOut); }; diff --git a/resolve/matrix/Csr.cpp b/resolve/matrix/Csr.cpp index f1ddd31f..0c08b641 100644 --- a/resolve/matrix/Csr.cpp +++ b/resolve/matrix/Csr.cpp @@ -24,64 +24,61 @@ namespace ReSolve { } - index_type* matrix::Csr::getRowData(std::string memspace) + index_type* matrix::Csr::getRowData(memory::MemorySpace memspace) { - if (memspace == "cpu") { - copyData("cpu"); - return this->h_row_data_; - } else { - if (memspace == "cuda") { - copyData("cuda"); + using namespace ReSolve::memory; + copyData(memspace); + switch (memspace) { + case HOST: + return this->h_row_data_; + case DEVICE: return this->d_row_data_; - } else { + default: return nullptr; - } } } - index_type* matrix::Csr::getColData(std::string memspace) + index_type* matrix::Csr::getColData(memory::MemorySpace memspace) { - if (memspace == "cpu") { - copyData("cpu"); - return this->h_col_data_; - } else { - if (memspace == "cuda") { - copyData("cuda"); + using namespace ReSolve::memory; + copyData(memspace); + switch (memspace) { + case HOST: + return this->h_col_data_; + case DEVICE: return this->d_col_data_; - } else { + default: return nullptr; - } } } - real_type* matrix::Csr::getValues(std::string memspace) + real_type* matrix::Csr::getValues(memory::MemorySpace memspace) { - if (memspace == "cpu") { - copyData("cpu"); - return this->h_val_data_; - } else { - if (memspace == "cuda") { - copyData("cuda"); + using namespace ReSolve::memory; + copyData(memspace); + switch (memspace) { + case HOST: + return this->h_val_data_; + case DEVICE: return this->d_val_data_; - } else { + default: return nullptr; - } } } - int matrix::Csr::updateData(index_type* row_data, index_type* col_data, real_type* val_data, std::string memspaceIn, std::string memspaceOut) + int matrix::Csr::updateData(index_type* row_data, index_type* col_data, real_type* val_data, memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut) { //four cases (for now) index_type nnz_current = nnz_; if (is_expanded_) {nnz_current = nnz_expanded_;} setNotUpdated(); int control = -1; - if ((memspaceIn == "cpu") && (memspaceOut == "cpu")){ control = 0;} - if ((memspaceIn == "cpu") && (memspaceOut == "cuda")){ control = 1;} - if ((memspaceIn == "cuda") && (memspaceOut == "cpu")){ control = 2;} - if ((memspaceIn == "cuda") && (memspaceOut == "cuda")){ control = 3;} + if ((memspaceIn == memory::HOST) && (memspaceOut == memory::HOST)) { control = 0;} + if ((memspaceIn == memory::HOST) && ((memspaceOut == memory::DEVICE))){ control = 1;} + if (((memspaceIn == memory::DEVICE)) && (memspaceOut == memory::HOST)) { control = 2;} + if (((memspaceIn == memory::DEVICE)) && ((memspaceOut == memory::DEVICE))){ control = 3;} - if (memspaceOut == "cpu") { + if (memspaceOut == memory::HOST) { //check if cpu data allocated if (h_row_data_ == nullptr) { this->h_row_data_ = new index_type[n_ + 1]; @@ -94,7 +91,7 @@ namespace ReSolve } } - if (memspaceOut == "cuda") { + if (memspaceOut == memory::DEVICE) { //check if cuda data allocated if (d_row_data_ == nullptr) { mem_.allocateArrayOnDevice(&d_row_data_, n_ + 1); @@ -111,14 +108,14 @@ namespace ReSolve //copy switch(control) { case 0: //cpu->cpu - std::memcpy(h_row_data_, row_data, (n_ + 1) * sizeof(index_type)); - std::memcpy(h_col_data_, col_data, (nnz_current) * sizeof(index_type)); - std::memcpy(h_val_data_, val_data, (nnz_current) * sizeof(real_type)); + mem_.copyArrayHostToHost(h_row_data_, row_data, n_ + 1); + mem_.copyArrayHostToHost(h_col_data_, col_data, nnz_current); + mem_.copyArrayHostToHost(h_val_data_, val_data, nnz_current); h_data_updated_ = true; owns_cpu_data_ = true; owns_cpu_vals_ = true; break; - case 2://cuda->cpu + case 2://gpu->cpu mem_.copyArrayDeviceToHost(h_row_data_, row_data, n_ + 1); mem_.copyArrayDeviceToHost(h_col_data_, col_data, nnz_current); mem_.copyArrayDeviceToHost(h_val_data_, val_data, nnz_current); @@ -126,7 +123,7 @@ namespace ReSolve owns_cpu_data_ = true; owns_cpu_vals_ = true; break; - case 1://cpu->cuda + case 1://cpu->gpu mem_.copyArrayHostToDevice(d_row_data_, row_data, n_ + 1); mem_.copyArrayHostToDevice(d_col_data_, col_data, nnz_current); mem_.copyArrayHostToDevice(d_val_data_, val_data, nnz_current); @@ -134,7 +131,7 @@ namespace ReSolve owns_gpu_data_ = true; owns_gpu_vals_ = true; break; - case 3://cuda->cuda + case 3://gpu->gpu mem_.copyArrayDeviceToDevice(d_row_data_, row_data, n_ + 1); mem_.copyArrayDeviceToDevice(d_col_data_, col_data, nnz_current); mem_.copyArrayDeviceToDevice(d_val_data_, val_data, nnz_current); @@ -148,7 +145,7 @@ namespace ReSolve return 0; } - int matrix::Csr::updateData(index_type* row_data, index_type* col_data, real_type* val_data, index_type new_nnz, std::string memspaceIn, std::string memspaceOut) + int matrix::Csr::updateData(index_type* row_data, index_type* col_data, real_type* val_data, index_type new_nnz, memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut) { this->destroyMatrixData(memspaceOut); this->nnz_ = new_nnz; @@ -156,13 +153,13 @@ namespace ReSolve return i; } - int matrix::Csr::allocateMatrixData(std::string memspace) + int matrix::Csr::allocateMatrixData(memory::MemorySpace memspace) { index_type nnz_current = nnz_; if (is_expanded_) {nnz_current = nnz_expanded_;} destroyMatrixData(memspace);//just in case - if (memspace == "cpu") { + if (memspace == memory::HOST) { this->h_row_data_ = new index_type[n_ + 1]; std::fill(h_row_data_, h_row_data_ + n_ + 1, 0); this->h_col_data_ = new index_type[nnz_current]; @@ -174,7 +171,7 @@ namespace ReSolve return 0; } - if (memspace == "cuda") { + if (memspace == memory::DEVICE) { mem_.allocateArrayOnDevice(&d_row_data_, n_ + 1); mem_.allocateArrayOnDevice(&d_col_data_, nnz_current); mem_.allocateArrayOnDevice(&d_val_data_, nnz_current); @@ -185,54 +182,58 @@ namespace ReSolve return -1; } - int matrix::Csr::copyData(std::string memspaceOut) + int matrix::Csr::copyData(memory::MemorySpace memspaceOut) { - index_type nnz_current = nnz_; - if (is_expanded_) {nnz_current = nnz_expanded_;} + using namespace ReSolve::memory; - if (memspaceOut == "cpu") { - //check if we need to copy or not - if ((d_data_updated_ == true) && (h_data_updated_ == false)) { - if (h_row_data_ == nullptr) { - h_row_data_ = new index_type[n_ + 1]; - } - if (h_col_data_ == nullptr) { - h_col_data_ = new index_type[nnz_current]; - } - if (h_val_data_ == nullptr) { - h_val_data_ = new real_type[nnz_current]; - } - mem_.copyArrayDeviceToHost(h_row_data_, d_row_data_, n_ + 1); - mem_.copyArrayDeviceToHost(h_col_data_, d_col_data_, nnz_current); - mem_.copyArrayDeviceToHost(h_val_data_, d_val_data_, nnz_current); - h_data_updated_ = true; - owns_cpu_data_ = true; - owns_cpu_vals_ = true; - } - return 0; + index_type nnz_current = nnz_; + if (is_expanded_) { + nnz_current = nnz_expanded_; } - if (memspaceOut == "cuda") { - if ((d_data_updated_ == false) && (h_data_updated_ == true)) { - if (d_row_data_ == nullptr) { - mem_.allocateArrayOnDevice(&d_row_data_, n_ + 1); - } - if (d_col_data_ == nullptr) { - mem_.allocateArrayOnDevice(&d_col_data_, nnz_current); + switch (memspaceOut) { + case HOST: + //check if we need to copy or not + if ((d_data_updated_ == true) && (h_data_updated_ == false)) { + if (h_row_data_ == nullptr) { + h_row_data_ = new index_type[n_ + 1]; + } + if (h_col_data_ == nullptr) { + h_col_data_ = new index_type[nnz_current]; + } + if (h_val_data_ == nullptr) { + h_val_data_ = new real_type[nnz_current]; + } + mem_.copyArrayDeviceToHost(h_row_data_, d_row_data_, n_ + 1); + mem_.copyArrayDeviceToHost(h_col_data_, d_col_data_, nnz_current); + mem_.copyArrayDeviceToHost(h_val_data_, d_val_data_, nnz_current); + h_data_updated_ = true; + owns_cpu_data_ = true; + owns_cpu_vals_ = true; } - if (d_val_data_ == nullptr) { - mem_.allocateArrayOnDevice(&d_val_data_, nnz_current); + return 0; + case DEVICE: + if ((d_data_updated_ == false) && (h_data_updated_ == true)) { + if (d_row_data_ == nullptr) { + mem_.allocateArrayOnDevice(&d_row_data_, n_ + 1); + } + if (d_col_data_ == nullptr) { + mem_.allocateArrayOnDevice(&d_col_data_, nnz_current); + } + if (d_val_data_ == nullptr) { + mem_.allocateArrayOnDevice(&d_val_data_, nnz_current); + } + mem_.copyArrayHostToDevice(d_row_data_, h_row_data_, n_ + 1); + mem_.copyArrayHostToDevice(d_col_data_, h_col_data_, nnz_current); + mem_.copyArrayHostToDevice(d_val_data_, h_val_data_, nnz_current); + d_data_updated_ = true; + owns_gpu_data_ = true; + owns_gpu_vals_ = true; } - mem_.copyArrayHostToDevice(d_row_data_, h_row_data_, n_ + 1); - mem_.copyArrayHostToDevice(d_col_data_, h_col_data_, nnz_current); - mem_.copyArrayHostToDevice(d_val_data_, h_val_data_, nnz_current); - d_data_updated_ = true; - owns_gpu_data_ = true; - owns_gpu_vals_ = true; - } - return 0; - } - return -1; + return 0; + default: + return -1; + } // switch } } // namespace ReSolve diff --git a/resolve/matrix/Csr.hpp b/resolve/matrix/Csr.hpp index 43c317de..a5d8f682 100644 --- a/resolve/matrix/Csr.hpp +++ b/resolve/matrix/Csr.hpp @@ -18,18 +18,18 @@ namespace ReSolve { namespace matrix { ~Csr(); - virtual index_type* getRowData(std::string memspace); - virtual index_type* getColData(std::string memspace); - virtual real_type* getValues(std::string memspace); + virtual index_type* getRowData(memory::MemorySpace memspace); + virtual index_type* getColData(memory::MemorySpace memspace); + virtual real_type* getValues( memory::MemorySpace memspace); - virtual int updateData(index_type* row_data, index_type* col_data, real_type* val_data, std::string memspaceIn, std::string memspaceOut); - virtual int updateData(index_type* row_data, index_type* col_data, real_type* val_data, index_type new_nnz, std::string memspaceIn, std::string memspaceOut); + virtual int updateData(index_type* row_data, index_type* col_data, real_type* val_data, memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut); + virtual int updateData(index_type* row_data, index_type* col_data, real_type* val_data, index_type new_nnz, memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut); - virtual int allocateMatrixData(std::string memspace); + virtual int allocateMatrixData(memory::MemorySpace memspace); virtual void print() {return;} - virtual int copyData(std::string memspaceOut); + virtual int copyData(memory::MemorySpace memspaceOut); }; }} // namespace ReSolve::matrix diff --git a/resolve/matrix/MatrixHandler.cpp b/resolve/matrix/MatrixHandler.cpp index 8bf4302c..b2d4339f 100644 --- a/resolve/matrix/MatrixHandler.cpp +++ b/resolve/matrix/MatrixHandler.cpp @@ -13,6 +13,9 @@ #ifdef RESOLVE_USE_CUDA #include "MatrixHandlerCuda.hpp" #endif +#ifdef RESOLVE_USE_HIP +#include "MatrixHandlerHip.hpp" +#endif namespace ReSolve { // Create a shortcut name for Logger static class @@ -41,6 +44,7 @@ namespace ReSolve { { if (isCpuEnabled_) delete cpuImpl_; if (isCudaEnabled_) delete cudaImpl_; + if (isHipEnabled_) delete hipImpl_; } /** @@ -74,12 +78,31 @@ namespace ReSolve { } #endif +#ifdef RESOLVE_USE_HIP + /** + * @brief Constructor taking pointer to the CUDA workspace as its parameter. + * + * @post A CPU implementation instance is created because it is cheap and + * it does not require a workspace. + * + * @post A HIP implementation instance is created with supplied workspace. + */ + MatrixHandler::MatrixHandler(LinAlgWorkspaceHIP* new_workspace) + { + cpuImpl_ = new MatrixHandlerCpu(); + hipImpl_ = new MatrixHandlerHip(new_workspace); + isCpuEnabled_ = true; + isHipEnabled_ = true; + } +#endif void MatrixHandler::setValuesChanged(bool isValuesChanged, std::string memspace) { if (memspace == "cpu") { cpuImpl_->setValuesChanged(isValuesChanged); } else if (memspace == "cuda") { cudaImpl_->setValuesChanged(isValuesChanged); + } else if (memspace == "hip") { + hipImpl_->setValuesChanged(isValuesChanged); } else { out::error() << "Unsupported device " << memspace << "\n"; } @@ -101,9 +124,9 @@ namespace ReSolve { index_type* nnz_counts = new index_type[n]; std::fill_n(nnz_counts, n, 0); - index_type* coo_rows = A_coo->getRowData("cpu"); - index_type* coo_cols = A_coo->getColData("cpu"); - real_type* coo_vals = A_coo->getValues("cpu"); + index_type* coo_rows = A_coo->getRowData(memory::HOST); + index_type* coo_cols = A_coo->getColData(memory::HOST); + real_type* coo_vals = A_coo->getValues( memory::HOST); index_type* diag_control = new index_type[n]; //for DEDUPLICATION of the diagonal std::fill_n(diag_control, n, 0); @@ -226,10 +249,12 @@ namespace ReSolve { #endif A_csr->setNnz(nnz_no_duplicates); if (memspace == "cpu"){ - A_csr->updateData(csr_ia, csr_ja, csr_a, "cpu", "cpu"); + A_csr->updateData(csr_ia, csr_ja, csr_a, memory::HOST, memory::HOST); } else { if (memspace == "cuda"){ - A_csr->updateData(csr_ia, csr_ja, csr_a, "cpu", "cuda"); + A_csr->updateData(csr_ia, csr_ja, csr_a, memory::HOST, memory::DEVICE); + } else if (memspace == "hip"){ + A_csr->updateData(csr_ia, csr_ja, csr_a, memory::HOST, memory::DEVICE); } else { //display error } @@ -269,6 +294,8 @@ namespace ReSolve { return cudaImpl_->matvec(A, vec_x, vec_result, alpha, beta, matrixFormat); } else if (memspace == "cpu") { return cpuImpl_->matvec(A, vec_x, vec_result, alpha, beta, matrixFormat); + } else if (memspace == "hip") { + return hipImpl_->matvec(A, vec_x, vec_result, alpha, beta, matrixFormat); } else { out::error() << "Support for device " << memspace << " not implemented (yet)" << std::endl; return 1; @@ -280,6 +307,8 @@ namespace ReSolve { { if (memspace == "cuda") { return cudaImpl_->csc2csr(A_csc, A_csr); + } else if (memspace == "hip") { + return hipImpl_->csc2csr(A_csc, A_csr); } else if (memspace == "cpu") { out::warning() << "Using untested csc2csr on CPU ..." << std::endl; return cpuImpl_->csc2csr(A_csc, A_csr); diff --git a/resolve/matrix/MatrixHandler.hpp b/resolve/matrix/MatrixHandler.hpp index 398a8039..cec61085 100644 --- a/resolve/matrix/MatrixHandler.hpp +++ b/resolve/matrix/MatrixHandler.hpp @@ -18,6 +18,7 @@ namespace ReSolve } class LinAlgWorkspaceCpu; class LinAlgWorkspaceCUDA; + class LinAlgWorkspaceHIP; class MatrixHandlerImpl; } @@ -48,6 +49,7 @@ namespace ReSolve { MatrixHandler(); MatrixHandler(LinAlgWorkspaceCpu* workspace); MatrixHandler(LinAlgWorkspaceCUDA* workspace); + MatrixHandler(LinAlgWorkspaceHIP* workspace); ~MatrixHandler(); int csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr, std::string memspace); @@ -70,9 +72,11 @@ namespace ReSolve { MemoryHandler mem_; ///< Device memory manager object MatrixHandlerImpl* cpuImpl_{nullptr}; ///< Pointer to CPU implementation MatrixHandlerImpl* cudaImpl_{nullptr}; ///< Pointer to CUDA implementation + MatrixHandlerImpl* hipImpl_{nullptr}; ///< Pointer to HIP implementation bool isCpuEnabled_{false}; ///< true if CPU implementation is instantiated bool isCudaEnabled_{false}; ///< true if CUDA implementation is instantiated + bool isHipEnabled_{false}; ///< true if HIP implementation is instantiated }; } // namespace ReSolve diff --git a/resolve/matrix/MatrixHandlerCpu.cpp b/resolve/matrix/MatrixHandlerCpu.cpp index 2c434dcb..d4799ffd 100644 --- a/resolve/matrix/MatrixHandlerCpu.cpp +++ b/resolve/matrix/MatrixHandlerCpu.cpp @@ -45,12 +45,12 @@ namespace ReSolve { // int error_sum = 0; if (matrixFormat == "csr") { matrix::Csr* A = (matrix::Csr*) Ageneric; - index_type* ia = A->getRowData("cpu"); - index_type* ja = A->getColData("cpu"); - real_type* a = A->getValues("cpu"); + index_type* ia = A->getRowData(memory::HOST); + index_type* ja = A->getColData(memory::HOST); + real_type* a = A->getValues( memory::HOST); - real_type* x_data = vec_x->getData("cpu"); - real_type* result_data = vec_result->getData("cpu"); + real_type* x_data = vec_x->getData(memory::HOST); + real_type* result_data = vec_result->getData(memory::HOST); real_type sum; real_type y; real_type t; @@ -70,7 +70,7 @@ namespace ReSolve { sum *= (*alpha); result_data[i] = result_data[i]*(*beta) + sum; } - vec_result->setDataUpdated("cpu"); + vec_result->setDataUpdated(memory::HOST); return 0; } else { out::error() << "MatVec not implemented (yet) for " @@ -100,13 +100,13 @@ namespace ReSolve { index_type nnz = A_csc->getNnz(); index_type n = A_csc->getNumColumns(); - index_type* rowIdxCsc = A_csc->getRowData("cpu"); - index_type* colPtrCsc = A_csc->getColData("cpu"); - real_type* valuesCsc = A_csc->getValues("cpu"); + index_type* rowIdxCsc = A_csc->getRowData(memory::HOST); + index_type* colPtrCsc = A_csc->getColData(memory::HOST); + real_type* valuesCsc = A_csc->getValues( memory::HOST); - index_type* rowPtrCsr = A_csr->getRowData("cpu"); - index_type* colIdxCsr = A_csr->getColData("cpu"); - real_type* valuesCsr = A_csr->getValues("cpu"); + index_type* rowPtrCsr = A_csr->getRowData(memory::HOST); + index_type* colIdxCsr = A_csr->getColData(memory::HOST); + real_type* valuesCsr = A_csr->getValues( memory::HOST); // Set all CSR row pointers to zero for (index_type i = 0; i <= n; ++i) { diff --git a/resolve/matrix/MatrixHandlerCpu.hpp b/resolve/matrix/MatrixHandlerCpu.hpp index 0b0afbd3..b6e66066 100644 --- a/resolve/matrix/MatrixHandlerCpu.hpp +++ b/resolve/matrix/MatrixHandlerCpu.hpp @@ -50,7 +50,7 @@ namespace ReSolve { LinAlgWorkspaceCpu* workspace_{nullptr}; bool values_changed_{true}; ///< needed for matvec - MemoryHandler mem_; ///< Device memory manager object + // MemoryHandler mem_; ///< Device memory manager object not used for now }; } // namespace ReSolve diff --git a/resolve/matrix/MatrixHandlerCuda.cpp b/resolve/matrix/MatrixHandlerCuda.cpp index 3405ba8d..e0ac7bb4 100644 --- a/resolve/matrix/MatrixHandlerCuda.cpp +++ b/resolve/matrix/MatrixHandlerCuda.cpp @@ -42,11 +42,11 @@ namespace ReSolve { cusparseStatus_t status; LinAlgWorkspaceCUDA* workspaceCUDA = workspace_; cusparseDnVecDescr_t vecx = workspaceCUDA->getVecX(); - cusparseCreateDnVec(&vecx, A->getNumRows(), vec_x->getData("cuda"), CUDA_R_64F); + cusparseCreateDnVec(&vecx, A->getNumRows(), vec_x->getData(memory::DEVICE), CUDA_R_64F); cusparseDnVecDescr_t vecAx = workspaceCUDA->getVecY(); - cusparseCreateDnVec(&vecAx, A->getNumRows(), vec_result->getData("cuda"), CUDA_R_64F); + cusparseCreateDnVec(&vecAx, A->getNumRows(), vec_result->getData(memory::DEVICE), CUDA_R_64F); cusparseSpMatDescr_t matA = workspaceCUDA->getSpmvMatrixDescriptor(); @@ -57,9 +57,9 @@ namespace ReSolve { A->getNumRows(), A->getNumColumns(), A->getNnzExpanded(), - A->getRowData("cuda"), - A->getColData("cuda"), - A->getValues("cuda"), + A->getRowData(memory::DEVICE), + A->getColData(memory::DEVICE), + A->getValues( memory::DEVICE), CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, @@ -105,7 +105,7 @@ namespace ReSolve { if (status) out::error() << "Matvec status: " << status << "Last error code: " << mem_.getLastDeviceError() << std::endl; - vec_result->setDataUpdated("cuda"); + vec_result->setDataUpdated(memory::DEVICE); cusparseDestroyDnVec(vecx); cusparseDestroyDnVec(vecAx); @@ -127,7 +127,7 @@ namespace ReSolve { index_type error_sum = 0; LinAlgWorkspaceCUDA* workspaceCUDA = (LinAlgWorkspaceCUDA*) workspace_; - A_csr->allocateMatrixData("cuda"); + A_csr->allocateMatrixData(memory::DEVICE); index_type n = A_csc->getNumRows(); index_type m = A_csc->getNumRows(); index_type nnz = A_csc->getNnz(); @@ -137,12 +137,12 @@ namespace ReSolve { n, m, nnz, - A_csc->getValues("cuda"), - A_csc->getColData("cuda"), - A_csc->getRowData("cuda"), - A_csr->getValues("cuda"), - A_csr->getRowData("cuda"), - A_csr->getColData("cuda"), + A_csc->getValues( memory::DEVICE), + A_csc->getColData(memory::DEVICE), + A_csc->getRowData(memory::DEVICE), + A_csr->getValues( memory::DEVICE), + A_csr->getRowData(memory::DEVICE), + A_csr->getColData(memory::DEVICE), CUDA_R_64F, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO, @@ -154,12 +154,12 @@ namespace ReSolve { n, m, nnz, - A_csc->getValues("cuda"), - A_csc->getColData("cuda"), - A_csc->getRowData("cuda"), - A_csr->getValues("cuda"), - A_csr->getRowData("cuda"), - A_csr->getColData("cuda"), + A_csc->getValues( memory::DEVICE), + A_csc->getColData(memory::DEVICE), + A_csc->getRowData(memory::DEVICE), + A_csr->getValues( memory::DEVICE), + A_csr->getRowData(memory::DEVICE), + A_csr->getColData(memory::DEVICE), CUDA_R_64F, CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO, diff --git a/resolve/matrix/MatrixHandlerHip.cpp b/resolve/matrix/MatrixHandlerHip.cpp new file mode 100644 index 00000000..ff10e973 --- /dev/null +++ b/resolve/matrix/MatrixHandlerHip.cpp @@ -0,0 +1,155 @@ +#include + +#include +#include +#include +#include +#include +#include +#include "MatrixHandlerHip.hpp" + +namespace ReSolve { + // Create a shortcut name for Logger static class + using out = io::Logger; + + MatrixHandlerHip::~MatrixHandlerHip() + { + } + + MatrixHandlerHip::MatrixHandlerHip(LinAlgWorkspaceHIP* new_workspace) + { + workspace_ = new_workspace; + } + + void MatrixHandlerHip::setValuesChanged(bool values_changed) + { + values_changed_ = values_changed; + } + + + int MatrixHandlerHip::matvec(matrix::Sparse* Ageneric, + vector_type* vec_x, + vector_type* vec_result, + const real_type* alpha, + const real_type* beta, + std::string matrixFormat) + { + using namespace constants; + int error_sum = 0; + if (matrixFormat == "csr") { + matrix::Csr* A = dynamic_cast(Ageneric); + //result = alpha *A*x + beta * result + rocsparse_status status; + LinAlgWorkspaceHIP* workspaceHIP = workspace_; + + rocsparse_handle handle_rocsparse = workspaceHIP->getRocsparseHandle(); + + rocsparse_mat_info infoA = workspaceHIP->getSpmvMatrixInfo(); + rocsparse_mat_descr descrA = workspaceHIP->getSpmvMatrixDescriptor(); + + if (!workspaceHIP->matvecSetup()) { + //setup first, allocate, etc. + rocsparse_create_mat_descr(&(descrA)); + rocsparse_set_mat_index_base(descrA, rocsparse_index_base_zero); + rocsparse_set_mat_type(descrA, rocsparse_matrix_type_general); + + rocsparse_create_mat_info(&infoA); + + status = rocsparse_dcsrmv_analysis(handle_rocsparse, + rocsparse_operation_none, + A->getNumRows(), + A->getNumColumns(), + A->getNnzExpanded(), + descrA, + A->getValues( memory::DEVICE), + A->getRowData(memory::DEVICE), + A->getColData(memory::DEVICE), // cuda is used as "device" + infoA); + error_sum += status; + mem_.deviceSynchronize(); + + workspaceHIP->setSpmvMatrixDescriptor(descrA); + workspaceHIP->setSpmvMatrixInfo(infoA); + workspaceHIP->matvecSetupDone(); + } + + status = rocsparse_dcsrmv(handle_rocsparse, + rocsparse_operation_none, + A->getNumRows(), + A->getNumColumns(), + A->getNnzExpanded(), + alpha, + descrA, + A->getValues( memory::DEVICE), + A->getRowData(memory::DEVICE), + A->getColData(memory::DEVICE), + infoA, + vec_x->getData(memory::DEVICE), + beta, + vec_result->getData(memory::DEVICE)); + + error_sum += status; + mem_.deviceSynchronize(); + if (status) + out::error() << "Matvec status: " << status + << "Last error code: " << mem_.getLastDeviceError() << std::endl; + vec_result->setDataUpdated(memory::DEVICE); + + return error_sum; + } else { + out::error() << "MatVec not implemented (yet) for " + << matrixFormat << " matrix format." << std::endl; + return 1; + } + } + + int MatrixHandlerHip::Matrix1Norm(matrix::Sparse* /* A */, real_type* /* norm */) + { + return -1; + } + + int MatrixHandlerHip::csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr) + { + index_type error_sum = 0; + LinAlgWorkspaceHIP* workspaceHIP = (LinAlgWorkspaceHIP*) workspace_; + + rocsparse_status status; + + A_csr->allocateMatrixData(memory::DEVICE); + index_type n = A_csc->getNumRows(); + index_type m = A_csc->getNumRows(); + index_type nnz = A_csc->getNnz(); + size_t bufferSize; + void* d_work; + + status = rocsparse_csr2csc_buffer_size(workspaceHIP->getRocsparseHandle(), + n, + m, + nnz, + A_csc->getColData(memory::DEVICE), + A_csc->getRowData(memory::DEVICE), + rocsparse_action_numeric, + &bufferSize); + + error_sum += status; + mem_.allocateBufferOnDevice(&d_work, bufferSize); + + status = rocsparse_dcsr2csc(workspaceHIP->getRocsparseHandle(), + n, + m, + nnz, + A_csc->getValues( memory::DEVICE), + A_csc->getColData(memory::DEVICE), + A_csc->getRowData(memory::DEVICE), + A_csr->getValues( memory::DEVICE), + A_csr->getRowData(memory::DEVICE), + A_csr->getColData(memory::DEVICE), + rocsparse_action_numeric, + rocsparse_index_base_zero, + d_work); + error_sum += status; + return error_sum; + mem_.deleteOnDevice(d_work); + } + +} // namespace ReSolve diff --git a/resolve/matrix/MatrixHandlerHip.hpp b/resolve/matrix/MatrixHandlerHip.hpp new file mode 100644 index 00000000..37f11a7b --- /dev/null +++ b/resolve/matrix/MatrixHandlerHip.hpp @@ -0,0 +1,60 @@ +#pragma once +#include +#include +#include + +namespace ReSolve +{ + namespace vector + { + class Vector; + } + namespace matrix + { + class Sparse; + class Coo; + class Csc; + class Csr; + } + class LinAlgWorkspaceHIP; +} + + +namespace ReSolve { + /** + * @class MatrixHandlerHip + * + * @brief HIP implementation of the matrix handler. + */ + class MatrixHandlerHip : public MatrixHandlerImpl + { + using vector_type = vector::Vector; + + public: + + MatrixHandlerHip(LinAlgWorkspaceHIP* workspace); + virtual ~MatrixHandlerHip(); + + int csc2csr(matrix::Csc* A_csc, matrix::Csr* A_csr); + + virtual int matvec(matrix::Sparse* A, + vector_type* vec_x, + vector_type* vec_result, + const real_type* alpha, + const real_type* beta, + std::string matrix_type); + + virtual int Matrix1Norm(matrix::Sparse *A, real_type* norm); + + void setValuesChanged(bool isValuesChanged); + + private: + + LinAlgWorkspaceHIP* workspace_{nullptr}; + bool values_changed_{true}; ///< needed for matvec + + MemoryHandler mem_; ///< Device memory manager object + }; + +} // namespace ReSolve + diff --git a/resolve/matrix/Sparse.cpp b/resolve/matrix/Sparse.cpp index 5c866386..faa86e11 100644 --- a/resolve/matrix/Sparse.cpp +++ b/resolve/matrix/Sparse.cpp @@ -73,8 +73,8 @@ namespace ReSolve { namespace matrix { Sparse::~Sparse() { - this->destroyMatrixData("cpu"); - this->destroyMatrixData("cuda"); + this->destroyMatrixData(memory::HOST); + this->destroyMatrixData(memory::DEVICE); } void Sparse::setNotUpdated() @@ -133,58 +133,59 @@ namespace ReSolve { namespace matrix { this->nnz_ = nnz_new; } - int Sparse::setUpdated(std::string what) + int Sparse::setUpdated(memory::MemorySpace memspace) { - if (what == "cpu") - { - h_data_updated_ = true; - d_data_updated_ = false; - } else { - if (what == "cuda"){ + using namespace ReSolve::memory; + switch (memspace) { + case HOST: + h_data_updated_ = true; + d_data_updated_ = false; + break; + case DEVICE: d_data_updated_ = true; h_data_updated_ = false; - } else { - return -1; - } + break; } return 0; } - int Sparse::setMatrixData(index_type* row_data, index_type* col_data, real_type* val_data, std::string memspace) + int Sparse::setMatrixData(index_type* row_data, index_type* col_data, real_type* val_data, memory::MemorySpace memspace) { + using namespace ReSolve::memory; setNotUpdated(); - if (memspace == "cpu"){ - this->h_row_data_ = row_data; - this->h_col_data_ = col_data; - this->h_val_data_ = val_data; - h_data_updated_ = true; - } else { - if (memspace == "cuda"){ + switch (memspace) { + case HOST: + this->h_row_data_ = row_data; + this->h_col_data_ = col_data; + this->h_val_data_ = val_data; + h_data_updated_ = true; + break; + case DEVICE: this->d_row_data_ = row_data; this->d_col_data_ = col_data; this->d_val_data_ = val_data; d_data_updated_ = true; - } else { - return -1; - } + break; } return 0; } - int Sparse::destroyMatrixData(std::string memspace) - { - if (memspace == "cpu"){ - if (owns_cpu_data_) { - delete [] h_row_data_; - delete [] h_col_data_; - } - if (owns_cpu_vals_) { - delete [] h_val_data_; - } - } else { - if (memspace == "cuda"){ + int Sparse::destroyMatrixData(memory::MemorySpace memspace) + { + using namespace ReSolve::memory; + switch (memspace) { + case HOST: + if (owns_cpu_data_) { + delete [] h_row_data_; + delete [] h_col_data_; + } + if (owns_cpu_vals_) { + delete [] h_val_data_; + } + return 0; + case DEVICE: if (owns_gpu_data_) { mem_.deleteOnDevice(d_row_data_); mem_.deleteOnDevice(d_col_data_); @@ -192,14 +193,13 @@ namespace ReSolve { namespace matrix { if (owns_gpu_vals_) { mem_.deleteOnDevice(d_val_data_); } - } else { + return 0; + default: return -1; - } } - return 0; } - int Sparse::updateValues(real_type* new_vals, std::string memspaceIn, std::string memspaceOut) + int Sparse::updateValues(real_type* new_vals, memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut) { index_type nnz_current = nnz_; @@ -207,19 +207,19 @@ namespace ReSolve { namespace matrix { //four cases (for now) setNotUpdated(); int control=-1; - if ((memspaceIn == "cpu") && (memspaceOut == "cpu")){ control = 0;} - if ((memspaceIn == "cpu") && (memspaceOut == "cuda")){ control = 1;} - if ((memspaceIn == "cuda") && (memspaceOut == "cpu")){ control = 2;} - if ((memspaceIn == "cuda") && (memspaceOut == "cuda")){ control = 3;} + if ((memspaceIn == memory::HOST) && (memspaceOut == memory::HOST)) { control = 0;} + if ((memspaceIn == memory::HOST) && (memspaceOut == memory::DEVICE)){ control = 1;} + if ((memspaceIn == memory::DEVICE) && (memspaceOut == memory::HOST)) { control = 2;} + if ((memspaceIn == memory::DEVICE) && (memspaceOut == memory::DEVICE)){ control = 3;} - if (memspaceOut == "cpu") { + if (memspaceOut == memory::HOST) { //check if cpu data allocated if (h_val_data_ == nullptr) { this->h_val_data_ = new real_type[nnz_current]; } } - if (memspaceOut == "cuda") { + if (memspaceOut == memory::DEVICE) { //check if cuda data allocated if (d_val_data_ == nullptr) { mem_.allocateArrayOnDevice(&d_val_data_, nnz_current); @@ -228,7 +228,7 @@ namespace ReSolve { namespace matrix { switch(control) { case 0: //cpu->cpu - std::memcpy(h_val_data_, new_vals, (nnz_current) * sizeof(real_type)); + mem_.copyArrayHostToHost(h_val_data_, new_vals, nnz_current); h_data_updated_ = true; owns_cpu_vals_ = true; break; @@ -253,21 +253,22 @@ namespace ReSolve { namespace matrix { return 0; } - int Sparse::setNewValues(real_type* new_vals, std::string memspace) + int Sparse::setNewValues(real_type* new_vals, memory::MemorySpace memspace) { - + using namespace ReSolve::memory; setNotUpdated(); - if (memspace == "cpu"){ - this->h_val_data_ = new_vals; - h_data_updated_ = true; - } else { - if (memspace == "cuda"){ + switch (memspace) { + case HOST: + this->h_val_data_ = new_vals; + h_data_updated_ = true; + break; + case DEVICE: this->d_val_data_ = new_vals; d_data_updated_ = true; - } else { + break; + default: return -1; - } } return 0; } diff --git a/resolve/matrix/Sparse.hpp b/resolve/matrix/Sparse.hpp index 1196c38e..96121acb 100644 --- a/resolve/matrix/Sparse.hpp +++ b/resolve/matrix/Sparse.hpp @@ -31,31 +31,31 @@ namespace ReSolve { namespace matrix { void setExpanded(bool expanded); void setNnzExpanded(index_type nnz_expanded_new); void setNnz(index_type nnz_new); // for resetting when removing duplicates - index_type setUpdated(std::string what); + index_type setUpdated(memory::MemorySpace what); - virtual index_type* getRowData(std::string memspace) = 0; - virtual index_type* getColData(std::string memspace) = 0; - virtual real_type* getValues(std::string memspace) = 0; + virtual index_type* getRowData(memory::MemorySpace memspace) = 0; + virtual index_type* getColData(memory::MemorySpace memspace) = 0; + virtual real_type* getValues( memory::MemorySpace memspace) = 0; - virtual int updateData(index_type* row_data, index_type* col_data, real_type* val_data, std::string memspaceIn, std::string memspaceOut) = 0; - virtual int updateData(index_type* row_data, index_type* col_data, real_type* val_data, index_type new_nnz, std::string memspaceIn, std::string memspaceOut) = 0; + virtual int updateData(index_type* row_data, index_type* col_data, real_type* val_data, memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut) = 0; + virtual int updateData(index_type* row_data, index_type* col_data, real_type* val_data, index_type new_nnz, memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut) = 0; - virtual int allocateMatrixData(std::string memspace) = 0; - int setMatrixData(index_type* row_data, index_type* col_data, real_type* val_data, std::string memspace); + virtual int allocateMatrixData(memory::MemorySpace memspace) = 0; + int setMatrixData(index_type* row_data, index_type* col_data, real_type* val_data, memory::MemorySpace memspace); - int destroyMatrixData(std::string memspace); + int destroyMatrixData(memory::MemorySpace memspace); virtual void print() = 0; - virtual int copyData(std::string memspaceOut) = 0; + virtual int copyData(memory::MemorySpace memspaceOut) = 0; //update Values just updates values; it allocates if necessary. //values have the same dimensions between different formats - virtual int updateValues(real_type* new_vals, std::string memspaceIn, std::string memspaceOut); + virtual int updateValues(real_type* new_vals, memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut); //set new values just sets the pointer, use caution. - virtual int setNewValues(real_type* new_vals, std::string memspace); + virtual int setNewValues(real_type* new_vals, memory::MemorySpace memspace); protected: //size diff --git a/resolve/matrix/io.cpp b/resolve/matrix/io.cpp index 36fb5f1b..0d96a5e1 100644 --- a/resolve/matrix/io.cpp +++ b/resolve/matrix/io.cpp @@ -53,7 +53,7 @@ namespace ReSolve { namespace io { coo_vals[i] = c; i++; } - A->setMatrixData(coo_rows, coo_cols, coo_vals, "cpu"); + A->setMatrixData(coo_rows, coo_cols, coo_vals, memory::HOST); return A; } @@ -116,9 +116,9 @@ namespace ReSolve { namespace io { } A->setNnz(nnz); //create coo arrays - index_type* coo_rows = A->getRowData("cpu"); - index_type* coo_cols = A->getColData("cpu"); - real_type* coo_vals = A->getValues("cpu"); + index_type* coo_rows = A->getRowData(memory::HOST); + index_type* coo_cols = A->getColData(memory::HOST); + real_type* coo_vals = A->getValues( memory::HOST); i = 0; index_type a, b; real_type c; @@ -171,7 +171,7 @@ namespace ReSolve { namespace io { int writeVectorToFile(vector_type* vec_x, std::ostream& file_out) { - real_type* x_data = vec_x->getData("cpu"); + real_type* x_data = vec_x->getData(memory::HOST); // std::ofstream file_out (filename, std::ofstream::out); file_out << "%%MatrixMarket matrix array real general \n"; file_out << "% ID: XXX \n"; diff --git a/resolve/resolve_defs.hpp.in b/resolve/resolve_defs.hpp.in index 9756376c..15cd5791 100644 --- a/resolve/resolve_defs.hpp.in +++ b/resolve/resolve_defs.hpp.in @@ -1,4 +1,7 @@ -#pragma once +// #pragma once + +#ifndef __RESOLVE_DEFINITIONS_HPP__ +#define __RESOLVE_DEFINITIONS_HPP__ #cmakedefine RESOLVE_USE_GPU #cmakedefine RESOLVE_USE_CUDA @@ -14,3 +17,12 @@ // /// Date of build with the format "%Y-%m-%d" // #define RESOLVE_RELEASE_DATE "@RESOLVE_RELEASE_DATE@" + +#ifdef RESOLVE_USE_HIP +#ifndef __HIP_PLATFORM_AMD__ +#define __HIP_PLATFORM_AMD__ +#endif +#endif + + +#endif // __RESOLVE_DEFINITIONS_HPP__ \ No newline at end of file diff --git a/resolve/utilities/logger/CMakeLists.txt b/resolve/utilities/logger/CMakeLists.txt index 91b29dfc..29800942 100644 --- a/resolve/utilities/logger/CMakeLists.txt +++ b/resolve/utilities/logger/CMakeLists.txt @@ -17,8 +17,9 @@ set(Logger_HEADER_INSTALL # Build shared library ReSolve add_library(resolve_logger SHARED ${Logger_SRC}) -target_include_directories(resolve_logger INTERFACE - $ +target_include_directories(resolve_logger PUBLIC + $ + $ $ ) diff --git a/resolve/utilities/logger/Logger.cpp b/resolve/utilities/logger/Logger.cpp index f2448179..7369978f 100644 --- a/resolve/utilities/logger/Logger.cpp +++ b/resolve/utilities/logger/Logger.cpp @@ -59,7 +59,7 @@ namespace ReSolve */ void Logger::updateVerbosity(std::vector& output_streams) { - for (int i = NONE; i <= EVERYTHING; ++i) + for (std::size_t i = NONE; i <= EVERYTHING; ++i) { output_streams[i] = i > verbosity_ ? &nullstream_ : logger_; } diff --git a/resolve/vector/CMakeLists.txt b/resolve/vector/CMakeLists.txt index 16d53010..89b1abc8 100644 --- a/resolve/vector/CMakeLists.txt +++ b/resolve/vector/CMakeLists.txt @@ -18,6 +18,13 @@ set(Vector_CUDASDK_SRC VectorHandlerCuda.cpp ) +#and hip + +set(Vector_ROCM_SRC + VectorHandlerHip.cpp +) + + # Header files to be installed set(Vector_HEADER_INSTALL Vector.hpp @@ -30,6 +37,11 @@ if(RESOLVE_USE_CUDA) set(Vector_SRC ${Vector_SRC} ${Vector_CUDASDK_SRC}) endif() +# and hip +if(RESOLVE_USE_HIP) + set(Vector_SRC ${Vector_SRC} ${Vector_ROCM_SRC}) +endif() + add_library(resolve_vector SHARED ${Vector_SRC}) target_link_libraries(resolve_vector PRIVATE resolve_logger) @@ -38,6 +50,10 @@ if (RESOLVE_USE_CUDA) target_link_libraries(resolve_vector PUBLIC resolve_backend_cuda) endif() +if (RESOLVE_USE_HIP) + target_link_libraries(resolve_vector PUBLIC resolve_backend_hip) +endif() + # If no GPU is enabled link to dummy device backend if(NOT RESOLVE_USE_GPU) target_link_libraries(resolve_vector PUBLIC resolve_backend_cpu) diff --git a/resolve/vector/Vector.cpp b/resolve/vector/Vector.cpp index 7934e8b0..3b4f9e72 100644 --- a/resolve/vector/Vector.cpp +++ b/resolve/vector/Vector.cpp @@ -52,76 +52,75 @@ namespace ReSolve { namespace vector { return k_; } - void Vector::setData(real_type* data, std::string memspace) + void Vector::setData(real_type* data, memory::MemorySpace memspace) { - - if (memspace == "cpu") { - h_data_ = data; - cpu_updated_ = true; - gpu_updated_ = false; - } else { - if (memspace == "cuda") { + using namespace ReSolve::memory; + switch (memspace) { + case HOST: + h_data_ = data; + cpu_updated_ = true; + gpu_updated_ = false; + break; + case DEVICE: d_data_ = data; gpu_updated_ = true; cpu_updated_ = false; - } else { - //error - } + break; } } - void Vector::setDataUpdated(std::string memspace) + void Vector::setDataUpdated(memory::MemorySpace memspace) { - if (memspace == "cpu") { - cpu_updated_ = true; - gpu_updated_ = false; - } else { - if (memspace == "cuda") { + using namespace ReSolve::memory; + switch (memspace) { + case HOST: + cpu_updated_ = true; + gpu_updated_ = false; + break; + case DEVICE: gpu_updated_ = true; cpu_updated_ = false; - } else { - //error - } + break; } } - int Vector::update(real_type* data, std::string memspaceIn, std::string memspaceOut) + int Vector::update(real_type* data, memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut) { int control=-1; - if ((memspaceIn == "cpu") && (memspaceOut == "cpu")){ control = 0;} - if ((memspaceIn == "cpu") && (memspaceOut == "cuda")){ control = 1;} - if ((memspaceIn == "cuda") && (memspaceOut == "cpu")){ control = 2;} - if ((memspaceIn == "cuda") && (memspaceOut == "cuda")){ control = 3;} + if ((memspaceIn == memory::HOST) && (memspaceOut == memory::HOST)) { control = 0;} + if ((memspaceIn == memory::HOST) && (memspaceOut == memory::DEVICE)){ control = 1;} + if ((memspaceIn == memory::DEVICE) && (memspaceOut == memory::HOST)) { control = 2;} + if ((memspaceIn == memory::DEVICE) && (memspaceOut == memory::DEVICE)){ control = 3;} - if ((memspaceOut == "cpu") && (h_data_ == nullptr)){ + if ((memspaceOut == memory::HOST) && (h_data_ == nullptr)) { //allocate first h_data_ = new real_type[n_ * k_]; } - if ((memspaceOut == "cuda") && (d_data_ == nullptr)){ + if ((memspaceOut == memory::DEVICE) && (d_data_ == nullptr)) { //allocate first mem_.allocateArrayOnDevice(&d_data_, n_ * k_); } switch(control) { case 0: //cpu->cpu - std::memcpy(h_data_, data, (n_current_ * k_) * sizeof(real_type)); + mem_.copyArrayHostToHost(h_data_, data, n_current_ * k_); owns_cpu_data_ = true; cpu_updated_ = true; gpu_updated_ = false; break; - case 2: //cuda->cpu + case 2: //gpu->cpu mem_.copyArrayDeviceToHost(h_data_, data, n_current_ * k_); owns_gpu_data_ = true; cpu_updated_ = true; gpu_updated_ = false; break; - case 1: //cpu->cuda + case 1: //cpu->gpu mem_.copyArrayHostToDevice(d_data_, data, n_current_ * k_); owns_gpu_data_ = true; gpu_updated_ = true; cpu_updated_ = false; break; - case 3: //cuda->cuda + case 3: //gpu->gpu mem_.copyArrayDeviceToDevice(d_data_, data, n_current_ * k_); owns_gpu_data_ = true; gpu_updated_ = true; @@ -133,26 +132,27 @@ namespace ReSolve { namespace vector { return 0; } - real_type* Vector::getData(std::string memspace) + real_type* Vector::getData(memory::MemorySpace memspace) { return this->getData(0, memspace); } - real_type* Vector::getData(index_type i, std::string memspace) + real_type* Vector::getData(index_type i, memory::MemorySpace memspace) { - if ((memspace == "cpu") && (cpu_updated_ == false) && (gpu_updated_ == true )) { - copyData("cuda", "cpu"); + if ((memspace == memory::HOST) && (cpu_updated_ == false) && (gpu_updated_ == true )) { + // remember IN FIRST OUT SECOND!!! + copyData(memory::DEVICE, memspace); owns_cpu_data_ = true; } - if ((memspace == "cuda") && (gpu_updated_ == false) && (cpu_updated_ == true )) { - copyData("cpu", "cuda"); + if ((memspace == memory::DEVICE) && (gpu_updated_ == false) && (cpu_updated_ == true )) { + copyData(memory::HOST, memspace); owns_gpu_data_ = true; } - if (memspace == "cpu") { + if (memspace == memory::HOST) { return &h_data_[i * n_current_]; } else { - if (memspace == "cuda"){ + if (memspace == memory::DEVICE){ return &d_data_[i * n_current_]; } else { return nullptr; @@ -161,21 +161,20 @@ namespace ReSolve { namespace vector { } - int Vector::copyData(std::string memspaceIn, std::string memspaceOut) + int Vector::copyData(memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut) { int control=-1; - if ((memspaceIn == "cpu") && (memspaceOut == "cuda")){ control = 0;} - if ((memspaceIn == "cuda") && (memspaceOut == "cpu")){ control = 1;} + if ((memspaceIn == memory::HOST) && (memspaceOut == memory::DEVICE)){ control = 0;} + if ((memspaceIn == memory::DEVICE) && (memspaceOut == memory::HOST)) { control = 1;} - if ((memspaceOut == "cpu") && (h_data_ == nullptr)){ + if ((memspaceOut == memory::HOST) && (h_data_ == nullptr)) { //allocate first h_data_ = new real_type[n_ * k_]; } - if ((memspaceOut == "cuda") && (d_data_ == nullptr)){ + if ((memspaceOut == memory::DEVICE) && (d_data_ == nullptr)) { //allocate first mem_.allocateArrayOnDevice(&d_data_, n_ * k_); } - switch(control) { case 0: //cpu->cuda mem_.copyArrayHostToDevice(d_data_, h_data_, n_current_ * k_); @@ -193,108 +192,118 @@ namespace ReSolve { namespace vector { return 0; } - void Vector::allocate(std::string memspace) + void Vector::allocate(memory::MemorySpace memspace) { - if (memspace == "cpu") { - delete [] h_data_; - h_data_ = new real_type[n_ * k_]; - owns_cpu_data_ = true; - } else { - if (memspace == "cuda") { + using namespace ReSolve::memory; + switch (memspace) { + case HOST: + delete [] h_data_; + h_data_ = new real_type[n_ * k_]; + owns_cpu_data_ = true; + break; + case DEVICE: mem_.deleteOnDevice(d_data_); mem_.allocateArrayOnDevice(&d_data_, n_ * k_); owns_gpu_data_ = true; - } + break; } } - void Vector::setToZero(std::string memspace) + void Vector::setToZero(memory::MemorySpace memspace) { - if (memspace == "cpu") { - if (h_data_ == nullptr) { - h_data_ = new real_type[n_ * k_]; - owns_cpu_data_ = true; - } - for (int i = 0; i < n_ * k_; ++i){ - h_data_[i] = 0.0; - } - } else { - if (memspace == "cuda") { + using namespace ReSolve::memory; + switch (memspace) { + case HOST: + if (h_data_ == nullptr) { + h_data_ = new real_type[n_ * k_]; + owns_cpu_data_ = true; + } + for (int i = 0; i < n_ * k_; ++i){ + h_data_[i] = 0.0; + } + break; + case DEVICE: if (d_data_ == nullptr) { mem_.allocateArrayOnDevice(&d_data_, n_ * k_); owns_gpu_data_ = true; } mem_.setZeroArrayOnDevice(d_data_, n_ * k_); - } + break; } } - void Vector::setToZero(index_type j, std::string memspace) + void Vector::setToZero(index_type j, memory::MemorySpace memspace) { - if (memspace == "cpu") { - if (h_data_ == nullptr) { - h_data_ = new real_type[n_ * k_]; - owns_cpu_data_ = true; - } - for (int i = (n_current_) * j; i < n_current_ * (j + 1); ++i) { - h_data_[i] = 0.0; - } - } else { - if (memspace == "cuda") { + using namespace ReSolve::memory; + switch (memspace) { + case HOST: + if (h_data_ == nullptr) { + h_data_ = new real_type[n_ * k_]; + owns_cpu_data_ = true; + } + for (int i = (n_current_) * j; i < n_current_ * (j + 1); ++i) { + h_data_[i] = 0.0; + } + break; + case DEVICE: if (d_data_ == nullptr) { mem_.allocateArrayOnDevice(&d_data_, n_ * k_); owns_gpu_data_ = true; } // TODO: We should not need to access raw data in this class mem_.setZeroArrayOnDevice(&d_data_[j * n_current_], n_current_); - } + break; } } - void Vector::setToConst(real_type C, std::string memspace) + void Vector::setToConst(real_type C, memory::MemorySpace memspace) { - if (memspace == "cpu") { - if (h_data_ == nullptr) { - h_data_ = new real_type[n_ * k_]; - owns_cpu_data_ = true; - } - for (int i = 0; i < n_ * k_; ++i){ - h_data_[i] = C; - } - } else { - if (memspace == "cuda") { + using namespace ReSolve::memory; + switch (memspace) { + case HOST: + if (h_data_ == nullptr) { + h_data_ = new real_type[n_ * k_]; + owns_cpu_data_ = true; + } + for (int i = 0; i < n_ * k_; ++i){ + h_data_[i] = C; + } + break; + case DEVICE: if (d_data_ == nullptr) { mem_.allocateArrayOnDevice(&d_data_, n_ * k_); owns_gpu_data_ = true; } set_array_const(n_ * k_, C, d_data_); - } + break; } } - void Vector::setToConst(index_type j, real_type C, std::string memspace) + void Vector::setToConst(index_type j, real_type C, memory::MemorySpace memspace) { - if (memspace == "cpu") { - if (h_data_ == nullptr) { - h_data_ = new real_type[n_ * k_]; - owns_cpu_data_ = true; - } - for (int i = j * n_current_; i < (j + 1 ) * n_current_ * k_; ++i){ - h_data_[i] = C; - } - } else { - if (memspace == "cuda") { + using namespace ReSolve::memory; + switch (memspace) { + case HOST: + if (h_data_ == nullptr) { + h_data_ = new real_type[n_ * k_]; + owns_cpu_data_ = true; + } + for (int i = j * n_current_; i < (j + 1 ) * n_current_ * k_; ++i){ + h_data_[i] = C; + } + break; + case DEVICE: if (d_data_ == nullptr) { mem_.allocateArrayOnDevice(&d_data_, n_ * k_); owns_gpu_data_ = true; } set_array_const(n_current_ * 1, C, &d_data_[n_current_ * j]); - } + break; } } - real_type* Vector::getVectorData(index_type i, std::string memspace) + real_type* Vector::getVectorData(index_type i, memory::MemorySpace memspace) { if (this->k_ < i){ return nullptr; @@ -313,38 +322,38 @@ namespace ReSolve { namespace vector { } } - int Vector::deepCopyVectorData(real_type* dest, index_type i, std::string memspaceOut) + int Vector::deepCopyVectorData(real_type* dest, index_type i, memory::MemorySpace memspaceOut) { + using namespace ReSolve::memory; if (i > this->k_) { return -1; } else { real_type* data = this->getData(i, memspaceOut); - if (memspaceOut == "cpu") { - std::memcpy(dest, data, n_current_ * sizeof(real_type)); - } else { - if (memspaceOut == "cuda") { + switch (memspaceOut) { + case HOST: + mem_.copyArrayHostToHost(dest, data, n_current_); + break; + case DEVICE: mem_.copyArrayDeviceToDevice(dest, data, n_current_); - } else { - //error - } + break; } return 0; } } - int Vector::deepCopyVectorData(real_type* dest, std::string memspaceOut) + int Vector::deepCopyVectorData(real_type* dest, memory::MemorySpace memspaceOut) { + using namespace ReSolve::memory; real_type* data = this->getData(memspaceOut); - if (memspaceOut == "cpu") { - std::memcpy(dest, data, n_current_ * k_ * sizeof(real_type)); - } else { - if (memspaceOut == "cuda") { + switch (memspaceOut) { + case HOST: + mem_.copyArrayHostToHost(dest, data, n_current_ * k_); + break; + case DEVICE: mem_.copyArrayDeviceToDevice(dest, data, n_current_ * k_); - } else { - //error - } + break; } return 0; - } + }} // namespace ReSolve::vector diff --git a/resolve/vector/Vector.hpp b/resolve/vector/Vector.hpp index 9d1bd452..5f86ef7f 100644 --- a/resolve/vector/Vector.hpp +++ b/resolve/vector/Vector.hpp @@ -11,26 +11,26 @@ namespace ReSolve { namespace vector { Vector(index_type n, index_type k); ~Vector(); - int update(real_type* data, std::string memspaceIn, std::string memspaceOut); - real_type* getData(std::string memspace); - real_type* getData(index_type i, std::string memspace); // get pointer to i-th vector in multivector + int update(real_type* data, memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut); + real_type* getData(memory::MemorySpace memspace); + real_type* getData(index_type i, memory::MemorySpace memspace); // get pointer to i-th vector in multivector index_type getSize(); index_type getCurrentSize(); index_type getNumVectors(); - void setDataUpdated(std::string memspace); - void setData(real_type* data, std::string memspace); - void allocate(std::string memspace); - void setToZero(std::string memspace); - void setToZero(index_type i, std::string memspace); // set i-th ivector to 0 - void setToConst(real_type C, std::string memspace); - void setToConst(index_type i, real_type C, std::string memspace); // set i-th vector to C - needed for unit tests, Gram Schmidt tests - int copyData(std::string memspaceIn, std::string memspaceOut); + void setDataUpdated(memory::MemorySpace memspace); + void setData(real_type* data, memory::MemorySpace memspace); + void allocate(memory::MemorySpace memspace); + void setToZero(memory::MemorySpace memspace); + void setToZero(index_type i, memory::MemorySpace memspace); // set i-th ivector to 0 + void setToConst(real_type C, memory::MemorySpace memspace); + void setToConst(index_type i, real_type C, memory::MemorySpace memspace); // set i-th vector to C - needed for unit tests, Gram Schmidt tests + int copyData(memory::MemorySpace memspaceIn, memory::MemorySpace memspaceOut); int setCurrentSize(index_type new_n_current); - real_type* getVectorData(index_type i, std::string memspace); // get ith vector data out of multivector - int deepCopyVectorData(real_type* dest, index_type i, std::string memspace); - int deepCopyVectorData(real_type* dest, std::string memspace); //copy FULL multivector + real_type* getVectorData(index_type i, memory::MemorySpace memspace); // get ith vector data out of multivector + int deepCopyVectorData(real_type* dest, index_type i, memory::MemorySpace memspace); + int deepCopyVectorData(real_type* dest, memory::MemorySpace memspace); //copy FULL multivector private: index_type n_; ///< size diff --git a/resolve/vector/VectorHandler.cpp b/resolve/vector/VectorHandler.cpp index 8c89cb2f..f797f483 100644 --- a/resolve/vector/VectorHandler.cpp +++ b/resolve/vector/VectorHandler.cpp @@ -11,6 +11,9 @@ #ifdef RESOLVE_USE_CUDA #include #endif +#ifdef RESOLVE_USE_HIP +#include +#endif namespace ReSolve { using out = io::Logger; @@ -50,6 +53,21 @@ namespace ReSolve { isCpuEnabled_ = true; } #endif +#ifdef RESOLVE_USE_HIP + /** + * @brief constructor + * + * @param new_workspace - workspace to be set + */ + VectorHandler::VectorHandler(LinAlgWorkspaceHIP* new_workspace) + { + hipImpl_ = new VectorHandlerHip(new_workspace); + cpuImpl_ = new VectorHandlerCpu(); + + isHipEnabled_ = true; + isCpuEnabled_ = true; + } +#endif /** * @brief destructor @@ -64,7 +82,7 @@ namespace ReSolve { * * @param[in] x The first vector * @param[in] y The second vector - * @param[in] memspace String containg memspace (cpu or cuda) + * @param[in] memspace String containg memspace (cpu or cuda or hip) * * @return dot product (real number) of _x_ and _y_ */ @@ -74,7 +92,9 @@ namespace ReSolve { if (memspace == "cuda" ) { return cudaImpl_->dot(x, y); } else { - if (memspace == "cpu") { + if (memspace == "hip") { + return hipImpl_->dot(x, y); + } else if (memspace == "cpu") { return cpuImpl_->dot(x, y); } else { out::error() << "Not implemented (yet)" << std::endl; @@ -88,13 +108,15 @@ namespace ReSolve { * * @param[in] alpha The constant * @param[in,out] x The vector - * @param memspace string containg memspace (cpu or cuda) + * @param memspace string containg memspace (cpu or cuda or hip) * */ void VectorHandler::scal(const real_type* alpha, vector::Vector* x, std::string memspace) { if (memspace == "cuda" ) { cudaImpl_->scal(alpha, x); + } else if (memspace == "hip") { + hipImpl_->scal(alpha, x); } else { if (memspace == "cpu") { cpuImpl_->scal(alpha, x); @@ -110,7 +132,7 @@ namespace ReSolve { * @param[in] alpha The constant * @param[in] x The first vector * @param[in,out] y The second vector (result is return in y) - * @param[in] memspace String containg memspace (cpu or cuda) + * @param[in] memspace String containg memspace (cpu or cuda or hip) * */ void VectorHandler::axpy(const real_type* alpha, vector::Vector* x, vector::Vector* y, std::string memspace) @@ -119,10 +141,14 @@ namespace ReSolve { if (memspace == "cuda" ) { cudaImpl_->axpy(alpha, x, y); } else { - if (memspace == "cpu") { - cpuImpl_->axpy(alpha, x, y); + if (memspace == "hip" ) { + hipImpl_->axpy(alpha, x, y); } else { - out::error() <<"Not implemented (yet)" << std::endl; + if (memspace == "cpu") { + cpuImpl_->axpy(alpha, x, y); + } else { + out::error() <<"Not implemented (yet)" << std::endl; + } } } } @@ -139,7 +165,7 @@ namespace ReSolve { * @param[in] V Multivector containing the matrix, organized columnwise * @param[in] y Vector, k x 1 if N and n x 1 if T * @param[in,out] x Vector, n x 1 if N and k x 1 if T - * @param[in] memspace cpu or cuda (for now) + * @param[in] memspace cpu or cuda or hip (for now) * * @pre V is stored colum-wise, _n_ > 0, _k_ > 0 * @@ -148,6 +174,8 @@ namespace ReSolve { { if (memspace == "cuda") { cudaImpl_->gemv(transpose, n, k, alpha, beta, V, y, x); + } else if (memspace == "hip") { + hipImpl_->gemv(transpose, n, k, alpha, beta, V, y, x); } else if (memspace == "cpu") { cpuImpl_->gemv(transpose, n, k, alpha, beta, V, y, x); } else { @@ -162,7 +190,7 @@ namespace ReSolve { * @param[in] alpha vector size k x 1 * @param[in] x (multi)vector size size x k * @param[in,out] y vector size size x 1 (this is where the result is stored) - * @param[in] memspace string containg memspace (cpu or cuda) + * @param[in] memspace string containg memspace (cpu or cuda or hip) * * @pre _k_ > 0, _size_ > 0, _size_ = x->getSize() * @@ -172,6 +200,8 @@ namespace ReSolve { using namespace constants; if (memspace == "cuda") { cudaImpl_->massAxpy(size, alpha, k, x, y); + } else if (memspace == "hip") { + hipImpl_->massAxpy(size, alpha, k, x, y); } else if (memspace == "cpu") { cpuImpl_->massAxpy(size, alpha, k, x, y); } else { @@ -188,7 +218,7 @@ namespace ReSolve { * @param[in] k Number of vectors in V * @param[in] x Multivector; 2 vectors size n x 1 each * @param[out] res Multivector; 2 vectors size k x 1 each (result is returned in res) - * @param[in] memspace String containg memspace (cpu or cuda) + * @param[in] memspace String containg memspace (cpu or cuda or hip) * * @pre _size_ > 0, _k_ > 0, size = x->getSize(), _res_ needs to be allocated * @@ -197,6 +227,8 @@ namespace ReSolve { { if (memspace == "cuda") { cudaImpl_->massDot2Vec(size, V, k, x, res); + } else if (memspace == "hip") { + hipImpl_->massDot2Vec(size, V, k, x, res); } else if (memspace == "cpu") { cpuImpl_->massDot2Vec(size, V, k, x, res); } else { diff --git a/resolve/vector/VectorHandler.hpp b/resolve/vector/VectorHandler.hpp index c17d4688..02d426b5 100644 --- a/resolve/vector/VectorHandler.hpp +++ b/resolve/vector/VectorHandler.hpp @@ -10,6 +10,7 @@ namespace ReSolve class VectorHandlerImpl; class LinAlgWorkspaceCpu; class LinAlgWorkspaceCUDA; + class LinAlgWorkspaceHIP; } @@ -19,6 +20,7 @@ namespace ReSolve { //namespace vector { VectorHandler(); VectorHandler(LinAlgWorkspaceCpu* new_workspace); VectorHandler(LinAlgWorkspaceCUDA* new_workspace); + VectorHandler(LinAlgWorkspaceHIP* new_workspace); ~VectorHandler(); //y = alpha x + y @@ -55,9 +57,11 @@ namespace ReSolve { //namespace vector { private: VectorHandlerImpl* cpuImpl_{nullptr}; VectorHandlerImpl* cudaImpl_{nullptr}; + VectorHandlerImpl* hipImpl_{nullptr}; bool isCpuEnabled_{false}; bool isCudaEnabled_{false}; + bool isHipEnabled_{false}; }; } //} // namespace ReSolve::vector diff --git a/resolve/vector/VectorHandlerCpu.cpp b/resolve/vector/VectorHandlerCpu.cpp index f5cc463d..a8317a89 100644 --- a/resolve/vector/VectorHandlerCpu.cpp +++ b/resolve/vector/VectorHandlerCpu.cpp @@ -47,8 +47,8 @@ namespace ReSolve { real_type VectorHandlerCpu::dot(vector::Vector* x, vector::Vector* y) { - real_type* x_data = x->getData("cpu"); - real_type* y_data = y->getData("cpu"); + real_type* x_data = x->getData(memory::HOST); + real_type* y_data = y->getData(memory::HOST); real_type sum = 0.0; real_type c = 0.0; // real_type t, y; @@ -72,7 +72,7 @@ namespace ReSolve { */ void VectorHandlerCpu::scal(const real_type* alpha, vector::Vector* x) { - real_type* x_data = x->getData("cpu"); + real_type* x_data = x->getData(memory::HOST); for (int i = 0; i < x->getSize(); ++i){ x_data[i] *= (*alpha); @@ -91,8 +91,8 @@ namespace ReSolve { void VectorHandlerCpu::axpy(const real_type* alpha, vector::Vector* x, vector::Vector* y) { //AXPY: y = alpha * x + y - real_type* x_data = x->getData("cpu"); - real_type* y_data = y->getData("cpu"); + real_type* x_data = x->getData(memory::HOST); + real_type* y_data = y->getData(memory::HOST); for (int i = 0; i < x->getSize(); ++i) { y_data[i] = (*alpha) * x_data[i] + y_data[i]; } diff --git a/resolve/vector/VectorHandlerCuda.cpp b/resolve/vector/VectorHandlerCuda.cpp index 3c887e85..5871fd5a 100644 --- a/resolve/vector/VectorHandlerCuda.cpp +++ b/resolve/vector/VectorHandlerCuda.cpp @@ -50,7 +50,7 @@ namespace ReSolve { LinAlgWorkspaceCUDA* workspaceCUDA = workspace_; cublasHandle_t handle_cublas = workspaceCUDA->getCublasHandle(); double nrm = 0.0; - cublasStatus_t st= cublasDdot (handle_cublas, x->getSize(), x->getData("cuda"), 1, y->getData("cuda"), 1, &nrm); + cublasStatus_t st= cublasDdot (handle_cublas, x->getSize(), x->getData(memory::DEVICE), 1, y->getData(memory::DEVICE), 1, &nrm); if (st!=0) {printf("dot product crashed with code %d \n", st);} return nrm; } @@ -67,7 +67,7 @@ namespace ReSolve { { LinAlgWorkspaceCUDA* workspaceCUDA = workspace_; cublasHandle_t handle_cublas = workspaceCUDA->getCublasHandle(); - cublasStatus_t st = cublasDscal(handle_cublas, x->getSize(), alpha, x->getData("cuda"), 1); + cublasStatus_t st = cublasDscal(handle_cublas, x->getSize(), alpha, x->getData(memory::DEVICE), 1); if (st!=0) { ReSolve::io::Logger::error() << "scal crashed with code " << st << "\n"; } @@ -90,9 +90,9 @@ namespace ReSolve { cublasDaxpy(handle_cublas, x->getSize(), alpha, - x->getData("cuda"), + x->getData(memory::DEVICE), 1, - y->getData("cuda"), + y->getData(memory::DEVICE), 1); } @@ -131,12 +131,12 @@ namespace ReSolve { n, k, alpha, - V->getData("cuda"), + V->getData(memory::DEVICE), n, - y->getData("cuda"), + y->getData(memory::DEVICE), 1, beta, - x->getData("cuda"), + x->getData(memory::DEVICE), 1); } else { @@ -145,12 +145,12 @@ namespace ReSolve { n, k, alpha, - V->getData("cuda"), + V->getData(memory::DEVICE), n, - y->getData("cuda"), + y->getData(memory::DEVICE), 1, beta, - x->getData("cuda"), + x->getData(memory::DEVICE), 1); } } @@ -171,7 +171,7 @@ namespace ReSolve { { using namespace constants; if (k < 200) { - mass_axpy(size, k, x->getData("cuda"), y->getData("cuda"),alpha->getData("cuda")); + mass_axpy(size, k, x->getData(memory::DEVICE), y->getData(memory::DEVICE),alpha->getData(memory::DEVICE)); } else { LinAlgWorkspaceCUDA* workspaceCUDA = workspace_; cublasHandle_t handle_cublas = workspaceCUDA->getCublasHandle(); @@ -182,12 +182,12 @@ namespace ReSolve { 1, // n k + 1, // k &MINUSONE, // alpha - x->getData("cuda"), // A + x->getData(memory::DEVICE), // A size, // lda - alpha->getData("cuda"), // B + alpha->getData(memory::DEVICE), // B k + 1, // ldb &ONE, - y->getData("cuda"), // c + y->getData(memory::DEVICE), // c size); // ldc } } @@ -212,7 +212,7 @@ namespace ReSolve { using namespace constants; if (k < 200) { - mass_inner_product_two_vectors(size, k, x->getData("cuda") , x->getData(1, "cuda"), V->getData("cuda"), res->getData("cuda")); + mass_inner_product_two_vectors(size, k, x->getData(memory::DEVICE) , x->getData(1, memory::DEVICE), V->getData(memory::DEVICE), res->getData(memory::DEVICE)); } else { LinAlgWorkspaceCUDA* workspaceCUDA = workspace_; cublasHandle_t handle_cublas = workspaceCUDA->getCublasHandle(); @@ -223,12 +223,12 @@ namespace ReSolve { 2, //n size, //k &ONE, //alpha - V->getData("cuda"), //A + V->getData(memory::DEVICE), //A size, //lda - x->getData("cuda"), //B + x->getData(memory::DEVICE), //B size, //ldb &ZERO, - res->getData("cuda"), //c + res->getData(memory::DEVICE), //c k + 1); //ldc } } diff --git a/resolve/vector/VectorHandlerHip.cpp b/resolve/vector/VectorHandlerHip.cpp new file mode 100644 index 00000000..1e1195fc --- /dev/null +++ b/resolve/vector/VectorHandlerHip.cpp @@ -0,0 +1,236 @@ +#include + +#include +#include +#include +#include +#include +#include "VectorHandlerHip.hpp" + +namespace ReSolve { + using out = io::Logger; + + /** + * @brief empty constructor that does absolutely nothing + */ + VectorHandlerHip::VectorHandlerHip() + { + } + + /** + * @brief constructor + * + * @param new_workspace - workspace to be set + */ + VectorHandlerHip:: VectorHandlerHip(LinAlgWorkspaceHIP* new_workspace) + { + workspace_ = new_workspace; + } + + /** + * @brief destructor + */ + VectorHandlerHip::~VectorHandlerHip() + { + //delete the workspace TODO + } + + /** + * @brief dot product of two vectors i.e, a = x^Ty + * + * @param[in] x The first vector + * @param[in] y The second vector + * @param[in] memspace String containg memspace (cpu or hip) + * + * @return dot product (real number) of _x_ and _y_ + */ + + real_type VectorHandlerHip::dot(vector::Vector* x, vector::Vector* y) + { + LinAlgWorkspaceHIP* workspaceHIP = workspace_; + rocblas_handle handle_rocblas = workspaceHIP->getRocblasHandle(); + double nrm = 0.0; + rocblas_status st= rocblas_ddot (handle_rocblas, x->getSize(), x->getData(memory::DEVICE), 1, y->getData(memory::DEVICE), 1, &nrm); + if (st!=0) {printf("dot product crashed with code %d \n", st);} + return nrm; + } + + /** + * @brief scale a vector by a constant i.e, x = alpha*x where alpha is a constant + * + * @param[in] alpha The constant + * @param[in,out] x The vector + * @param memspace string containg memspace (cpu or hip) + * + */ + void VectorHandlerHip::scal(const real_type* alpha, vector::Vector* x) + { + LinAlgWorkspaceHIP* workspaceHIP = workspace_; + rocblas_handle handle_rocblas = workspaceHIP->getRocblasHandle(); + rocblas_status st = rocblas_dscal(handle_rocblas, x->getSize(), alpha, x->getData(memory::DEVICE), 1); + if (st!=0) { + ReSolve::io::Logger::error() << "scal crashed with code " << st << "\n"; + } + } + + /** + * @brief axpy i.e, y = alpha*x+y where alpha is a constant + * + * @param[in] alpha The constant + * @param[in] x The first vector + * @param[in,out] y The second vector (result is return in y) + * @param[in] memspace String containg memspace (cpu or hip) + * + */ + void VectorHandlerHip::axpy(const real_type* alpha, vector::Vector* x, vector::Vector* y) + { + //AXPY: y = alpha * x + y + LinAlgWorkspaceHIP* workspaceHIP = workspace_; + rocblas_handle handle_rocblas = workspaceHIP->getRocblasHandle(); + rocblas_daxpy(handle_rocblas, + x->getSize(), + alpha, + x->getData(memory::DEVICE), + 1, + y->getData(memory::DEVICE), + 1); + } + + /** + * @brief gemv computes matrix-vector product where both matrix and vectors are dense. + * i.e., x = beta*x + alpha*V*y + * + * @param[in] Transpose - yes (T) or no (N) + * @param[in] n Number of rows in (non-transposed) matrix + * @param[in] k Number of columns in (non-transposed) + * @param[in] alpha Constant real number + * @param[in] beta Constant real number + * @param[in] V Multivector containing the matrix, organized columnwise + * @param[in] y Vector, k x 1 if N and n x 1 if T + * @param[in,out] x Vector, n x 1 if N and k x 1 if T + * @param[in] memspace cpu or hip (for now) + * + * @pre V is stored colum-wise, _n_ > 0, _k_ > 0 + * + */ + void VectorHandlerHip::gemv(std::string transpose, + index_type n, + index_type k, + const real_type* alpha, + const real_type* beta, + vector::Vector* V, + vector::Vector* y, + vector::Vector* x) + { + LinAlgWorkspaceHIP* workspaceHIP = workspace_; + rocblas_handle handle_rocblas = workspaceHIP->getRocblasHandle(); + if (transpose == "T") { + + rocblas_dgemv(handle_rocblas, + rocblas_operation_transpose, + n, + k, + alpha, + V->getData(memory::DEVICE), + n, + y->getData(memory::DEVICE), + 1, + beta, + x->getData(memory::DEVICE), + 1); + + } else { + rocblas_dgemv(handle_rocblas, + rocblas_operation_none, + n, + k, + alpha, + V->getData(memory::DEVICE), + n, + y->getData(memory::DEVICE), + 1, + beta, + x->getData(memory::DEVICE), + 1); + } + } + + /** + * @brief mass (bulk) axpy i.e, y = y - x*alpha where alpha is a vector + * + * @param[in] size number of elements in y + * @param[in] alpha vector size k x 1 + * @param[in] x (multi)vector size size x k + * @param[in,out] y vector size size x 1 (this is where the result is stored) + * @param[in] memspace string containg memspace (cpu or hip) + * + * @pre _k_ > 0, _size_ > 0, _size_ = x->getSize() + * + */ + void VectorHandlerHip::massAxpy(index_type size, vector::Vector* alpha, index_type k, vector::Vector* x, vector::Vector* y) + { + using namespace constants; + if (k < 200) { + mass_axpy(size, k, x->getData(memory::DEVICE), y->getData(memory::DEVICE),alpha->getData(memory::DEVICE)); + } else { + LinAlgWorkspaceHIP* workspaceHIP = workspace_; + rocblas_handle handle_rocblas = workspaceHIP->getRocblasHandle(); + rocblas_dgemm(handle_rocblas, + rocblas_operation_none, + rocblas_operation_none, + size, // m + 1, // n + k, // k + &MINUSONE, // alpha + x->getData(memory::DEVICE), // A + size, // lda + alpha->getData(memory::DEVICE), // B + k, // ldb + &ONE, + y->getData(memory::DEVICE), // c + size); // ldc + } + } + + /** + * @brief mass (bulk) dot product i.e, V^T x, where V is n x k dense multivector + * (a dense multivector consisting of k vectors size n) and x is k x 2 dense + * multivector (a multivector consisiting of two vectors size n each) + * + * @param[in] size Number of elements in a single vector in V + * @param[in] V Multivector; k vectors size n x 1 each + * @param[in] k Number of vectors in V + * @param[in] x Multivector; 2 vectors size n x 1 each + * @param[out] res Multivector; 2 vectors size k x 1 each (result is returned in res) + * @param[in] memspace String containg memspace (cpu or hip) + * + * @pre _size_ > 0, _k_ > 0, size = x->getSize(), _res_ needs to be allocated + * + */ + void VectorHandlerHip::massDot2Vec(index_type size, vector::Vector* V, index_type k, vector::Vector* x, vector::Vector* res) + { + using namespace constants; + + if (k < 200) { + mass_inner_product_two_vectors(size, k, x->getData(memory::DEVICE) , x->getData(1, memory::DEVICE), V->getData(memory::DEVICE), res->getData(memory::DEVICE)); + } else { + LinAlgWorkspaceHIP* workspaceHIP = workspace_; + rocblas_handle handle_rocblas = workspaceHIP->getRocblasHandle(); + rocblas_dgemm(handle_rocblas, + rocblas_operation_transpose, + rocblas_operation_none, + k + 1, //m + 2, //n + size, //k + &ONE, //alpha + V->getData(memory::DEVICE), //A + size, //lda + x->getData(memory::DEVICE), //B + size, //ldb + &ZERO, + res->getData(memory::DEVICE), //c + k + 1); //ldc + } + } + +} // namespace ReSolve diff --git a/resolve/vector/VectorHandlerHip.hpp b/resolve/vector/VectorHandlerHip.hpp new file mode 100644 index 00000000..7e5085e3 --- /dev/null +++ b/resolve/vector/VectorHandlerHip.hpp @@ -0,0 +1,57 @@ +#pragma once +#include + +namespace ReSolve +{ + namespace vector + { + class Vector; + } + class LinAlgWorkspaceHIP; + class VectorHandlerImpl; +} + + +namespace ReSolve { //namespace vector { + class VectorHandlerHip : public VectorHandlerImpl + { + public: + VectorHandlerHip(); + VectorHandlerHip(LinAlgWorkspaceHIP* workspace); + virtual ~VectorHandlerHip(); + + //y = alpha x + y + virtual void axpy(const real_type* alpha, vector::Vector* x, vector::Vector* y); + + //dot: x \cdot y + virtual real_type dot(vector::Vector* x, vector::Vector* y); + + //scal = alpha * x + virtual void scal(const real_type* alpha, vector::Vector* x); + + //mass axpy: x*alpha + y where x is [n x k] and alpha is [k x 1]; x is stored columnwise + virtual void massAxpy(index_type size, vector::Vector* alpha, index_type k, vector::Vector* x, vector::Vector* y); + + //mass dot: V^T x, where V is [n x k] and x is [k x 2], everything is stored and returned columnwise + //Size = n + virtual void massDot2Vec(index_type size, vector::Vector* V, index_type k, vector::Vector* x, vector::Vector* res); + + /** gemv: + * if `transpose = N` (no), `x = beta*x + alpha*V*y`, + * where `x` is `[n x 1]`, `V` is `[n x k]` and `y` is `[k x 1]`. + * if `transpose = T` (yes), `x = beta*x + alpha*V^T*y`, + * where `x` is `[k x 1]`, `V` is `[n x k]` and `y` is `[n x 1]`. + */ + virtual void gemv(std::string transpose, + index_type n, + index_type k, + const real_type* alpha, + const real_type* beta, + vector::Vector* V, + vector::Vector* y, + vector::Vector* x); + private: + LinAlgWorkspaceHIP* workspace_; + }; + +} //} // namespace ReSolve::vector diff --git a/resolve/workspace/CMakeLists.txt b/resolve/workspace/CMakeLists.txt index 673fac4b..a44f74f8 100644 --- a/resolve/workspace/CMakeLists.txt +++ b/resolve/workspace/CMakeLists.txt @@ -16,10 +16,15 @@ set(ReSolve_Workspace_CUDASDK_SRC LinAlgWorkspaceCUDA.cpp ) +set(ReSolve_Workspace_ROCM_SRC + LinAlgWorkspaceHIP.cpp +) + set(ReSolve_Workspace_HEADER_INSTALL LinAlgWorkspace.hpp LinAlgWorkspaceCpu.hpp LinAlgWorkspaceCUDA.hpp + LinAlgWorkspaceHIP.hpp ) # If cuda is enabled, add CUDA SDK workspace files @@ -27,6 +32,10 @@ if(RESOLVE_USE_CUDA) set(ReSolve_Workspace_SRC ${ReSolve_Workspace_SRC} ${ReSolve_Workspace_CUDASDK_SRC}) endif() +if(RESOLVE_USE_HIP) + set(ReSolve_Workspace_SRC ${ReSolve_Workspace_SRC} ${ReSolve_Workspace_ROCM_SRC}) +endif() + add_library(resolve_workspace SHARED ${ReSolve_Workspace_SRC}) # If CUDA is enabled, link to ReSolve CUDA backend @@ -34,9 +43,14 @@ if(RESOLVE_USE_CUDA) target_link_libraries(resolve_workspace PUBLIC resolve_backend_cuda) endif(RESOLVE_USE_CUDA) -target_include_directories(resolve_workspace INTERFACE - $ - $ +if(RESOLVE_USE_HIP) + target_link_libraries(resolve_workspace PUBLIC resolve_backend_hip) +endif(RESOLVE_USE_HIP) + +target_include_directories(resolve_workspace PUBLIC + $ + $ + $ ) # install include headers diff --git a/resolve/workspace/LinAlgWorkspace.hpp b/resolve/workspace/LinAlgWorkspace.hpp index 6da58fda..4efe834e 100644 --- a/resolve/workspace/LinAlgWorkspace.hpp +++ b/resolve/workspace/LinAlgWorkspace.hpp @@ -6,3 +6,7 @@ #include #endif +#ifdef RESOLVE_USE_HIP +#include +#endif + diff --git a/resolve/workspace/LinAlgWorkspaceCpu.cpp b/resolve/workspace/LinAlgWorkspaceCpu.cpp index 3ed9aa43..c0f25248 100644 --- a/resolve/workspace/LinAlgWorkspaceCpu.cpp +++ b/resolve/workspace/LinAlgWorkspaceCpu.cpp @@ -1,3 +1,4 @@ +#include #include "LinAlgWorkspaceCpu.hpp" namespace ReSolve diff --git a/resolve/workspace/LinAlgWorkspaceCpu.hpp b/resolve/workspace/LinAlgWorkspaceCpu.hpp index 00e5f38e..3c056b73 100644 --- a/resolve/workspace/LinAlgWorkspaceCpu.hpp +++ b/resolve/workspace/LinAlgWorkspaceCpu.hpp @@ -12,7 +12,7 @@ namespace ReSolve ~LinAlgWorkspaceCpu(); void initializeHandles(); private: - MemoryHandler mem_; + // MemoryHandler mem_; ///< Memory handler not needed for now }; } diff --git a/resolve/workspace/LinAlgWorkspaceHIP.cpp b/resolve/workspace/LinAlgWorkspaceHIP.cpp new file mode 100644 index 00000000..e64dff17 --- /dev/null +++ b/resolve/workspace/LinAlgWorkspaceHIP.cpp @@ -0,0 +1,75 @@ +#include + +namespace ReSolve +{ + LinAlgWorkspaceHIP::LinAlgWorkspaceHIP() + { + handle_rocsparse_ = nullptr; + handle_rocblas_ = nullptr; + + matvec_setup_done_ = false; + } + + LinAlgWorkspaceHIP::~LinAlgWorkspaceHIP() + { + rocsparse_destroy_handle(handle_rocsparse_); + rocblas_destroy_handle(handle_rocblas_); + rocsparse_destroy_mat_descr(mat_A_); + } + + rocsparse_handle LinAlgWorkspaceHIP::getRocsparseHandle() + { + return handle_rocsparse_; + } + + void LinAlgWorkspaceHIP::setRocsparseHandle(rocsparse_handle handle) + { + handle_rocsparse_ = handle; + } + + rocblas_handle LinAlgWorkspaceHIP::getRocblasHandle() + { + return handle_rocblas_; + } + + void LinAlgWorkspaceHIP::setRocblasHandle(rocblas_handle handle) + { + handle_rocblas_ = handle; + } + + rocsparse_mat_descr LinAlgWorkspaceHIP::getSpmvMatrixDescriptor() + { + return mat_A_; + } + + void LinAlgWorkspaceHIP::setSpmvMatrixDescriptor(rocsparse_mat_descr mat) + { + mat_A_ = mat; + } + + rocsparse_mat_info LinAlgWorkspaceHIP::getSpmvMatrixInfo() + { + return info_A_; + } + + void LinAlgWorkspaceHIP::setSpmvMatrixInfo(rocsparse_mat_info info) + { + info_A_ = info; + } + + bool LinAlgWorkspaceHIP::matvecSetup() + { + return matvec_setup_done_; + } + + void LinAlgWorkspaceHIP::matvecSetupDone() + { + matvec_setup_done_ = true; + } + + void LinAlgWorkspaceHIP::initializeHandles() + { + rocsparse_create_handle(&handle_rocsparse_); + rocblas_create_handle(&handle_rocblas_); + } + } // namespace ReSolve diff --git a/resolve/workspace/LinAlgWorkspaceHIP.hpp b/resolve/workspace/LinAlgWorkspaceHIP.hpp new file mode 100644 index 00000000..abdc3e41 --- /dev/null +++ b/resolve/workspace/LinAlgWorkspaceHIP.hpp @@ -0,0 +1,52 @@ +#pragma once + +#include +#include +#include + +#include + +namespace ReSolve +{ + class LinAlgWorkspaceHIP + { + public: + LinAlgWorkspaceHIP(); + ~LinAlgWorkspaceHIP(); + + rocblas_handle getRocblasHandle(); + rocsparse_handle getRocsparseHandle(); + rocsparse_mat_descr getSpmvMatrixDescriptor(); + rocsparse_mat_info getSpmvMatrixInfo(); + + void setRocblasHandle(rocblas_handle handle); + void setRocsparseHandle(rocsparse_handle handle); + void setSpmvMatrixDescriptor(rocsparse_mat_descr mat); + void setSpmvMatrixInfo(rocsparse_mat_info info); + + void initializeHandles(); + + bool matvecSetup(); + void matvecSetupDone(); + + private: + //handles + rocblas_handle handle_rocblas_; + rocsparse_handle handle_rocsparse_; + + //matrix descriptors + rocsparse_mat_descr mat_A_; + + //vector descriptors not needed, rocsparse uses RAW pointers. + + //buffers + // there is no buffer needed in matvec + bool matvec_setup_done_; //check if setup is done for matvec (note: no buffer but there is analysis) + + //info - but we need info + rocsparse_mat_info info_A_; + + // MemoryHandler mem_; ///< Memory handler not needed for now + }; + +} // namespace ReSolve diff --git a/tests/functionality/CMakeLists.txt b/tests/functionality/CMakeLists.txt index a6652c26..85b47fd7 100644 --- a/tests/functionality/CMakeLists.txt +++ b/tests/functionality/CMakeLists.txt @@ -26,6 +26,19 @@ if(RESOLVE_USE_CUDA) endif(RESOLVE_USE_CUDA) + +if(RESOLVE_USE_HIP) + + # Build KLU+rossolver test + add_executable(rocsolver_rf_test.exe testKLU_RocSolver.cpp) + target_link_libraries(rocsolver_rf_test.exe PRIVATE ReSolve) + + # And another one to test FGMRES version + add_executable(rocsolver_rf_fgmres_test.exe testKLU_RocSolver_FGMRES.cpp) + target_link_libraries(rocsolver_rf_fgmres_test.exe PRIVATE ReSolve) + +endif(RESOLVE_USE_HIP) + # Install tests set(installable_tests klu_klu_test.exe) @@ -36,6 +49,12 @@ if(RESOLVE_USE_CUDA) klu_glu_test.exe) endif(RESOLVE_USE_CUDA) +if(RESOLVE_USE_HIP) + set(installable_tests ${installable_tests} + rocsolver_rf_test.exe + rocsolver_rf_fgmres_test.exe) +endif(RESOLVE_USE_HIP) + install(TARGETS ${installable_tests} RUNTIME DESTINATION bin/resolve/tests/functionality) @@ -50,3 +69,8 @@ if(RESOLVE_USE_CUDA) add_test(NAME klu_rf_fgmres_test COMMAND $ "${test_data_dir}") add_test(NAME klu_glu_test COMMAND $ "${test_data_dir}") endif(RESOLVE_USE_CUDA) + +if(RESOLVE_USE_HIP) + add_test(NAME rocsolver_rf_test COMMAND $ "${test_data_dir}") + add_test(NAME rocsolver_rf_fgmres_test COMMAND $ "${test_data_dir}") +endif(RESOLVE_USE_HIP) diff --git a/tests/functionality/testKLU.cpp b/tests/functionality/testKLU.cpp index f3c1da57..083c11d1 100644 --- a/tests/functionality/testKLU.cpp +++ b/tests/functionality/testKLU.cpp @@ -66,7 +66,7 @@ int main(int argc, char *argv[]) return -1; } real_type* rhs = ReSolve::io::readRhsFromFile(rhs1_file); - real_type* x = new real_type[A->getNumRows()]; + real_type* x = new real_type[A->getNumRows()]; vector_type* vec_rhs = new vector_type(A->getNumRows()); vector_type* vec_x = new vector_type(A->getNumRows()); vector_type* vec_r = new vector_type(A->getNumRows()); @@ -74,8 +74,8 @@ int main(int argc, char *argv[]) // Convert first matrix to CSR format matrix_handler->coo2csr(A_coo, A, "cpu"); - vec_rhs->update(rhs, "cpu", "cpu"); - vec_rhs->setDataUpdated("cpu"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST); + vec_rhs->setDataUpdated(ReSolve::memory::HOST); // Solve the first system using KLU status = KLU->setup(A); @@ -100,11 +100,11 @@ int main(int argc, char *argv[]) x_data[i] = 1.0; } - vec_test->setData(x_data, "cpu"); - vec_r->update(rhs, "cpu", "cpu"); - vec_diff->update(x_data, "cpu", "cpu"); + vec_test->setData(x_data, ReSolve::memory::HOST); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST); + vec_diff->update(x_data, ReSolve::memory::HOST, ReSolve::memory::HOST); - // real_type normXmatrix1 = sqrt(vector_handler->dot(vec_test, vec_test, "cpu")); + // real_type normXmatrix1 = sqrt(vector_handler->dot(vec_test, vec_test, ReSolve::memory::HOST)); matrix_handler->setValuesChanged(true, "cpu"); status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr","cpu"); error_sum += status; @@ -123,13 +123,13 @@ int main(int argc, char *argv[]) real_type normDiffMatrix1 = sqrt(vector_handler->dot(vec_diff, vec_diff, "cpu")); //compute the residual using exact solution - vec_r->update(rhs, "cpu", "cpu"); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST); status = matrix_handler->matvec(A, vec_test, vec_r, &ONE, &MINUSONE,"csr", "cpu"); error_sum += status; real_type exactSol_normRmatrix1 = sqrt(vector_handler->dot(vec_r, vec_r, "cpu")); //evaluate the residual ON THE CPU using COMPUTED solution - vec_r->update(rhs, "cpu", "cpu"); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST); status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "cpu"); error_sum += status; @@ -165,7 +165,7 @@ int main(int argc, char *argv[]) rhs2_file.close(); matrix_handler->coo2csr(A_coo, A, "cpu"); - vec_rhs->update(rhs, "cpu", "cpu"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST); // and solve it too status = KLU->refactorize(); @@ -174,7 +174,7 @@ int main(int argc, char *argv[]) status = KLU->solve(vec_rhs, vec_x); error_sum += status; - vec_r->update(rhs, "cpu", "cpu"); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST); matrix_handler->setValuesChanged(true, "cpu"); status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE, "csr", "cpu"); @@ -185,13 +185,13 @@ int main(int argc, char *argv[]) //for testing only - control real_type normB2 = sqrt(vector_handler->dot(vec_rhs, vec_rhs, "cpu")); //compute x-x_true - vec_diff->update(x_data, "cpu", "cpu"); + vec_diff->update(x_data, ReSolve::memory::HOST, ReSolve::memory::HOST); vector_handler->axpy(&MINUSONE, vec_x, vec_diff, "cpu"); //evaluate its norm real_type normDiffMatrix2 = sqrt(vector_handler->dot(vec_diff, vec_diff, "cpu")); //compute the residual using exact solution - vec_r->update(rhs, "cpu", "cpu"); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST); status = matrix_handler->matvec(A, vec_test, vec_r, &ONE, &MINUSONE, "csr", "cpu"); error_sum += status; real_type exactSol_normRmatrix2 = sqrt(vector_handler->dot(vec_r, vec_r, "cpu")); @@ -215,7 +215,8 @@ int main(int argc, char *argv[]) //now DELETE delete A; delete KLU; - delete x; + delete [] x; + delete [] rhs; delete vec_r; delete vec_x; delete matrix_handler; diff --git a/tests/functionality/testKLU_GLU.cpp b/tests/functionality/testKLU_GLU.cpp index 0e9bb4bd..702141ec 100644 --- a/tests/functionality/testKLU_GLU.cpp +++ b/tests/functionality/testKLU_GLU.cpp @@ -72,18 +72,18 @@ int main(int argc, char *argv[]) return -1; } real_type* rhs = ReSolve::io::readRhsFromFile(rhs1_file); - real_type* x = new real_type[A->getNumRows()]; + real_type* x = new real_type[A->getNumRows()]; vector_type* vec_rhs = new vector_type(A->getNumRows()); vector_type* vec_x = new vector_type(A->getNumRows()); - vec_x->allocate("cpu");//for KLU - vec_x->allocate("cuda"); + vec_x->allocate(ReSolve::memory::HOST);//for KLU + vec_x->allocate(ReSolve::memory::DEVICE); vector_type* vec_r = new vector_type(A->getNumRows()); rhs1_file.close(); // Convert first matrix to CSR format matrix_handler->coo2csr(A_coo, A, "cpu"); - vec_rhs->update(rhs, "cpu", "cpu"); - vec_rhs->setDataUpdated("cpu"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST); + vec_rhs->setDataUpdated(ReSolve::memory::HOST); // Solve the first system using KLU status = KLU->setup(A); @@ -106,7 +106,7 @@ int main(int argc, char *argv[]) status = GLU->setup(A, L, U, P, Q); error_sum += status; std::cout<<"GLU setup status: "<update(rhs, "cpu", "cuda"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); status = GLU->solve(vec_rhs, vec_x); error_sum += status; std::cout<<"GLU solve status: "<setData(x_data, "cpu"); - vec_r->update(rhs, "cpu", "cuda"); - vec_diff->update(x_data, "cpu", "cuda"); + vec_test->setData(x_data, ReSolve::memory::HOST); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + vec_diff->update(x_data, ReSolve::memory::HOST, ReSolve::memory::DEVICE); // real_type normXmatrix1 = sqrt(vector_handler->dot(vec_test, vec_test, "cuda")); matrix_handler->setValuesChanged(true, "cuda"); @@ -145,13 +145,13 @@ int main(int argc, char *argv[]) real_type normDiffMatrix1 = sqrt(vector_handler->dot(vec_diff, vec_diff, "cuda")); //compute the residual using exact solution - vec_x->update(vec_x->getData("cuda"), "cuda", "cpu"); + vec_x->update(vec_x->getData(ReSolve::memory::DEVICE), ReSolve::memory::DEVICE, ReSolve::memory::HOST); status = matrix_handler->matvec(A, vec_test, vec_r, &ONE, &MINUSONE,"csr", "cuda"); error_sum += status; real_type exactSol_normRmatrix1 = sqrt(vector_handler->dot(vec_r, vec_r, "cuda")); //evaluate the residual ON THE CPU using COMPUTED solution - vec_r->update(rhs, "cpu", "cpu"); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST); status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "cpu"); error_sum += status; @@ -188,7 +188,7 @@ int main(int argc, char *argv[]) rhs2_file.close(); matrix_handler->coo2csr(A_coo, A, "cuda"); - vec_rhs->update(rhs, "cpu", "cuda"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); status = GLU->refactorize(); error_sum += status; @@ -197,7 +197,7 @@ int main(int argc, char *argv[]) status = GLU->solve(vec_rhs, vec_x); error_sum += status; - vec_r->update(rhs, "cpu", "cuda"); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); matrix_handler->setValuesChanged(true, "cuda"); status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE, "csr", "cuda"); @@ -208,13 +208,13 @@ int main(int argc, char *argv[]) //for testing only - control real_type normB2 = sqrt(vector_handler->dot(vec_rhs, vec_rhs, "cuda")); //compute x-x_true - vec_diff->update(x_data, "cpu", "cuda"); + vec_diff->update(x_data, ReSolve::memory::HOST, ReSolve::memory::DEVICE); vector_handler->axpy(&MINUSONE, vec_x, vec_diff, "cuda"); //evaluate its norm real_type normDiffMatrix2 = sqrt(vector_handler->dot(vec_diff, vec_diff, "cuda")); //compute the residual using exact solution - vec_r->update(rhs, "cpu", "cuda"); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); status = matrix_handler->matvec(A, vec_test, vec_r, &ONE, &MINUSONE, "csr", "cuda"); error_sum += status; real_type exactSol_normRmatrix2 = sqrt(vector_handler->dot(vec_r, vec_r, "cuda")); @@ -239,7 +239,8 @@ int main(int argc, char *argv[]) delete A; delete KLU; delete GLU; - delete x; + delete [] x; + delete [] rhs; delete vec_r; delete vec_x; delete workspace_CUDA; diff --git a/tests/functionality/testKLU_Rf.cpp b/tests/functionality/testKLU_Rf.cpp index 729968f5..a136017e 100644 --- a/tests/functionality/testKLU_Rf.cpp +++ b/tests/functionality/testKLU_Rf.cpp @@ -72,7 +72,7 @@ int main(int argc, char *argv[]) return -1; } real_type* rhs = ReSolve::io::readRhsFromFile(rhs1_file); - real_type* x = new real_type[A->getNumRows()]; + real_type* x = new real_type[A->getNumRows()]; vector_type* vec_rhs = new vector_type(A->getNumRows()); vector_type* vec_x = new vector_type(A->getNumRows()); vector_type* vec_r = new vector_type(A->getNumRows()); @@ -80,8 +80,8 @@ int main(int argc, char *argv[]) // Convert first matrix to CSR format matrix_handler->coo2csr(A_coo, A, "cpu"); - vec_rhs->update(rhs, "cpu", "cpu"); - vec_rhs->setDataUpdated("cpu"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST); + vec_rhs->setDataUpdated(ReSolve::memory::HOST); // Solve the first system using KLU status = KLU->setup(A); @@ -106,9 +106,9 @@ int main(int argc, char *argv[]) x_data[i] = 1.0; } - vec_test->setData(x_data, "cpu"); - vec_r->update(rhs, "cpu", "cuda"); - vec_diff->update(x_data, "cpu", "cuda"); + vec_test->setData(x_data, ReSolve::memory::HOST); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + vec_diff->update(x_data, ReSolve::memory::HOST, ReSolve::memory::DEVICE); // real_type normXmatrix1 = sqrt(vector_handler->dot(vec_test, vec_test, "cuda")); matrix_handler->setValuesChanged(true, "cuda"); @@ -129,13 +129,13 @@ int main(int argc, char *argv[]) real_type normDiffMatrix1 = sqrt(vector_handler->dot(vec_diff, vec_diff, "cuda")); //compute the residual using exact solution - vec_r->update(rhs, "cpu", "cuda"); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); status = matrix_handler->matvec(A, vec_test, vec_r, &ONE, &MINUSONE,"csr", "cuda"); error_sum += status; real_type exactSol_normRmatrix1 = sqrt(vector_handler->dot(vec_r, vec_r, "cuda")); //evaluate the residual ON THE CPU using COMPUTED solution - vec_r->update(rhs, "cpu", "cpu"); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST); status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "cpu"); error_sum += status; @@ -186,7 +186,7 @@ int main(int argc, char *argv[]) rhs2_file.close(); matrix_handler->coo2csr(A_coo, A, "cuda"); - vec_rhs->update(rhs, "cpu", "cuda"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); status = Rf->refactorize(); error_sum += status; @@ -194,7 +194,7 @@ int main(int argc, char *argv[]) status = Rf->solve(vec_rhs, vec_x); error_sum += status; - vec_r->update(rhs, "cpu", "cuda"); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); matrix_handler->setValuesChanged(true, "cuda"); status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE, "csr", "cuda"); @@ -205,13 +205,13 @@ int main(int argc, char *argv[]) //for testing only - control real_type normB2 = sqrt(vector_handler->dot(vec_rhs, vec_rhs, "cuda")); //compute x-x_true - vec_diff->update(x_data, "cpu", "cuda"); + vec_diff->update(x_data, ReSolve::memory::HOST, ReSolve::memory::DEVICE); vector_handler->axpy(&MINUSONE, vec_x, vec_diff, "cuda"); //evaluate its norm real_type normDiffMatrix2 = sqrt(vector_handler->dot(vec_diff, vec_diff, "cuda")); //compute the residual using exact solution - vec_r->update(rhs, "cpu", "cuda"); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); status = matrix_handler->matvec(A, vec_test, vec_r, &ONE, &MINUSONE, "csr", "cuda"); error_sum += status; real_type exactSol_normRmatrix2 = sqrt(vector_handler->dot(vec_r, vec_r, "cuda")); @@ -243,7 +243,8 @@ int main(int argc, char *argv[]) delete A; delete KLU; delete Rf; - delete x; + delete [] x; + delete [] rhs; delete vec_r; delete vec_x; delete workspace_CUDA; diff --git a/tests/functionality/testKLU_Rf_FGMRES.cpp b/tests/functionality/testKLU_Rf_FGMRES.cpp index a474e406..2e582e02 100644 --- a/tests/functionality/testKLU_Rf_FGMRES.cpp +++ b/tests/functionality/testKLU_Rf_FGMRES.cpp @@ -77,7 +77,7 @@ int main(int argc, char *argv[]) return -1; } real_type* rhs = ReSolve::io::readRhsFromFile(rhs1_file); - real_type* x = new real_type[A->getNumRows()]; + real_type* x = new real_type[A->getNumRows()]; vector_type* vec_rhs = new vector_type(A->getNumRows()); vector_type* vec_x = new vector_type(A->getNumRows()); vector_type* vec_r = new vector_type(A->getNumRows()); @@ -85,8 +85,8 @@ int main(int argc, char *argv[]) // Convert first matrix to CSR format matrix_handler->coo2csr(A_coo, A, "cpu"); - vec_rhs->update(rhs, "cpu", "cpu"); - vec_rhs->setDataUpdated("cpu"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST); + vec_rhs->setDataUpdated(ReSolve::memory::HOST); // Solve the first system using KLU status = KLU->setup(A); @@ -112,11 +112,11 @@ int main(int argc, char *argv[]) x_data[i] = 1.0; } - vec_test->setData(x_data, "cpu"); - vec_r->update(rhs, "cpu", "cuda"); - vec_diff->update(x_data, "cpu", "cuda"); + vec_test->setData(x_data, ReSolve::memory::HOST); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + vec_diff->update(x_data, ReSolve::memory::HOST, ReSolve::memory::DEVICE); - // real_type normXmatrix1 = sqrt(vector_handler->dot(vec_test, vec_test, "cuda")); + // real_type normXmatrix1 = sqrt(vector_handler->dot(vec_test, vec_test, ReSolve::memory::DEVICE)); matrix_handler->setValuesChanged(true, "cuda"); //evaluate the residual ||b-Ax|| status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr","cuda"); @@ -136,13 +136,13 @@ int main(int argc, char *argv[]) real_type normDiffMatrix1 = sqrt(vector_handler->dot(vec_diff, vec_diff, "cuda")); //compute the residual using exact solution - vec_r->update(rhs, "cpu", "cuda"); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); status = matrix_handler->matvec(A, vec_test, vec_r, &ONE, &MINUSONE,"csr", "cuda"); error_sum += status; real_type exactSol_normRmatrix1 = sqrt(vector_handler->dot(vec_r, vec_r, "cuda")); //evaluate the residual ON THE CPU using COMPUTED solution - vec_r->update(rhs, "cpu", "cpu"); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST); status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "cpu"); error_sum += status; @@ -202,25 +202,25 @@ int main(int argc, char *argv[]) rhs2_file.close(); matrix_handler->coo2csr(A_coo, A, "cuda"); - vec_rhs->update(rhs, "cpu", "cuda"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); Rf->setNumericalProperties(1e-12, 1e-1); status = Rf->refactorize(); error_sum += status; - vec_x->update(rhs, "cpu", "cuda"); + vec_x->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); status = Rf->solve(vec_x); error_sum += status; FGMRES->resetMatrix(A); - status = FGMRES->setupPreconditioner("CuSolverRf", Rf); + status = FGMRES->setupPreconditioner("LU", Rf); error_sum += status; - vec_rhs->update(rhs, "cpu", "cuda"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); status = FGMRES->solve(vec_rhs, vec_x); error_sum += status; - vec_r->update(rhs, "cpu", "cuda"); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); matrix_handler->setValuesChanged(true, "cuda"); //evaluate final residual @@ -233,13 +233,13 @@ int main(int argc, char *argv[]) //for testing only - control real_type normB2 = sqrt(vector_handler->dot(vec_rhs, vec_rhs, "cuda")); //compute x-x_true - vec_diff->update(x_data, "cpu", "cuda"); + vec_diff->update(x_data, ReSolve::memory::HOST, ReSolve::memory::DEVICE); vector_handler->axpy(&MINUSONE, vec_x, vec_diff, "cuda"); //evaluate its norm real_type normDiffMatrix2 = sqrt(vector_handler->dot(vec_diff, vec_diff, "cuda")); //compute the residual using exact solution - vec_r->update(rhs, "cpu", "cuda"); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); status = matrix_handler->matvec(A, vec_test, vec_r, &ONE, &MINUSONE, "csr", "cuda"); error_sum += status; real_type exactSol_normRmatrix2 = sqrt(vector_handler->dot(vec_r, vec_r, "cuda")); @@ -264,7 +264,8 @@ int main(int argc, char *argv[]) delete GS; delete FGMRES; delete Rf; - delete x; + delete [] x; + delete [] rhs; delete vec_r; delete vec_x; delete workspace_CUDA; diff --git a/tests/functionality/testKLU_RocSolver.cpp b/tests/functionality/testKLU_RocSolver.cpp new file mode 100644 index 00000000..9fd43ac1 --- /dev/null +++ b/tests/functionality/testKLU_RocSolver.cpp @@ -0,0 +1,251 @@ +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +//author: KS +//functionality test to check whether rocsolver_rf works correctly. + +using namespace ReSolve::constants; + +int main(int argc, char *argv[]) +{ + // Use ReSolve data types. + using index_type = ReSolve::index_type; + using real_type = ReSolve::real_type; + using vector_type = ReSolve::vector::Vector; + using matrix_type = ReSolve::matrix::Sparse; + + //we want error sum to be 0 at the end + //that means PASS. + //otheriwse it is a FAIL. + int error_sum = 0; + int status = 0; + + ReSolve::LinAlgWorkspaceHIP* workspace_HIP = new ReSolve::LinAlgWorkspaceHIP(); + workspace_HIP->initializeHandles(); + ReSolve::MatrixHandler* matrix_handler = new ReSolve::MatrixHandler(workspace_HIP); + ReSolve::VectorHandler* vector_handler = new ReSolve::VectorHandler(workspace_HIP); + + ReSolve::LinSolverDirectKLU* KLU = new ReSolve::LinSolverDirectKLU; + KLU->setupParameters(1, 0.1, false); + + ReSolve::LinSolverDirectRocSolverRf* Rf = new ReSolve::LinSolverDirectRocSolverRf(workspace_HIP); + // Input to this code is location of `data` directory where matrix files are stored + const std::string data_path = (argc == 2) ? argv[1] : "./"; + + + std::string matrixFileName1 = data_path + "data/matrix_ACTIVSg200_AC_10.mtx"; + std::string matrixFileName2 = data_path + "data/matrix_ACTIVSg200_AC_11.mtx"; + + std::string rhsFileName1 = data_path + "data/rhs_ACTIVSg200_AC_10.mtx.ones"; + std::string rhsFileName2 = data_path + "data/rhs_ACTIVSg200_AC_11.mtx.ones"; + + // Read first matrix + std::ifstream mat1(matrixFileName1); + if(!mat1.is_open()) + { + std::cout << "Failed to open file " << matrixFileName1 << "\n"; + return -1; + } + ReSolve::matrix::Coo* A_coo = ReSolve::io::readMatrixFromFile(mat1); + ReSolve::matrix::Csr* A = new ReSolve::matrix::Csr(A_coo->getNumRows(), + A_coo->getNumColumns(), + A_coo->getNnz(), + A_coo->symmetric(), + A_coo->expanded()); + mat1.close(); + + // Read first rhs vector + std::ifstream rhs1_file(rhsFileName1); + if(!rhs1_file.is_open()) + { + std::cout << "Failed to open file " << rhsFileName1 << "\n"; + return -1; + } + real_type* rhs = ReSolve::io::readRhsFromFile(rhs1_file); + real_type* x = new real_type[A->getNumRows()]; + vector_type* vec_rhs = new vector_type(A->getNumRows()); + vector_type* vec_x = new vector_type(A->getNumRows()); + vec_x->allocate(ReSolve::memory::HOST);//for KLU + vec_x->allocate(ReSolve::memory::DEVICE); + vector_type* vec_r = new vector_type(A->getNumRows()); + rhs1_file.close(); + + // Convert first matrix to CSR format + matrix_handler->coo2csr(A_coo, A, "cpu"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST); + vec_rhs->setDataUpdated(ReSolve::memory::HOST); + + // Solve the first system using KLU + status = KLU->setup(A); + error_sum += status; + + status = KLU->analyze(); + error_sum += status; + + status = KLU->factorize(); + error_sum += status; + + status = KLU->solve(vec_rhs, vec_x); + error_sum += status; + + std::cout<<"KLU solve status: "<getLFactor(); + matrix_type* U = KLU->getUFactor(); + if (L == nullptr) {printf("ERROR");} + index_type* P = KLU->getPOrdering(); + index_type* Q = KLU->getQOrdering(); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + vec_rhs->setDataUpdated(ReSolve::memory::DEVICE); + + status = Rf->setup(A, L, U, P, Q, vec_rhs); + error_sum += status; + std::cout<<"Rf setup status: "<refactorize(); + error_sum += status; + vector_type* vec_test; + vector_type* vec_diff; + vec_test = new vector_type(A->getNumRows()); + vec_diff = new vector_type(A->getNumRows()); + real_type* x_data = new real_type[A->getNumRows()]; + for (int i=0; igetNumRows(); ++i){ + x_data[i] = 1.0; + } + + vec_test->setData(x_data, ReSolve::memory::HOST); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + vec_diff->update(x_data, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + + // real_type normXmatrix1 = sqrt(vector_handler->dot(vec_test, vec_test, "hip")); + matrix_handler->setValuesChanged(true, "hip"); + status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr","hip"); + error_sum += status; + + real_type normRmatrix1 = sqrt(vector_handler->dot(vec_r, vec_r, "hip")); + + //for testing only - control + + real_type normXtrue = sqrt(vector_handler->dot(vec_x, vec_x, "hip")); + real_type normB1 = sqrt(vector_handler->dot(vec_rhs, vec_rhs, "hip")); + + //compute x-x_true + vector_handler->axpy(&MINUSONE, vec_x, vec_diff, "hip"); + //evaluate its norm + real_type normDiffMatrix1 = sqrt(vector_handler->dot(vec_diff, vec_diff, "hip")); + + //compute the residual using exact solution + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + status = matrix_handler->matvec(A, vec_test, vec_r, &ONE, &MINUSONE,"csr", "hip"); + error_sum += status; + real_type exactSol_normRmatrix1 = sqrt(vector_handler->dot(vec_r, vec_r, "hip")); + //evaluate the residual ON THE CPU using COMPUTED solution + + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST); + + status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "cpu"); + error_sum += status; + + real_type normRmatrix1CPU = sqrt(vector_handler->dot(vec_r, vec_r, "hip")); + + std::cout<<"Results (first matrix): "<coo2csr(A_coo, A, "hip"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + + // this hangs up + status = Rf->refactorize(); + error_sum += status; + + std::cout<<"rocSolverRf refactorization status: "<solve(vec_rhs, vec_x); + error_sum += status; + + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + matrix_handler->setValuesChanged(true, "hip"); + + status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE, "csr", "hip"); + error_sum += status; + + real_type normRmatrix2 = sqrt(vector_handler->dot(vec_r, vec_r, "hip")); + + //for testing only - control + real_type normB2 = sqrt(vector_handler->dot(vec_rhs, vec_rhs, "hip")); + //compute x-x_true + vec_diff->update(x_data, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + vector_handler->axpy(&MINUSONE, vec_x, vec_diff, "hip"); + //evaluate its norm + real_type normDiffMatrix2 = sqrt(vector_handler->dot(vec_diff, vec_diff, "hip")); + + //compute the residual using exact solution + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + status = matrix_handler->matvec(A, vec_test, vec_r, &ONE, &MINUSONE, "csr", "hip"); + error_sum += status; + real_type exactSol_normRmatrix2 = sqrt(vector_handler->dot(vec_r, vec_r, "hip")); + + std::cout<<"Results (second matrix): "< +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +//author: KS +//functionality test to check whether cuSolverRf/FGMRES works correctly. + +using namespace ReSolve::constants; + +int main(int argc, char *argv[]) +{ + // Use ReSolve data types. + using index_type = ReSolve::index_type; + using real_type = ReSolve::real_type; + using vector_type = ReSolve::vector::Vector; + + //we want error sum to be 0 at the end + //that means PASS. + //otheriwse it is a FAIL. + int error_sum = 0; + int status = 0; + + ReSolve::LinAlgWorkspaceHIP* workspace_HIP = new ReSolve::LinAlgWorkspaceHIP(); + workspace_HIP->initializeHandles(); + ReSolve::MatrixHandler* matrix_handler = new ReSolve::MatrixHandler(workspace_HIP); + ReSolve::VectorHandler* vector_handler = new ReSolve::VectorHandler(workspace_HIP); + + ReSolve::LinSolverDirectKLU* KLU = new ReSolve::LinSolverDirectKLU; + KLU->setupParameters(1, 0.1, false); + + ReSolve::LinSolverDirectRocSolverRf* Rf = new ReSolve::LinSolverDirectRocSolverRf(workspace_HIP); + ReSolve::GramSchmidt* GS = new ReSolve::GramSchmidt(vector_handler, ReSolve::GramSchmidt::cgs2); + ReSolve::LinSolverIterativeFGMRES* FGMRES = new ReSolve::LinSolverIterativeFGMRES(matrix_handler, vector_handler, GS, "hip"); + // Input to this code is location of `data` directory where matrix files are stored + const std::string data_path = (argc == 2) ? argv[1] : "./"; + + + std::string matrixFileName1 = data_path + "data/matrix_ACTIVSg2000_AC_00.mtx"; + std::string matrixFileName2 = data_path + "data/matrix_ACTIVSg2000_AC_02.mtx"; + + std::string rhsFileName1 = data_path + "data/rhs_ACTIVSg2000_AC_00.mtx.ones"; + std::string rhsFileName2 = data_path + "data/rhs_ACTIVSg2000_AC_02.mtx.ones"; + + + + // Read first matrix + std::ifstream mat1(matrixFileName1); + if(!mat1.is_open()) + { + std::cout << "Failed to open file " << matrixFileName1 << "\n"; + return -1; + } + ReSolve::matrix::Coo* A_coo = ReSolve::io::readMatrixFromFile(mat1); + ReSolve::matrix::Csr* A = new ReSolve::matrix::Csr(A_coo->getNumRows(), + A_coo->getNumColumns(), + A_coo->getNnz(), + A_coo->symmetric(), + A_coo->expanded()); + mat1.close(); + + // Read first rhs vector + std::ifstream rhs1_file(rhsFileName1); + if(!rhs1_file.is_open()) + { + std::cout << "Failed to open file " << rhsFileName1 << "\n"; + return -1; + } + real_type* rhs = ReSolve::io::readRhsFromFile(rhs1_file); + real_type* x = new real_type[A->getNumRows()]; + vector_type* vec_rhs = new vector_type(A->getNumRows()); + vector_type* vec_x = new vector_type(A->getNumRows()); + vector_type* vec_r = new vector_type(A->getNumRows()); + rhs1_file.close(); + + // Convert first matrix to CSR format + matrix_handler->coo2csr(A_coo, A, "cpu"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST); + vec_rhs->setDataUpdated(ReSolve::memory::HOST); + + // Solve the first system using KLU + status = KLU->setup(A); + error_sum += status; + + status = KLU->analyze(); + error_sum += status; + + status = KLU->factorize(); + error_sum += status; + + status = KLU->solve(vec_rhs, vec_x); + error_sum += status; + + vector_type* vec_test; + vector_type* vec_diff; + + vec_test = new vector_type(A->getNumRows()); + vec_diff = new vector_type(A->getNumRows()); + real_type* x_data = new real_type[A->getNumRows()]; + + for (int i=0; igetNumRows(); ++i){ + x_data[i] = 1.0; + } + + vec_test->setData(x_data, ReSolve::memory::HOST); + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + vec_diff->update(x_data, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + + // real_type normXmatrix1 = sqrt(vector_handler->dot(vec_test, vec_test, ReSolve::memory::DEVICE)); + matrix_handler->setValuesChanged(true, "hip"); + //evaluate the residual ||b-Ax|| + status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr","hip"); + error_sum += status; + + real_type normRmatrix1 = sqrt(vector_handler->dot(vec_r, vec_r, "hip")); + + + //for testing only - control + + real_type normXtrue = sqrt(vector_handler->dot(vec_x, vec_x, "hip")); + real_type normB1 = sqrt(vector_handler->dot(vec_rhs, vec_rhs, "hip")); + + //compute x-x_true + vector_handler->axpy(&MINUSONE, vec_x, vec_diff, "hip"); + //evaluate its norm + real_type normDiffMatrix1 = sqrt(vector_handler->dot(vec_diff, vec_diff, "hip")); + + //compute the residual using exact solution + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + status = matrix_handler->matvec(A, vec_test, vec_r, &ONE, &MINUSONE,"csr", "hip"); + error_sum += status; + real_type exactSol_normRmatrix1 = sqrt(vector_handler->dot(vec_r, vec_r, "hip")); + //evaluate the residual ON THE CPU using COMPUTED solution + + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::HOST); + + status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE,"csr", "cpu"); + error_sum += status; + + real_type normRmatrix1CPU = sqrt(vector_handler->dot(vec_r, vec_r, "hip")); + + std::cout<<"Results (first matrix): "<getLFactor(); + ReSolve::matrix::Csc* U = (ReSolve::matrix::Csc*) KLU->getUFactor(); + + if (L == nullptr) { + printf("ERROR"); + } + index_type* P = KLU->getPOrdering(); + index_type* Q = KLU->getQOrdering(); + Rf->setSolveMode(1); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + error_sum += Rf->setup(A, L, U, P, Q, vec_rhs); + FGMRES->setMaxit(200); + FGMRES->setRestart(100); + + GS->setup(A->getNumRows(), FGMRES->getRestart()); + status = FGMRES->setup(A); + error_sum += status; + + // Load the second matrix + std::ifstream mat2(matrixFileName2); + if(!mat2.is_open()) + { + std::cout << "Failed to open file " << matrixFileName2 << "\n"; + return -1; + } + ReSolve::io::readAndUpdateMatrix(mat2, A_coo); + mat2.close(); + + // Load the second rhs vector + std::ifstream rhs2_file(rhsFileName2); + if(!rhs2_file.is_open()) + { + std::cout << "Failed to open file " << rhsFileName2 << "\n"; + return -1; + } + ReSolve::io::readAndUpdateRhs(rhs2_file, &rhs); + rhs2_file.close(); + + matrix_handler->coo2csr(A_coo, A, "hip"); + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + + status = Rf->refactorize(); + error_sum += status; + + vec_x->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + status = Rf->solve(vec_x); + error_sum += status; + + FGMRES->resetMatrix(A); + status = FGMRES->setupPreconditioner("LU", Rf); + error_sum += status; + + vec_rhs->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + status = FGMRES->solve(vec_rhs, vec_x); + error_sum += status; + + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + matrix_handler->setValuesChanged(true, "hip"); + + //evaluate final residual + status = matrix_handler->matvec(A, vec_x, vec_r, &ONE, &MINUSONE, "csr", "hip"); + error_sum += status; + + real_type normRmatrix2 = sqrt(vector_handler->dot(vec_r, vec_r, "hip")); + + + //for testing only - control + real_type normB2 = sqrt(vector_handler->dot(vec_rhs, vec_rhs, "hip")); + //compute x-x_true + vec_diff->update(x_data, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + vector_handler->axpy(&MINUSONE, vec_x, vec_diff, "hip"); + //evaluate its norm + real_type normDiffMatrix2 = sqrt(vector_handler->dot(vec_diff, vec_diff, "hip")); + + //compute the residual using exact solution + vec_r->update(rhs, ReSolve::memory::HOST, ReSolve::memory::DEVICE); + status = matrix_handler->matvec(A, vec_test, vec_r, &ONE, &MINUSONE, "csr", "hip"); + error_sum += status; + real_type exactSol_normRmatrix2 = sqrt(vector_handler->dot(vec_r, vec_r, "hip")); + std::cout<<"Results (second matrix): "<getNumIter()<<" (max 200, restart 100)"<getInitResidualNorm()<<" "<getFinalResidualNorm()<<" (tol 1e-14)"<) -add_test(NAME matrix_handler_test COMMAND $) \ No newline at end of file +add_test(NAME matrix_handler_test COMMAND $) diff --git a/tests/unit/matrix/MatrixHandlerTests.hpp b/tests/unit/matrix/MatrixHandlerTests.hpp index e203017a..63d2f49b 100644 --- a/tests/unit/matrix/MatrixHandlerTests.hpp +++ b/tests/unit/matrix/MatrixHandlerTests.hpp @@ -42,17 +42,23 @@ class MatrixHandlerTests : TestBase TestOutcome matVec(index_type N) { TestStatus status; + ReSolve::memory::MemorySpace ms; + if (memspace_ == "cpu") + ms = memory::HOST; + else + ms = memory::DEVICE; ReSolve::MatrixHandler* handler = createMatrixHandler(); matrix::Csr* A = createCsrMatrix(N, memspace_); vector::Vector x(N); vector::Vector y(N); - x.allocate(memspace_); - y.allocate(memspace_); + x.allocate(ms); + if (x.getData(ms) == NULL) printf("oups we have an issue \n"); + y.allocate(ms); - x.setToConst(1.0, memspace_); - y.setToConst(1.0, memspace_); + x.setToConst(1.0, ms); + y.setToConst(1.0, ms); real_type alpha = 2.0/30.0; real_type beta = 2.0; @@ -80,6 +86,12 @@ class MatrixHandlerTests : TestBase LinAlgWorkspaceCUDA* workspace = new LinAlgWorkspaceCUDA(); workspace->initializeHandles(); return new MatrixHandler(workspace); +#endif +#ifdef RESOLVE_USE_HIP + } else if (memspace_ == "hip") { + LinAlgWorkspaceHIP* workspace = new LinAlgWorkspaceHIP(); + workspace->initializeHandles(); + return new MatrixHandler(workspace); #endif } else { std::cout << "ReSolve not built with support for memory space " << memspace_ << "\n"; @@ -91,14 +103,14 @@ class MatrixHandlerTests : TestBase { bool status = true; if (memspace != "cpu") { - x.copyData(memspace, "cpu"); + x.copyData(memory::DEVICE, memory::HOST); } for (index_type i = 0; i < x.getSize(); ++i) { - // std::cout << x.getData("cpu")[i] << "\n"; - if (!isEqual(x.getData("cpu")[i], answer)) { + // std::cout << x.getData(memory::HOST)[i] << "\n"; + if (!isEqual(x.getData(memory::HOST)[i], answer)) { status = false; - std::cout << "Solution vector element x[" << i << "] = " << x.getData("cpu")[i] + std::cout << "Solution vector element x[" << i << "] = " << x.getData(memory::HOST)[i] << ", expected: " << answer << "\n"; break; } @@ -118,42 +130,42 @@ class MatrixHandlerTests : TestBase // std::cout << N << "\n"; + // First compute number of nonzeros index_type NNZ = 0; for (index_type i = 0; i < N; ++i) { - NNZ += static_cast(data[i%5].size()); + size_t reminder = static_cast(i%5); + NNZ += static_cast(data[reminder].size()); } - // std::cout << NNZ << "\n"; + // Allocate NxN CSR matrix with NNZ nonzeros matrix::Csr* A = new matrix::Csr(N, N, NNZ); - A->allocateMatrixData("cpu"); + A->allocateMatrixData(memory::HOST); - index_type* rowptr = A->getRowData("cpu"); - index_type* colidx = A->getColData("cpu"); - real_type* val = A->getValues("cpu"); + index_type* rowptr = A->getRowData(memory::HOST); + index_type* colidx = A->getColData(memory::HOST); + real_type* val = A->getValues( memory::HOST); + // Populate CSR matrix using same row pattern as for NNZ calculation rowptr[0] = 0; - index_type i = 0; - for (i=0; i < N; ++i) + for (index_type i=0; i < N; ++i) { - const std::vector& row_sample = data[i%5]; + size_t reminder = static_cast(i%5); + const std::vector& row_sample = data[reminder]; index_type nnz_per_row = static_cast(row_sample.size()); - // std::cout << nnz_per_row << "\n"; rowptr[i+1] = rowptr[i] + nnz_per_row; for (index_type j = rowptr[i]; j < rowptr[i+1]; ++j) { colidx[j] = (j - rowptr[i]) * N/nnz_per_row + (N%(N/nnz_per_row)); // evenly distribute nonzeros ^^^^ ^^^^^^^^ perturb offset - val[j] = row_sample[j - rowptr[i]]; - // std::cout << i << " " << colidx[j] << " " << val[j] << "\n"; + val[j] = row_sample[static_cast(j - rowptr[i])]; } } - A->setUpdated("cpu"); - // std::cout << rowptr[i] << "\n"; + A->setUpdated(memory::HOST); - if (memspace == "cuda") { - A->copyData(memspace); + if ((memspace == "cuda") || (memspace == "hip")) { + A->copyData(memory::DEVICE); } return A; diff --git a/tests/unit/matrix/MatrixIoTests.hpp b/tests/unit/matrix/MatrixIoTests.hpp index ad14f0a7..1ce23ae2 100644 --- a/tests/unit/matrix/MatrixIoTests.hpp +++ b/tests/unit/matrix/MatrixIoTests.hpp @@ -78,7 +78,7 @@ class MatrixIoTests : TestBase // Create a 5x5 COO matrix with 10 nonzeros ReSolve::matrix::Coo A(5, 5, 10); - A.allocateMatrixData("cpu"); + A.allocateMatrixData(memory::HOST); // Read string into istream and status it to `readMatrixFromFile` function. std::istringstream file2(symmetric_coo_matrix_file_); @@ -176,9 +176,9 @@ class MatrixIoTests : TestBase const std::vector& val_data) { for (size_t i = 0; i < val_data.size(); ++i) { - if ((answer.getRowData("cpu")[i] != row_data[i]) || - (answer.getColData("cpu")[i] != col_data[i]) || - (!isEqual(answer.getValues("cpu")[i], val_data[i]))) + if ((answer.getRowData(memory::HOST)[i] != row_data[i]) || + (answer.getColData(memory::HOST)[i] != col_data[i]) || + (!isEqual(answer.getValues(memory::HOST)[i], val_data[i]))) { std::cout << "Incorrect matrix value at storage element " << i << ".\n"; return false; diff --git a/tests/unit/matrix/runMatrixHandlerTests.cpp b/tests/unit/matrix/runMatrixHandlerTests.cpp index 6eee90d5..26ad70b0 100644 --- a/tests/unit/matrix/runMatrixHandlerTests.cpp +++ b/tests/unit/matrix/runMatrixHandlerTests.cpp @@ -33,5 +33,17 @@ int main(int, char**) } #endif +#ifdef RESOLVE_USE_HIP + { + std::cout << "Running tests with HIP backend:\n"; + ReSolve::tests::MatrixHandlerTests test("hip"); + + result += test.matrixHandlerConstructor(); + result += test.matrixOneNorm(); + result += test.matVec(50); + + std::cout << "\n"; + } +#endif return result.summary(); } diff --git a/tests/unit/memory/CMakeLists.txt b/tests/unit/memory/CMakeLists.txt new file mode 100644 index 00000000..01313e33 --- /dev/null +++ b/tests/unit/memory/CMakeLists.txt @@ -0,0 +1,21 @@ +#[[ + +@brief Build ReSolve memory utilities unit tests + +@author Slaven Peles + +]] + +# Build memory utilities tests +add_executable(runMemoryUtilsTests.exe runMemoryUtilsTests.cpp) +target_link_libraries(runMemoryUtilsTests.exe PRIVATE ReSolve) +message(STATUS "Resolve libraries: ${resolve_backend_hip}") + + +# Install tests +set(installable_tests runMemoryUtilsTests.exe) +install(TARGETS ${installable_tests} + RUNTIME DESTINATION bin/resolve/tests/unit) + +# Add tests to run +add_test(NAME memory_test COMMAND $) diff --git a/tests/unit/memory/MemoryUtilsTests.hpp b/tests/unit/memory/MemoryUtilsTests.hpp new file mode 100644 index 00000000..4cc1ace8 --- /dev/null +++ b/tests/unit/memory/MemoryUtilsTests.hpp @@ -0,0 +1,110 @@ +#pragma once +#include +#include +#include +#include +#include +#include +#include + +namespace ReSolve { namespace tests { + +/** + * @class Unit tests for memory handler class + */ +class MemoryUtilsTests : TestBase +{ +public: + MemoryUtilsTests(std::string memspace) : memspace_(memspace) + {} + virtual ~MemoryUtilsTests() + {} + + TestOutcome allocateAndDelete() + { + TestStatus status; + status = true; + + MemoryHandler mh; + + index_type n = 1000; + size_t m = 8000; + index_type* i = nullptr; + real_type* r = nullptr; + + mh.allocateArrayOnDevice(&i, n); + mh.allocateBufferOnDevice((void**) &r, m); + + status *= (i != nullptr); + status *= (r != nullptr); + + mh.deleteOnDevice(i); + mh.deleteOnDevice(r); + + return status.report(__func__); + } + + TestOutcome memsetAndMemcpy() + { + TestStatus status; + status = true; + + MemoryHandler mh; + + index_type n = 10; + + real_type zero = 0.0; + real_type minusone = -1.0; + + // Create raw arrays on the host and set their elements to -1 + real_type* array1 = new real_type[n]{0}; + real_type* array2 = new real_type[n]{0}; + std::fill_n(array1, n, minusone); + std::fill_n(array2, n, minusone); + + // Allocate arrays of size n on the device + real_type* devarray1 = nullptr; + real_type* devarray2 = nullptr; + mh.allocateArrayOnDevice(&devarray1, n); + mh.allocateArrayOnDevice(&devarray2, n); + + // Set devarray1 elements to 0 and copy it to array1 + mh.setZeroArrayOnDevice(devarray1, n); + mh.copyArrayDeviceToHost(array1, devarray1, n); + status *= verifyAnswer(array1, zero, n); + + // Copy array2 (values -1) to devarray2 and then devarray2 to array1 + mh.copyArrayHostToDevice(devarray2, array2, n); + mh.copyArrayDeviceToHost(array1, devarray2, n); + status *= verifyAnswer(array1, minusone, n); + + // Copy devarray1 (values 0) to devarray2 and then to array2 + mh.copyArrayDeviceToDevice(devarray2, devarray1, n); + mh.copyArrayDeviceToHost(array2, devarray2, n); + status *= verifyAnswer(array2, zero, n); + + return status.report(__func__); + } + + +private: + std::string memspace_{"cpu"}; + + bool verifyAnswer(real_type* x, real_type answer, index_type n) + { + bool status = true; + + for (index_type i = 0; i < n; ++i) { + if (!isEqual(x[i], answer)) { + status = false; + std::cout << "Solution vector element x[" << i << "] = " << x[i] + << ", expected: " << answer << "\n"; + break; + } + } + return status; + } + +}; // class MemoryUtilsTests + +}} // namespace ReSolve::tests diff --git a/tests/unit/memory/runMemoryUtilsTests.cpp b/tests/unit/memory/runMemoryUtilsTests.cpp new file mode 100644 index 00000000..00349c7c --- /dev/null +++ b/tests/unit/memory/runMemoryUtilsTests.cpp @@ -0,0 +1,36 @@ +#include +#include +#include + +#include "MemoryUtilsTests.hpp" + +int main(int, char**) +{ + ReSolve::tests::TestingResults result; + +#ifdef RESOLVE_USE_HIP + { + std::cout << "Running memory tests with HIP backend:\n"; + ReSolve::tests::MemoryUtilsTests test("hip"); + + result += test.allocateAndDelete(); + result += test.memsetAndMemcpy(); + + std::cout << "\n"; + } +#endif + +#ifdef RESOLVE_USE_CUDA + { + std::cout << "Running memory tests with CUDA backend:\n"; + ReSolve::tests::MemoryUtilsTests test("hip"); + + result += test.allocateAndDelete(); + result += test.memsetAndMemcpy(); + + std::cout << "\n"; + } +#endif + + return result.summary(); +} diff --git a/tests/unit/vector/GramSchmidtTests.hpp b/tests/unit/vector/GramSchmidtTests.hpp index 9981ea48..4837b57b 100644 --- a/tests/unit/vector/GramSchmidtTests.hpp +++ b/tests/unit/vector/GramSchmidtTests.hpp @@ -66,15 +66,21 @@ namespace ReSolve { break; } + ReSolve::memory::MemorySpace ms; + if (memspace_ == "cpu") + ms = memory::HOST; + else + ms = memory::DEVICE; + ReSolve::VectorHandler* handler = createVectorHandler(); vector::Vector* V = new vector::Vector(N, 3); // we will be using a space of 3 vectors real_type* H = new real_type[6]; //in this case, Hessenberg matrix is 3 x 2 real_type* aux_data; // needed for setup - V->allocate(memspace_); - if (memspace_ != "cpu") { - V->allocate("cpu"); + V->allocate(ms); + if (ms != memory::HOST) { + V->allocate(memory::HOST); } @@ -82,7 +88,7 @@ namespace ReSolve { GS->setup(N, 3); //fill 2nd and 3rd vector with values - aux_data = V->getVectorData(1, "cpu"); + aux_data = V->getVectorData(1, memory::HOST); for (int i = 0; i < N; ++i) { if ( i % 2 == 0) { aux_data[i] = constants::ONE; @@ -90,7 +96,7 @@ namespace ReSolve { aux_data[i] = var1; } } - aux_data = V->getVectorData(2, "cpu"); + aux_data = V->getVectorData(2, memory::HOST); for (int i = 0; i < N; ++i) { if ( i % 3 > 0) { aux_data[i] = constants::ZERO; @@ -98,11 +104,11 @@ namespace ReSolve { aux_data[i] = var2; } } - V->setDataUpdated("cpu"); - V->copyData("cpu", memspace_); + V->setDataUpdated(memory::HOST); + V->copyData(memory::HOST, ms); //set the first vector to all 1s, normalize - V->setToConst(0, 1.0, memspace_); + V->setToConst(0, 1.0, ms); real_type nrm = handler->dot(V, V, memspace_); nrm = sqrt(nrm); nrm = 1.0 / nrm; @@ -144,6 +150,12 @@ namespace ReSolve { // x is a multivector containing K vectors bool verifyAnswer(vector::Vector* x, index_type K, ReSolve::VectorHandler* handler, std::string memspace) { + ReSolve::memory::MemorySpace ms; + if (memspace == "cpu") + ms = memory::HOST; + else + ms = memory::DEVICE; + vector::Vector* a = new vector::Vector(x->getSize()); vector::Vector* b = new vector::Vector(x->getSize()); @@ -152,8 +164,8 @@ namespace ReSolve { for (index_type i = 0; i < K; ++i) { for (index_type j = 0; j < K; ++j) { - a->update(x->getVectorData(i, memspace), memspace, "cpu"); - b->update(x->getVectorData(j, memspace), memspace, "cpu"); + a->update(x->getVectorData(i, ms), ms, memory::HOST); + b->update(x->getVectorData(j, ms), ms, memory::HOST); ip = handler->dot(a, b, "cpu"); if ( (i != j) && (abs(ip) > 1e-14)) { diff --git a/tests/unit/vector/VectorHandlerTests.hpp b/tests/unit/vector/VectorHandlerTests.hpp index d2f8c73c..856bb84d 100644 --- a/tests/unit/vector/VectorHandlerTests.hpp +++ b/tests/unit/vector/VectorHandlerTests.hpp @@ -1,6 +1,7 @@ #pragma once #include #include +#include #include #include #include @@ -38,16 +39,22 @@ namespace ReSolve { { TestStatus status; + ReSolve::memory::MemorySpace ms; + if (memspace_ == "cpu") + ms = memory::HOST; + else + ms = memory::DEVICE; + ReSolve::VectorHandler* handler = createVectorHandler(); vector::Vector* x = new vector::Vector(N); vector::Vector* y = new vector::Vector(N); - x->allocate(memspace_); - y->allocate(memspace_); + x->allocate(ms); + y->allocate(ms); - x->setToConst(3.0, memspace_); - y->setToConst(1.0, memspace_); + x->setToConst(3.0, ms); + y->setToConst(1.0, ms); real_type alpha = 0.5; //the result is a vector with y[i] = 2.5; @@ -65,16 +72,22 @@ namespace ReSolve { { TestStatus status; + ReSolve::memory::MemorySpace ms; + if (memspace_ == "cpu") + ms = memory::HOST; + else + ms = memory::DEVICE; + ReSolve::VectorHandler* handler = createVectorHandler(); vector::Vector* x = new vector::Vector(N); vector::Vector* y = new vector::Vector(N); - x->allocate(memspace_); - y->allocate(memspace_); + x->allocate(ms); + y->allocate(ms); - x->setToConst(0.25, memspace_); - y->setToConst(4.0, memspace_); + x->setToConst(0.25, ms); + y->setToConst(4.0, ms); real_type ans; //the result is N ans = handler->dot(x, y, memspace_); @@ -97,13 +110,19 @@ namespace ReSolve { { TestStatus status; + ReSolve::memory::MemorySpace ms; + if (memspace_ == "cpu") + ms = memory::HOST; + else + ms = memory::DEVICE; + ReSolve::VectorHandler* handler = createVectorHandler(); vector::Vector* x = new vector::Vector(N); - x->allocate(memspace_); + x->allocate(ms); - x->setToConst(1.25, memspace_); + x->setToConst(1.25, ms); real_type alpha = 3.5; @@ -121,17 +140,23 @@ namespace ReSolve { { TestStatus status; + ReSolve::memory::MemorySpace ms; + if (memspace_ == "cpu") + ms = memory::HOST; + else + ms = memory::DEVICE; + ReSolve::VectorHandler* handler = createVectorHandler(); vector::Vector* x = new vector::Vector(N, K); vector::Vector* y = new vector::Vector(N); vector::Vector* alpha = new vector::Vector(K);; - x->allocate(memspace_); - y->allocate(memspace_); - alpha->allocate(memspace_); + x->allocate(ms); + y->allocate(ms); + alpha->allocate(ms); - y->setToConst(2.0, memspace_); - alpha->setToConst(-1.0, memspace_); + y->setToConst(2.0, ms); + alpha->setToConst(-1.0, ms); for (int ii = 0; ii < K; ++ii) { real_type c; if (ii % 2 == 0) { @@ -139,15 +164,15 @@ namespace ReSolve { } else { c = 0.5; } - x->setToConst(ii, c, memspace_); + x->setToConst(ii, c, ms); } + index_type r = K % 2; real_type res = (real_type) ((floor((real_type) K / 2.0) + r) * 1.0 + floor((real_type) K / 2.0) * (-0.5)); handler->massAxpy(N, alpha, K, x, y, memspace_); status *= verifyAnswer(y, 2.0 - res, memspace_); - - + delete handler; delete x; delete y; @@ -160,17 +185,23 @@ namespace ReSolve { { TestStatus status; + ReSolve::memory::MemorySpace ms; + if (memspace_ == "cpu") + ms = memory::HOST; + else + ms = memory::DEVICE; + ReSolve::VectorHandler* handler = createVectorHandler(); vector::Vector* x = new vector::Vector(N, K); vector::Vector* y = new vector::Vector(N, 2); vector::Vector* res = new vector::Vector(K, 2); - x->allocate(memspace_); - y->allocate(memspace_); - res->allocate(memspace_); + x->allocate(ms); + y->allocate(ms); + res->allocate(ms); - x->setToConst(1.0, memspace_); - y->setToConst(-1.0, memspace_); + x->setToConst(1.0, ms); + y->setToConst(-1.0, ms); handler->massDot2Vec(N, x, K, y, res, memspace_); status *= verifyAnswer(res, (-1.0) * (real_type) N, memspace_); @@ -185,6 +216,13 @@ namespace ReSolve { TestOutcome gemv(index_type N, index_type K) { TestStatus status; + + ReSolve::memory::MemorySpace ms; + if (memspace_ == "cpu") + ms = memory::HOST; + else + ms = memory::DEVICE; + ReSolve::VectorHandler* handler = createVectorHandler(); vector::Vector* V = new vector::Vector(N, K); // for the test with NO TRANSPOSE @@ -194,17 +232,17 @@ namespace ReSolve { vector::Vector* yT = new vector::Vector(N); vector::Vector* xT = new vector::Vector(K); - V->allocate(memspace_); - yN->allocate(memspace_); - xN->allocate(memspace_); - yT->allocate(memspace_); - xT->allocate(memspace_); - - V->setToConst(1.0, memspace_); - yN->setToConst(-1.0, memspace_); - xN->setToConst(.5, memspace_); - yT->setToConst(-1.0, memspace_); - xT->setToConst(.5, memspace_); + V->allocate(ms); + yN->allocate(ms); + xN->allocate(ms); + yT->allocate(ms); + xT->allocate(ms); + + V->setToConst(1.0, ms); + yN->setToConst(-1.0, ms); + xN->setToConst(.5, ms); + yT->setToConst(-1.0, ms); + xT->setToConst(.5, ms); real_type alpha = -1.0; real_type beta = 1.0; @@ -229,6 +267,12 @@ namespace ReSolve { LinAlgWorkspaceCUDA* workspace = new LinAlgWorkspaceCUDA(); workspace->initializeHandles(); return new VectorHandler(workspace); +#endif +#ifdef RESOLVE_USE_HIP + } else if (memspace_ == "hip") { + LinAlgWorkspaceHIP* workspace = new LinAlgWorkspaceHIP(); + workspace->initializeHandles(); + return new VectorHandler(workspace); #endif } else { std::cout << "ReSolve not built with support for memory space " << memspace_ << "\n"; @@ -241,14 +285,15 @@ namespace ReSolve { { bool status = true; if (memspace != "cpu") { - x->copyData(memspace, "cpu"); + x->copyData(memory::DEVICE, memory::HOST); } for (index_type i = 0; i < x->getSize(); ++i) { // std::cout << x->getData("cpu")[i] << "\n"; - if (!isEqual(x->getData("cpu")[i], answer)) { + if (!isEqual(x->getData(memory::HOST)[i], answer)) { + std::cout << std::setprecision(16); status = false; - std::cout << "Solution vector element x[" << i << "] = " << x->getData("cpu")[i] + std::cout << "Solution vector element x[" << i << "] = " << x->getData(memory::HOST)[i] << ", expected: " << answer << "\n"; break; } diff --git a/tests/unit/vector/runVectorHandlerTests.cpp b/tests/unit/vector/runVectorHandlerTests.cpp index 77e99471..9bb543a5 100644 --- a/tests/unit/vector/runVectorHandlerTests.cpp +++ b/tests/unit/vector/runVectorHandlerTests.cpp @@ -37,5 +37,22 @@ int main(int, char**) } #endif +#ifdef RESOLVE_USE_HIP + { + std::cout << "Running tests with HIP backend:\n"; + ReSolve::tests::VectorHandlerTests test("hip"); + + result += test.dot(5000); + result += test.axpy(5000); + result += test.scal(5000); + result += test.gemv(5000, 10); + result += test.massAxpy(100, 10); + result += test.massAxpy(1000, 300); + result += test.massDot(100, 10); + result += test.massDot(1000, 30); + + std::cout << "\n"; + } +#endif return result.summary(); }