Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Permit INV_TRANS with OpenMP offload #196

Open
wants to merge 28 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
a2f6a28
Remove _OVERLOAD layer
samhatfield Feb 6, 2025
aac11e1
Refresh OpenMP in PRFI1B
samhatfield Dec 10, 2024
21af059
Refresh OpenMP in SPNSDE
samhatfield Dec 10, 2024
efe0636
Refresh OpenMP in VDTUV
samhatfield Dec 10, 2024
e218866
Refresh OpenMP in FSC
samhatfield Dec 11, 2024
d9851d6
Refresh OpenMP in FTINV
samhatfield Dec 11, 2024
ef7ab39
Refresh OpenMP in LEINV
samhatfield Dec 11, 2024
3b18eae
Refresh OpenMP in TPM_HICFFT
samhatfield Dec 11, 2024
9758067
Refresh OpenMP in TRMTOL
samhatfield Dec 11, 2024
7b2868f
Refresh OpenMP in TRMTOL_PACK_UNPACK
samhatfield Dec 11, 2024
c3625c0
Refresh OpenMP in TRLTOG
samhatfield Dec 11, 2024
1b7e709
Add OpenMP to ext_acc
samhatfield Dec 11, 2024
e15b102
Refresh OpenMP in BUFFERED_ALLOCATOR_MOD
samhatfield Jan 23, 2025
e0f8993
Add missing copy-in for D_NUMP (OpenMP)
samhatfield Feb 3, 2025
0851260
Make packing index integers shared, not private
samhatfield Feb 6, 2025
84a14d0
Copy all derived type parameters for OpenMP
samhatfield Feb 6, 2025
7d2deca
Fix handle reusing behaviour for OMPGPU
samhatfield Feb 6, 2025
56f1e17
Delete unused handles
samhatfield Feb 6, 2025
212993a
Update OMPT statements in prfi1b/vdtuv mods
thomasgibson Feb 7, 2025
a44e978
update ompgpu: spnsde mod
thomasgibson Feb 7, 2025
202a367
update ompgpu: leinv_mod (wip)
thomasgibson Feb 7, 2025
48d066a
update trmtol modules
thomasgibson Feb 7, 2025
74ed73e
update ftinv mod
thomasgibson Feb 7, 2025
b8398c3
initial update to trltog mod
thomasgibson Feb 7, 2025
e438cd6
Introduce data explicit scoping to OMP directives
thomasgibson Feb 7, 2025
d20af51
Fixup OMP directive in TRMTOL
samhatfield Feb 7, 2025
b213ea0
Remove accidental mapping of ZACHTE2
samhatfield Feb 7, 2025
ab272a2
Change HIP stream from 1 to 0
samhatfield Feb 20, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 2 additions & 8 deletions src/trans/gpu/algor/buffered_allocator_mod.F90
Original file line number Diff line number Diff line change
Expand Up @@ -146,8 +146,6 @@ SUBROUTINE ASSIGN_PTR_FLOAT(DST, SRC, START_IN_BYTES, LENGTH_IN_BYTES, SET_VALUE
ELSE
#ifdef ACCGPU
SET_STREAM_EFF = ACC_ASYNC_SYNC
#endif
#ifdef OMPGPU
#endif
ENDIF
IF (SET_VALUE_EFF .AND. LENGTH_IN_BYTES > 0) THEN
Expand All @@ -157,14 +155,13 @@ SUBROUTINE ASSIGN_PTR_FLOAT(DST, SRC, START_IN_BYTES, LENGTH_IN_BYTES, SET_VALUE
!$ACC PARALLEL PRESENT(SRC) ASYNC(SET_STREAM_EFF)
#endif
#ifdef OMPGPU
!$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO DEFAULT(NONE) SHARED(LENGTH_IN_BYTES,SRC)
#endif
DO J=1_C_SIZE_T,LENGTH_IN_BYTES
SRC(J) = -1
ENDDO
#ifdef ACCGPU
!$ACC END PARALLEL
#endif
#ifdef OMPGPU
#endif
ENDIF
CALL C_F_POINTER(C_LOC(SRC(START_IN_BYTES:START_IN_BYTES+LENGTH_IN_BYTES-1)), DST, &
Expand Down Expand Up @@ -196,8 +193,6 @@ SUBROUTINE ASSIGN_PTR_DOUBLE(DST, SRC, START_IN_BYTES, LENGTH_IN_BYTES, SET_VALU
ELSE
#ifdef ACCGPU
SET_STREAM_EFF = ACC_ASYNC_SYNC
#endif
#ifdef OMPGPU
#endif
ENDIF
IF (SET_VALUE_EFF .AND. LENGTH_IN_BYTES > 0) THEN
Expand All @@ -208,14 +203,13 @@ SUBROUTINE ASSIGN_PTR_DOUBLE(DST, SRC, START_IN_BYTES, LENGTH_IN_BYTES, SET_VALU
!$ACC PARALLEL PRESENT(SRC) ASYNC(SET_STREAM_EFF)
#endif
#ifdef OMPGPU
!$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO DEFAULT(NONE) SHARED(LENGTH_IN_BYTES,SRC)
#endif
DO J=1_C_SIZE_T,LENGTH_IN_BYTES
SRC(J) = -1
ENDDO
#ifdef ACCGPU
!$ACC END PARALLEL
#endif
#ifdef OMPGPU
#endif
ENDIF
CALL C_F_POINTER(C_LOC(SRC(START_IN_BYTES:START_IN_BYTES+LENGTH_IN_BYTES-1)), DST, &
Expand Down
4 changes: 4 additions & 0 deletions src/trans/gpu/algor/ext_acc.F90
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,7 @@ subroutine ext_acc_create(ptrs, stream)
!$acc enter data create(pp) async(stream_act)
#endif
#ifdef OMPGPU
!$omp target enter data map(alloc:pp)
#endif
enddo
end subroutine
Expand Down Expand Up @@ -327,6 +328,7 @@ subroutine ext_acc_copyin(ptrs, stream)
!$acc enter data copyin(pp) async(stream_act)
#endif
#ifdef OMPGPU
!$omp target enter data map(to:pp)
#endif
enddo
end subroutine
Expand Down Expand Up @@ -363,6 +365,7 @@ subroutine ext_acc_copyout(ptrs, stream)
!$acc exit data copyout(pp) async(stream_act)
#endif
#ifdef OMPGPU
!$omp target exit data map(from:pp)
#endif
enddo
end subroutine
Expand Down Expand Up @@ -399,6 +402,7 @@ subroutine ext_acc_delete(ptrs, stream)
!$acc exit data delete(pp) async(stream_act)
#endif
#ifdef OMPGPU
!$omp target exit data map(delete:pp)
#endif
enddo
end subroutine
Expand Down
18 changes: 10 additions & 8 deletions src/trans/gpu/algor/hicblas_gemm.hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,7 @@
bool hip_alreadyAllocated_sgemm = false;
bool hip_alreadyAllocated_sgemm_handle = false;

bool hip_alreadyAllocated_dsgemm = false;
bool hip_alreadyAllocated_dgemm_handle = false;

hipblasHandle_t handle_hip_sgemm;
hipblasHandle_t handle_hip_dgemm;

namespace {
struct cache_key {
Expand Down Expand Up @@ -199,15 +195,25 @@ template <typename Real> struct hipblas_gemm_grouped {
void operator()(hipStream_t stream, int m, int n, int k, Real alpha,
const Real *A, int lda, const Real *B, int ldb, Real beta,
Real *C, int ldc) const {
// TODO: sort out this nonsense
#ifdef OMPGPU
hipblasHandle_t handle;
HICBLAS_CHECK(hipblasCreate(&handle));
#endif
#ifdef ACCGPU
hipblasHandle_t handle = get_hipblas_handle();
HICBLAS_CHECK(hipblasSetStream(handle, stream));
#endif

if constexpr (std::is_same<Real, float>::value)
HICBLAS_CHECK(hipblasSgemm(handle, transa_, transb_, m, n, k, &alpha, A,
lda, B, ldb, &beta, C, ldc));
if constexpr (std::is_same<Real, double>::value)
HICBLAS_CHECK(hipblasDgemm(handle, transa_, transb_, m, n, k, &alpha, A,
lda, B, ldb, &beta, C, ldc));
#ifdef OMPGPU
HICBLAS_CHECK(hipblasDestroy(handle));
#endif
}

private:
Expand Down Expand Up @@ -278,10 +284,6 @@ void hipblas_dgemm_wrapper(char transa, char transb, int m, int n, int k,
if (transb == 'T' || transb == 't')
op_t2 = HIPBLAS_OP_T;

if (!hip_alreadyAllocated_dgemm_handle) {
HICBLAS_CHECK(hipblasCreate(&handle_hip_dgemm));
hip_alreadyAllocated_dgemm_handle = true;
}
hipblasHandle_t handle = get_hipblas_handle();
HICBLAS_CHECK(hipblasSetStream(handle, *(hipStream_t *)stream));

Expand Down
233 changes: 0 additions & 233 deletions src/trans/gpu/algor/hicblas_mod.F90
Original file line number Diff line number Diff line change
Expand Up @@ -7,21 +7,8 @@
! nor does it submit to any jurisdiction.
!

#if defined CUDAGPU
#define ACC_GET_HIP_STREAM ACC_GET_CUDA_STREAM
#define OPENACC_LIB OPENACC
#endif

MODULE HICBLAS_MOD

USE EC_PARKIND, ONLY: JPIM, JPRM, JPRD, JPIB
USE GROWING_ALLOCATOR_MOD, ONLY: GROWING_ALLOCATION_TYPE
#ifdef ACCGPU
USE OPENACC_LIB, ONLY: ACC_GET_HIP_STREAM
#endif
#ifdef OMPGPU
#endif

IMPLICIT NONE

INTERFACE
Expand Down Expand Up @@ -118,224 +105,4 @@ SUBROUTINE HIP_SGEMM_GROUPED( &
END SUBROUTINE HIP_SGEMM_GROUPED
END INTERFACE

CONTAINS

SUBROUTINE HIP_DGEMM_BATCHED_OVERLOAD( &
& TRANSA, TRANSB, &
& M, N, K, &
& ALPHA, &
& AARRAY, LDA, STRIDEA, &
& BARRAY, LDB, STRIDEB, &
& BETA, &
& CARRAY, LDC, STRIDEC, &
& BATCHCOUNT, STREAM, ALLOC)
USE ISO_C_BINDING, ONLY: C_CHAR, C_INT, C_LONG, C_LOC
CHARACTER(1,C_CHAR), VALUE :: TRANSA, TRANSB
INTEGER(KIND=JPIM) :: M
INTEGER(KIND=JPIM) :: N
INTEGER(KIND=JPIM) :: K
REAL(KIND=JPRD) :: ALPHA
REAL(KIND=JPRD), DIMENSION(:) :: AARRAY
INTEGER(KIND=JPIM) :: LDA
INTEGER(KIND=JPIM) :: STRIDEA
REAL(KIND=JPRD), DIMENSION(:,:) :: BARRAY
INTEGER(KIND=JPIM) :: LDB
INTEGER(KIND=JPIM) :: STRIDEB
REAL(KIND=JPRD) :: BETA
REAL(KIND=JPRD), DIMENSION(:) :: CARRAY
INTEGER(KIND=JPIM) :: LDC
INTEGER(KIND=JPIM) :: STRIDEC
INTEGER(KIND=JPIM) :: BATCHCOUNT
INTEGER(KIND=C_INT) :: STREAM
TYPE(GROWING_ALLOCATION_TYPE), INTENT(IN), POINTER :: ALLOC

INTEGER(KIND=C_LONG) :: HIP_STREAM

#ifdef ACCGPU
HIP_STREAM = INT(ACC_GET_HIP_STREAM(STREAM), C_LONG)
#endif
#ifdef OMPGPU
#endif

#if defined(_CRAYFTN)
#ifdef ACCGPU
!$ACC HOST_DATA USE_DEVICE(AARRAY,BARRAY,CARRAY)
#endif
#endif
CALL HIP_DGEMM_BATCHED( &
& TRANSA, TRANSB, &
& M, N, K, &
& ALPHA, &
& AARRAY, LDA, STRIDEA, &
& BARRAY, LDB, STRIDEB, &
& BETA, &
& CARRAY, LDC, STRIDEC, &
& BATCHCOUNT, HIP_STREAM, C_LOC(ALLOC))
#if defined(_CRAYFTN)
#ifdef ACCGPU
!$ACC END HOST_DATA
#endif
#endif
END SUBROUTINE HIP_DGEMM_BATCHED_OVERLOAD

SUBROUTINE HIP_SGEMM_BATCHED_OVERLOAD( &
& TRANSA, TRANSB, &
& M, N, K, &
& ALPHA, &
& AARRAY, LDA, STRIDEA, &
& BARRAY, LDB, STRIDEB, &
& BETA, &
& CARRAY, LDC, STRIDEC, &
& BATCHCOUNT, STREAM, ALLOC)
USE ISO_C_BINDING, ONLY: C_CHAR, C_INT, C_LONG, C_LOC
CHARACTER(1,C_CHAR), VALUE :: TRANSA, TRANSB
INTEGER(KIND=JPIM) :: M
INTEGER(KIND=JPIM) :: N
INTEGER(KIND=JPIM) :: K
REAL(KIND=JPRM) :: ALPHA
REAL(KIND=JPRM), DIMENSION(:) :: AARRAY
INTEGER(KIND=JPIM) :: LDA
INTEGER(KIND=JPIM) :: STRIDEA
REAL(KIND=JPRM), DIMENSION(*) :: BARRAY
INTEGER(KIND=JPIM) :: LDB
INTEGER(KIND=JPIM) :: STRIDEB
REAL(KIND=JPRM) :: BETA
REAL(KIND=JPRM), DIMENSION(:) :: CARRAY
INTEGER(KIND=JPIM) :: LDC
INTEGER(KIND=JPIM) :: STRIDEC
INTEGER(KIND=JPIM) :: BATCHCOUNT
INTEGER(KIND=C_INT) :: STREAM
TYPE(GROWING_ALLOCATION_TYPE), INTENT(IN), POINTER :: ALLOC

INTEGER(KIND=C_LONG) :: HIP_STREAM

#ifdef ACCGPU
HIP_STREAM = INT(ACC_GET_HIP_STREAM(STREAM), C_LONG)
#endif
#ifdef OMPGPU
#endif

CALL HIP_SGEMM_BATCHED( &
& TRANSA, TRANSB, &
& M, N, K, &
& ALPHA, &
& AARRAY, LDA, STRIDEA, &
& BARRAY, LDB, STRIDEB, &
& BETA, &
& CARRAY, LDC, STRIDEC, &
& BATCHCOUNT, HIP_STREAM, C_LOC(ALLOC))
END SUBROUTINE HIP_SGEMM_BATCHED_OVERLOAD

SUBROUTINE HIP_DGEMM_GROUPED_OVERLOAD( &
& RESOL_ID, BLAS_ID, TRANSA, TRANSB, &
& M, N, K, &
& ALPHA, &
& AARRAY, LDA, OFFSETA, &
& BARRAY, LDB, OFFSETB, &
& BETA, &
& CARRAY, LDC, OFFSETC, &
& BATCHCOUNT, STREAM, ALLOC)
USE ISO_C_BINDING, ONLY: C_INT, C_CHAR, C_LONG, C_LOC
INTEGER(KIND=C_INT), INTENT(IN) :: RESOL_ID
INTEGER(KIND=C_INT), INTENT(IN) :: BLAS_ID
CHARACTER(1,C_CHAR), VALUE :: TRANSA, TRANSB
INTEGER(KIND=JPIM) :: M
INTEGER(KIND=JPIM) :: N(:)
INTEGER(KIND=JPIM) :: K(:)
REAL(KIND=JPRD) :: ALPHA
REAL(KIND=JPRD), DIMENSION(:) :: AARRAY
INTEGER(KIND=JPIM) :: LDA
INTEGER(KIND=JPIB) :: OFFSETA(:)
REAL(KIND=JPRD), DIMENSION(*) :: BARRAY
INTEGER(KIND=JPIM) :: LDB(:)
INTEGER(KIND=JPIB) :: OFFSETB(:)
REAL(KIND=JPRD) :: BETA
REAL(KIND=JPRD), DIMENSION(:) :: CARRAY
INTEGER(KIND=JPIM) :: LDC
INTEGER(KIND=JPIB) :: OFFSETC(:)
INTEGER(KIND=JPIM) :: BATCHCOUNT
INTEGER(KIND=C_INT) :: STREAM
TYPE(GROWING_ALLOCATION_TYPE), INTENT(IN), POINTER :: ALLOC

INTEGER(KIND=C_LONG) :: HIP_STREAM

#ifdef ACCGPU
HIP_STREAM = INT(ACC_GET_HIP_STREAM(STREAM), C_LONG)
#endif
#ifdef OMPGPU
#endif

CALL HIP_DGEMM_GROUPED( &
& RESOL_ID, BLAS_ID, TRANSA, TRANSB, &
& M, N, K, &
& ALPHA, &
& AARRAY, LDA, OFFSETA, &
& BARRAY, LDB, OFFSETB, &
& BETA, &
& CARRAY, LDC, OFFSETC, &
& BATCHCOUNT, HIP_STREAM, C_LOC(ALLOC))

END SUBROUTINE HIP_DGEMM_GROUPED_OVERLOAD

SUBROUTINE HIP_SGEMM_GROUPED_OVERLOAD(&
& RESOL_ID, BLAS_ID, TRANSA, TRANSB, &
& M, N, K, &
& ALPHA, &
& AARRAY, LDA, OFFSETA, &
& BARRAY, LDB, OFFSETB, &
& BETA, &
& CARRAY, LDC, OFFSETC, &
& BATCHCOUNT, STREAM, ALLOC)
USE ISO_C_BINDING, ONLY: C_INT, C_CHAR, C_LONG, C_LOC
INTEGER(KIND=C_INT), INTENT(IN) :: RESOL_ID
INTEGER(KIND=C_INT), INTENT(IN) :: BLAS_ID
CHARACTER(1,C_CHAR), VALUE :: TRANSA, TRANSB
INTEGER(KIND=JPIM) :: M
INTEGER(KIND=JPIM) :: N(:)
INTEGER(KIND=JPIM) :: K(:)
REAL(KIND=JPRM) :: ALPHA
REAL(KIND=JPRM), DIMENSION(:) :: AARRAY
INTEGER(KIND=JPIM) :: LDA
INTEGER(KIND=JPIB) :: OFFSETA(:)
REAL(KIND=JPRM), DIMENSION(*) :: BARRAY
INTEGER(KIND=JPIM) :: LDB(:)
INTEGER(KIND=JPIB) :: OFFSETB(:)
REAL(KIND=JPRM) :: BETA
REAL(KIND=JPRM), DIMENSION(:) :: CARRAY
INTEGER(KIND=JPIM) :: LDC
INTEGER(KIND=JPIB) :: OFFSETC(:)
INTEGER(KIND=JPIM) :: BATCHCOUNT
INTEGER(KIND=C_INT) :: STREAM
TYPE(GROWING_ALLOCATION_TYPE), INTENT(IN), POINTER :: ALLOC

INTEGER(KIND=C_LONG) :: HIP_STREAM

#ifdef ACCGPU
HIP_STREAM = INT(ACC_GET_HIP_STREAM(STREAM), C_LONG)
#endif
#ifdef OMPGPU
#endif

#if defined(_CRAYFTN)
#ifdef ACCGPU
!$ACC HOST_DATA USE_DEVICE(AARRAY,BARRAY,CARRAY)
#endif
#endif
CALL HIP_SGEMM_GROUPED( &
& RESOL_ID, BLAS_ID, TRANSA, TRANSB, &
& M, N, K, &
& ALPHA, &
& AARRAY, LDA, OFFSETA, &
& BARRAY, LDB, OFFSETB, &
& BETA, &
& CARRAY, LDC, OFFSETC, &
& BATCHCOUNT, HIP_STREAM, C_LOC(ALLOC))
#if defined(_CRAYFTN)
#ifdef ACCGPU
!$ACC END HOST_DATA
#endif
#endif

END SUBROUTINE HIP_SGEMM_GROUPED_OVERLOAD

END MODULE HICBLAS_MOD
12 changes: 6 additions & 6 deletions src/trans/gpu/external/setup_trans.F90
Original file line number Diff line number Diff line change
Expand Up @@ -562,13 +562,13 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,&
!$ACC WAIT(1)
#endif
#ifdef OMPGPU
!$OMP TARGET ENTER DATA MAP(TO:R)
!$OMP TARGET ENTER DATA MAP(TO:F%RLAPIN,F%RACTHE,F%RW)
!$OMP TARGET ENTER DATA MAP(TO:FG%ZAA,FG%ZAS,FG%ZEPSNM)
!$OMP TARGET ENTER DATA MAP(TO:D%MYMS,D%NPNTGTB0,D%NPNTGTB1,D%NSTAGT0B,D%NSTAGT1B,D%NSTAGTF,&
!$OMP TARGET ENTER DATA MAP(TO:R,R%NSMAX,R%NTMAX,R%NDGL,R%NDGNH)
!$OMP TARGET ENTER DATA MAP(TO:F,F%RLAPIN,F%RACTHE,F%RW)
!$OMP TARGET ENTER DATA MAP(TO:FG,FG%ZAA,FG%ZAS,FG%ZEPSNM)
!$OMP TARGET ENTER DATA MAP(TO:D,D%NUMP,D%MYMS,D%NPNTGTB0,D%NPNTGTB1,D%NSTAGT0B,D%NSTAGT1B,D%NSTAGTF,&
!$OMP& D%NPROCM,D%NPROCL,D%NPTRLS,D%MSTABF,D%NASM0,D%OFFSETS_GEMM1,&
!$OMP& D%OFFSETS_GEMM2)
!$OMP TARGET ENTER DATA MAP(TO:G%NDGLU,G%NMEN,G%NLOEN)
!$OMP& D%OFFSETS_GEMM2,D%NDGL_FS)
!$OMP TARGET ENTER DATA MAP(TO:G,G%NDGLU,G%NMEN,G%NLOEN)
#endif

WRITE(NOUT,*) '===GPU arrays successfully allocated'
Expand Down
Loading
Loading