From a2f6a285297232d4c0662336948375e3639d0c5f Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Thu, 6 Feb 2025 19:05:26 +0200 Subject: [PATCH 01/28] Remove _OVERLOAD layer --- src/trans/gpu/algor/hicblas_mod.F90 | 233 --------------------------- src/trans/gpu/internal/ledir_mod.F90 | 39 +++-- src/trans/gpu/internal/leinv_mod.F90 | 39 +++-- 3 files changed, 56 insertions(+), 255 deletions(-) diff --git a/src/trans/gpu/algor/hicblas_mod.F90 b/src/trans/gpu/algor/hicblas_mod.F90 index 21adae46d..cc25e7207 100644 --- a/src/trans/gpu/algor/hicblas_mod.F90 +++ b/src/trans/gpu/algor/hicblas_mod.F90 @@ -7,21 +7,8 @@ ! nor does it submit to any jurisdiction. ! -#if defined CUDAGPU -#define ACC_GET_HIP_STREAM ACC_GET_CUDA_STREAM -#define OPENACC_LIB OPENACC -#endif - MODULE HICBLAS_MOD -USE EC_PARKIND, ONLY: JPIM, JPRM, JPRD, JPIB -USE GROWING_ALLOCATOR_MOD, ONLY: GROWING_ALLOCATION_TYPE -#ifdef ACCGPU -USE OPENACC_LIB, ONLY: ACC_GET_HIP_STREAM -#endif -#ifdef OMPGPU -#endif - IMPLICIT NONE INTERFACE @@ -118,224 +105,4 @@ SUBROUTINE HIP_SGEMM_GROUPED( & END SUBROUTINE HIP_SGEMM_GROUPED END INTERFACE -CONTAINS - -SUBROUTINE HIP_DGEMM_BATCHED_OVERLOAD( & - & TRANSA, TRANSB, & - & M, N, K, & - & ALPHA, & - & AARRAY, LDA, STRIDEA, & - & BARRAY, LDB, STRIDEB, & - & BETA, & - & CARRAY, LDC, STRIDEC, & - & BATCHCOUNT, STREAM, ALLOC) - USE ISO_C_BINDING, ONLY: C_CHAR, C_INT, C_LONG, C_LOC - CHARACTER(1,C_CHAR), VALUE :: TRANSA, TRANSB - INTEGER(KIND=JPIM) :: M - INTEGER(KIND=JPIM) :: N - INTEGER(KIND=JPIM) :: K - REAL(KIND=JPRD) :: ALPHA - REAL(KIND=JPRD), DIMENSION(:) :: AARRAY - INTEGER(KIND=JPIM) :: LDA - INTEGER(KIND=JPIM) :: STRIDEA - REAL(KIND=JPRD), DIMENSION(:,:) :: BARRAY - INTEGER(KIND=JPIM) :: LDB - INTEGER(KIND=JPIM) :: STRIDEB - REAL(KIND=JPRD) :: BETA - REAL(KIND=JPRD), DIMENSION(:) :: CARRAY - INTEGER(KIND=JPIM) :: LDC - INTEGER(KIND=JPIM) :: STRIDEC - INTEGER(KIND=JPIM) :: BATCHCOUNT - INTEGER(KIND=C_INT) :: STREAM - TYPE(GROWING_ALLOCATION_TYPE), INTENT(IN), POINTER :: ALLOC - - INTEGER(KIND=C_LONG) :: HIP_STREAM - -#ifdef ACCGPU - HIP_STREAM = INT(ACC_GET_HIP_STREAM(STREAM), C_LONG) -#endif -#ifdef OMPGPU -#endif - -#if defined(_CRAYFTN) -#ifdef ACCGPU - !$ACC HOST_DATA USE_DEVICE(AARRAY,BARRAY,CARRAY) -#endif -#endif - CALL HIP_DGEMM_BATCHED( & - & TRANSA, TRANSB, & - & M, N, K, & - & ALPHA, & - & AARRAY, LDA, STRIDEA, & - & BARRAY, LDB, STRIDEB, & - & BETA, & - & CARRAY, LDC, STRIDEC, & - & BATCHCOUNT, HIP_STREAM, C_LOC(ALLOC)) -#if defined(_CRAYFTN) -#ifdef ACCGPU - !$ACC END HOST_DATA -#endif -#endif -END SUBROUTINE HIP_DGEMM_BATCHED_OVERLOAD - -SUBROUTINE HIP_SGEMM_BATCHED_OVERLOAD( & - & TRANSA, TRANSB, & - & M, N, K, & - & ALPHA, & - & AARRAY, LDA, STRIDEA, & - & BARRAY, LDB, STRIDEB, & - & BETA, & - & CARRAY, LDC, STRIDEC, & - & BATCHCOUNT, STREAM, ALLOC) - USE ISO_C_BINDING, ONLY: C_CHAR, C_INT, C_LONG, C_LOC - CHARACTER(1,C_CHAR), VALUE :: TRANSA, TRANSB - INTEGER(KIND=JPIM) :: M - INTEGER(KIND=JPIM) :: N - INTEGER(KIND=JPIM) :: K - REAL(KIND=JPRM) :: ALPHA - REAL(KIND=JPRM), DIMENSION(:) :: AARRAY - INTEGER(KIND=JPIM) :: LDA - INTEGER(KIND=JPIM) :: STRIDEA - REAL(KIND=JPRM), DIMENSION(*) :: BARRAY - INTEGER(KIND=JPIM) :: LDB - INTEGER(KIND=JPIM) :: STRIDEB - REAL(KIND=JPRM) :: BETA - REAL(KIND=JPRM), DIMENSION(:) :: CARRAY - INTEGER(KIND=JPIM) :: LDC - INTEGER(KIND=JPIM) :: STRIDEC - INTEGER(KIND=JPIM) :: BATCHCOUNT - INTEGER(KIND=C_INT) :: STREAM - TYPE(GROWING_ALLOCATION_TYPE), INTENT(IN), POINTER :: ALLOC - - INTEGER(KIND=C_LONG) :: HIP_STREAM - -#ifdef ACCGPU - HIP_STREAM = INT(ACC_GET_HIP_STREAM(STREAM), C_LONG) -#endif -#ifdef OMPGPU -#endif - - CALL HIP_SGEMM_BATCHED( & - & TRANSA, TRANSB, & - & M, N, K, & - & ALPHA, & - & AARRAY, LDA, STRIDEA, & - & BARRAY, LDB, STRIDEB, & - & BETA, & - & CARRAY, LDC, STRIDEC, & - & BATCHCOUNT, HIP_STREAM, C_LOC(ALLOC)) -END SUBROUTINE HIP_SGEMM_BATCHED_OVERLOAD - -SUBROUTINE HIP_DGEMM_GROUPED_OVERLOAD( & - & RESOL_ID, BLAS_ID, TRANSA, TRANSB, & - & M, N, K, & - & ALPHA, & - & AARRAY, LDA, OFFSETA, & - & BARRAY, LDB, OFFSETB, & - & BETA, & - & CARRAY, LDC, OFFSETC, & - & BATCHCOUNT, STREAM, ALLOC) - USE ISO_C_BINDING, ONLY: C_INT, C_CHAR, C_LONG, C_LOC - INTEGER(KIND=C_INT), INTENT(IN) :: RESOL_ID - INTEGER(KIND=C_INT), INTENT(IN) :: BLAS_ID - CHARACTER(1,C_CHAR), VALUE :: TRANSA, TRANSB - INTEGER(KIND=JPIM) :: M - INTEGER(KIND=JPIM) :: N(:) - INTEGER(KIND=JPIM) :: K(:) - REAL(KIND=JPRD) :: ALPHA - REAL(KIND=JPRD), DIMENSION(:) :: AARRAY - INTEGER(KIND=JPIM) :: LDA - INTEGER(KIND=JPIB) :: OFFSETA(:) - REAL(KIND=JPRD), DIMENSION(*) :: BARRAY - INTEGER(KIND=JPIM) :: LDB(:) - INTEGER(KIND=JPIB) :: OFFSETB(:) - REAL(KIND=JPRD) :: BETA - REAL(KIND=JPRD), DIMENSION(:) :: CARRAY - INTEGER(KIND=JPIM) :: LDC - INTEGER(KIND=JPIB) :: OFFSETC(:) - INTEGER(KIND=JPIM) :: BATCHCOUNT - INTEGER(KIND=C_INT) :: STREAM - TYPE(GROWING_ALLOCATION_TYPE), INTENT(IN), POINTER :: ALLOC - - INTEGER(KIND=C_LONG) :: HIP_STREAM - -#ifdef ACCGPU - HIP_STREAM = INT(ACC_GET_HIP_STREAM(STREAM), C_LONG) -#endif -#ifdef OMPGPU -#endif - - CALL HIP_DGEMM_GROUPED( & - & RESOL_ID, BLAS_ID, TRANSA, TRANSB, & - & M, N, K, & - & ALPHA, & - & AARRAY, LDA, OFFSETA, & - & BARRAY, LDB, OFFSETB, & - & BETA, & - & CARRAY, LDC, OFFSETC, & - & BATCHCOUNT, HIP_STREAM, C_LOC(ALLOC)) - -END SUBROUTINE HIP_DGEMM_GROUPED_OVERLOAD - -SUBROUTINE HIP_SGEMM_GROUPED_OVERLOAD(& - & RESOL_ID, BLAS_ID, TRANSA, TRANSB, & - & M, N, K, & - & ALPHA, & - & AARRAY, LDA, OFFSETA, & - & BARRAY, LDB, OFFSETB, & - & BETA, & - & CARRAY, LDC, OFFSETC, & - & BATCHCOUNT, STREAM, ALLOC) - USE ISO_C_BINDING, ONLY: C_INT, C_CHAR, C_LONG, C_LOC - INTEGER(KIND=C_INT), INTENT(IN) :: RESOL_ID - INTEGER(KIND=C_INT), INTENT(IN) :: BLAS_ID - CHARACTER(1,C_CHAR), VALUE :: TRANSA, TRANSB - INTEGER(KIND=JPIM) :: M - INTEGER(KIND=JPIM) :: N(:) - INTEGER(KIND=JPIM) :: K(:) - REAL(KIND=JPRM) :: ALPHA - REAL(KIND=JPRM), DIMENSION(:) :: AARRAY - INTEGER(KIND=JPIM) :: LDA - INTEGER(KIND=JPIB) :: OFFSETA(:) - REAL(KIND=JPRM), DIMENSION(*) :: BARRAY - INTEGER(KIND=JPIM) :: LDB(:) - INTEGER(KIND=JPIB) :: OFFSETB(:) - REAL(KIND=JPRM) :: BETA - REAL(KIND=JPRM), DIMENSION(:) :: CARRAY - INTEGER(KIND=JPIM) :: LDC - INTEGER(KIND=JPIB) :: OFFSETC(:) - INTEGER(KIND=JPIM) :: BATCHCOUNT - INTEGER(KIND=C_INT) :: STREAM - TYPE(GROWING_ALLOCATION_TYPE), INTENT(IN), POINTER :: ALLOC - - INTEGER(KIND=C_LONG) :: HIP_STREAM - -#ifdef ACCGPU - HIP_STREAM = INT(ACC_GET_HIP_STREAM(STREAM), C_LONG) -#endif -#ifdef OMPGPU -#endif - -#if defined(_CRAYFTN) -#ifdef ACCGPU - !$ACC HOST_DATA USE_DEVICE(AARRAY,BARRAY,CARRAY) -#endif -#endif - CALL HIP_SGEMM_GROUPED( & - & RESOL_ID, BLAS_ID, TRANSA, TRANSB, & - & M, N, K, & - & ALPHA, & - & AARRAY, LDA, OFFSETA, & - & BARRAY, LDB, OFFSETB, & - & BETA, & - & CARRAY, LDC, OFFSETC, & - & BATCHCOUNT, HIP_STREAM, C_LOC(ALLOC)) -#if defined(_CRAYFTN) -#ifdef ACCGPU - !$ACC END HOST_DATA -#endif -#endif - -END SUBROUTINE HIP_SGEMM_GROUPED_OVERLOAD - END MODULE HICBLAS_MOD diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index e7a1fb18a..4b7f78a86 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -1,4 +1,9 @@ #define ALIGN(I, A) (((I)+(A)-1)/(A)*(A)) +#if defined CUDAGPU +#define ACC_GET_HIP_STREAM ACC_GET_CUDA_STREAM +#define OPENACC_LIB OPENACC +#endif + ! (C) Copyright 2000- ECMWF. ! (C) Copyright 2000- Meteo-France. ! (C) Copyright 2022- NVIDIA. @@ -106,16 +111,19 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) USE TPM_GEOMETRY, ONLY: G USE TPM_FIELDS_GPU, ONLY: FG USE TPM_DISTR, ONLY: D - USE HICBLAS_MOD, ONLY: HIP_DGEMM_BATCHED_OVERLOAD, & - & HIP_DGEMM_GROUPED_OVERLOAD, HIP_SGEMM_GROUPED_OVERLOAD + USE HICBLAS_MOD, ONLY: HIP_DGEMM_BATCHED, & + & HIP_DGEMM_GROUPED, HIP_SGEMM_GROUPED USE MPL_MODULE, ONLY: MPL_BARRIER,MPL_ALL_MS_COMM USE TPM_STATS, ONLY: GSTATS => GSTATS_NVTX - USE, INTRINSIC :: ISO_C_BINDING, ONLY: C_INT + USE, INTRINSIC :: ISO_C_BINDING, ONLY: C_INT, C_LONG, C_LOC +#ifdef ACCGPU + USE OPENACC_LIB, ONLY: ACC_GET_HIP_STREAM +#endif #ifdef TRANS_SINGLE -#define HIP_GEMM HIP_SGEMM_GROUPED_OVERLOAD +#define HIP_GEMM HIP_SGEMM_GROUPED #else -#define HIP_GEMM HIP_DGEMM_GROUPED_OVERLOAD +#define HIP_GEMM HIP_DGEMM_GROUPED #endif IMPLICIT NONE @@ -149,12 +157,21 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_STRIDES1 INTEGER(KIND=8) :: ALLOC_SZ, ALLOC_POS + INTEGER(KIND=C_LONG) :: HIP_STREAM + ASSOCIATE(D_NUMP=>D%NUMP, R_NSMAX=>R%NSMAX, R_NTMAX=>R%NTMAX, G_NDGLU=>G%NDGLU, & & D_MYMS=>D%MYMS, D_OFFSETS_GEMM1=>D%OFFSETS_GEMM1, & & D_OFFSETS_GEMM2=>D%OFFSETS_GEMM2, & & ZAA=>FG%ZAA, ZAS=>FG%ZAS, ZAA0=>FG%ZAA0, ZAS0=>FG%ZAS0) IF (LHOOK) CALL DR_HOOK('LE_DGEMM',0,ZHOOK_HANDLE) +#ifdef ACCGPU + HIP_STREAM = INT(ACC_GET_HIP_STREAM(1_C_INT), C_LONG) +#endif +#ifdef OMPGPU + HIP_STREAM = 1_C_LONG +#endif + CALL LEDIR_STRIDES(KF_FS,IOUT_STRIDES0,IOUT_STRIDES1,IIN_STRIDES0,IIN_STRIDES1,& IOUT0_STRIDES0,IOUT0_STRIDES1,IIN0_STRIDES0,IIN0_STRIDES1) @@ -187,7 +204,7 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) #ifdef ACCGPU !$ACC HOST_DATA USE_DEVICE(ZAA0,ZINPA0,ZOUT0) #endif - CALL HIP_DGEMM_BATCHED_OVERLOAD( & + CALL HIP_DGEMM_BATCHED( & & 'N', 'N', & & KF_FS, (R_NSMAX+2)/2, G_NDGLU(0), & & 1.0_JPRD, & @@ -195,7 +212,7 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) & ZAA0, SIZE(ZAA0,1), 0, & & 0.0_JPRD, & & ZOUT0, IOUT0_STRIDES0, 0, & - & 1, STREAM=1_C_INT, ALLOC=ALLOCATOR%PTR) + & 1, HIP_STREAM, C_LOC(ALLOCATOR%PTR)) #ifdef OMPGPU !$OMP END TARGET DATA #endif @@ -233,7 +250,7 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) & ZAA, D%LEGENDRE_MATRIX_STRIDES, BOFFSETS, & & 0.0_JPRBT, & & ZOUT, IOUT_STRIDES0, COFFSETS, & - & D_NUMP, STREAM=1_C_INT, ALLOC=ALLOCATOR%PTR) + & D_NUMP, HIP_STREAM, C_LOC(ALLOCATOR%PTR)) #ifdef OMPGPU !$OMP END TARGET DATA #endif @@ -306,7 +323,7 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) !$ACC HOST_DATA USE_DEVICE(ZAS0,ZINPS0,ZOUT0) #endif ! compute m=0 in double precision: - call HIP_DGEMM_BATCHED_OVERLOAD( & + call HIP_DGEMM_BATCHED( & & 'N', 'N', & & KF_FS, (R_NSMAX+3)/2, G_NDGLU(0), & & 1.0_JPRD, & @@ -314,7 +331,7 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) & ZAS0, SIZE(ZAS0,1), 0, & & 0.0_JPRD, & & ZOUT0, IOUT0_STRIDES0, 0, & - & 1, STREAM=1_C_INT, ALLOC=ALLOCATOR%PTR) + & 1, HIP_STREAM, C_LOC(ALLOCATOR%PTR)) #ifdef OMPGPU !$OMP END TARGET DATA #endif @@ -353,7 +370,7 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) & ZAS, D%LEGENDRE_MATRIX_STRIDES, BOFFSETS, & & 0.0_JPRBT, & & ZOUT, IOUT_STRIDES0, COFFSETS, & - & D_NUMP, STREAM=1_C_INT, ALLOC=ALLOCATOR%PTR) + & D_NUMP, HIP_STREAM, C_LOC(ALLOCATOR%PTR)) #ifdef OMPGPU !$OMP END TARGET DATA #endif diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index 8bfc2ac0e..741bb3d01 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -1,4 +1,9 @@ #define ALIGN(I, A) (((I)+(A)-1)/(A)*(A)) +#if defined CUDAGPU +#define ACC_GET_HIP_STREAM ACC_GET_CUDA_STREAM +#define OPENACC_LIB OPENACC +#endif + ! (C) Copyright 2000- ECMWF. ! (C) Copyright 2000- Meteo-France. ! (C) Copyright 2022- NVIDIA. @@ -105,15 +110,18 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) USE TPM_GEOMETRY, ONLY: G USE TPM_FIELDS_GPU, ONLY: FG USE TPM_DISTR, ONLY: D - USE HICBLAS_MOD, ONLY: HIP_DGEMM_BATCHED_OVERLOAD, & - & HIP_DGEMM_GROUPED_OVERLOAD, HIP_SGEMM_GROUPED_OVERLOAD - USE, INTRINSIC :: ISO_C_BINDING, ONLY: C_INT + USE HICBLAS_MOD, ONLY: HIP_DGEMM_BATCHED, & + & HIP_DGEMM_GROUPED, HIP_SGEMM_GROUPED + USE, INTRINSIC :: ISO_C_BINDING, ONLY: C_INT, C_LONG, C_LOC USE MPL_MODULE, ONLY: MPL_BARRIER,MPL_ALL_MS_COMM USE TPM_STATS, ONLY: GSTATS => GSTATS_NVTX +#ifdef ACCGPU + USE OPENACC_LIB, ONLY: ACC_GET_HIP_STREAM +#endif #ifdef TRANS_SINGLE -#define HIP_GEMM HIP_SGEMM_GROUPED_OVERLOAD +#define HIP_GEMM HIP_SGEMM_GROUPED #else -#define HIP_GEMM HIP_DGEMM_GROUPED_OVERLOAD +#define HIP_GEMM HIP_DGEMM_GROUPED #endif IMPLICIT NONE @@ -137,6 +145,8 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) REAL(KIND=JPHOOK) :: ZHOOK_HANDLE + INTEGER(KIND=C_LONG) :: HIP_STREAM + ASSOCIATE(D_NUMP=>D%NUMP, R_NSMAX=>R%NSMAX, G_NDGLU=>G%NDGLU, D_MYMS=>D%MYMS, D_OFFSETS_GEMM1=>D%OFFSETS_GEMM1,& D_OFFSETS_GEMM2=>D%OFFSETS_GEMM2, & ZAA=>FG%ZAA, ZAS=>FG%ZAS, ZAA0=>FG%ZAA0, ZAS0=>FG%ZAS0) @@ -144,6 +154,13 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) !* 1.1 PREPARATIONS. IF (LHOOK) CALL DR_HOOK('LE_DGEMM',0,ZHOOK_HANDLE) +#ifdef ACCGPU + HIP_STREAM = INT(ACC_GET_HIP_STREAM(1_C_INT), C_LONG) +#endif +#ifdef OMPGPU + HIP_STREAM = 1_C_LONG +#endif + ! ------------------------------------------------------------------ !* 1. PERFORM LEGENDRE TRANFORM. @@ -247,7 +264,7 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) #ifdef ACCGPU !$ACC HOST_DATA USE_DEVICE(ZAA0,ZINP0,ZOUTA0) #endif - CALL HIP_DGEMM_BATCHED_OVERLOAD( & + CALL HIP_DGEMM_BATCHED( & & 'N', 'T', & & KF_LEG, G_NDGLU(0), (R_NSMAX+2)/2, & & 1.0_JPRD, & @@ -255,7 +272,7 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) & ZAA0, SIZE(ZAA0,1), 0, & & 0.0_JPRD, & & ZOUTA0, IOUT0_STRIDES0, 0, & - & 1, STREAM=1_C_INT, ALLOC=ALLOCATOR%PTR) + & 1, HIP_STREAM, C_LOC(ALLOCATOR%PTR)) #ifdef OMPGPU !$OMP END TARGET DATA #endif @@ -293,7 +310,7 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) & ZAA, D%LEGENDRE_MATRIX_STRIDES, BOFFSETS, & & 0.0_JPRBT, & & ZOUTA, IOUT_STRIDES0, COFFSETS, & - & D_NUMP, STREAM=1_C_INT, ALLOC=ALLOCATOR%PTR) + & D_NUMP, HIP_STREAM, C_LOC(ALLOCATOR%PTR)) #ifdef OMPGPU !$OMP END TARGET DATA #endif @@ -389,7 +406,7 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) #ifdef ACCGPU !$ACC HOST_DATA USE_DEVICE(ZAS0,ZINP0,ZOUTS0) #endif - CALL HIP_DGEMM_BATCHED_OVERLOAD( & + CALL HIP_DGEMM_BATCHED( & & 'N', 'T', & & KF_LEG, G_NDGLU(0), (R_NSMAX+3)/2, & & 1.0_JPRD, & @@ -397,7 +414,7 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) & ZAS0, SIZE(ZAS0,1), 0, & & 0.0_JPRD, & & ZOUTS0, IOUT0_STRIDES0, 0, & - & 1, STREAM=1_C_INT, ALLOC=ALLOCATOR%PTR) + & 1, HIP_STREAM, C_LOC(ALLOCATOR%PTR)) #ifdef OMPGPU !$OMP END TARGET DATA #endif @@ -433,7 +450,7 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) & ZAS, D%LEGENDRE_MATRIX_STRIDES, BOFFSETS, & & 0.0_JPRBT, & & ZOUTS, IOUT_STRIDES0, COFFSETS, & - & D_NUMP, STREAM=1_C_INT, ALLOC=ALLOCATOR%PTR) + & D_NUMP, HIP_STREAM, C_LOC(ALLOCATOR%PTR)) #ifdef OMPGPU !$OMP END TARGET DATA #endif From aac11e105e636e2ec9f567f8f9886b6a58c74134 Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Tue, 10 Dec 2024 13:49:55 +0200 Subject: [PATCH 02/28] Refresh OpenMP in PRFI1B --- src/trans/gpu/internal/prfi1b_mod.F90 | 128 +++++++------------------- 1 file changed, 34 insertions(+), 94 deletions(-) diff --git a/src/trans/gpu/internal/prfi1b_mod.F90 b/src/trans/gpu/internal/prfi1b_mod.F90 index 489683f8a..b2220804c 100755 --- a/src/trans/gpu/internal/prfi1b_mod.F90 +++ b/src/trans/gpu/internal/prfi1b_mod.F90 @@ -71,130 +71,70 @@ SUBROUTINE PRFI1B(PIA,PSPEC,KFIELDS,KDIM,KFLDPTR) INTEGER(KIND=JPIM),INTENT(IN),OPTIONAL :: KFLDPTR(:) ! LOCAL INTEGER SCALARS - INTEGER(KIND=JPIM) :: II, INM, IR, JN, JFLD, ILCM, IASM0, IFLD + INTEGER(KIND=JPIM) :: II, INM, IR, JN, JFLD, IASM0 ! ------------------------------------------------------------------ !* 1. EXTRACT FIELDS FROM SPECTRAL ARRAYS. ! -------------------------------------------------- + ASSOCIATE(D_NUMP=>D%NUMP, D_MYMS=>D%MYMS, D_NASM0=>D%NASM0, R_NSMAX=>R%NSMAX) #ifdef ACCGPU - !$ACC DATA & - !$ACC& PRESENT(D,D_NUMP,R,R_NSMAX,D_MYMS,D_NASM0) & - !$ACC& PRESENT(PIA) & - !$ACC& PRESENT(PSPEC) ASYNC(1) -#endif -#ifdef OMPGPU - !$OMP TARGET DATA MAP(PRESENT,ALLOC:D_NUMP,R_NSMAX,D_MYMS,D_NASM0,PSPEC) + !$ACC DATA PRESENT(D,D_NUMP,R,R_NSMAX,D_MYMS,D_NASM0,PIA,PSPEC) ASYNC(1) #endif - -#ifdef OMPGPU -#endif -#ifdef ACCGPU - !$ACC DATA IF(PRESENT(KFLDPTR)) PRESENT(KFLDPTR) ASYNC(1) -#endif - IF(PRESENT(KFLDPTR)) THEN - CALL ABORT_TRANS("PRFI1B not implemented for GPU") - - !loop over wavenumber -#ifdef OMPGPU - !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) PRIVATE(KM,ILCM,IFLD,IASM0,IR,II,INM) & - !$OMP& FIRSTPRIVATE(KFIELDS) -#endif -#ifdef ACCGPU - !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(KM,ILCM,IFLD,IASM0,IR,II,INM) & - !$ACC& FIRSTPRIVATE(KFIELDS) ASYNC(1) -#endif - DO KMLOC=1,D_NUMP - DO JN=1,R_NSMAX+1 - DO JFLD=1,KFIELDS - KM = D_MYMS(KMLOC) - ILCM = R_NSMAX+1-KM - IFLD = KFLDPTR(JFLD) - IF (JN .LE. ILCM) THEN - IASM0 = D_NASM0(KM) - INM = IASM0+(ILCM-JN)*2 - IR = 2*(JFLD-1)+1 - II = IR+1 - PIA(IR,JN+2,KMLOC) = PSPEC(IFLD,INM ) - PIA(II,JN+2,KMLOC) = PSPEC(IFLD,INM+1) - END IF - ENDDO - ENDDO - - ! end loop over wavenumber - ENDDO - -#ifdef OMPGPU - !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(2) PRIVATE(KM,ILCM) FIRSTPRIVATE(KFIELDS) -#endif -#ifdef ACCGPU - !$ACC PARALLEL LOOP DEFAULT(NONE) COLLAPSE(2) PRIVATE(KM,ILCM) FIRSTPRIVATE(KFIELDS) ASYNC(1) -#endif - DO KMLOC=1,D_NUMP - DO JFLD=1,2*KFIELDS - KM = D_MYMS(KMLOC) - ILCM = R_NSMAX+1-KM - PIA(JFLD,1,KMLOC) = 0.0_JPRB - PIA(JFLD,2,KMLOC) = 0.0_JPRB - PIA(JFLD,ILCM+3,KMLOC) = 0.0_JPRB - ENDDO - ! end loop over wavenumber - ENDDO + CALL ABORT_TRANS("KFLDPTR not implemented for GPU") ELSE - !loop over wavenumber + !loop over wavenumber #ifdef OMPGPU - !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) PRIVATE(KM,IASM0,INM) & - !$OMP& FIRSTPRIVATE(KFIELDS,KDIM) + !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) DEFAULT(NONE) & + !$OMP& PRIVATE(KM,IASM0,INM) SHARED(KFIELDS,KDIM,D,R,PIA,PSPEC) MAP(TO:KFIELDS) #endif #ifdef ACCGPU - !$ACC PARALLEL LOOP DEFAULT(NONE) COLLAPSE(3) PRIVATE(KM,IASM0,INM) FIRSTPRIVATE(KFIELDS,KDIM) & + !$ACC PARALLEL LOOP DEFAULT(NONE) COLLAPSE(3) PRIVATE(KM,IASM0,INM) & + !$ACC FIRSTPRIVATE(KFIELDS,KDIM) & #ifndef _CRAYFTN - !$ACC& ASYNC(1) + !$ACC& ASYNC(1) #else - !$ACC& + !$ACC& #endif #endif - DO KMLOC=1,D_NUMP - DO JN=0,R_NSMAX+3 - DO JFLD=1,KFIELDS - KM = D_MYMS(KMLOC) + DO KMLOC=1,D_NUMP + DO JN=0,R_NSMAX+3 + DO JFLD=1,KFIELDS + KM = D_MYMS(KMLOC) - IF (JN <= 1) THEN - PIA(2*JFLD-1,JN+1,KMLOC) = 0.0_JPRB - PIA(2*JFLD ,JN+1,KMLOC) = 0.0_JPRB - ELSEIF (JN <= R_NSMAX+2-KM) THEN - IASM0 = D_NASM0(KM) - INM = IASM0+((R_NSMAX+2-JN)-KM)*2 - PIA(2*JFLD-1,JN+1,KMLOC) = PSPEC(JFLD,INM ) - PIA(2*JFLD ,JN+1,KMLOC) = PSPEC(JFLD,INM+1) - ELSEIF (JN <= R_NSMAX+3-KM) THEN - PIA(2*JFLD-1,JN+1,KMLOC) = 0.0_JPRB - PIA(2*JFLD ,JN+1,KMLOC) = 0.0_JPRB - ENDIF - ENDDO + IF (JN <= 1) THEN + PIA(2*JFLD-1,JN+1,KMLOC) = 0.0_JPRB + PIA(2*JFLD ,JN+1,KMLOC) = 0.0_JPRB + ELSEIF (JN <= R_NSMAX+2-KM) THEN + IASM0 = D_NASM0(KM) + INM = IASM0+((R_NSMAX+2-JN)-KM)*2 + PIA(2*JFLD-1,JN+1,KMLOC) = PSPEC(JFLD,INM ) + PIA(2*JFLD ,JN+1,KMLOC) = PSPEC(JFLD,INM+1) + ELSEIF (JN <= R_NSMAX+3-KM) THEN + PIA(2*JFLD-1,JN+1,KMLOC) = 0.0_JPRB + PIA(2*JFLD ,JN+1,KMLOC) = 0.0_JPRB + ENDIF + ENDDO + ENDDO ENDDO - ENDDO -ENDIF + ENDIF #ifdef ACCGPU -!$ACC END DATA -!$ACC END DATA -#endif -#ifdef OMPGPU -!$OMP END TARGET DATA + !$ACC END DATA #endif + END ASSOCIATE + ! ------------------------------------------------------------------ - END ASSOCIATE - + END SUBROUTINE PRFI1B END MODULE PRFI1B_MOD From 21af0598161bff4cda9440870e3e9f99873456e8 Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Tue, 10 Dec 2024 13:50:15 +0200 Subject: [PATCH 03/28] Refresh OpenMP in SPNSDE --- src/trans/gpu/internal/spnsde_mod.F90 | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/trans/gpu/internal/spnsde_mod.F90 b/src/trans/gpu/internal/spnsde_mod.F90 index d7fdcd282..af320d058 100755 --- a/src/trans/gpu/internal/spnsde_mod.F90 +++ b/src/trans/gpu/internal/spnsde_mod.F90 @@ -73,13 +73,12 @@ SUBROUTINE SPNSDE(KF_SCALARS,PEPSNM,PF,PNSD) INTEGER(KIND=JPIM) :: KM, KMLOC INTEGER(KIND=JPIM), INTENT(IN) :: KF_SCALARS -!REAL(KIND=JPRBT), INTENT(IN) :: PEPSNM(0:R%NTMAX+2) REAL(KIND=JPRBT), INTENT(IN) :: PEPSNM(1:D%NUMP,0:R%NTMAX+2) REAL(KIND=JPRB), INTENT(IN) :: PF(:,:,:) REAL(KIND=JPRB), INTENT(OUT) :: PNSD(:,:,:) ! LOCAL INTEGER SCALARS -INTEGER(KIND=JPIM) :: IJ, ISKIP, J, JN, JI, IR, II +INTEGER(KIND=JPIM) :: J, JN, JI, IR, II ASSOCIATE(D_NUMP=>D%NUMP, R_NTMAX=>R%NTMAX, D_MYMS=>D%MYMS) @@ -98,8 +97,8 @@ SUBROUTINE SPNSDE(KF_SCALARS,PEPSNM,PF,PNSD) !* 1.1 COMPUTE #ifdef OMPGPU -!$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) PRIVATE(KM,IR,II,JI) MAP(TO:KF_SCALARS) & -!$OMP& SHARED(D,R) +!$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) DEFAULT(NONE) & +!$OMP& PRIVATE(KM,IR,II,JI) MAP(TO:KF_SCALARS) SHARED(D,R,PEPSNM,PF,PNSD,KF_SCALARS) #endif #ifdef ACCGPU !$ACC PARALLEL LOOP DEFAULT(NONE) COLLAPSE(3) PRIVATE(KM,IR,II,JI) FIRSTPRIVATE(KF_SCALARS) & From efe06361b8537a7cff5df1366f522c8e641c8001 Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Tue, 10 Dec 2024 13:50:38 +0200 Subject: [PATCH 04/28] Refresh OpenMP in VDTUV --- src/trans/gpu/internal/vdtuv_mod.F90 | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/src/trans/gpu/internal/vdtuv_mod.F90 b/src/trans/gpu/internal/vdtuv_mod.F90 index e548abe96..bffa7afd0 100755 --- a/src/trans/gpu/internal/vdtuv_mod.F90 +++ b/src/trans/gpu/internal/vdtuv_mod.F90 @@ -73,7 +73,7 @@ SUBROUTINE VDTUV(KFIELD,PEPSNM,PVOR,PDIV,PU,PV) IMPLICIT NONE -INTEGER(KIND=JPIM) :: KM, kmloc +INTEGER(KIND=JPIM) :: KM, KMLOC INTEGER(KIND=JPIM), INTENT(IN) :: KFIELD REAL(KIND=JPRBT), INTENT(IN) :: PEPSNM(1:D%NUMP,0:R%NTMAX+2) REAL(KIND=JPRB), INTENT(INOUT) :: PVOR(:,:,:),PDIV(:,:,:) @@ -93,9 +93,6 @@ SUBROUTINE VDTUV(KFIELD,PEPSNM,PVOR,PDIV,PU,PV) !$ACC& PRESENT(PEPSNM, PVOR, PDIV) & !$ACC& PRESENT(PU, PV) #endif -#ifdef OMPGPU -!$OMP TARGET DATA MAP(PRESENT,ALLOC:R_NTMAX,D_MYMS,D_NUMP,F_RLAPIN,PEPSNM,PVOR,PDIV,PU,PV) -#endif ! ------------------------------------------------------------------ @@ -103,8 +100,9 @@ SUBROUTINE VDTUV(KFIELD,PEPSNM,PVOR,PDIV,PU,PV) ! ------------------------------------------ #ifdef OMPGPU -!$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) PRIVATE(IR,II,KM,ZKM,JI) & -!$OMP& FIRSTPRIVATE(KFIELD,KMLOC) +!$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) DEFAULT(NONE) & +!$OMP& PRIVATE(IR,II,KM,ZKM,JI) SHARED(D,R,F,PEPSNM,PVOR,PDIV,PU,PV,KFIELD) & +!$OMP& MAP(TO:KFIELD) #endif #ifdef ACCGPU !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(IR,II,KM,ZKM,JI) FIRSTPRIVATE(KFIELD,KMLOC) & @@ -153,9 +151,6 @@ SUBROUTINE VDTUV(KFIELD,PEPSNM,PVOR,PDIV,PU,PV) ENDDO ENDDO -#ifdef OMPGPU -!$OMP END TARGET DATA -#endif #ifdef ACCGPU !$ACC END DATA #endif From e218866cef7e852af086fb7b454216531069e20a Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Wed, 11 Dec 2024 11:26:48 +0200 Subject: [PATCH 05/28] Refresh OpenMP in FSC --- src/trans/gpu/internal/fsc_mod.F90 | 42 ++++++++++++++---------------- 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/src/trans/gpu/internal/fsc_mod.F90 b/src/trans/gpu/internal/fsc_mod.F90 index 6784f87f1..e2381aaa3 100755 --- a/src/trans/gpu/internal/fsc_mod.F90 +++ b/src/trans/gpu/internal/fsc_mod.F90 @@ -64,11 +64,9 @@ SUBROUTINE FSC(ALLOCATOR,HFSC,PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSE ! ------------------------------------------------------------------ -USE TPM_TRANS, ONLY: LATLON USE TPM_DISTR, ONLY: MYSETW, MYPROC, NPROC, D USE TPM_GEOMETRY, ONLY: G USE TPM_FIELDS, ONLY: F -USE TPM_GEN, ONLY: NOUT USE TPM_DIM, ONLY: R ! @@ -107,11 +105,6 @@ SUBROUTINE FSC(ALLOCATOR,HFSC,PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSE !$ACC DATA & !$ACC& PRESENT(D,D_NPTRLS,D_NSTAGTF,PREEL_COMPLEX,F,F_RACTHE,G,G_NMEN,G_NLOEN,R,R_NSMAX) #endif -#ifdef OMPGPU -!$OMP TARGET DATA & -!$OMP& MAP(PRESENT,ALLOC:D_NSTAGTF,PREEL_COMPLEX,F_RACTHE,G_NMEN,G_NLOEN) & -!$OMP& MAP(TO:R_NSMAX) -#endif ! ------------------------------------------------------------------ @@ -122,9 +115,10 @@ SUBROUTINE FSC(ALLOCATOR,HFSC,PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSE !* 1.1 U AND V. #ifdef OMPGPU -!$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) & -!$OMP& PRIVATE(IGLG,IOFF_LAT,IOFF_UV,ZACHTE2,JM,JF,KGL) & -!$OMP& FIRSTPRIVATE(IBEG,IEND,IINC,OFFSET_VAR,KF_UV,KUV_OFFSET,KF_FS) +!$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) DEFAULT(NONE) & +!$OMP& PRIVATE(IGLG,IOFF_LAT,IOFF_UV,ZACHTE2) & +!$OMP& SHARED(IBEG,IEND,IINC,KF_UV,R,OFFSET_VAR,G,D,KF_FS,KUV_OFFSET,F,PREEL_COMPLEX) & +!$OMP& MAP(TO:IBEG,IEND,IINC,KF_UV,OFFSET_VAR,KF_FS,KUV_OFFSET,ZACHTE2) #endif #ifdef ACCGPU !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) & @@ -160,9 +154,11 @@ SUBROUTINE FSC(ALLOCATOR,HFSC,PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSE IF (KSCALARS_NSDER_OFFSET >= 0) THEN #ifdef OMPGPU - !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) & - !$OMP& PRIVATE(IGLG,IOFF_LAT,IOFF_KSCALARS_NSDER,ZACHTE2,KGL,JF,JM) & - !$OMP& FIRSTPRIVATE(IBEG,IEND,IINC,OFFSET_VAR,KF_SCALARS,KSCALARS_NSDER_OFFSET,KF_FS) + !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) DEFAULT(NONE) & + !$OMP& PRIVATE(IGLG,IOFF_LAT,IOFF_KSCALARS_NSDER,ZACHTE2) & + !$OMP& SHARED(IBEG,IEND,IINC,KF_SCALARS,R,OFFSET_VAR,G,D,KF_FS,KSCALARS_NSDER_OFFSET,F,& + !$OMP& PREEL_COMPLEX) & + !$OMP& MAP(TO:IBEG,IEND,IINC,KF_SCALARS,OFFSET_VAR,KF_FS,KSCALARS_NSDER_OFFSET) #endif #ifdef ACCGPU !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(IGLG,IOFF_LAT,IOFF_KSCALARS_NSDER,ZACHTE2,KGL,JF,JM) & @@ -204,9 +200,11 @@ SUBROUTINE FSC(ALLOCATOR,HFSC,PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSE ILOEN_MAX = MAXVAL(G_NLOEN) IF (KUV_EWDER_OFFSET >= 0) THEN #ifdef OMPGPU - !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) & - !$OMP& PRIVATE(IGLG,IOFF_LAT,IOFF_UV,IOFF_UV_EWDER,RET_REAL,RET_COMPLEX,ZACHTE2,JM,JF,KGL) & - !$OMP& FIRSTPRIVATE(IBEG,IEND,IINC,OFFSET_VAR,KF_UV,KUV_EWDER_OFFSET,KUV_OFFSET,KF_FS) + !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) DEFAULT(NONE) & + !$OMP& PRIVATE(IGLG,IOFF_LAT,IOFF_UV,IOFF_UV_EWDER,RET_REAL,RET_COMPLEX,ZACHTE2) & + !$OMP& SHARED(IBEG,IEND,IINC,KF_UV,ILOEN_MAX,OFFSET_VAR,G,D,KF_FS,KUV_OFFSET,KUV_EWDER_OFFSET,F,& + !$OMP& PREEL_COMPLEX) & + !$OMP& MAP(TO:IBEG,IEND,IINC,KF_UV,ILOEN_MAX,OFFSET_VAR,KF_FS,KUV_OFFSET,KUV_EWDER_OFFSET) #endif #ifdef ACCGPU !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(IGLG,IOFF_LAT,IOFF_UV,IOFF_UV_EWDER,RET_REAL,RET_COMPLEX,ZACHTE2,JM,JF,KGL) & @@ -253,9 +251,12 @@ SUBROUTINE FSC(ALLOCATOR,HFSC,PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSE IF (KSCALARS_EWDER_OFFSET > 0) THEN #ifdef OMPGPU - !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) & - !$OMP& PRIVATE(IGLG,IOFF_LAT,IOFF_SCALARS_EWDER,IOFF_SCALARS,ZACHTE2,RET_REAL,RET_COMPLEX) & - !$OMP& FIRSTPRIVATE(IBEG,IEND,IINC,KF_SCALARS,OFFSET_VAR,KSCALARS_EWDER_OFFSET,KSCALARS_OFFSET,KF_FS) + !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) DEFAULT(NONE) & + !$OMP& PRIVATE(IGLG,IOFF_LAT,IOFF_SCALARS_EWDER,IOFF_SCALARS,RET_REAL,RET_COMPLEX,ZACHTE2) & + !$OMP& SHARED(IBEG,IEND,IINC,KF_SCALARS,ILOEN_MAX,OFFSET_VAR,G,D,KF_FS,KSCALARS_EWDER_OFFSET,& + !$OMP& KSCALARS_OFFSET,F,PREEL_COMPLEX) & + !$OMP& MAP(TO:IBEG,IEND,IINC,KF_SCALARS,ILOEN_MAX,OFFSET_VAR,KF_FS,KSCALARS_EWDER_OFFSET,& + !$OMP& KSCALARS_OFFSET) #endif #ifdef ACCGPU !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(IGLG,IOFF_LAT,IOFF_SCALARS_EWDER,IOFF_SCALARS,ZACHTE2,RET_REAL,RET_COMPLEX) & @@ -304,9 +305,6 @@ SUBROUTINE FSC(ALLOCATOR,HFSC,PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSE !$ACC END DATA #endif -#ifdef OMPGPU -!$OMP END TARGET DATA -#endif ! ------------------------------------------------------------------ END ASSOCIATE From d9851d6e023af0dcbca082c1250bfd13340eb1e6 Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Wed, 11 Dec 2024 11:27:08 +0200 Subject: [PATCH 06/28] Refresh OpenMP in FTINV --- src/trans/gpu/internal/ftinv_mod.F90 | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/trans/gpu/internal/ftinv_mod.F90 b/src/trans/gpu/internal/ftinv_mod.F90 index ddd07deb0..8a7e42d13 100755 --- a/src/trans/gpu/internal/ftinv_mod.F90 +++ b/src/trans/gpu/internal/ftinv_mod.F90 @@ -101,8 +101,6 @@ SUBROUTINE FTINV(ALLOCATOR,HFTINV,PREEL_COMPLEX,PREEL_REAL,KFIELD) & 1_JPIB, 1_JPIB*KFIELD*D%NLENGTF*C_SIZEOF(PREEL_REAL(1))) #endif -#ifdef OMPGPU -#endif #ifdef ACCGPU !$ACC DATA PRESENT(PREEL_REAL,PREEL_COMPLEX,D_NPTRLS,D_NDGL_FS,D_NSTAGTF,G_NLOEN) #endif @@ -124,8 +122,6 @@ SUBROUTINE FTINV(ALLOCATOR,HFTINV,PREEL_COMPLEX,PREEL_REAL,KFIELD) ENDIF CALL GSTATS(423,1) -#ifdef OMPGPU -#endif #ifdef ACCGPU !$ACC END DATA #endif From ef7ab39eaeb1bf66b80b0be734a3729347e044af Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Wed, 11 Dec 2024 11:27:31 +0200 Subject: [PATCH 07/28] Refresh OpenMP in LEINV --- src/trans/gpu/internal/leinv_mod.F90 | 50 +++++++++++++--------------- 1 file changed, 23 insertions(+), 27 deletions(-) diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index 741bb3d01..a091981ca 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -171,9 +171,6 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) CALL LEINV_STRIDES(KF_LEG,IOUT_STRIDES0,IOUT_SIZE,IIN_STRIDES0,IIN_SIZE,& IOUT0_STRIDES0,IOUT0_SIZE,IIN0_STRIDES0,IIN0_SIZE) - -#ifdef OMPGPU -#endif #ifdef ACCGPU !$ACC DATA PRESENT(D,D_MYMS,D_NUMP) & !$ACC& PRESENT(ZINP,ZOUTS,ZOUTA,ZINP0,ZOUTS0,ZOUTA0) & @@ -193,6 +190,11 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) ! PIA_2=2+1+(1..4-1)*2 ...3+(0..3)*2 .... 3,5,7,9 #ifdef OMPGPU + ! Directive incomplete -> putting more variables in SHARED() triggers internal compiler error + ! ftn-7991: INTERNAL COMPILER ERROR: "Too few arguments on the stack" + !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(2) & + !$OMP& PRIVATE(KM,IA,J,IIN_STRIDES0,IIN0_STRIDES0) & + !$OMP& SHARED(D,R,KF_LEG,ZINP) MAP(TO:KF_LEG) #endif #ifdef ACCGPU !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,IA,J) & @@ -208,8 +210,6 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) KM = D_MYMS(KMLOC) IA = 1+MOD(R_NSMAX-KM+2,2) IF(KM /= 0)THEN -#ifdef OMPGPU -#endif #ifdef ACCGPU !$ACC LOOP SEQ #endif @@ -225,8 +225,6 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) #endif ELSEIF (MOD((JK-1),2) .EQ. 0) THEN ! every other field is sufficient because Im(KM=0) == 0 -#ifdef OMPGPU -#endif #ifdef ACCGPU !$ACC LOOP SEQ #endif @@ -273,16 +271,14 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) & 0.0_JPRD, & & ZOUTA0, IOUT0_STRIDES0, 0, & & 1, HIP_STREAM, C_LOC(ALLOCATOR%PTR)) -#ifdef OMPGPU - !$OMP END TARGET DATA -#endif #ifdef ACCGPU !$ACC END HOST_DATA +#endif +#ifdef OMPGPU + !$OMP END TARGET DATA #endif ENDIF - - DO KMLOC=1,D_NUMP KM = D_MYMS(KMLOC) KS(KMLOC) = (R_NSMAX-KM+2)/2 @@ -311,12 +307,12 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) & 0.0_JPRBT, & & ZOUTA, IOUT_STRIDES0, COFFSETS, & & D_NUMP, HIP_STREAM, C_LOC(ALLOCATOR%PTR)) -#ifdef OMPGPU - !$OMP END TARGET DATA -#endif #ifdef ACCGPU !$ACC END HOST_DATA #endif +#ifdef OMPGPU + !$OMP END TARGET DATA +#endif IF (LSYNC_TRANS) THEN #ifdef ACCGPU @@ -339,6 +335,11 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) ! PIA_2=1+1+(1..5-1)*2 ...2+(0..4)*2 .... 2,4,6,8,10 #ifdef OMPGPU + ! Directive incomplete -> putting more variables in SHARED() triggers internal compiler error + ! ftn-7991: INTERNAL COMPILER ERROR: "Too few arguments on the stack" + !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(2) & + !$OMP& PRIVATE(KM,IS,J,IIN_STRIDES0,IIN0_STRIDES0) & + !$OMP& SHARED(D,R,KF_LEG,ZINP) MAP(TO:KF_LEG) #endif #ifdef ACCGPU !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,IS,J) & @@ -354,8 +355,6 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) KM = D_MYMS(KMLOC) IS = 1+MOD(R_NSMAX-KM+1,2) IF(KM /= 0) THEN -#ifdef OMPGPU -#endif #ifdef ACCGPU !$ACC LOOP SEQ #endif @@ -370,8 +369,6 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) ENDDO #endif ELSEIF (MOD((JK-1),2) == 0) THEN -#ifdef OMPGPU -#endif #ifdef ACCGPU !$ACC LOOP SEQ #endif @@ -415,11 +412,11 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) & 0.0_JPRD, & & ZOUTS0, IOUT0_STRIDES0, 0, & & 1, HIP_STREAM, C_LOC(ALLOCATOR%PTR)) -#ifdef OMPGPU - !$OMP END TARGET DATA -#endif #ifdef ACCGPU !$ACC END HOST_DATA +#endif +#ifdef OMPGPU + !$OMP END TARGET DATA #endif ENDIF @@ -451,12 +448,13 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) & 0.0_JPRBT, & & ZOUTS, IOUT_STRIDES0, COFFSETS, & & D_NUMP, HIP_STREAM, C_LOC(ALLOCATOR%PTR)) -#ifdef OMPGPU - !$OMP END TARGET DATA -#endif #ifdef ACCGPU !$ACC END HOST_DATA #endif +#ifdef OMPGPU + !$OMP END TARGET DATA +#endif + IF (LSYNC_TRANS) THEN #ifdef ACCGPU !$ACC WAIT(1) @@ -467,8 +465,6 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) ENDIF CALL GSTATS(424,1) -#ifdef OMPGPU -#endif #ifdef ACCGPU !$ACC WAIT(1) From 3b18eae3ea1eef5dd33570ea25e6c60a2c960a5c Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Wed, 11 Dec 2024 11:27:53 +0200 Subject: [PATCH 08/28] Refresh OpenMP in TPM_HICFFT --- src/trans/gpu/internal/tpm_hicfft.F90 | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/src/trans/gpu/internal/tpm_hicfft.F90 b/src/trans/gpu/internal/tpm_hicfft.F90 index 019071635..f1fc8de84 100755 --- a/src/trans/gpu/internal/tpm_hicfft.F90 +++ b/src/trans/gpu/internal/tpm_hicfft.F90 @@ -79,6 +79,9 @@ SUBROUTINE EXECUTE_DIR_FFT_FLOAT_C(PREEL_REAL,PREEL_COMPLEX,RESOL_ID,KFIELD,LOEN END SUBROUTINE END INTERFACE +#ifdef OMPGPU + !$OMP TARGET DATA USE_DEVICE_PTR(PREEL_REAL,PREEL_COMPLEX) +#endif #ifdef ACCGPU !$ACC HOST_DATA USE_DEVICE(PREEL_REAL,PREEL_COMPLEX) #endif @@ -86,6 +89,9 @@ SUBROUTINE EXECUTE_DIR_FFT_FLOAT_C(PREEL_REAL,PREEL_COMPLEX,RESOL_ID,KFIELD,LOEN #ifdef ACCGPU !$ACC END HOST_DATA #endif +#ifdef OMPGPU + !$OMP END TARGET DATA +#endif END SUBROUTINE EXECUTE_DIR_FFT_FLOAT SUBROUTINE EXECUTE_DIR_FFT_DOUBLE(PREEL_REAL,PREEL_COMPLEX,RESOL_ID,KFIELD,LOENS,OFFSETS,ALLOC) @@ -115,6 +121,9 @@ SUBROUTINE EXECUTE_DIR_FFT_DOUBLE_C(PREEL_REAL,PREEL_COMPLEX,RESOL_ID,KFIELD,LOE END SUBROUTINE END INTERFACE +#ifdef OMPGPU + !$OMP TARGET DATA USE_DEVICE_PTR(PREEL_REAL,PREEL_COMPLEX) +#endif #ifdef ACCGPU !$ACC HOST_DATA USE_DEVICE(PREEL_REAL,PREEL_COMPLEX) #endif @@ -122,6 +131,9 @@ SUBROUTINE EXECUTE_DIR_FFT_DOUBLE_C(PREEL_REAL,PREEL_COMPLEX,RESOL_ID,KFIELD,LOE #ifdef ACCGPU !$ACC END HOST_DATA #endif +#ifdef OMPGPU + !$OMP END TARGET DATA +#endif END SUBROUTINE EXECUTE_DIR_FFT_DOUBLE @@ -152,6 +164,9 @@ SUBROUTINE EXECUTE_INV_FFT_FLOAT_C(PREEL_COMPLEX,PREEL_REAL,RESOL_ID,KFIELD,LOEN END SUBROUTINE END INTERFACE +#ifdef OMPGPU + !$OMP TARGET DATA USE_DEVICE_PTR(PREEL_COMPLEX,PREEL_REAL) +#endif #ifdef ACCGPU !$ACC HOST_DATA USE_DEVICE(PREEL_COMPLEX,PREEL_REAL) #endif @@ -159,6 +174,10 @@ SUBROUTINE EXECUTE_INV_FFT_FLOAT_C(PREEL_COMPLEX,PREEL_REAL,RESOL_ID,KFIELD,LOEN #ifdef ACCGPU !$ACC END HOST_DATA #endif +#ifdef OMPGPU + !$OMP END TARGET DATA +#endif + END SUBROUTINE SUBROUTINE EXECUTE_INV_FFT_DOUBLE(PREEL_COMPLEX,PREEL_REAL,RESOL_ID,KFIELD,LOENS,OFFSETS,ALLOC) @@ -188,6 +207,9 @@ SUBROUTINE EXECUTE_INV_FFT_DOUBLE_C(PREEL_COMPLEX,PREEL_REAL,RESOL_ID,KFIELD,LOE END SUBROUTINE END INTERFACE +#ifdef OMPGPU + !$OMP TARGET DATA USE_DEVICE_PTR(PREEL_COMPLEX,PREEL_REAL) +#endif #ifdef ACCGPU !$ACC HOST_DATA USE_DEVICE(PREEL_COMPLEX,PREEL_REAL) #endif @@ -195,6 +217,10 @@ SUBROUTINE EXECUTE_INV_FFT_DOUBLE_C(PREEL_COMPLEX,PREEL_REAL,RESOL_ID,KFIELD,LOE #ifdef ACCGPU !$ACC END HOST_DATA #endif +#ifdef OMPGPU + !$OMP END TARGET DATA +#endif + END SUBROUTINE From 975806721d7e2a72eb7df9fddf012ce8e94a8726 Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Wed, 11 Dec 2024 11:28:51 +0200 Subject: [PATCH 09/28] Refresh OpenMP in TRMTOL --- src/trans/gpu/internal/trmtol_mod.F90 | 34 ++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/src/trans/gpu/internal/trmtol_mod.F90 b/src/trans/gpu/internal/trmtol_mod.F90 index 73d93f8c2..0a748de7b 100755 --- a/src/trans/gpu/internal/trmtol_mod.F90 +++ b/src/trans/gpu/internal/trmtol_mod.F90 @@ -160,6 +160,11 @@ SUBROUTINE TRMTOL(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG) FROM_RECV = IOFFR(IRANK) + 1 TO_RECV = FROM_RECV + ILENR(IRANK) - 1 #ifdef OMPGPU + !$OMP TARGET TEAMS DEFAULT(NONE) & + !$OMP& SHARED(PFBUF,PFBUF_IN,FROM_RECV,TO_RECV,FROM_SEND,TO_SEND) & + !$OMP& MAP(TO:FROM_RECV,TO_RECV,FROM_SEND,TO_SEND) + !$OMP PARALLEL + !$OMP WORKSHARE #endif #ifdef ACCGPU #ifdef __HIP_PLATFORM_AMD__ @@ -171,6 +176,9 @@ SUBROUTINE TRMTOL(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG) #endif PFBUF(FROM_RECV:TO_RECV) = PFBUF_IN(FROM_SEND:TO_SEND) #ifdef OMPGPU + !$OMP END WORKSHARE + !$OMP END PARALLEL + !$OMP END TARGET TEAMS #endif #ifdef ACCGPU !$ACC END KERNELS @@ -187,13 +195,19 @@ SUBROUTINE TRMTOL(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG) CALL GSTATS(421,0) #ifdef USE_GPU_AWARE_MPI #ifdef OMPGPU + !$OMP TARGET DATA USE_DEVICE_PTR(PFBUF_IN,PFBUF) #endif #ifdef ACCGPU !$ACC HOST_DATA USE_DEVICE(PFBUF_IN, PFBUF) #endif #else - !! this is safe-but-slow fallback for running without GPU-aware MPI - !$ACC UPDATE HOST(PFBUF_IN,PFBUF) + !! this is safe-but-slow fallback for running without GPU-aware MPI +#ifdef OMPGPU + !$OMP TARGET UPDATE FROM(PFBUF_IN,PFBUF) +#endif +#ifdef ACCGPU + !$ACC UPDATE HOST(PFBUF_IN,PFBUF) +#endif #endif #if ECTRANS_HAVE_MPI @@ -205,14 +219,20 @@ SUBROUTINE TRMTOL(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG) #endif #ifdef USE_GPU_AWARE_MPI -#ifdef OMPGPU -#endif #ifdef ACCGPU !$ACC END HOST_DATA #endif +#ifdef OMPGPU + !$OMP END TARGET DATA +#endif #else - !! this is safe-but-slow fallback for running without GPU-aware MPI - !$ACC UPDATE DEVICE(PFBUF) + !! this is safe-but-slow fallback for running without GPU-aware MPI +#ifdef ACCGPU + !$ACC UPDATE DEVICE(PFBUF) +#endif +#ifdef OMPGPU + !$OMP TARGET UPDATE TO(PFBUF) +#endif #endif IF (LSYNC_TRANS) THEN CALL GSTATS(441,0) @@ -234,6 +254,8 @@ SUBROUTINE TRMTOL(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG) IEND = ISTA+ILEN-1 CALL GSTATS(1608,0) #ifdef OMPGPU + !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO DEFAULT(NONE) & + !$OMP SHARED(PFBUF,PFBUF_IN,ISTA,IEND,ILEN) MAP(TO:ISTA,ILEN) #endif #ifdef ACCGPU !$ACC PARALLEL LOOP DEFAULT(NONE) PRESENT(PFBUF,PFBUF_IN) FIRSTPRIVATE(ISTA,IEND) From 7b2868fceb057f8bc54fe6be578b9fde0f7f6617 Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Wed, 11 Dec 2024 11:29:27 +0200 Subject: [PATCH 10/28] Refresh OpenMP in TRMTOL_PACK_UNPACK --- src/trans/gpu/internal/trmtol_pack_unpack.F90 | 20 +++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/src/trans/gpu/internal/trmtol_pack_unpack.F90 b/src/trans/gpu/internal/trmtol_pack_unpack.F90 index 76080145c..792630781 100755 --- a/src/trans/gpu/internal/trmtol_pack_unpack.F90 +++ b/src/trans/gpu/internal/trmtol_pack_unpack.F90 @@ -126,14 +126,18 @@ SUBROUTINE TRMTOL_PACK(ALLOCATOR,HTRMTOL_PACK,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_I CALL LEINV_STRIDES(KF_LEG,IOUT_STRIDES0=IOUT_STRIDES0,IOUT_SIZE=IOUT_SIZE,& IOUT0_STRIDES0=IOUT0_STRIDES0,IOUT0_SIZE=IOUT0_SIZE) -#ifdef OMPGPU -#endif #ifdef ACCGPU !$ACC DATA PRESENT(D,D_MYMS,D_NPNTGTB1,D_NUMP,G,G_NDGLU,R,R_NDGNH,R_NDGL) & !$ACC& PRESENT(ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_IN,D_OFFSETS_GEMM1) #endif #ifdef OMPGPU + ! Directive incomplete -> putting more variables in SHARED() triggers internal compiler error + ! ftn-7991: INTERNAL COMPILER ERROR: "Too few arguments on the stack" + !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) & + !$OMP& SHARED(D,R,G,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_IN) & + !$OMP& PRIVATE(KM,ISL,IGLS,OFFSET1,OFFSET2,ZAOA,ZSOA) & + !$OMP& MAP(TO:KF_LEG,IOUT_STRIDES0,IOUT0_STRIDES0) #endif #ifdef ACCGPU !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(KM,ISL,IGLS,OFFSET1,OFFSET2,ZAOA,ZSOA) & @@ -174,8 +178,6 @@ SUBROUTINE TRMTOL_PACK(ALLOCATOR,HTRMTOL_PACK,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_I ENDDO ENDDO -#ifdef OMPGPU -#endif #ifdef ACCGPU !$ACC WAIT(1) @@ -259,8 +261,6 @@ SUBROUTINE TRMTOL_UNPACK(ALLOCATOR,HTRMTOL_UNPACK,FOUBUF,PREEL_COMPLEX,KF_CURREN CALL ASSIGN_PTR(PREEL_COMPLEX, GET_ALLOCATION(ALLOCATOR, HTRMTOL_UNPACK%HREEL),& & 1_JPIB, 1_JPIB*KF_TOTAL*D%NLENGTF*C_SIZEOF(PREEL_COMPLEX(1))) -#ifdef OMPGPU -#endif #ifdef ACCGPU !$ACC DATA PRESENT(G,G_NLOEN,G_NMEN,D,D_NPNTGTB0,FOUBUF,PREEL_COMPLEX,D_NSTAGTF,D_NDGL_FS) ASYNC(1) #endif @@ -268,6 +268,12 @@ SUBROUTINE TRMTOL_UNPACK(ALLOCATOR,HTRMTOL_UNPACK,FOUBUF,PREEL_COMPLEX,KF_CURREN OFFSET_VAR=D_NPTRLS(MYSETW) ILOEN_MAX=MAXVAL(G_NLOEN) #ifdef OMPGPU +! Directive incomplete -> putting more variables in SHARED() triggers internal compiler error +! ftn-7991: INTERNAL COMPILER ERROR: "Too few arguments on the stack" +!$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO & +!$OMP& SHARED(D,G,KF_CURRENT,ILOEN_MAX,OFFSET_VAR,FOUBUF,PREEL_COMPLEX) & +!$OMP& PRIVATE(IGLG,RET_REAL,RET_COMPLEX,ISTA) & +!$OMP& MAP(TO:KF_CURRENT,ILOEN_MAX,OFFSET_VAR,KF_TOTAL) #endif #ifdef ACCGPU !$ACC PARALLEL LOOP PRIVATE(IGLG,IOFF_LAT,ISTA,RET_REAL,RET_COMPLEX) FIRSTPRIVATE(KF_CURRENT,& @@ -302,8 +308,6 @@ SUBROUTINE TRMTOL_UNPACK(ALLOCATOR,HTRMTOL_UNPACK,FOUBUF,PREEL_COMPLEX,KF_CURREN ENDDO ENDDO ENDDO -#ifdef OMPGPU -#endif #ifdef ACCGPU !$ACC END DATA From c3625c0e0977ac47612e3c48bf2196eae12e86e9 Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Wed, 11 Dec 2024 11:29:49 +0200 Subject: [PATCH 11/28] Refresh OpenMP in TRLTOG --- src/trans/gpu/internal/trltog_mod.F90 | 75 ++++++++++++++++++++------- 1 file changed, 55 insertions(+), 20 deletions(-) diff --git a/src/trans/gpu/internal/trltog_mod.F90 b/src/trans/gpu/internal/trltog_mod.F90 index da885ac0e..d8cdeaf80 100755 --- a/src/trans/gpu/internal/trltog_mod.F90 +++ b/src/trans/gpu/internal/trltog_mod.F90 @@ -492,6 +492,7 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, LLOCAL_CONTRIBUTION = ISENDTOT(MYPROC) > 0 #ifdef OMPGPU + !$OMP TARGET DATA MAP(TO:IGP_OFFSETS) #endif #ifdef ACCGPU !$ACC DATA COPYIN(IGP_OFFSETS) ASYNC(1) @@ -528,6 +529,7 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, #endif #ifdef OMPGPU + !$OMP TARGET DATA MAP(TO:IIN_TO_SEND_BUFR) IF(KF_FS > 0) #endif #ifdef ACCGPU !$ACC DATA IF(PRESENT(PGP)) PRESENT(PGP) ASYNC(1) @@ -538,8 +540,6 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, ! Present until self contribution and packing are done !$ACC DATA COPYIN(IIN_TO_SEND_BUFR) PRESENT(PREEL_REAL) IF(KF_FS > 0) ASYNC(1) -#endif -#ifdef OMPGPU #endif CALL GSTATS(1806,1) @@ -601,6 +601,7 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, ENDDO #ifdef OMPGPU + !$OMP TARGET DATA MAP(TO:IFLDA) #endif #ifdef ACCGPU !$ACC DATA COPYIN(IFLDA) ASYNC(1) @@ -616,6 +617,11 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, IIN_TO_SEND_BUFR_V = IIN_TO_SEND_BUFR_OFFSET(MYPROC) IF (PRESENT(PGP)) THEN #ifdef OMPGPU + !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(2) DEFAULT(NONE) & + !$OMP& PRIVATE(JK,JBLK,IFLD,IPOS) & + !$OMP& SHARED(KF_FS,IRECV_WSET_SIZE_V,NPROMA,IRECV_WSET_OFFSET_V,IFLDA,IIN_TO_SEND_BUFR_V, & + !$OMP& IIN_TO_SEND_BUFR,PREEL_REAL,PGP) & + !$OMP& MAP(TO:KF_FS,IRECV_WSET_SIZE_V,NPROMA,IRECV_WSET_OFFSET_V,IIN_TO_SEND_BUFR_V) #endif #ifdef ACCGPU !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,IPOS) & @@ -634,6 +640,11 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, ENDDO ELSE #ifdef OMPGPU + !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(2) DEFAULT(NONE) & + !$OMP& PRIVATE(JK,JBLK,IFLD,IPOS) & + !$OMP& SHARED(KF_FS,IRECV_WSET_SIZE_V,NPROMA,IRECV_WSET_OFFSET_V,IFLDA,IIN_TO_SEND_BUFR_V, & + !$OMP& IIN_TO_SEND_BUFR,IGP_OFFSETS,PREEL_REAL,PGPUV,PGP2,PGP3A,PGP3B) & + !$OMP& MAP(TO:KF_FS,IRECV_WSET_SIZE_V,NPROMA,IRECV_WSET_OFFSET_V,IIN_TO_SEND_BUFR_V) #endif #ifdef ACCGPU !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,IPOS) & @@ -683,8 +694,6 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, & ICOMBUFS_OFFSET(ISEND_COUNTS+1)*C_SIZEOF(ZCOMBUFS(1))) ENDIF -#ifdef OMPGPU -#endif #ifdef ACCGPU !$ACC DATA PRESENT(ZCOMBUFS) IF(ISEND_COUNTS > 0) ASYNC(1) #endif @@ -695,6 +704,11 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, IIN_TO_SEND_BUFR_V = IIN_TO_SEND_BUFR_OFFSET(IPROC) ICOMBUFS_OFFSET_V = ICOMBUFS_OFFSET(INS) #ifdef OMPGPU + !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(2) DEFAULT(NONE) & + !$OMP& PRIVATE(IPOS) & + !$OMP& SHARED(KF_FS,ILEN,IIN_TO_SEND_BUFR_V,IIN_TO_SEND_BUFR,PREEL_REAL,ICOMBUFS_OFFSET_V, & + !$OMP& ZCOMBUFS) & + !$OMP& MAP(TO:KF_FS,ILEN,IIN_TO_SEND_BUFR_V,ICOMBUFS_OFFSET_V) #endif #ifdef ACCGPU !$ACC PARALLEL LOOP DEFAULT(NONE) PRIVATE(IPOS) FIRSTPRIVATE(KF_FS,ILEN,IIN_TO_SEND_BUFR_V, & @@ -709,8 +723,6 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, ENDDO ENDDO CALL GSTATS(1605,1) -#ifdef OMPGPU -#endif #ifdef ACCGPU !$ACC END DATA ! ZCOMBUFS @@ -730,12 +742,14 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, !...Receive loop......................................................... #ifdef USE_GPU_AWARE_MPI #ifdef OMPGPU + !$OMP TARGET DATA USE_DEVICE_PTR(ZCOMBUFS,ZCOMBUFR) #endif #ifdef ACCGPU !$ACC HOST_DATA USE_DEVICE(ZCOMBUFS,ZCOMBUFR) #endif #else #ifdef OMPGPU + !$OMP TARGET UPDATE FROM(ZCOMBUFS) IF(ISEND_COUNTS > 0) #endif #ifdef ACCGPU !! this is safe-but-slow fallback for running without GPU-aware MPI @@ -788,16 +802,20 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, ENDIF #ifdef USE_GPU_AWARE_MPI -#ifdef OMPGPU -#endif #ifdef ACCGPU - !$ACC END HOST_DATA + !$ACC END HOST_DATA ! ZCOMBUFS, ZCOMBUFR +#endif +#ifdef OMPGPU + !$OMP END TARGET DATA ! ZCOMBUFS, ZCOMBUFR #endif #else #ifdef OMPGPU #endif -#ifdef ACCGPU !! this is safe-but-slow fallback for running without GPU-aware MPI +#ifdef OMPGPU + !$OMP TARGET UPDATE TO(ZCOMBUFR) IF(IRECV_COUNTS > 0) +#endif +#ifdef ACCGPU !$ACC UPDATE DEVICE(ZCOMBUFR) IF(IRECV_COUNTS > 0) #endif #endif @@ -809,8 +827,6 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, ENDIF CALL GSTATS(421,1) -#ifdef OMPGPU -#endif #ifdef ACCGPU !$ACC DATA PRESENT(ZCOMBUFR) IF(IRECV_COUNTS > 0) ASYNC(1) #endif @@ -830,6 +846,12 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, IRECV_WSET_SIZE_V = IRECV_WSET_SIZE(ISETW) IF (PRESENT(PGP)) THEN #ifdef OMPGPU + !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(2) DEFAULT(NONE) & + !$OMP& PRIVATE(JK,JBLK,IFLD,JI) & + !$OMP& SHARED(IRECV_FIELD_COUNT_V,IRECV_WSET_SIZE_V,NPROMA,IRECV_WSET_OFFSET_V,IFLDA, & + !$OMP& ICOMBUFR_OFFSET_V,ZCOMBUFR,PGP,INR) & + !$OMP& MAP(TO:IRECV_FIELD_COUNT_V,IRECV_WSET_SIZE_V,NPROMA,IRECV_WSET_OFFSET_V, & + !$OMP& ICOMBUFR_OFFSET_V) #endif #ifdef ACCGPU !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,JI) & @@ -847,6 +869,12 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, ENDDO ELSE #ifdef OMPGPU + !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(2) DEFAULT(NONE) & + !$OMP& PRIVATE(JK,JBLK,IFLD,JI) & + !$OMP& SHARED(IRECV_FIELD_COUNT_V,IRECV_WSET_SIZE_V,NPROMA,IRECV_WSET_OFFSET_V,IFLDA, & + !$OMP& ICOMBUFR_OFFSET_V,IGP_OFFSETS,ZCOMBUFR,PGPUV,PGP2,PGP3A,PGP3B,INR) & + !$OMP& MAP(TO:IRECV_FIELD_COUNT_V,IRECV_WSET_SIZE_V,NPROMA,IRECV_WSET_OFFSET_V, & + !$OMP& ICOMBUFR_OFFSET_V) #endif #ifdef ACCGPU !$ACC PARALLEL LOOP COLLAPSE(2) DEFAULT(NONE) PRIVATE(JK,JBLK,IFLD,JI) & @@ -878,10 +906,8 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, !$ACC WAIT(1) #endif -#ifdef OMPGPU -#endif #ifdef ACCGPU - !$ACC END DATA ! ZOMBUFR + !$ACC END DATA ! ZCOMBUFR #endif IF (LSYNC_TRANS) THEN #ifdef ACCGPU @@ -892,8 +918,6 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, CALL GSTATS(440,1) ENDIF CALL GSTATS(422,0) -#ifdef OMPGPU -#endif #ifdef ACCGPU !$ACC END DATA ! IFLDA !$ACC END DATA ! PREEL_REAL @@ -902,9 +926,14 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, !$ACC END DATA ! PGP2 !$ACC END DATA ! PGPUV !$ACC END DATA ! PGP +#endif +#ifdef OMPGPU + !$OMP END TARGET DATA ! IFLDA + !$OMP END TARGET DATA ! IIN_TO_SEND_BUFR #endif IF (PRESENT(PGP)) THEN #ifdef OMPGPU + !$OMP TARGET UPDATE FROM(PGP) #endif #ifdef ACCGPU !$ACC UPDATE HOST(PGP) ASYNC(1) @@ -912,6 +941,7 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, ENDIF IF (PRESENT(PGPUV)) THEN #ifdef OMPGPU + !$OMP TARGET UPDATE FROM(PGPUV) #endif #ifdef ACCGPU !$ACC UPDATE HOST(PGPUV) ASYNC(1) @@ -919,6 +949,7 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, ENDIF IF (PRESENT(PGP2)) THEN #ifdef OMPGPU + !$OMP TARGET UPDATE FROM(PGP2) #endif #ifdef ACCGPU !$ACC UPDATE HOST(PGP2) ASYNC(1) @@ -926,6 +957,7 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, ENDIF IF (PRESENT(PGP3A)) THEN #ifdef OMPGPU + !$OMP TARGET UPDATE FROM(PGP3A) #endif #ifdef ACCGPU !$ACC UPDATE HOST(PGP3A) ASYNC(1) @@ -933,6 +965,7 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, ENDIF IF (PRESENT(PGP3B)) THEN #ifdef OMPGPU + !$OMP TARGET UPDATE FROM(PGP3B) #endif #ifdef ACCGPU !$ACC UPDATE HOST(PGP3B) ASYNC(1) @@ -955,13 +988,15 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, CALL GSTATS(442,1) ENDIF CALL GSTATS(422,1) -#ifdef OMPGPU -#endif + #ifdef ACCGPU - !$ACC END DATA ! IRECVBUFR_TO_OUT,PGPINDICES + !$ACC END DATA ! IGP_OFFSETS !$ACC WAIT(1) #endif +#ifdef OMPGPU + !$OMP END TARGET DATA !IGP_OFFSETS +#endif CALL GSTATS(1606,1) From 1b7e709a0b69d8e04aa4b43cadad50360bdcd04e Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Wed, 11 Dec 2024 17:48:48 +0200 Subject: [PATCH 12/28] Add OpenMP to ext_acc --- src/trans/gpu/algor/ext_acc.F90 | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/trans/gpu/algor/ext_acc.F90 b/src/trans/gpu/algor/ext_acc.F90 index ffc760e61..4239d95bb 100644 --- a/src/trans/gpu/algor/ext_acc.F90 +++ b/src/trans/gpu/algor/ext_acc.F90 @@ -290,6 +290,7 @@ subroutine ext_acc_create(ptrs, stream) !$acc enter data create(pp) async(stream_act) #endif #ifdef OMPGPU + !$omp target enter data map(alloc:pp) #endif enddo end subroutine @@ -327,6 +328,7 @@ subroutine ext_acc_copyin(ptrs, stream) !$acc enter data copyin(pp) async(stream_act) #endif #ifdef OMPGPU + !$omp target enter data map(to:pp) #endif enddo end subroutine @@ -363,6 +365,7 @@ subroutine ext_acc_copyout(ptrs, stream) !$acc exit data copyout(pp) async(stream_act) #endif #ifdef OMPGPU + !$omp target exit data map(from:pp) #endif enddo end subroutine @@ -399,6 +402,7 @@ subroutine ext_acc_delete(ptrs, stream) !$acc exit data delete(pp) async(stream_act) #endif #ifdef OMPGPU + !$omp target exit data map(delete:pp) #endif enddo end subroutine From e15b1027d63686ac7bb5301b75faa255d62f501e Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Thu, 23 Jan 2025 19:14:03 +0200 Subject: [PATCH 13/28] Refresh OpenMP in BUFFERED_ALLOCATOR_MOD --- src/trans/gpu/algor/buffered_allocator_mod.F90 | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/src/trans/gpu/algor/buffered_allocator_mod.F90 b/src/trans/gpu/algor/buffered_allocator_mod.F90 index 346b85977..a4e187801 100644 --- a/src/trans/gpu/algor/buffered_allocator_mod.F90 +++ b/src/trans/gpu/algor/buffered_allocator_mod.F90 @@ -146,8 +146,6 @@ SUBROUTINE ASSIGN_PTR_FLOAT(DST, SRC, START_IN_BYTES, LENGTH_IN_BYTES, SET_VALUE ELSE #ifdef ACCGPU SET_STREAM_EFF = ACC_ASYNC_SYNC -#endif -#ifdef OMPGPU #endif ENDIF IF (SET_VALUE_EFF .AND. LENGTH_IN_BYTES > 0) THEN @@ -157,14 +155,13 @@ SUBROUTINE ASSIGN_PTR_FLOAT(DST, SRC, START_IN_BYTES, LENGTH_IN_BYTES, SET_VALUE !$ACC PARALLEL PRESENT(SRC) ASYNC(SET_STREAM_EFF) #endif #ifdef OMPGPU + !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO DEFAULT(NONE) SHARED(LENGTH_IN_BYTES,SRC) #endif DO J=1_C_SIZE_T,LENGTH_IN_BYTES SRC(J) = -1 ENDDO #ifdef ACCGPU !$ACC END PARALLEL -#endif -#ifdef OMPGPU #endif ENDIF CALL C_F_POINTER(C_LOC(SRC(START_IN_BYTES:START_IN_BYTES+LENGTH_IN_BYTES-1)), DST, & @@ -196,8 +193,6 @@ SUBROUTINE ASSIGN_PTR_DOUBLE(DST, SRC, START_IN_BYTES, LENGTH_IN_BYTES, SET_VALU ELSE #ifdef ACCGPU SET_STREAM_EFF = ACC_ASYNC_SYNC -#endif -#ifdef OMPGPU #endif ENDIF IF (SET_VALUE_EFF .AND. LENGTH_IN_BYTES > 0) THEN @@ -208,14 +203,13 @@ SUBROUTINE ASSIGN_PTR_DOUBLE(DST, SRC, START_IN_BYTES, LENGTH_IN_BYTES, SET_VALU !$ACC PARALLEL PRESENT(SRC) ASYNC(SET_STREAM_EFF) #endif #ifdef OMPGPU + !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO DEFAULT(NONE) SHARED(LENGTH_IN_BYTES,SRC) #endif DO J=1_C_SIZE_T,LENGTH_IN_BYTES SRC(J) = -1 ENDDO #ifdef ACCGPU !$ACC END PARALLEL -#endif -#ifdef OMPGPU #endif ENDIF CALL C_F_POINTER(C_LOC(SRC(START_IN_BYTES:START_IN_BYTES+LENGTH_IN_BYTES-1)), DST, & From e0f8993e218bcb0d078de479e8e00dd4f6d5ad75 Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Mon, 3 Feb 2025 18:21:24 +0200 Subject: [PATCH 14/28] Add missing copy-in for D_NUMP (OpenMP) --- src/trans/gpu/external/setup_trans.F90 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/trans/gpu/external/setup_trans.F90 b/src/trans/gpu/external/setup_trans.F90 index d7102579b..cbfa46a43 100755 --- a/src/trans/gpu/external/setup_trans.F90 +++ b/src/trans/gpu/external/setup_trans.F90 @@ -565,7 +565,7 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& !$OMP TARGET ENTER DATA MAP(TO:R) !$OMP TARGET ENTER DATA MAP(TO:F%RLAPIN,F%RACTHE,F%RW) !$OMP TARGET ENTER DATA MAP(TO:FG%ZAA,FG%ZAS,FG%ZEPSNM) - !$OMP TARGET ENTER DATA MAP(TO:D%MYMS,D%NPNTGTB0,D%NPNTGTB1,D%NSTAGT0B,D%NSTAGT1B,D%NSTAGTF,& + !$OMP TARGET ENTER DATA MAP(TO:D%NUMP,D%MYMS,D%NPNTGTB0,D%NPNTGTB1,D%NSTAGT0B,D%NSTAGT1B,D%NSTAGTF,& !$OMP& D%NPROCM,D%NPROCL,D%NPTRLS,D%MSTABF,D%NASM0,D%OFFSETS_GEMM1,& !$OMP& D%OFFSETS_GEMM2) !$OMP TARGET ENTER DATA MAP(TO:G%NDGLU,G%NMEN,G%NLOEN) From 0851260b35f9d36335d13759e3fd73fec0d164c4 Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Thu, 6 Feb 2025 17:30:26 +0200 Subject: [PATCH 15/28] Make packing index integers shared, not private These variables should be inherited from the parent scope and shared between all threads. This was a bug. --- src/trans/gpu/internal/leinv_mod.F90 | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index a091981ca..dd95f89ce 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -193,8 +193,8 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) ! Directive incomplete -> putting more variables in SHARED() triggers internal compiler error ! ftn-7991: INTERNAL COMPILER ERROR: "Too few arguments on the stack" !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(2) & - !$OMP& PRIVATE(KM,IA,J,IIN_STRIDES0,IIN0_STRIDES0) & - !$OMP& SHARED(D,R,KF_LEG,ZINP) MAP(TO:KF_LEG) + !$OMP& PRIVATE(KM,IA,J) & + !$OMP& SHARED(D,R,KF_LEG,ZINP,IIN_STRIDES0,IIN0_STRIDES0) MAP(TO:KF_LEG) #endif #ifdef ACCGPU !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,IA,J) & @@ -338,8 +338,8 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) ! Directive incomplete -> putting more variables in SHARED() triggers internal compiler error ! ftn-7991: INTERNAL COMPILER ERROR: "Too few arguments on the stack" !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(2) & - !$OMP& PRIVATE(KM,IS,J,IIN_STRIDES0,IIN0_STRIDES0) & - !$OMP& SHARED(D,R,KF_LEG,ZINP) MAP(TO:KF_LEG) + !$OMP& PRIVATE(KM,IS,J) & + !$OMP& SHARED(D,R,KF_LEG,ZINP,IIN_STRIDES0,IIN0_STRIDES0) MAP(TO:KF_LEG) #endif #ifdef ACCGPU !$ACC PARALLEL LOOP COLLAPSE(2) PRIVATE(KM,IS,J) & From 84a14d0aa778899e3d6b02e1376319e1fea70df7 Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Thu, 6 Feb 2025 17:46:32 +0200 Subject: [PATCH 16/28] Copy all derived type parameters for OpenMP --- src/trans/gpu/external/setup_trans.F90 | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/trans/gpu/external/setup_trans.F90 b/src/trans/gpu/external/setup_trans.F90 index cbfa46a43..73de395f1 100755 --- a/src/trans/gpu/external/setup_trans.F90 +++ b/src/trans/gpu/external/setup_trans.F90 @@ -562,13 +562,13 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& !$ACC WAIT(1) #endif #ifdef OMPGPU - !$OMP TARGET ENTER DATA MAP(TO:R) - !$OMP TARGET ENTER DATA MAP(TO:F%RLAPIN,F%RACTHE,F%RW) - !$OMP TARGET ENTER DATA MAP(TO:FG%ZAA,FG%ZAS,FG%ZEPSNM) - !$OMP TARGET ENTER DATA MAP(TO:D%NUMP,D%MYMS,D%NPNTGTB0,D%NPNTGTB1,D%NSTAGT0B,D%NSTAGT1B,D%NSTAGTF,& + !$OMP TARGET ENTER DATA MAP(TO:R,R%NSMAX,R%NTMAX,R%NDGL,R%NDGNH) + !$OMP TARGET ENTER DATA MAP(TO:F,F%RLAPIN,F%RACTHE,F%RW) + !$OMP TARGET ENTER DATA MAP(TO:FG,FG%ZAA,FG%ZAS,FG%ZEPSNM) + !$OMP TARGET ENTER DATA MAP(TO:D,D%NUMP,D%MYMS,D%NPNTGTB0,D%NPNTGTB1,D%NSTAGT0B,D%NSTAGT1B,D%NSTAGTF,& !$OMP& D%NPROCM,D%NPROCL,D%NPTRLS,D%MSTABF,D%NASM0,D%OFFSETS_GEMM1,& - !$OMP& D%OFFSETS_GEMM2) - !$OMP TARGET ENTER DATA MAP(TO:G%NDGLU,G%NMEN,G%NLOEN) + !$OMP& D%OFFSETS_GEMM2,D%NDGL_FS) + !$OMP TARGET ENTER DATA MAP(TO:G,G%NDGLU,G%NMEN,G%NLOEN) #endif WRITE(NOUT,*) '===GPU arrays successfully allocated' From 7d2deca511d61054e56284a19e0413fe6fdff1fc Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Thu, 6 Feb 2025 17:51:11 +0200 Subject: [PATCH 17/28] Fix handle reusing behaviour for OMPGPU --- src/trans/gpu/algor/hicblas_gemm.hip.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/trans/gpu/algor/hicblas_gemm.hip.cpp b/src/trans/gpu/algor/hicblas_gemm.hip.cpp index f9caa0383..ffd956beb 100644 --- a/src/trans/gpu/algor/hicblas_gemm.hip.cpp +++ b/src/trans/gpu/algor/hicblas_gemm.hip.cpp @@ -199,8 +199,15 @@ template struct hipblas_gemm_grouped { void operator()(hipStream_t stream, int m, int n, int k, Real alpha, const Real *A, int lda, const Real *B, int ldb, Real beta, Real *C, int ldc) const { + // TODO: sort out this nonsense +#ifdef OMPGPU + hipblasHandle_t handle; + HICBLAS_CHECK(hipblasCreate(&handle)); +#endif +#ifdef ACCGPU hipblasHandle_t handle = get_hipblas_handle(); HICBLAS_CHECK(hipblasSetStream(handle, stream)); +#endif if constexpr (std::is_same::value) HICBLAS_CHECK(hipblasSgemm(handle, transa_, transb_, m, n, k, &alpha, A, @@ -208,6 +215,9 @@ template struct hipblas_gemm_grouped { if constexpr (std::is_same::value) HICBLAS_CHECK(hipblasDgemm(handle, transa_, transb_, m, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc)); +#ifdef OMPGPU + HICBLAS_CHECK(hipblasDestroy(handle)); +#endif } private: From 56f1e17811b913b53af5dd6b1992e56a8f933340 Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Thu, 6 Feb 2025 18:31:46 +0200 Subject: [PATCH 18/28] Delete unused handles --- src/trans/gpu/algor/hicblas_gemm.hip.cpp | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/trans/gpu/algor/hicblas_gemm.hip.cpp b/src/trans/gpu/algor/hicblas_gemm.hip.cpp index ffd956beb..98b3bed09 100644 --- a/src/trans/gpu/algor/hicblas_gemm.hip.cpp +++ b/src/trans/gpu/algor/hicblas_gemm.hip.cpp @@ -25,11 +25,7 @@ bool hip_alreadyAllocated_sgemm = false; bool hip_alreadyAllocated_sgemm_handle = false; -bool hip_alreadyAllocated_dsgemm = false; -bool hip_alreadyAllocated_dgemm_handle = false; - hipblasHandle_t handle_hip_sgemm; -hipblasHandle_t handle_hip_dgemm; namespace { struct cache_key { @@ -288,10 +284,6 @@ void hipblas_dgemm_wrapper(char transa, char transb, int m, int n, int k, if (transb == 'T' || transb == 't') op_t2 = HIPBLAS_OP_T; - if (!hip_alreadyAllocated_dgemm_handle) { - HICBLAS_CHECK(hipblasCreate(&handle_hip_dgemm)); - hip_alreadyAllocated_dgemm_handle = true; - } hipblasHandle_t handle = get_hipblas_handle(); HICBLAS_CHECK(hipblasSetStream(handle, *(hipStream_t *)stream)); From 212993a3685ce4eb1fcf0f6f8dc3647aa9a9caee Mon Sep 17 00:00:00 2001 From: Thomas Gibson Date: Fri, 7 Feb 2025 13:50:35 +0200 Subject: [PATCH 19/28] Update OMPT statements in prfi1b/vdtuv mods Co-authored-by: Sam Hatfield --- src/trans/gpu/internal/prfi1b_mod.F90 | 8 +++++++- src/trans/gpu/internal/vdtuv_mod.F90 | 9 +++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/src/trans/gpu/internal/prfi1b_mod.F90 b/src/trans/gpu/internal/prfi1b_mod.F90 index b2220804c..daa23b820 100755 --- a/src/trans/gpu/internal/prfi1b_mod.F90 +++ b/src/trans/gpu/internal/prfi1b_mod.F90 @@ -83,7 +83,10 @@ SUBROUTINE PRFI1B(PIA,PSPEC,KFIELDS,KDIM,KFLDPTR) #ifdef ACCGPU !$ACC DATA PRESENT(D,D_NUMP,R,R_NSMAX,D_MYMS,D_NASM0,PIA,PSPEC) ASYNC(1) #endif - +#ifdef OMPGPU + !$OMP TARGET DATA MAP(PRESENT,ALLOC:D,D_NUMP,R,R_NSMAX,D_MYMS,D_NASM0,PIA,PSPEC) +#endif + IF(PRESENT(KFLDPTR)) THEN CALL ABORT_TRANS("KFLDPTR not implemented for GPU") @@ -131,6 +134,9 @@ SUBROUTINE PRFI1B(PIA,PSPEC,KFIELDS,KDIM,KFLDPTR) #ifdef ACCGPU !$ACC END DATA #endif +#ifdef OMPGPU + !$OMP END TARGET DATA +#endif END ASSOCIATE diff --git a/src/trans/gpu/internal/vdtuv_mod.F90 b/src/trans/gpu/internal/vdtuv_mod.F90 index bffa7afd0..da8833376 100755 --- a/src/trans/gpu/internal/vdtuv_mod.F90 +++ b/src/trans/gpu/internal/vdtuv_mod.F90 @@ -93,6 +93,12 @@ SUBROUTINE VDTUV(KFIELD,PEPSNM,PVOR,PDIV,PU,PV) !$ACC& PRESENT(PEPSNM, PVOR, PDIV) & !$ACC& PRESENT(PU, PV) #endif +#ifdef OMPGPU +!$OMP TARGET DATA & +!$OMP& MAP(PRESENT,ALLOC:R,R_NTMAX,D,D_MYMS,D_NUMP,F,F_RLAPIN) & +!$OMP& MAP(PRESENT,ALLOC:PEPSNM, PVOR, PDIV) & +!$OMP& MAP(PRESENT,ALLOC:PU, PV) +#endif ! ------------------------------------------------------------------ @@ -154,6 +160,9 @@ SUBROUTINE VDTUV(KFIELD,PEPSNM,PVOR,PDIV,PU,PV) #ifdef ACCGPU !$ACC END DATA #endif +#ifdef OMPGPU +!$OMP END TARGET DATA +#endif ! ------------------------------------------------------------------ END ASSOCIATE From a44e9780bd3c4e9695e3cae21e960949bdd41165 Mon Sep 17 00:00:00 2001 From: Thomas Gibson Date: Fri, 7 Feb 2025 13:53:53 +0200 Subject: [PATCH 20/28] update ompgpu: spnsde mod Co-authored-by: Sam Hatfield --- src/trans/gpu/internal/spnsde_mod.F90 | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/trans/gpu/internal/spnsde_mod.F90 b/src/trans/gpu/internal/spnsde_mod.F90 index af320d058..13c28663f 100755 --- a/src/trans/gpu/internal/spnsde_mod.F90 +++ b/src/trans/gpu/internal/spnsde_mod.F90 @@ -82,6 +82,11 @@ SUBROUTINE SPNSDE(KF_SCALARS,PEPSNM,PF,PNSD) ASSOCIATE(D_NUMP=>D%NUMP, R_NTMAX=>R%NTMAX, D_MYMS=>D%MYMS) +#ifdef OMPGPU +!$OMP TARGET DATA & +!$OMP& MAP(PRESENT,ALLOC:R,R_NTMAX,D,D_MYMS) & +!$OMP& MAP(PRESENT,ALLOC:D_NUMP,PEPSNM,PF,PNSD) +#endif #ifdef ACCGPU !$ACC DATA & !$ACC& PRESENT (R,R_NTMAX, D,D_MYMS) & @@ -133,6 +138,9 @@ SUBROUTINE SPNSDE(KF_SCALARS,PEPSNM,PF,PNSD) ENDDO END DO +#ifdef OMPGPU +!$OMP END TARGET DATA +#endif #ifdef ACCGPU !$ACC END DATA #endif From 202a36725a9779f2aea57375a2ed299f198ec620 Mon Sep 17 00:00:00 2001 From: Thomas Gibson Date: Fri, 7 Feb 2025 13:54:49 +0200 Subject: [PATCH 21/28] update ompgpu: leinv_mod (wip) Co-authored-by: Sam Hatfield --- src/trans/gpu/internal/leinv_mod.F90 | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index dd95f89ce..7e5551203 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -171,6 +171,14 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) CALL LEINV_STRIDES(KF_LEG,IOUT_STRIDES0,IOUT_SIZE,IIN_STRIDES0,IIN_SIZE,& IOUT0_STRIDES0,IOUT0_SIZE,IIN0_STRIDES0,IIN0_SIZE) + +#ifdef OMPGPU + !$OMP TARGET DATA & + !$OMP& MAP(PRESENT,ALLOC:D,D_MYMS,D_NUMP) & + !$OMP& MAP(PRESENT,ALLOC:ZINP,ZOUTS,ZOUTA,ZINP0,ZOUTS0,ZOUTA0) & + !$OMP& MAP(PRESENT,ALLOC:ZAA,ZAS,PIA) & + !$OMP& MAP(PRESENT,ALLOC:R,R_NSMAX,D_OFFSETS_GEMM2) +#endif #ifdef ACCGPU !$ACC DATA PRESENT(D,D_MYMS,D_NUMP) & !$ACC& PRESENT(ZINP,ZOUTS,ZOUTA,ZINP0,ZOUTS0,ZOUTA0) & @@ -465,6 +473,9 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) ENDIF CALL GSTATS(424,1) +#ifdef OMPGPU + !$OMP END TARGET DATA +#endif #ifdef ACCGPU !$ACC WAIT(1) From 48d066af917968450fb329fd8b9aeb60d8c4d824 Mon Sep 17 00:00:00 2001 From: Thomas Gibson Date: Fri, 7 Feb 2025 13:57:25 +0200 Subject: [PATCH 22/28] update trmtol modules Co-authored-by: Sam Hatfield --- src/trans/gpu/internal/trmtol_mod.F90 | 8 +------- src/trans/gpu/internal/trmtol_pack_unpack.F90 | 13 +++++++++++++ 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/src/trans/gpu/internal/trmtol_mod.F90 b/src/trans/gpu/internal/trmtol_mod.F90 index 0a748de7b..24bd8c755 100755 --- a/src/trans/gpu/internal/trmtol_mod.F90 +++ b/src/trans/gpu/internal/trmtol_mod.F90 @@ -160,11 +160,7 @@ SUBROUTINE TRMTOL(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG) FROM_RECV = IOFFR(IRANK) + 1 TO_RECV = FROM_RECV + ILENR(IRANK) - 1 #ifdef OMPGPU - !$OMP TARGET TEAMS DEFAULT(NONE) & - !$OMP& SHARED(PFBUF,PFBUF_IN,FROM_RECV,TO_RECV,FROM_SEND,TO_SEND) & - !$OMP& MAP(TO:FROM_RECV,TO_RECV,FROM_SEND,TO_SEND) - !$OMP PARALLEL - !$OMP WORKSHARE + !$OMP TARGET TEAMS MAP(PRESENT,ALLOC:PFBUF,PFBUF_IN) MAP(TO:FROM_RECV,TO_RECV,FROM_SEND,TO_SEND) #endif #ifdef ACCGPU #ifdef __HIP_PLATFORM_AMD__ @@ -176,8 +172,6 @@ SUBROUTINE TRMTOL(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG) #endif PFBUF(FROM_RECV:TO_RECV) = PFBUF_IN(FROM_SEND:TO_SEND) #ifdef OMPGPU - !$OMP END WORKSHARE - !$OMP END PARALLEL !$OMP END TARGET TEAMS #endif #ifdef ACCGPU diff --git a/src/trans/gpu/internal/trmtol_pack_unpack.F90 b/src/trans/gpu/internal/trmtol_pack_unpack.F90 index 792630781..92d095a57 100755 --- a/src/trans/gpu/internal/trmtol_pack_unpack.F90 +++ b/src/trans/gpu/internal/trmtol_pack_unpack.F90 @@ -126,6 +126,10 @@ SUBROUTINE TRMTOL_PACK(ALLOCATOR,HTRMTOL_PACK,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_I CALL LEINV_STRIDES(KF_LEG,IOUT_STRIDES0=IOUT_STRIDES0,IOUT_SIZE=IOUT_SIZE,& IOUT0_STRIDES0=IOUT0_STRIDES0,IOUT0_SIZE=IOUT0_SIZE) +#ifdef OMPGPU + !$OMP TARGET DATA MAP(PRESENT,ALLOC:D,D_MYMS,D_NPNTGTB1,D_NUMP,G,G_NDGLU,R,R_NDGNH,R_NDGL) & + !$OMP& MAP(PRESENT,ALLOC:ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_IN,D_OFFSETS_GEMM1) +#endif #ifdef ACCGPU !$ACC DATA PRESENT(D,D_MYMS,D_NPNTGTB1,D_NUMP,G,G_NDGLU,R,R_NDGNH,R_NDGL) & !$ACC& PRESENT(ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_IN,D_OFFSETS_GEMM1) @@ -178,6 +182,9 @@ SUBROUTINE TRMTOL_PACK(ALLOCATOR,HTRMTOL_PACK,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_I ENDDO ENDDO +#ifdef OMPGPU + !$OMP END TARGET DATA +#endif #ifdef ACCGPU !$ACC WAIT(1) @@ -261,6 +268,9 @@ SUBROUTINE TRMTOL_UNPACK(ALLOCATOR,HTRMTOL_UNPACK,FOUBUF,PREEL_COMPLEX,KF_CURREN CALL ASSIGN_PTR(PREEL_COMPLEX, GET_ALLOCATION(ALLOCATOR, HTRMTOL_UNPACK%HREEL),& & 1_JPIB, 1_JPIB*KF_TOTAL*D%NLENGTF*C_SIZEOF(PREEL_COMPLEX(1))) +#ifdef OMPGPU +!$OMP TARGET DATA MAP(PRESENT,ALLOC:G,G_NLOEN,G_NMEN,D,D_NPNTGTB0,FOUBUF,PREEL_COMPLEX,D_NSTAGTF,D_NDGL_FS) +#endif #ifdef ACCGPU !$ACC DATA PRESENT(G,G_NLOEN,G_NMEN,D,D_NPNTGTB0,FOUBUF,PREEL_COMPLEX,D_NSTAGTF,D_NDGL_FS) ASYNC(1) #endif @@ -308,6 +318,9 @@ SUBROUTINE TRMTOL_UNPACK(ALLOCATOR,HTRMTOL_UNPACK,FOUBUF,PREEL_COMPLEX,KF_CURREN ENDDO ENDDO ENDDO +#ifdef OMPGPU +!$OMP END TARGET DATA +#endif #ifdef ACCGPU !$ACC END DATA From 74ed73e856b5f5294f9b82173f4c39040fb61b8e Mon Sep 17 00:00:00 2001 From: Thomas Gibson Date: Fri, 7 Feb 2025 15:38:01 +0200 Subject: [PATCH 23/28] update ftinv mod Co-authored-by: Sam Hatfield --- src/trans/gpu/internal/ftinv_mod.F90 | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/trans/gpu/internal/ftinv_mod.F90 b/src/trans/gpu/internal/ftinv_mod.F90 index 8a7e42d13..ca91c1986 100755 --- a/src/trans/gpu/internal/ftinv_mod.F90 +++ b/src/trans/gpu/internal/ftinv_mod.F90 @@ -101,6 +101,9 @@ SUBROUTINE FTINV(ALLOCATOR,HFTINV,PREEL_COMPLEX,PREEL_REAL,KFIELD) & 1_JPIB, 1_JPIB*KFIELD*D%NLENGTF*C_SIZEOF(PREEL_REAL(1))) #endif +#ifdef OMPGPU + !$OMP TARGET DATA MAP(PRESENT,ALLOC:PREEL_REAL,PREEL_COMPLEX,D_NPTRLS,D_NDGL_FS,D_NSTAGTF,G_NLOEN) +#endif #ifdef ACCGPU !$ACC DATA PRESENT(PREEL_REAL,PREEL_COMPLEX,D_NPTRLS,D_NDGL_FS,D_NSTAGTF,G_NLOEN) #endif @@ -122,6 +125,9 @@ SUBROUTINE FTINV(ALLOCATOR,HFTINV,PREEL_COMPLEX,PREEL_REAL,KFIELD) ENDIF CALL GSTATS(423,1) +#ifdef OMPGPU + !$OMP END TARGET DATA +#endif #ifdef ACCGPU !$ACC END DATA #endif From b8398c3f108ec90615ecc35c5d5240f20e2d5d27 Mon Sep 17 00:00:00 2001 From: Thomas Gibson Date: Fri, 7 Feb 2025 15:48:25 +0200 Subject: [PATCH 24/28] initial update to trltog mod Co-authored-by: Sam Hatfield --- src/trans/gpu/internal/trltog_mod.F90 | 33 +++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 5 deletions(-) diff --git a/src/trans/gpu/internal/trltog_mod.F90 b/src/trans/gpu/internal/trltog_mod.F90 index d8cdeaf80..fd5e2db44 100755 --- a/src/trans/gpu/internal/trltog_mod.F90 +++ b/src/trans/gpu/internal/trltog_mod.F90 @@ -529,7 +529,13 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, #endif #ifdef OMPGPU - !$OMP TARGET DATA MAP(TO:IIN_TO_SEND_BUFR) IF(KF_FS > 0) + !$OMP TARGET DATA MAP(PRESENT,ALLOC:PGP) IF(PRESENT(PGP)) + !$OMP TARGET DATA MAP(PRESENT,ALLOC:PGPUV) IF(PRESENT(PGPUV)) + !$OMP TARGET DATA MAP(PRESENT,ALLOC:PGP2) IF(PRESENT(PGP2)) + !$OMP TARGET DATA MAP(PRESENT,ALLOC:PGP3A) IF(PRESENT(PGP3A)) + !$OMP TARGET DATA MAP(PRESENT,ALLOC:PGP3B) IF(PRESENT(PGP3B)) + + !$OMP TARGET DATA MAP(TO:IIN_TO_SEND_BUFR) MAP(PRESENT,ALLOC:PREEL_REAL) IF(KF_FS > 0) #endif #ifdef ACCGPU !$ACC DATA IF(PRESENT(PGP)) PRESENT(PGP) ASYNC(1) @@ -694,6 +700,9 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, & ICOMBUFS_OFFSET(ISEND_COUNTS+1)*C_SIZEOF(ZCOMBUFS(1))) ENDIF +#ifdef OMPGPU + !$OMP TARGET DATA MAP(PRESENT,ALLOC:ZCOMBUFS) IF(ISEND_COUNTS > 0) +#endif #ifdef ACCGPU !$ACC DATA PRESENT(ZCOMBUFS) IF(ISEND_COUNTS > 0) ASYNC(1) #endif @@ -723,6 +732,9 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, ENDDO ENDDO CALL GSTATS(1605,1) +#ifdef OMPGPU + !$OMP END TARGET DATA ! ZCOMBUFS +#endif #ifdef ACCGPU !$ACC END DATA ! ZCOMBUFS @@ -827,6 +839,9 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, ENDIF CALL GSTATS(421,1) +#ifdef OMPGPU + !$OMP TARGET DATA MAP(PRESENT,ALLOC:ZCOMBUFR) IF(IRECV_COUNTS > 0) +#endif #ifdef ACCGPU !$ACC DATA PRESENT(ZCOMBUFR) IF(IRECV_COUNTS > 0) ASYNC(1) #endif @@ -906,6 +921,9 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, !$ACC WAIT(1) #endif +#ifdef OMPGPU + !$OMP END TARGET DATA ! ZCOMBUFR +#endif #ifdef ACCGPU !$ACC END DATA ! ZCOMBUFR #endif @@ -918,6 +936,15 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, CALL GSTATS(440,1) ENDIF CALL GSTATS(422,0) +#ifdef OMPGPU + !$OMP END TARGET DATA ! IFLDA + !$OMP END TARGET DATA ! PREEL_REAL + !$OMP END TARGET DATA ! PGP3B + !$OMP END TARGET DATA ! PGP3A + !$OMP END TARGET DATA ! PGP2 + !$OMP END TARGET DATA ! PGPUV + !$OMP END TARGET DATA ! PGP +#endif #ifdef ACCGPU !$ACC END DATA ! IFLDA !$ACC END DATA ! PREEL_REAL @@ -926,10 +953,6 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, !$ACC END DATA ! PGP2 !$ACC END DATA ! PGPUV !$ACC END DATA ! PGP -#endif -#ifdef OMPGPU - !$OMP END TARGET DATA ! IFLDA - !$OMP END TARGET DATA ! IIN_TO_SEND_BUFR #endif IF (PRESENT(PGP)) THEN #ifdef OMPGPU From e438cd6b244718d4923e55e0c34c6fdb5b46eecb Mon Sep 17 00:00:00 2001 From: Thomas Gibson Date: Fri, 7 Feb 2025 15:48:52 +0200 Subject: [PATCH 25/28] Introduce data explicit scoping to OMP directives Co-authored-by: Sam Hatfield --- src/trans/gpu/internal/fsc_mod.F90 | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/trans/gpu/internal/fsc_mod.F90 b/src/trans/gpu/internal/fsc_mod.F90 index e2381aaa3..bc8608215 100755 --- a/src/trans/gpu/internal/fsc_mod.F90 +++ b/src/trans/gpu/internal/fsc_mod.F90 @@ -101,6 +101,10 @@ SUBROUTINE FSC(ALLOCATOR,HFSC,PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSE IINC=-1 ENDIF +#ifdef OMPGPU +!$OMP TARGET DATA & +!$OMP& MAP(PRESENT,ALLOC:D,D_NPTRLS,D_NSTAGTF,PREEL_COMPLEX,F,F_RACTHE,G,G_NMEN,G_NLOEN,R,R_NSMAX) +#endif #ifdef ACCGPU !$ACC DATA & !$ACC& PRESENT(D,D_NPTRLS,D_NSTAGTF,PREEL_COMPLEX,F,F_RACTHE,G,G_NMEN,G_NLOEN,R,R_NSMAX) @@ -305,6 +309,9 @@ SUBROUTINE FSC(ALLOCATOR,HFSC,PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSE !$ACC END DATA #endif +#ifdef OMPGPU +!$OMP END TARGET DATA +#endif ! ------------------------------------------------------------------ END ASSOCIATE From d20af51b65fa30a903a07a0edae430596ba4ba92 Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Fri, 7 Feb 2025 15:54:53 +0200 Subject: [PATCH 26/28] Fixup OMP directive in TRMTOL --- src/trans/gpu/internal/trmtol_mod.F90 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/trans/gpu/internal/trmtol_mod.F90 b/src/trans/gpu/internal/trmtol_mod.F90 index 24bd8c755..cd8b45735 100755 --- a/src/trans/gpu/internal/trmtol_mod.F90 +++ b/src/trans/gpu/internal/trmtol_mod.F90 @@ -249,7 +249,7 @@ SUBROUTINE TRMTOL(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG) CALL GSTATS(1608,0) #ifdef OMPGPU !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO DEFAULT(NONE) & - !$OMP SHARED(PFBUF,PFBUF_IN,ISTA,IEND,ILEN) MAP(TO:ISTA,ILEN) + !$OMP SHARED(PFBUF,PFBUF_IN,ISTA,IEND) MAP(TO:ISTA,IEND) #endif #ifdef ACCGPU !$ACC PARALLEL LOOP DEFAULT(NONE) PRESENT(PFBUF,PFBUF_IN) FIRSTPRIVATE(ISTA,IEND) From b213ea098947dd962b558502fc325bd8a9d65604 Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Fri, 7 Feb 2025 15:55:40 +0200 Subject: [PATCH 27/28] Remove accidental mapping of ZACHTE2 Co-authored-by: Thomas Gibson --- src/trans/gpu/internal/fsc_mod.F90 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/trans/gpu/internal/fsc_mod.F90 b/src/trans/gpu/internal/fsc_mod.F90 index bc8608215..e1fb3d488 100755 --- a/src/trans/gpu/internal/fsc_mod.F90 +++ b/src/trans/gpu/internal/fsc_mod.F90 @@ -122,7 +122,7 @@ SUBROUTINE FSC(ALLOCATOR,HFSC,PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSE !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) DEFAULT(NONE) & !$OMP& PRIVATE(IGLG,IOFF_LAT,IOFF_UV,ZACHTE2) & !$OMP& SHARED(IBEG,IEND,IINC,KF_UV,R,OFFSET_VAR,G,D,KF_FS,KUV_OFFSET,F,PREEL_COMPLEX) & -!$OMP& MAP(TO:IBEG,IEND,IINC,KF_UV,OFFSET_VAR,KF_FS,KUV_OFFSET,ZACHTE2) +!$OMP& MAP(TO:IBEG,IEND,IINC,KF_UV,OFFSET_VAR,KF_FS,KUV_OFFSET) #endif #ifdef ACCGPU !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) & From ab272a20f0eb1aa1fde1ed0018d2fe652c03db00 Mon Sep 17 00:00:00 2001 From: Sam Hatfield Date: Thu, 20 Feb 2025 18:28:07 +0000 Subject: [PATCH 28/28] Change HIP stream from 1 to 0 --- src/trans/gpu/internal/ledir_mod.F90 | 2 +- src/trans/gpu/internal/leinv_mod.F90 | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index 4b7f78a86..b4b235338 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -169,7 +169,7 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) HIP_STREAM = INT(ACC_GET_HIP_STREAM(1_C_INT), C_LONG) #endif #ifdef OMPGPU - HIP_STREAM = 1_C_LONG + HIP_STREAM = 0_C_LONG #endif CALL LEDIR_STRIDES(KF_FS,IOUT_STRIDES0,IOUT_STRIDES1,IIN_STRIDES0,IIN_STRIDES1,& diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index 7e5551203..8b0557092 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -158,7 +158,7 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) HIP_STREAM = INT(ACC_GET_HIP_STREAM(1_C_INT), C_LONG) #endif #ifdef OMPGPU - HIP_STREAM = 1_C_LONG + HIP_STREAM = 0_C_LONG #endif ! ------------------------------------------------------------------