Skip to content

Commit

Permalink
Merge pull request #13 from lukasm91/improve-noncontiguous
Browse files Browse the repository at this point in the history
Improve noncontiguous data transfers
  • Loading branch information
pmarguinaud authored Jan 22, 2024
2 parents 27c44be + ca16c19 commit 521069a
Show file tree
Hide file tree
Showing 6 changed files with 441 additions and 22 deletions.
1 change: 1 addition & 0 deletions LICENSE
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,7 @@
END OF TERMS AND CONDITIONS

Copyright 2018- ECMWF
Copyright 2023- NVIDIA

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
Expand Down
174 changes: 163 additions & 11 deletions field_RANKSUFF_data_module.fypp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#! (C) Copyright 2022- ECMWF.
#! (C) Copyright 2022- Meteo-France.
#! (C) Copyright 2023- NVIDIA
#!
#! This software is licensed under the terms of the Apache Licence Version 2.0
#! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
Expand All @@ -16,11 +17,61 @@ ${fieldType.useParkind1 ()}$

IMPLICIT NONE

PRIVATE

#:for ft in fieldTypeList
#:set ftn = ft.name

PUBLIC :: ${ftn}$_COPY

#:endfor

CONTAINS

#:for ft in fieldTypeList
#:set ftn = ft.name

SUBROUTINE ${ftn}$_COPY (HST, DEV, MAP_DEVPTR, KDIR, QUEUE)

USE FIELD_ABORT_MODULE

${ft.type}$, POINTER :: HST (${ft.shape}$), DEV (${ft.shape}$)
LOGICAL, INTENT (IN) :: MAP_DEVPTR
INTEGER (KIND=JPIM), INTENT (IN) :: KDIR
INTEGER (KIND=JPIM), OPTIONAL, INTENT (IN) :: QUEUE
INTEGER :: LAST_CONTIG_DIM
INTEGER :: NEXT_CONTIG_DIM

LAST_CONTIG_DIM = ${ftn}$_GET_LAST_CONTIGUOUS_DIMENSION (HST, 0)
NEXT_CONTIG_DIM = ${ftn}$_GET_LAST_CONTIGUOUS_DIMENSION (HST, LAST_CONTIG_DIM+1)

SELECT CASE (LAST_CONTIG_DIM)
#:if defined('CUDA')
#:for d1 in range (ft.rank)
CASE (${d1}$)
SELECT CASE (NEXT_CONTIG_DIM)
#:for d2 in range (d1+1, ft.rank+1)
CASE (${d2}$)
CALL ${ftn}$_COPY_2D_DIM${d1}$_${d2}$_CONTIGUOUS (HST, DEV, MAP_DEVPTR, KDIR, QUEUE)
#:endfor
CASE DEFAULT
CALL FIELD_ABORT ('INTERNAL ERROR: UNEXPECTED NEXT_CONTIG_DIM')
END SELECT
#:endfor
CASE (${ft.rank}$)
CALL ${ftn}$_COPY_DIM${ft.rank}$_CONTIGUOUS (HST, DEV, MAP_DEVPTR, KDIR, QUEUE)
#:else
#:for d in range (ft.rank + 1)
CASE (${d}$)
CALL ${ftn}$_COPY_DIM${d}$_CONTIGUOUS (HST, DEV, MAP_DEVPTR, KDIR, QUEUE)
#:endfor
#:endif
CASE DEFAULT
CALL FIELD_ABORT ('INTERNAL ERROR: UNEXPECTED LAST_CONTIG_DIM')
END SELECT

END SUBROUTINE

#:for d in range (0, ft.rank+1)
SUBROUTINE ${ftn}$_COPY_DIM${d}$_CONTIGUOUS (HST, DEV, MAP_DEVPTR, KDIR, QUEUE)
#ifdef _OPENACC
Expand Down Expand Up @@ -85,22 +136,111 @@ CONTAINS

#:endfor

#:if defined('CUDA')
#:for d1 in range (0, ft.rank)
#:for d2 in range (d1+1, ft.rank+1)
SUBROUTINE ${ftn}$_COPY_2D_DIM${d1}$_${d2}$_CONTIGUOUS (HST, DEV, MAP_DEVPTR, KDIR, QUEUE)
USE OPENACC
USE CUDAFOR
USE FIELD_ABORT_MODULE

${ft.type}$, POINTER :: HST (${ft.shape}$), DEV (${ft.shape}$)
INTEGER (KIND=JPIM), INTENT (IN) :: KDIR
LOGICAL, INTENT (IN) :: MAP_DEVPTR
INTEGER (KIND=JPIM), OPTIONAL, INTENT (IN) :: QUEUE
INTEGER (KIND=JPIM) :: IHST_PITCH, IDEV_PITCH, IRET
INTEGER (KIND=JPIM) :: IWIDTH, IHEIGHT, ISHP(${ft.rank+1}$)
#:if d2 < ft.rank
INTEGER :: ${', '.join (list (map (lambda i: 'J' + str (i+1), range (d2, ft.rank))))}$
#:endif
INTEGER(KIND=CUDA_STREAM_KIND) :: STREAM
TYPE(C_PTR) :: HSTPTR
TYPE(C_DEVPTR) :: DEVPTR

ISHP(1) = 1
ISHP(2:) = SHAPE(HST)
IWIDTH = PRODUCT(ISHP(1:${d1+1}$)) * KIND(HST)
IHEIGHT = PRODUCT(ISHP(${d1+2}$:${d2+1}$))

#:set lb = lambda arr, i: f'LBOUND({arr}, {i+1})'
#:set lbnds = lambda arr, start, end: [lb(arr, i) for i in range(start, end)]
#:set this_slice = lambda arr: ', '.join(lbnds(arr, 0, ft.rank))
#:set next_slice = lambda arr: ', '.join(lbnds(arr, 0, d1) + [lb(arr, d1)+'+1'] + lbnds(arr, d1+1, ft.rank))
IHST_PITCH = LOC (HST(${next_slice('HST')}$)) - LOC (HST (${this_slice('HST')}$))
IDEV_PITCH = LOC (DEV(${next_slice('DEV')}$)) - LOC (DEV (${this_slice('DEV')}$))

#:for e in range (ft.rank, d2, -1)
${' ' * (ft.rank - e)}$DO J${e}$ = LBOUND (HST, ${e}$), UBOUND (HST, ${e}$)
#:endfor
#:set indent = ' ' * (ft.rank - d2 - 1)
#:set ar = lambda arr: ', '.join(lbnds(arr, 0, d2) + [f'J{i+1}' for i in range(d2, ft.rank)])
${indent}$ HSTPTR = C_LOC(HST (${ar('HST')}$))
${indent}$ IF (MAP_DEVPTR) THEN
${indent}$ !$acc host_data use_device(DEV)
${indent}$ DEVPTR = C_DEVLOC(DEV (${ar('DEV')}$))
${indent}$ !$acc end host_data
${indent}$ ELSE
${indent}$ !$acc data deviceptr(DEVPTR,DEV)
${indent}$ DEVPTR = C_DEVLOC(DEV (${ar('DEV')}$))
${indent}$ !$acc end data
${indent}$ ENDIF
${indent}$ IF (KDIR == NH2D) THEN
${indent}$ IF(PRESENT(QUEUE)) THEN
${indent}$ CALL ACC_SET_CUDA_STREAM(QUEUE, STREAM)
${indent}$ IRET = CUDAMEMCPY2DASYNC (DEVPTR, IDEV_PITCH, &
${indent}$ & HSTPTR, IHST_PITCH, &
${indent}$ & IWIDTH, IHEIGHT, &
${indent}$ & STREAM=STREAM)
${indent}$ ELSE
${indent}$ IRET = CUDAMEMCPY2D (DEVPTR, IDEV_PITCH, &
${indent}$ & HSTPTR, IHST_PITCH, &
${indent}$ & IWIDTH, IHEIGHT)
${indent}$ ENDIF
${indent}$ IF (IRET /= CUDASUCCESS) THEN
${indent}$ CALL FIELD_ABORT ("${ftn}$_COPY_2D_DIM${d1}$_${d2}$_CONTIGUOUS: HOST-TO-DEVICE TRANSFER FAILED")
${indent}$ ENDIF
${indent}$ ELSEIF (KDIR == ND2H) THEN
${indent}$ IF(PRESENT(QUEUE)) THEN
${indent}$ CALL ACC_SET_CUDA_STREAM(QUEUE, STREAM)
${indent}$ IRET = CUDAMEMCPY2DASYNC (HSTPTR, IHST_PITCH, &
${indent}$ & DEVPTR, IDEV_PITCH, &
${indent}$ & IWIDTH, IHEIGHT, &
${indent}$ & STREAM=STREAM)
${indent}$ ELSE
${indent}$ IRET = CUDAMEMCPY2D (HSTPTR, IHST_PITCH, &
${indent}$ & DEVPTR, IDEV_PITCH, &
${indent}$ & IWIDTH, IHEIGHT)
${indent}$ ENDIF
${indent}$ IF (IRET /= CUDASUCCESS) THEN
${indent}$ CALL FIELD_ABORT ("${ftn}$_COPY_2D_DIM${d1}$_${d2}$_CONTIGUOUS: DEVICE-TO-HOST TRANSFER FAILED")
${indent}$ ENDIF
${indent}$ ENDIF
#:for e in range (d2, ft.rank)
${' ' * (ft.rank - e - 1)}$ENDDO
#:endfor
END SUBROUTINE
#:endfor
#:endfor
#:endif

#:endfor

#:for ft in fieldTypeList
#:set ftn = ft.name
INTEGER (KIND=JPIM) FUNCTION ${ftn}$_GET_LAST_CONTIGUOUS_DIMENSION (PTR) RESULT (JDIM)

INTEGER (KIND=JPIM) FUNCTION ${ftn}$_GET_LAST_CONTIGUOUS_DIMENSION (PTR, AFTER) RESULT (JDIM)
${ft.type}$, POINTER :: PTR (${ft.shape}$)
INTEGER*8 :: ISTRIDE (${ft.rank}$)
INTEGER (KIND=JPIM) :: AFTER
INTEGER*8 :: IPREVIOUS_STRIDE, ITHIS_STRIDE, ISIZE
INTEGER (KIND=JPIM) :: J, LB(${ft.rank}$)

! assume that dimension all dimensions before AFTER are contiguous...

LB = LBOUND(PTR)
ISTRIDE (1) = KIND (PTR)
DO J = 2, ${ft.rank}$
ISTRIDE (J) = ISTRIDE (J-1) * SIZE (PTR, J-1)
ENDDO
IF (AFTER == 0) THEN
IPREVIOUS_STRIDE = KIND (PTR)
ENDIF

JDIM = 0
#:for d in range (1, ft.rank+1)
#:set ind0 = ""
#:set ind1 = ""
Expand All @@ -110,14 +250,26 @@ CONTAINS
#:endfor
#:set ind0 = ind0[:-2]
#:set ind1 = ind1[:-2]
IF (LOC (PTR (${ind1}$)) - LOC (PTR (${ind0}$)) /= ISTRIDE (${d}$)) THEN
RETURN
ITHIS_STRIDE = LOC (PTR (${ind1}$)) - LOC (PTR (${ind0}$))
IF (AFTER < ${d}$) THEN
#:if d == 1
ISIZE = 1
#:else
ISIZE = SIZE(PTR, ${d-1}$)
#:endif
IF (SIZE(PTR, ${d}$) /= 1 .AND. IPREVIOUS_STRIDE * ISIZE /= ITHIS_STRIDE) THEN
JDIM = ${d-1}$
RETURN
ENDIF
IPREVIOUS_STRIDE = IPREVIOUS_STRIDE * ISIZE
ELSE IF (AFTER == ${d}$) THEN
IPREVIOUS_STRIDE = ITHIS_STRIDE
ENDIF

JDIM = ${d}$

#:endfor
JDIM = ${ft.rank}$
END FUNCTION ${ftn}$_GET_LAST_CONTIGUOUS_DIMENSION

#:endfor

END MODULE FIELD_${RANK}$${SUFF}$_DATA_MODULE
12 changes: 2 additions & 10 deletions field_RANKSUFF_module.fypp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#! (C) Copyright 2022- ECMWF.
#! (C) Copyright 2022- Meteo-France.
#! (C) Copyright 2023- NVIDIA
#!
#! This software is licensed under the terms of the Apache Licence Version 2.0
#! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
Expand Down Expand Up @@ -221,8 +222,6 @@ CONTAINS
! By default we allocate thread-local temporaries
SELF%THREAD_BUFFER = .TRUE.

SELF%LAST_CONTIGUOUS_DIMENSION = ${ft.rank}$

IF (PRESENT(PERSISTENT)) THEN
IF (PERSISTENT) THEN
SELF%THREAD_BUFFER = .FALSE.
Expand Down Expand Up @@ -378,15 +377,8 @@ CONTAINS
INTEGER (KIND=JPIM), OPTIONAL, INTENT(IN) :: QUEUE
REAL :: START, FINISH

SELF%LAST_CONTIGUOUS_DIMENSION = ${ftn}$_GET_LAST_CONTIGUOUS_DIMENSION (SELF%PTR)

CALL CPU_TIME(START)
SELECT CASE (SELF%LAST_CONTIGUOUS_DIMENSION)
#:for d in range (ft.rank + 1)
CASE (${d}$)
CALL ${ftn}$_COPY_DIM${d}$_CONTIGUOUS (SELF%PTR, SELF%DEVPTR, SELF%MAP_DEVPTR, KDIR, QUEUE)
#:endfor
END SELECT
CALL ${ftn}$_COPY (SELF%PTR, SELF%DEVPTR, SELF%MAP_DEVPTR, KDIR, QUEUE)
CALL CPU_TIME(FINISH)
IF (KDIR == NH2D) THEN
CALL SELF%STATS%INC_CPU_TO_GPU_TRANSFER(START, FINISH)
Expand Down
1 change: 0 additions & 1 deletion field_basic_module.F90
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ MODULE FIELD_BASIC_MODULE
LOGICAL :: THREAD_BUFFER = .FALSE.

INTEGER(KIND=JPIM) :: ISTATUS = 0
INTEGER(KIND=JPIM) :: LAST_CONTIGUOUS_DIMENSION = 0

TYPE(GPU_STATS) :: STATS

Expand Down
1 change: 1 addition & 0 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ list(APPEND TEST_FILES
init_wrapper_gpu.F90
init_wrapper_lbounds.F90
init_wrapper_non_contiguous.F90
init_wrapper_non_contiguous_multi.F90
no_transfer_get_device.F90
no_transfer_get_host.F90
pointer_to_owner_wrapper.F90
Expand Down
Loading

0 comments on commit 521069a

Please sign in to comment.