[ROCM] update fluid platform for rocm35 (part1), test=develop (#30639)

* [ROCM] update fluid platform for rocm35 (part1), test=develop * address review comments, test=develop
PaddlePaddle · Jan 28, 2021 · f89da4a · f89da4a
1 parent fc00240
commit f89da4a
Show file tree

Hide file tree

Showing 19 changed files with 626 additions and 152 deletions.
diff --git a/paddle/fluid/platform/bfloat16.h b/paddle/fluid/platform/bfloat16.h
@@ -47,7 +47,17 @@ struct PADDLE_ALIGN(2) bfloat16 {
   ~bfloat16() = default;
 
   HOSTDEVICE inline explicit bfloat16(float val) {
+#ifdef PADDLE_WITH_HIP
+    uint32_t res = 0;
+    uint32_t* tempRes;
+    // We should be using memcpy in order to respect the strict aliasing rule
+    // but it fails in the HIP environment.
+    tempRes = reinterpret_cast<uint32_t*>(&val);
+    res = *tempRes;
+    x = res >> 16;
+#else
     std::memcpy(&x, reinterpret_cast<char*>(&val) + 2, 2);
+#endif
   }
 
   template <class T>

diff --git a/paddle/fluid/platform/complex128.h b/paddle/fluid/platform/complex128.h
@@ -28,6 +28,11 @@
 #include <thrust/complex.h>
 #endif  // PADDLE_WITH_CUDA
 
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_complex.h>
+#include <thrust/complex.h>  // NOLINT
+#endif
+
 #include <cstring>
 
 #include "paddle/fluid/platform/hostdevice.h"
@@ -54,7 +59,7 @@ struct PADDLE_ALIGN(16) complex128 {
   ~complex128() = default;
 
   HOSTDEVICE complex128(double real, double imag) : real(real), imag(imag) {}
-#if defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
   HOSTDEVICE inline explicit complex128(const thrust::complex<double>& c) {
     real = c.real();
@@ -65,9 +70,15 @@ struct PADDLE_ALIGN(16) complex128 {
     return thrust::complex<double>(real, imag);
   }
 
+#ifdef PADDLE_WITH_HIP
+  HOSTDEVICE inline explicit operator hipDoubleComplex() const {
+    return make_hipDoubleComplex(real, imag);
+  }
+#else
   HOSTDEVICE inline explicit operator cuDoubleComplex() const {
     return make_cuDoubleComplex(real, imag);
   }
+#endif
 #endif
 
   HOSTDEVICE complex128(const float& val)
@@ -202,7 +213,7 @@ struct PADDLE_ALIGN(16) complex128 {
 
 HOSTDEVICE inline complex128 operator+(const complex128& a,
                                        const complex128& b) {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
   return complex128(thrust::complex<double>(a.real, a.imag) +
                     thrust::complex<double>(b.real, b.imag));
 #else
@@ -212,7 +223,7 @@ HOSTDEVICE inline complex128 operator+(const complex128& a,
 
 HOSTDEVICE inline complex128 operator-(const complex128& a,
                                        const complex128& b) {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
   return complex128(thrust::complex<double>(a.real, a.imag) -
                     thrust::complex<double>(b.real, b.imag));
 #else
@@ -222,7 +233,7 @@ HOSTDEVICE inline complex128 operator-(const complex128& a,
 
 HOSTDEVICE inline complex128 operator*(const complex128& a,
                                        const complex128& b) {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
   return complex128(thrust::complex<double>(a.real, a.imag) *
                     thrust::complex<double>(b.real, b.imag));
 #else
@@ -233,7 +244,7 @@ HOSTDEVICE inline complex128 operator*(const complex128& a,
 
 HOSTDEVICE inline complex128 operator/(const complex128& a,
                                        const complex128& b) {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
   return complex128(thrust::complex<double>(a.real, a.imag) /
                     thrust::complex<double>(b.real, b.imag));
 #else
@@ -244,7 +255,7 @@ HOSTDEVICE inline complex128 operator/(const complex128& a,
 }
 
 HOSTDEVICE inline complex128 operator-(const complex128& a) {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
   return complex128(-thrust::complex<double>(a.real, a.imag));
 #else
   complex128 res;
@@ -256,7 +267,7 @@ HOSTDEVICE inline complex128 operator-(const complex128& a) {
 
 HOSTDEVICE inline complex128& operator+=(complex128& a,  // NOLINT
                                          const complex128& b) {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
   a = complex128(thrust::complex<double>(a.real, a.imag) +=
                  thrust::complex<double>(b.real, b.imag));
   return a;
@@ -269,7 +280,7 @@ HOSTDEVICE inline complex128& operator+=(complex128& a,  // NOLINT
 
 HOSTDEVICE inline complex128& operator-=(complex128& a,  // NOLINT
                                          const complex128& b) {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
   a = complex128(thrust::complex<double>(a.real, a.imag) -=
                  thrust::complex<double>(b.real, b.imag));
   return a;
@@ -282,7 +293,7 @@ HOSTDEVICE inline complex128& operator-=(complex128& a,  // NOLINT
 
 HOSTDEVICE inline complex128& operator*=(complex128& a,  // NOLINT
                                          const complex128& b) {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
   a = complex128(thrust::complex<double>(a.real, a.imag) *=
                  thrust::complex<double>(b.real, b.imag));
   return a;
@@ -295,7 +306,7 @@ HOSTDEVICE inline complex128& operator*=(complex128& a,  // NOLINT
 
 HOSTDEVICE inline complex128& operator/=(complex128& a,  // NOLINT
                                          const complex128& b) {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
   a = complex128(thrust::complex<double>(a.real, a.imag) /=
                  thrust::complex<double>(b.real, b.imag));
   return a;
@@ -339,6 +350,7 @@ HOSTDEVICE inline bool operator>=(const complex128& a, const complex128& b) {
 
 HOSTDEVICE inline bool(isnan)(const complex128& a) {
 #if defined(__CUDA_ARCH__)
+  // __isnanf not supported on HIP platform
   return __isnan(a.real) || __isnan(a.imag);
 #else
   return std::isnan(a.real) || std::isnan(a.imag);
@@ -347,6 +359,7 @@ HOSTDEVICE inline bool(isnan)(const complex128& a) {
 
 HOSTDEVICE inline bool(isinf)(const complex128& a) {
 #if defined(__CUDA_ARCH__)
+  // __isinf not supported on HIP platform
   return __isinf(a.real) || __isinf(a.imag);
 #else
   return std::isinf(a.real) || std::isinf(a.imag);
@@ -358,15 +371,15 @@ HOSTDEVICE inline bool(isfinite)(const complex128& a) {
 }
 
 HOSTDEVICE inline double(abs)(const complex128& a) {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
   return thrust::abs(thrust::complex<double>(a.real, a.imag));
 #else
   return std::abs(std::complex<double>(a.real, a.imag));
 #endif
 }
 
 HOSTDEVICE inline complex128(pow)(const complex128& a, const complex128& b) {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
   return complex128(thrust::pow(thrust::complex<double>(a.real, a.imag),
                                 thrust::complex<double>(b.real, b.imag)));
 #else
@@ -375,23 +388,23 @@ HOSTDEVICE inline complex128(pow)(const complex128& a, const complex128& b) {
 }
 
 HOSTDEVICE inline complex128(sqrt)(const complex128& a) {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
   return complex128(thrust::sqrt(thrust::complex<double>(a.real, a.imag)));
 #else
   return std::sqrt(std::complex<double>(a));
 #endif
 }
 
 HOSTDEVICE inline complex128(tanh)(const complex128& a) {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
   return complex128(thrust::tanh(thrust::complex<double>(a.real, a.imag)));
 #else
   return std::tanh(std::complex<double>(a));
 #endif
 }
 
 HOSTDEVICE inline complex128(log)(const complex128& a) {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
   return complex128(thrust::log(thrust::complex<double>(a.real, a.imag)));
 #else
   return complex128(std::log(std::complex<double>(a)));

diff --git a/paddle/fluid/platform/complex64.h b/paddle/fluid/platform/complex64.h
@@ -27,6 +27,11 @@
 #include <thrust/complex.h>
 #endif  // PADDLE_WITH_CUDA
 
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_complex.h>
+#include <thrust/complex.h>  // NOLINT
+#endif
+
 #include <cstring>
 
 #include "paddle/fluid/platform/complex128.h"
@@ -54,7 +59,7 @@ struct PADDLE_ALIGN(8) complex64 {
   ~complex64() = default;
 
   HOSTDEVICE complex64(float real, float imag) : real(real), imag(imag) {}
-#if defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 
   HOSTDEVICE inline explicit complex64(const thrust::complex<float>& c) {
     real = c.real();
@@ -65,9 +70,15 @@ struct PADDLE_ALIGN(8) complex64 {
     return thrust::complex<float>(real, imag);
   }
 
+#ifdef PADDLE_WITH_HIP
+  HOSTDEVICE inline explicit operator hipFloatComplex() const {
+    return make_hipFloatComplex(real, imag);
+  }
+#else
   HOSTDEVICE inline explicit operator cuFloatComplex() const {
     return make_cuFloatComplex(real, imag);
   }
+#endif
 #endif
 
   HOSTDEVICE complex64(const float& val) : real(val), imag(0) {}
@@ -207,7 +218,7 @@ struct PADDLE_ALIGN(8) complex64 {
 };
 
 HOSTDEVICE inline complex64 operator+(const complex64& a, const complex64& b) {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
   return complex64(thrust::complex<float>(a.real, a.imag) +
                    thrust::complex<float>(b.real, b.imag));
 #else
@@ -216,7 +227,7 @@ HOSTDEVICE inline complex64 operator+(const complex64& a, const complex64& b) {
 }
 
 HOSTDEVICE inline complex64 operator-(const complex64& a, const complex64& b) {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
   return complex64(thrust::complex<float>(a.real, a.imag) -
                    thrust::complex<float>(b.real, b.imag));
 #else
@@ -225,7 +236,7 @@ HOSTDEVICE inline complex64 operator-(const complex64& a, const complex64& b) {
 }
 
 HOSTDEVICE inline complex64 operator*(const complex64& a, const complex64& b) {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
   return complex64(thrust::complex<float>(a.real, a.imag) *
                    thrust::complex<float>(b.real, b.imag));
 #else
@@ -235,7 +246,7 @@ HOSTDEVICE inline complex64 operator*(const complex64& a, const complex64& b) {
 }
 
 HOSTDEVICE inline complex64 operator/(const complex64& a, const complex64& b) {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
   return complex64(thrust::complex<float>(a.real, a.imag) /
                    thrust::complex<float>(b.real, b.imag));
 #else
@@ -246,7 +257,7 @@ HOSTDEVICE inline complex64 operator/(const complex64& a, const complex64& b) {
 }
 
 HOSTDEVICE inline complex64 operator-(const complex64& a) {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
   return complex64(-thrust::complex<float>(a.real, a.imag));
 #else
   complex64 res;
@@ -258,7 +269,7 @@ HOSTDEVICE inline complex64 operator-(const complex64& a) {
 
 HOSTDEVICE inline complex64& operator+=(complex64& a,  // NOLINT
                                         const complex64& b) {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
   a = complex64(thrust::complex<float>(a.real, a.imag) +=
                 thrust::complex<float>(b.real, b.imag));
   return a;
@@ -271,7 +282,7 @@ HOSTDEVICE inline complex64& operator+=(complex64& a,  // NOLINT
 
 HOSTDEVICE inline complex64& operator-=(complex64& a,  // NOLINT
                                         const complex64& b) {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
   a = complex64(thrust::complex<float>(a.real, a.imag) -=
                 thrust::complex<float>(b.real, b.imag));
   return a;
@@ -284,7 +295,7 @@ HOSTDEVICE inline complex64& operator-=(complex64& a,  // NOLINT
 
 HOSTDEVICE inline complex64& operator*=(complex64& a,  // NOLINT
                                         const complex64& b) {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
   a = complex64(thrust::complex<float>(a.real, a.imag) *=
                 thrust::complex<float>(b.real, b.imag));
   return a;
@@ -297,7 +308,7 @@ HOSTDEVICE inline complex64& operator*=(complex64& a,  // NOLINT
 
 HOSTDEVICE inline complex64& operator/=(complex64& a,  // NOLINT
                                         const complex64& b) {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
   a = complex64(thrust::complex<float>(a.real, a.imag) /=
                 thrust::complex<float>(b.real, b.imag));
   return a;
@@ -341,6 +352,7 @@ HOSTDEVICE inline bool operator>=(const complex64& a, const complex64& b) {
 
 HOSTDEVICE inline bool(isnan)(const complex64& a) {
 #if defined(__CUDA_ARCH__)
+  // __isnanf not supported on HIP platform
   return __isnanf(a.real) || __isnanf(a.imag);
 #else
   return std::isnan(a.real) || std::isnan(a.imag);
@@ -349,6 +361,7 @@ HOSTDEVICE inline bool(isnan)(const complex64& a) {
 
 HOSTDEVICE inline bool(isinf)(const complex64& a) {
 #if defined(__CUDA_ARCH__)
+  // __isinff not supported on HIP platform
   return __isinff(a.real) || __isinff(a.imag);
 #else
   return std::isinf(a.real) || std::isinf(a.imag);
@@ -360,15 +373,15 @@ HOSTDEVICE inline bool(isfinite)(const complex64& a) {
 }
 
 HOSTDEVICE inline float(abs)(const complex64& a) {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
   return complex64(thrust::abs(thrust::complex<float>(a.real, a.imag)));
 #else
   return std::abs(std::complex<float>(a.real, a.imag));
 #endif
 }
 
 HOSTDEVICE inline complex64(pow)(const complex64& a, const complex64& b) {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
   return complex64(thrust::pow(thrust::complex<float>(a.real, a.imag),
                                thrust::complex<float>(b.real, b.imag)));
 #else
@@ -377,23 +390,23 @@ HOSTDEVICE inline complex64(pow)(const complex64& a, const complex64& b) {
 }
 
 HOSTDEVICE inline complex64(sqrt)(const complex64& a) {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
   return complex64(thrust::sqrt(thrust::complex<float>(a.real, a.imag)));
 #else
   return std::sqrt(std::complex<float>(a));
 #endif
 }
 
 HOSTDEVICE inline complex64(tanh)(const complex64& a) {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
   return complex64(thrust::tanh(thrust::complex<float>(a.real, a.imag)));
 #else
   return std::tanh(std::complex<float>(a));
 #endif
 }
 
 HOSTDEVICE inline complex64(log)(const complex64& a) {
-#if defined(__CUDA_ARCH__)
+#if defined(__CUDA_ARCH__) || defined(__HIPCC__)
   return complex64(thrust::log(thrust::complex<float>(a.real, a.imag)));
 #else
   return std::log(std::complex<float>(a));

diff --git a/paddle/fluid/platform/details/cuda_transform_iterator_cast.h b/paddle/fluid/platform/details/cuda_transform_iterator_cast.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#ifndef __NVCC__
+#if !defined(__NVCC__) && !defined(__HIPCC__)
 #error device_ptr_cast must be include by .cu file
 #endif