Merged multicoreware/hcc/master into shfl

ROCm · Apr 6, 2016 · 862e135 · 862e135
2 parents b76c262 + d220f47
commit 862e135
Show file tree

Hide file tree

Showing 19 changed files with 1,018 additions and 230 deletions.
diff --git a/Bolt/test/amp/CMakeLists.txt b/Bolt/test/amp/CMakeLists.txt
@@ -117,13 +117,13 @@ add_subdirectory( StableSortTest )
 add_subdirectory( StableSortByKeyTest )
 
 # passed on SPIR path. crashed on HSA after passing some tests
-add_subdirectory( TransformTest )   
+#add_subdirectory( TransformTest )   
 
 # passed on SPIR path. failed some tests on HSA
-add_subdirectory( TransformReduceTest )
+#add_subdirectory( TransformReduceTest )
 
 # compile OK, failed some tests on SPIR and HSA
-add_subdirectory( TransformScanTest )
+#add_subdirectory( TransformScanTest )
 
 # compile OK, crashed on HSA after passing some tests
 add_subdirectory( ScatterTest )

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,9 +1,17 @@
 cmake_minimum_required( VERSION 2.8 )
 project (HCC)
 
+option(HSA_USE_AMDGPU_BACKEND "Use AMDGPU LLVM backend as compiler for HSA" OFF)
+
+
+
 # set default installation path
 if (CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT AND CMAKE_INSTALL_PREFIX MATCHES "/usr/local")
-  set(CMAKE_INSTALL_PREFIX "/opt/hcc" CACHE PATH "Default installation path of hcc" FORCE)
+  if (HSA_USE_AMDGPU_BACKEND)
+    set(CMAKE_INSTALL_PREFIX "/opt/rocm/hcc" CACHE PATH "Default installation path of hcc" FORCE)
+  else  (HSA_USE_AMDGPU_BACKEND)
+    set(CMAKE_INSTALL_PREFIX "/opt/rocm/hcc-hlc" CACHE PATH "Default installation path of hcc" FORCE)
+  endif()
 endif ()
 MESSAGE("Package installation path: ${CMAKE_INSTALL_PREFIX}")
 
@@ -17,7 +25,7 @@ ENDIF(NOT CMAKE_BUILD_TYPE)
 
 # set default cppamp-ng URL
 IF (NOT CLANG_URL)
-  SET(CLANG_URL "https://bitbucket.org/multicoreware/hcc-clang.git" CACHE STRING "CLANG URL" FORCE)
+  SET(CLANG_URL "https://github.com/RadeonOpenCompute/hcc-clang.git" CACHE STRING "CLANG URL" FORCE)
 ENDIF (NOT CLANG_URL)
 set(CMAKE_MACOSX_RPATH 1)
 
@@ -261,11 +269,16 @@ endif ((HAS_OPENCL EQUAL 0) AND (HAS_HSA EQUAL 0))
 #################
 # Detect AMDGPU backend for native codegen
 #################
-option(HSA_USE_AMDGPU_BACKEND "Use AMDGPU LLVM backend as compiler for HSA" OFF)
 
 if (HSA_USE_AMDGPU_BACKEND)
   add_definitions(-DHSA_USE_AMDGPU_BACKEND)
   set(HAS_HSA_HOF 0)
+  find_program(AMDPHDRS amdphdrs PATHS /opt/rocm/bin ${AMDPHDRS_DIR} NO_DEFAULT_PATH)
+  find_program(AMDPHDRS amdphdrs)
+  MESSAGE("AMDPHDRS_DIR = ${AMDPHDRS_DIR}, actually found at: ${AMDPHDRS}")
+  if (NOT AMDPHDRS)
+    MESSAGE("amdphdrs not found.  Use -DAMDPHDRS_DIR=<amdphdrs path>.")
+  endif (NOT AMDPHDRS)
 endif ()
 
 set(HSA_AMDGPU_GPU_TARGET "kaveri" CACHE STRING "Target GPU device (kaveri,carrizo,fiji)")
@@ -493,16 +506,30 @@ if(CXXAMP_ENABLE_BOLT)
 endif(CXXAMP_ENABLE_BOLT)
 
 set(CPACK_SET_DESTDIR TRUE)
-set(CPACK_INSTALL_PREFIX "/opt/hcc")
-set(CPACK_PACKAGE_NAME "hcc")
-set(CPACK_PACKAGE_VENDOR "MulticoreWare, Inc")
+set(CPACK_INSTALL_PREFIX ${CMAKE_INSTALL_PREFIX})
+
+if (HSA_USE_AMDGPU_BACKEND)
+  set(CPACK_PACKAGE_NAME "hcc")
+else (HSA_USE_AMDGPU_BACKEND)
+  set(CPACK_PACKAGE_NAME "hcc_hlc")
+endif(HSA_USE_AMDGPU_BACKEND)
+
+set(CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc")
 set(CPACK_PACKAGE_VERSION ${KALMAR_VERSION_STRING})
 set(CPACK_PACKAGE_VERSION_MAJOR ${KALMAR_VERSION_MAJOR})
 set(CPACK_PACKAGE_VERSION_MINOR ${KALMAR_VERSION_MINOR})
 set(CPACK_PACKAGE_VERSION_PATCH ${KALMAR_VERSION_PATCH})
 set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION}-${CMAKE_SYSTEM_NAME})
 set(CPACK_DEBIAN_PACKAGE_DESCRIPTION "HCC: a Heterogeneous C++ to OpenCL/HSA compiler")
-set(CPACK_DEBIAN_PACKAGE_MAINTAINER "Jack Chung <jack@multicorewareinc.com>")
+set(CPACK_DEBIAN_PACKAGE_MAINTAINER "Siu Chi Chan <siuchi.chan@amd.com>")
+
+set(HCC_DEBIAN_DEP "hsa-runtime-dev (>=1.0.0), libstdc++-4.8-dev, llvm, llvm-dev, llvm-runtime, libc++1, libc++-dev, libc++abi1, libc++abi-dev")
+if (HSA_USE_AMDGPU_BACKEND)
+  set(CPACK_DEBIAN_PACKAGE_DEPENDS "${HCC_DEBIAN_DEP}, amdphdrs (>=1.0), llvm-amdgpu (>=3.9)" )
+else (HSA_USE_AMDGPU_BACKEND)
+  set(CPACK_DEBIAN_PACKAGE_DEPENDS "${HCC_DEBIAN_DEP}" )
+endif (HSA_USE_AMDGPU_BACKEND)
+
 set(CPACK_GENERATOR "DEB;TGZ")
 set(CPACK_SOURCE_GENERATOR "TGZ")
 set(CPACK_BINARY_DEB "ON")

diff --git a/HC/WrapperGen/WrapperGen.cpp b/HC/WrapperGen/WrapperGen.cpp
@@ -475,12 +475,12 @@ struct StringFinder
           out << "}\n";
 
           out << "void operator()(tiled_index<3>& i) __attribute((hc))\n{\n";
-          out << "_lp.groupId.x = i.tile[0];\n";
+          out << "_lp.groupId.x = i.tile[2];\n";
           out << "_lp.groupId.y = i.tile[1];\n";
-          out << "_lp.groupId.z = i.tile[2];\n";
-          out << "_lp.threadId.x = i.local[0];\n";
+          out << "_lp.groupId.z = i.tile[0];\n";
+          out << "_lp.threadId.x = i.local[2];\n";
           out << "_lp.threadId.y = i.local[1];\n";
-          out << "_lp.threadId.z = i.local[2];\n";
+          out << "_lp.threadId.z = i.local[0];\n";
           out << func->getFunctionName() << "(";
           func->printArgsAsArguments(out);
           out << ");\n}\n";
@@ -493,7 +493,7 @@ struct StringFinder
           out << "void " << func->getWrapperName() << "(";
           func->printArgsAsParameters(out);
           out << ")\n{\n";
-          out << "completion_future cf = parallel_for_each(*(_lp.av),extent<3>(_lp.gridDim.x*_lp.groupDim.x,_lp.gridDim.y*_lp.groupDim.y,_lp.gridDim.z*_lp.groupDim.z).tile(_lp.groupDim.x, _lp.groupDim.y, _lp.groupDim.z), \n"
+          out << "completion_future cf = parallel_for_each(*(_lp.av),extent<3>(_lp.gridDim.z*_lp.groupDim.z,_lp.gridDim.y*_lp.groupDim.y,_lp.gridDim.x*_lp.groupDim.x).tile(_lp.groupDim.z, _lp.groupDim.y, _lp.groupDim.x), \n"
               << func->getFunctorName()
               << "(";
           func->printArgsAsArguments(out);

diff --git a/include/grid_launch.h b/include/grid_launch.h
@@ -20,6 +20,48 @@ typedef struct grid_launch_parm
   // use acc_view for PFE in WrapperGen
   hc::accelerator_view  *av;
   hc::completion_future *cf;
+
+  grid_launch_parm() = default;
+
+  // customized serialization: don't need av and cf in kernel
+  __attribute__((annotate("serialize")))
+  void __cxxamp_serialize(Kalmar::Serialize& s) const {
+    s.Append(sizeof(int), &gridDim.x);
+    s.Append(sizeof(int), &gridDim.y);
+    s.Append(sizeof(int), &gridDim.z);
+    s.Append(sizeof(int), &groupDim.x);
+    s.Append(sizeof(int), &groupDim.y);
+    s.Append(sizeof(int), &groupDim.z);
+    s.Append(sizeof(int), &groupId.x);
+    s.Append(sizeof(int), &groupId.y);
+    s.Append(sizeof(int), &groupId.z);
+    s.Append(sizeof(int), &threadId.x);
+    s.Append(sizeof(int), &threadId.y);
+    s.Append(sizeof(int), &threadId.z);
+    s.Append(sizeof(unsigned), &groupMemBytes);
+  }
+
+  __attribute__((annotate("user_deserialize")))
+  grid_launch_parm(int gridDim_x,  int gridDim_y,  int gridDim_z,
+                   int groupDim_x, int groupDim_y, int groupDim_z,
+                   int groupId_x,  int groupId_y,  int groupId_z,
+                   int threadId_x, int threadId_y, int threadId_z,
+                   unsigned groupMemBytes_) {
+    gridDim.x  = gridDim_x;
+    gridDim.y  = gridDim_y;
+    gridDim.z  = gridDim_z;
+    groupDim.x = groupDim_x;
+    groupDim.y = groupDim_y;
+    groupDim.z = groupDim_z;
+    groupId.x  = groupId_x;
+    groupId.y  = groupId_y;
+    groupId.z  = groupId_z;
+    threadId.x = threadId_x;
+    threadId.y = threadId_y;
+    threadId.z = threadId_z;
+    groupMemBytes = groupMemBytes_;
+  }
+
 } grid_launch_parm;
 
 // TODO: Will move to separate source file in the future

diff --git a/include/hc_printf.hpp b/include/hc_printf.hpp
@@ -0,0 +1,216 @@
+#pragma once
+
+#include <cstdlib>
+#include <cstdio>
+#include <cassert>
+#include <atomic>
+#include <string>
+#include <regex>
+#include <iostream>
+#include <algorithm>
+
+#include <hc_am.hpp>
+#include <hsa_atomic.h>
+
+#define HC_PRINTF_DEBUG  (0)
+
+namespace hc {
+
+union PrintfPacketData {
+  unsigned int    ui;
+  int             i;
+  float           f;
+  void*           ptr;
+  const void*     cptr;
+  std::atomic_int ai;
+};
+
+enum PrintfPacketDataType {
+  PRINTF_UNUSED       
+  ,PRINTF_UNSIGNED_INT 
+  ,PRINTF_SIGNED_INT  
+  ,PRINTF_FLOAT       
+  ,PRINTF_VOID_PTR    
+  ,PRINTF_CONST_VOID_PTR
+  ,PRINTF_BUFFER_CURSOR
+  ,PRINTF_BUFFER_SIZE
+};
+
+class PrintfPacket {
+public:
+  void clear()             [[hc,cpu]] { type = PRINTF_UNUSED; }
+  void set(unsigned int d) [[hc,cpu]] { type = PRINTF_UNSIGNED_INT;   data.ui = d; }
+  void set(int d)          [[hc,cpu]] { type = PRINTF_SIGNED_INT;     data.i = d; }
+  void set(float d)        [[hc,cpu]] { type = PRINTF_FLOAT;          data.f = d; }
+  void set(void* d)        [[hc,cpu]] { type = PRINTF_VOID_PTR;       data.ptr = d; }
+  void set(const void* d)  [[hc,cpu]] { type = PRINTF_CONST_VOID_PTR; data.cptr = d; }
+  PrintfPacketDataType type;
+  PrintfPacketData data;
+};
+
+enum PrintfError {
+   PRINTF_SUCCESS = 0
+  ,PRINTF_BUFFER_OVERFLOW = 1
+};
+
+static inline PrintfPacket* createPrintfBuffer(hc::accelerator& a, const unsigned int numElements) {
+  PrintfPacket* printfBuffer = NULL;
+  if (numElements > 3) {
+    printfBuffer = hc::am_alloc(sizeof(PrintfPacket) * numElements, a, 0);
+
+    // initialize the printf buffer header
+    PrintfPacket header[2];
+    header[0].type = PRINTF_BUFFER_SIZE;
+    header[0].data.ui = numElements;
+    header[1].type = PRINTF_BUFFER_CURSOR;
+    header[1].data.ui = 2;
+    hc::am_copy(printfBuffer,header,sizeof(PrintfPacket) * 2);
+  }
+  return printfBuffer;
+}
+
+void deletePrintfBuffer(PrintfPacket* buffer) {
+  hc::am_free(buffer);
+}
+
+// get the argument count
+static inline void countArg(unsigned int& count) [[hc,cpu]] {}
+template <typename T> 
+static inline void countArg(unsigned int& count, const T& t) [[hc,cpu]] { ++count; }
+template <typename T, typename... Rest> 
+static inline void countArg(unsigned int& count, const T& t, const Rest&... rest) [[hc,cpu]] {
+  ++count;
+  countArg(count,rest...);
+}
+
+template <typename T>
+static inline void set_batch(PrintfPacket* queue, int offset, const T t) [[hc,cpu]] {
+  queue[offset].set(t);
+}
+template <typename T, typename... Rest>
+static inline void set_batch(PrintfPacket* queue, int offset, const T t, Rest... rest) [[hc,cpu]] {
+  queue[offset].set(t);
+  set_batch(queue, offset + 1, rest...);
+}
+
+template <typename... All>
+static inline PrintfError printf(PrintfPacket* queue, All... all) [[hc,cpu]] {
+  unsigned int count = 0;      
+  countArg(count, all...);
+
+  PrintfError error = PRINTF_SUCCESS;
+
+  if (count + 1 + queue[1].data.ui > queue[0].data.ui) {
+    error = PRINTF_BUFFER_OVERFLOW;
+  } else {
+
+#if 0
+    /*** FIXME: hcc didn't promote the address of the atomic type into global address space ***/
+    unsigned int offset = queue[1].data.ai.fetch_add(count + 1);
+#endif
+    unsigned int offset = __hsail_atomic_fetch_add_unsigned(&(queue[1].data.ui),count + 1);
+    if (offset + count + 1 < queue[0].data.ui) { 
+      set_batch(queue, offset, count, all...);
+    }
+    else {
+      error = PRINTF_BUFFER_OVERFLOW;
+    }
+  }
+
+  return error;
+}
+
+// regex for finding format string specifiers
+static std::regex specifierPattern("(%){1}[-+#0]*[0-9]*((.)[0-9]+){0,1}([diuoxXfFeEgGaAcsp]){1}");
+static std::regex signedIntegerPattern("(%){1}[-+#0]*[0-9]*((.)[0-9]+){0,1}([cdi]){1}");
+static std::regex unsignedIntegerPattern("(%){1}[-+#0]*[0-9]*((.)[0-9]+){0,1}([uoxX]){1}");
+static std::regex floatPattern("(%){1}[-+#0]*[0-9]*((.)[0-9]+){0,1}([fFeEgGaA]){1}");
+static std::regex pointerPattern("(%){1}[ps]");
+static std::regex doubleAmpersandPattern("(%){2}");
+
+static inline void processPrintfPackets(PrintfPacket* packets, const unsigned int numPackets) {
+
+  for (unsigned int i = 0; i < numPackets; ) {
+
+    unsigned int numPrintfArgs = packets[i++].data.ui;
+    if (numPrintfArgs == 0)
+      continue;
+
+    // get the format
+    unsigned int formatStringIndex = i++;
+    assert(packets[formatStringIndex].type == PRINTF_VOID_PTR
+           || packets[formatStringIndex].type == PRINTF_CONST_VOID_PTR);
+    std::string formatString((const char*)packets[formatStringIndex].data.cptr);
+
+    unsigned int formatStringCursor = 0;
+    std::smatch specifierMatches;
+
+#if HC_PRINTF_DEBUG
+    std::printf("%s:%d \t number of matches = %d\n", __FUNCTION__, __LINE__, (int)specifierMatches.size());
+#endif
+
+    for (unsigned int j = 1; j < numPrintfArgs; ++j, ++i) {
+
+      if (!std::regex_search(formatString, specifierMatches, specifierPattern)) {
+        // More printf argument than format specifier??
+        // Just skip to the next printf request
+        break;
+      }
+
+      std::string specifier = specifierMatches.str();
+#if HC_PRINTF_DEBUG
+      std::cout << " (specifier found: " << specifier << ") ";
+#endif
+
+      // print the substring before the specifier
+      // clean up all the double ampersands
+      std::string prefix = specifierMatches.prefix();
+      prefix = std::regex_replace(prefix,doubleAmpersandPattern,"%");
+      std::printf("%s",prefix.c_str());
+
+      std::smatch specifierTypeMatch;
+      if (std::regex_search(specifier, specifierTypeMatch, unsignedIntegerPattern)) {
+        std::printf(specifier.c_str(), packets[i].data.ui);
+      } else if (std::regex_search(specifier, specifierTypeMatch, signedIntegerPattern)) {
+        std::printf(specifier.c_str(), packets[i].data.i);
+      } else if (std::regex_search(specifier, specifierTypeMatch, floatPattern)) {
+        std::printf(specifier.c_str(), packets[i].data.f);
+      } else if (std::regex_search(specifier, specifierTypeMatch, pointerPattern)) {
+        std::printf(specifier.c_str(), packets[i].data.cptr);
+      }
+      else {
+        assert(false);
+      }
+      formatString = specifierMatches.suffix();
+    }
+    // print the substring after the last specifier
+    // clean up all the double ampersands before printing
+    formatString = std::regex_replace(formatString,doubleAmpersandPattern,"%");
+    std::printf("%s",formatString.c_str());
+  }
+}
+
+static inline void processPrintfBuffer(PrintfPacket* gpuBuffer) {
+
+  if (gpuBuffer == NULL) return;
+
+  PrintfPacket header[2];
+  hc::am_copy(header, gpuBuffer, sizeof(PrintfPacket)*2);
+  unsigned int bufferSize = header[0].data.ui;
+  unsigned int cursor = header[1].data.ui;
+  unsigned int numPackets = ((bufferSize<cursor)?bufferSize:cursor) - 2;
+  if (numPackets > 0) {
+    PrintfPacket* hostBuffer = (PrintfPacket*)malloc(sizeof(PrintfPacket) * numPackets);
+    if (hostBuffer) {
+      hc::am_copy(hostBuffer, gpuBuffer+2, sizeof(PrintfPacket) * numPackets);
+      processPrintfPackets(hostBuffer, numPackets);
+      free(hostBuffer);
+    }
+  }
+  // reset the printf buffer
+  header[1].data.ui = 2;
+  hc::am_copy(gpuBuffer,header,sizeof(PrintfPacket) * 2);
+}
+
+
+} // namespace hc