Skip to content

Commit

Permalink
Merged multicoreware/hcc/master into shfl
Browse files Browse the repository at this point in the history
  • Loading branch information
scchan committed Apr 6, 2016
2 parents b76c262 + d220f47 commit 862e135
Show file tree
Hide file tree
Showing 19 changed files with 1,018 additions and 230 deletions.
6 changes: 3 additions & 3 deletions Bolt/test/amp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -117,13 +117,13 @@ add_subdirectory( StableSortTest )
add_subdirectory( StableSortByKeyTest )

# passed on SPIR path. crashed on HSA after passing some tests
add_subdirectory( TransformTest )
#add_subdirectory( TransformTest )

# passed on SPIR path. failed some tests on HSA
add_subdirectory( TransformReduceTest )
#add_subdirectory( TransformReduceTest )

# compile OK, failed some tests on SPIR and HSA
add_subdirectory( TransformScanTest )
#add_subdirectory( TransformScanTest )

# compile OK, crashed on HSA after passing some tests
add_subdirectory( ScatterTest )
Expand Down
41 changes: 34 additions & 7 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,17 @@
cmake_minimum_required( VERSION 2.8 )
project (HCC)

option(HSA_USE_AMDGPU_BACKEND "Use AMDGPU LLVM backend as compiler for HSA" OFF)



# set default installation path
if (CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT AND CMAKE_INSTALL_PREFIX MATCHES "/usr/local")
set(CMAKE_INSTALL_PREFIX "/opt/hcc" CACHE PATH "Default installation path of hcc" FORCE)
if (HSA_USE_AMDGPU_BACKEND)
set(CMAKE_INSTALL_PREFIX "/opt/rocm/hcc" CACHE PATH "Default installation path of hcc" FORCE)
else (HSA_USE_AMDGPU_BACKEND)
set(CMAKE_INSTALL_PREFIX "/opt/rocm/hcc-hlc" CACHE PATH "Default installation path of hcc" FORCE)
endif()
endif ()
MESSAGE("Package installation path: ${CMAKE_INSTALL_PREFIX}")

Expand All @@ -17,7 +25,7 @@ ENDIF(NOT CMAKE_BUILD_TYPE)

# set default cppamp-ng URL
IF (NOT CLANG_URL)
SET(CLANG_URL "https://bitbucket.org/multicoreware/hcc-clang.git" CACHE STRING "CLANG URL" FORCE)
SET(CLANG_URL "https://github.com/RadeonOpenCompute/hcc-clang.git" CACHE STRING "CLANG URL" FORCE)
ENDIF (NOT CLANG_URL)
set(CMAKE_MACOSX_RPATH 1)

Expand Down Expand Up @@ -261,11 +269,16 @@ endif ((HAS_OPENCL EQUAL 0) AND (HAS_HSA EQUAL 0))
#################
# Detect AMDGPU backend for native codegen
#################
option(HSA_USE_AMDGPU_BACKEND "Use AMDGPU LLVM backend as compiler for HSA" OFF)

if (HSA_USE_AMDGPU_BACKEND)
add_definitions(-DHSA_USE_AMDGPU_BACKEND)
set(HAS_HSA_HOF 0)
find_program(AMDPHDRS amdphdrs PATHS /opt/rocm/bin ${AMDPHDRS_DIR} NO_DEFAULT_PATH)
find_program(AMDPHDRS amdphdrs)
MESSAGE("AMDPHDRS_DIR = ${AMDPHDRS_DIR}, actually found at: ${AMDPHDRS}")
if (NOT AMDPHDRS)
MESSAGE("amdphdrs not found. Use -DAMDPHDRS_DIR=<amdphdrs path>.")
endif (NOT AMDPHDRS)
endif ()

set(HSA_AMDGPU_GPU_TARGET "kaveri" CACHE STRING "Target GPU device (kaveri,carrizo,fiji)")
Expand Down Expand Up @@ -493,16 +506,30 @@ if(CXXAMP_ENABLE_BOLT)
endif(CXXAMP_ENABLE_BOLT)

set(CPACK_SET_DESTDIR TRUE)
set(CPACK_INSTALL_PREFIX "/opt/hcc")
set(CPACK_PACKAGE_NAME "hcc")
set(CPACK_PACKAGE_VENDOR "MulticoreWare, Inc")
set(CPACK_INSTALL_PREFIX ${CMAKE_INSTALL_PREFIX})

if (HSA_USE_AMDGPU_BACKEND)
set(CPACK_PACKAGE_NAME "hcc")
else (HSA_USE_AMDGPU_BACKEND)
set(CPACK_PACKAGE_NAME "hcc_hlc")
endif(HSA_USE_AMDGPU_BACKEND)

set(CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc")
set(CPACK_PACKAGE_VERSION ${KALMAR_VERSION_STRING})
set(CPACK_PACKAGE_VERSION_MAJOR ${KALMAR_VERSION_MAJOR})
set(CPACK_PACKAGE_VERSION_MINOR ${KALMAR_VERSION_MINOR})
set(CPACK_PACKAGE_VERSION_PATCH ${KALMAR_VERSION_PATCH})
set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${CPACK_PACKAGE_VERSION}-${CMAKE_SYSTEM_NAME})
set(CPACK_DEBIAN_PACKAGE_DESCRIPTION "HCC: a Heterogeneous C++ to OpenCL/HSA compiler")
set(CPACK_DEBIAN_PACKAGE_MAINTAINER "Jack Chung <jack@multicorewareinc.com>")
set(CPACK_DEBIAN_PACKAGE_MAINTAINER "Siu Chi Chan <siuchi.chan@amd.com>")

set(HCC_DEBIAN_DEP "hsa-runtime-dev (>=1.0.0), libstdc++-4.8-dev, llvm, llvm-dev, llvm-runtime, libc++1, libc++-dev, libc++abi1, libc++abi-dev")
if (HSA_USE_AMDGPU_BACKEND)
set(CPACK_DEBIAN_PACKAGE_DEPENDS "${HCC_DEBIAN_DEP}, amdphdrs (>=1.0), llvm-amdgpu (>=3.9)" )
else (HSA_USE_AMDGPU_BACKEND)
set(CPACK_DEBIAN_PACKAGE_DEPENDS "${HCC_DEBIAN_DEP}" )
endif (HSA_USE_AMDGPU_BACKEND)

set(CPACK_GENERATOR "DEB;TGZ")
set(CPACK_SOURCE_GENERATOR "TGZ")
set(CPACK_BINARY_DEB "ON")
Expand Down
10 changes: 5 additions & 5 deletions HC/WrapperGen/WrapperGen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -475,12 +475,12 @@ struct StringFinder
out << "}\n";

out << "void operator()(tiled_index<3>& i) __attribute((hc))\n{\n";
out << "_lp.groupId.x = i.tile[0];\n";
out << "_lp.groupId.x = i.tile[2];\n";
out << "_lp.groupId.y = i.tile[1];\n";
out << "_lp.groupId.z = i.tile[2];\n";
out << "_lp.threadId.x = i.local[0];\n";
out << "_lp.groupId.z = i.tile[0];\n";
out << "_lp.threadId.x = i.local[2];\n";
out << "_lp.threadId.y = i.local[1];\n";
out << "_lp.threadId.z = i.local[2];\n";
out << "_lp.threadId.z = i.local[0];\n";
out << func->getFunctionName() << "(";
func->printArgsAsArguments(out);
out << ");\n}\n";
Expand All @@ -493,7 +493,7 @@ struct StringFinder
out << "void " << func->getWrapperName() << "(";
func->printArgsAsParameters(out);
out << ")\n{\n";
out << "completion_future cf = parallel_for_each(*(_lp.av),extent<3>(_lp.gridDim.x*_lp.groupDim.x,_lp.gridDim.y*_lp.groupDim.y,_lp.gridDim.z*_lp.groupDim.z).tile(_lp.groupDim.x, _lp.groupDim.y, _lp.groupDim.z), \n"
out << "completion_future cf = parallel_for_each(*(_lp.av),extent<3>(_lp.gridDim.z*_lp.groupDim.z,_lp.gridDim.y*_lp.groupDim.y,_lp.gridDim.x*_lp.groupDim.x).tile(_lp.groupDim.z, _lp.groupDim.y, _lp.groupDim.x), \n"
<< func->getFunctorName()
<< "(";
func->printArgsAsArguments(out);
Expand Down
42 changes: 42 additions & 0 deletions include/grid_launch.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,48 @@ typedef struct grid_launch_parm
// use acc_view for PFE in WrapperGen
hc::accelerator_view *av;
hc::completion_future *cf;

grid_launch_parm() = default;

// customized serialization: don't need av and cf in kernel
__attribute__((annotate("serialize")))
void __cxxamp_serialize(Kalmar::Serialize& s) const {
s.Append(sizeof(int), &gridDim.x);
s.Append(sizeof(int), &gridDim.y);
s.Append(sizeof(int), &gridDim.z);
s.Append(sizeof(int), &groupDim.x);
s.Append(sizeof(int), &groupDim.y);
s.Append(sizeof(int), &groupDim.z);
s.Append(sizeof(int), &groupId.x);
s.Append(sizeof(int), &groupId.y);
s.Append(sizeof(int), &groupId.z);
s.Append(sizeof(int), &threadId.x);
s.Append(sizeof(int), &threadId.y);
s.Append(sizeof(int), &threadId.z);
s.Append(sizeof(unsigned), &groupMemBytes);
}

__attribute__((annotate("user_deserialize")))
grid_launch_parm(int gridDim_x, int gridDim_y, int gridDim_z,
int groupDim_x, int groupDim_y, int groupDim_z,
int groupId_x, int groupId_y, int groupId_z,
int threadId_x, int threadId_y, int threadId_z,
unsigned groupMemBytes_) {
gridDim.x = gridDim_x;
gridDim.y = gridDim_y;
gridDim.z = gridDim_z;
groupDim.x = groupDim_x;
groupDim.y = groupDim_y;
groupDim.z = groupDim_z;
groupId.x = groupId_x;
groupId.y = groupId_y;
groupId.z = groupId_z;
threadId.x = threadId_x;
threadId.y = threadId_y;
threadId.z = threadId_z;
groupMemBytes = groupMemBytes_;
}

} grid_launch_parm;

// TODO: Will move to separate source file in the future
Expand Down
216 changes: 216 additions & 0 deletions include/hc_printf.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,216 @@
#pragma once

#include <cstdlib>
#include <cstdio>
#include <cassert>
#include <atomic>
#include <string>
#include <regex>
#include <iostream>
#include <algorithm>

#include <hc_am.hpp>
#include <hsa_atomic.h>

#define HC_PRINTF_DEBUG (0)

namespace hc {

union PrintfPacketData {
unsigned int ui;
int i;
float f;
void* ptr;
const void* cptr;
std::atomic_int ai;
};

enum PrintfPacketDataType {
PRINTF_UNUSED
,PRINTF_UNSIGNED_INT
,PRINTF_SIGNED_INT
,PRINTF_FLOAT
,PRINTF_VOID_PTR
,PRINTF_CONST_VOID_PTR
,PRINTF_BUFFER_CURSOR
,PRINTF_BUFFER_SIZE
};

class PrintfPacket {
public:
void clear() [[hc,cpu]] { type = PRINTF_UNUSED; }
void set(unsigned int d) [[hc,cpu]] { type = PRINTF_UNSIGNED_INT; data.ui = d; }
void set(int d) [[hc,cpu]] { type = PRINTF_SIGNED_INT; data.i = d; }
void set(float d) [[hc,cpu]] { type = PRINTF_FLOAT; data.f = d; }
void set(void* d) [[hc,cpu]] { type = PRINTF_VOID_PTR; data.ptr = d; }
void set(const void* d) [[hc,cpu]] { type = PRINTF_CONST_VOID_PTR; data.cptr = d; }
PrintfPacketDataType type;
PrintfPacketData data;
};

enum PrintfError {
PRINTF_SUCCESS = 0
,PRINTF_BUFFER_OVERFLOW = 1
};

static inline PrintfPacket* createPrintfBuffer(hc::accelerator& a, const unsigned int numElements) {
PrintfPacket* printfBuffer = NULL;
if (numElements > 3) {
printfBuffer = hc::am_alloc(sizeof(PrintfPacket) * numElements, a, 0);

// initialize the printf buffer header
PrintfPacket header[2];
header[0].type = PRINTF_BUFFER_SIZE;
header[0].data.ui = numElements;
header[1].type = PRINTF_BUFFER_CURSOR;
header[1].data.ui = 2;
hc::am_copy(printfBuffer,header,sizeof(PrintfPacket) * 2);
}
return printfBuffer;
}

void deletePrintfBuffer(PrintfPacket* buffer) {
hc::am_free(buffer);
}

// get the argument count
static inline void countArg(unsigned int& count) [[hc,cpu]] {}
template <typename T>
static inline void countArg(unsigned int& count, const T& t) [[hc,cpu]] { ++count; }
template <typename T, typename... Rest>
static inline void countArg(unsigned int& count, const T& t, const Rest&... rest) [[hc,cpu]] {
++count;
countArg(count,rest...);
}

template <typename T>
static inline void set_batch(PrintfPacket* queue, int offset, const T t) [[hc,cpu]] {
queue[offset].set(t);
}
template <typename T, typename... Rest>
static inline void set_batch(PrintfPacket* queue, int offset, const T t, Rest... rest) [[hc,cpu]] {
queue[offset].set(t);
set_batch(queue, offset + 1, rest...);
}

template <typename... All>
static inline PrintfError printf(PrintfPacket* queue, All... all) [[hc,cpu]] {
unsigned int count = 0;
countArg(count, all...);

PrintfError error = PRINTF_SUCCESS;

if (count + 1 + queue[1].data.ui > queue[0].data.ui) {
error = PRINTF_BUFFER_OVERFLOW;
} else {

#if 0
/*** FIXME: hcc didn't promote the address of the atomic type into global address space ***/
unsigned int offset = queue[1].data.ai.fetch_add(count + 1);
#endif
unsigned int offset = __hsail_atomic_fetch_add_unsigned(&(queue[1].data.ui),count + 1);
if (offset + count + 1 < queue[0].data.ui) {
set_batch(queue, offset, count, all...);
}
else {
error = PRINTF_BUFFER_OVERFLOW;
}
}

return error;
}

// regex for finding format string specifiers
static std::regex specifierPattern("(%){1}[-+#0]*[0-9]*((.)[0-9]+){0,1}([diuoxXfFeEgGaAcsp]){1}");
static std::regex signedIntegerPattern("(%){1}[-+#0]*[0-9]*((.)[0-9]+){0,1}([cdi]){1}");
static std::regex unsignedIntegerPattern("(%){1}[-+#0]*[0-9]*((.)[0-9]+){0,1}([uoxX]){1}");
static std::regex floatPattern("(%){1}[-+#0]*[0-9]*((.)[0-9]+){0,1}([fFeEgGaA]){1}");
static std::regex pointerPattern("(%){1}[ps]");
static std::regex doubleAmpersandPattern("(%){2}");

static inline void processPrintfPackets(PrintfPacket* packets, const unsigned int numPackets) {

for (unsigned int i = 0; i < numPackets; ) {

unsigned int numPrintfArgs = packets[i++].data.ui;
if (numPrintfArgs == 0)
continue;

// get the format
unsigned int formatStringIndex = i++;
assert(packets[formatStringIndex].type == PRINTF_VOID_PTR
|| packets[formatStringIndex].type == PRINTF_CONST_VOID_PTR);
std::string formatString((const char*)packets[formatStringIndex].data.cptr);

unsigned int formatStringCursor = 0;
std::smatch specifierMatches;

#if HC_PRINTF_DEBUG
std::printf("%s:%d \t number of matches = %d\n", __FUNCTION__, __LINE__, (int)specifierMatches.size());
#endif

for (unsigned int j = 1; j < numPrintfArgs; ++j, ++i) {

if (!std::regex_search(formatString, specifierMatches, specifierPattern)) {
// More printf argument than format specifier??
// Just skip to the next printf request
break;
}

std::string specifier = specifierMatches.str();
#if HC_PRINTF_DEBUG
std::cout << " (specifier found: " << specifier << ") ";
#endif

// print the substring before the specifier
// clean up all the double ampersands
std::string prefix = specifierMatches.prefix();
prefix = std::regex_replace(prefix,doubleAmpersandPattern,"%");
std::printf("%s",prefix.c_str());

std::smatch specifierTypeMatch;
if (std::regex_search(specifier, specifierTypeMatch, unsignedIntegerPattern)) {
std::printf(specifier.c_str(), packets[i].data.ui);
} else if (std::regex_search(specifier, specifierTypeMatch, signedIntegerPattern)) {
std::printf(specifier.c_str(), packets[i].data.i);
} else if (std::regex_search(specifier, specifierTypeMatch, floatPattern)) {
std::printf(specifier.c_str(), packets[i].data.f);
} else if (std::regex_search(specifier, specifierTypeMatch, pointerPattern)) {
std::printf(specifier.c_str(), packets[i].data.cptr);
}
else {
assert(false);
}
formatString = specifierMatches.suffix();
}
// print the substring after the last specifier
// clean up all the double ampersands before printing
formatString = std::regex_replace(formatString,doubleAmpersandPattern,"%");
std::printf("%s",formatString.c_str());
}
}

static inline void processPrintfBuffer(PrintfPacket* gpuBuffer) {

if (gpuBuffer == NULL) return;

PrintfPacket header[2];
hc::am_copy(header, gpuBuffer, sizeof(PrintfPacket)*2);
unsigned int bufferSize = header[0].data.ui;
unsigned int cursor = header[1].data.ui;
unsigned int numPackets = ((bufferSize<cursor)?bufferSize:cursor) - 2;
if (numPackets > 0) {
PrintfPacket* hostBuffer = (PrintfPacket*)malloc(sizeof(PrintfPacket) * numPackets);
if (hostBuffer) {
hc::am_copy(hostBuffer, gpuBuffer+2, sizeof(PrintfPacket) * numPackets);
processPrintfPackets(hostBuffer, numPackets);
free(hostBuffer);
}
}
// reset the printf buffer
header[1].data.ui = 2;
hc::am_copy(gpuBuffer,header,sizeof(PrintfPacket) * 2);
}


} // namespace hc
Loading

0 comments on commit 862e135

Please sign in to comment.