Skip to content

Commit

Permalink
Fastforward Public Develop (#45)
Browse files Browse the repository at this point in the history
* 3D group forward convolution solver (#2286)

* [HotFix] Fix build issue after #2286 (#2328)

* Fix ConvCkIgemmFwdV6r1DlopsNchw solver to reflect that it's not dynamic (#2325)

* Remove target IDs from kdb entries (#2309)

* Dropout: make seed and states_num kernel arguments (#2277)

* [MI250] Adding missing kernel objects (#2329)

* Post-merge fixups: Replace environment variable check with problem config check and reduce lambda capture for Invoker obj (#2305)

* [HotFix][CI] fix HIP tidy issue from #2277 (#2335)

* [HotFix] Update requirements.txt MLIR ignore PATH for Python conda LLD (#2324)

* [NFC] Replace long integers with int64_t and size_t for better compatibility with Windows (#2323)

* Windows compatibility: replace long integers with int64_t and size_t, replace uint with unsigned int, replace long with long long for numbers, proper casting

* Fix formatting

* Fix 3d group forward convolution

* Resolve review comments

* Fix formatting

---------

Co-authored-by: Daming Feng <dmfeng8898@gmail.com>
Co-authored-by: Evgenii Averin <86725875+averinevg@users.noreply.github.com>
Co-authored-by: JD <jahandad@gmail.com>
Co-authored-by: Tal Ben-Nun <tbennun@users.noreply.github.com>
Co-authored-by: amberhassaan <mah@katanagraph.com>
Co-authored-by: Jun Liu <Liu.Jun@amd.com>
  • Loading branch information
7 people authored Aug 25, 2023
1 parent fd0cf2d commit feab6fd
Show file tree
Hide file tree
Showing 63 changed files with 1,733 additions and 855 deletions.
2 changes: 1 addition & 1 deletion docs/DebugAndLogging.md
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,7 @@ Different ROCm versions use Code Object files of different versions (or, in othe
`MIOPEN_DEBUG_AMD_MP_BD_WINOGRAD_WORKSPACE_MAX` - `ConvMPBidirectWinograd*`, FWD BWD
Syntax of value:
* decimal or hex (with `0x` prefix) value that should fit into `unsigned long` (64 bits).
* decimal or hex (with `0x` prefix) value that should fit into 64-bit unsigned integer.
* If syntax is violated, then the behavior is unspecified.
Semantics:
Expand Down
21 changes: 12 additions & 9 deletions driver/conv_driver.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1564,7 +1564,8 @@ void ConvDriver<Tgpu, Tref>::PrintForwardTime(const float kernel_total_time,
std::tie(out_n, out_c, out_h, out_w) =
miopen::tien<4>(miopen::deref(outputTensor).GetLengths());

size_t flopCnt = 2L * in_n * in_c * wei_h * wei_w * out_c * out_h * out_w / group_count;
size_t flopCnt = static_cast<size_t>(2) * in_n * in_c * wei_h * wei_w * out_c * out_h *
out_w / group_count;
size_t inputBytes =
in_n * in_c * in_h * in_w * miopen::GetTypeSize(miopen::deref(inputTensor).GetType());
size_t weightBytes = wei_n * wei_c * wei_h * wei_w *
Expand Down Expand Up @@ -1607,8 +1608,8 @@ void ConvDriver<Tgpu, Tref>::PrintForwardTime(const float kernel_total_time,
std::tie(out_n, out_c, out_d, out_h, out_w) =
miopen::tien<5>(miopen::deref(outputTensor).GetLengths());

size_t flopCnt = 2L * in_n * in_c * in_d * wei_h * wei_w * wei_d * out_c * out_d * out_h *
out_w / group_count;
size_t flopCnt = static_cast<size_t>(2) * in_n * in_c * in_d * wei_h * wei_w * wei_d *
out_c * out_d * out_h * out_w / group_count;
size_t inputBytes = in_n * in_c * in_d * in_h * in_w *
miopen::GetTypeSize(miopen::deref(inputTensor).GetType());
size_t weightBytes = wei_n * wei_c * wei_d * wei_h * wei_w *
Expand Down Expand Up @@ -2464,7 +2465,8 @@ void ConvDriver<Tgpu, Tref>::PrintBackwardDataTime(float kernel_total_time, floa
std::tie(out_n, out_c, out_h, out_w) =
miopen::tien<4>(miopen::deref(outputTensor).GetLengths());

size_t flopCnt = 2L * in_n * in_c * wei_h * wei_w * out_c * out_h * out_w / group_count;
size_t flopCnt = static_cast<size_t>(2) * in_n * in_c * wei_h * wei_w * out_c * out_h *
out_w / group_count;
size_t weightBytes = wei_n * wei_c * wei_h * wei_w *
miopen::GetTypeSize(miopen::deref(weightTensor).GetType());
size_t inputBytes =
Expand Down Expand Up @@ -2507,8 +2509,8 @@ void ConvDriver<Tgpu, Tref>::PrintBackwardDataTime(float kernel_total_time, floa
std::tie(out_n, out_c, out_d, out_h, out_w) =
miopen::tien<5>(miopen::deref(outputTensor).GetLengths());

size_t flopCnt =
2L * in_n * in_c * wei_d * wei_h * wei_w * out_c * out_d * out_h * out_w / group_count;
size_t flopCnt = static_cast<size_t>(2) * in_n * in_c * wei_d * wei_h * wei_w * out_c *
out_d * out_h * out_w / group_count;
size_t weightBytes = wei_n * wei_c * wei_d * wei_h * wei_w *
miopen::GetTypeSize(miopen::deref(weightTensor).GetType());
size_t inputBytes =
Expand Down Expand Up @@ -2674,7 +2676,8 @@ void ConvDriver<Tgpu, Tref>::PrintBackwardWrwTime(float kernel_total_time, float
std::tie(out_n, out_c, out_h, out_w) =
miopen::tien<4>(miopen::deref(outputTensor).GetLengths());

size_t flopCnt = 2L * in_n * in_c * wei_h * wei_w * out_c * out_h * out_w / group_count;
size_t flopCnt = static_cast<size_t>(2) * in_n * in_c * wei_h * wei_w * out_c * out_h *
out_w / group_count;
size_t readBytes = 0;
size_t outputBytes = 0;

Expand Down Expand Up @@ -2711,8 +2714,8 @@ void ConvDriver<Tgpu, Tref>::PrintBackwardWrwTime(float kernel_total_time, float
std::tie(out_n, out_c, out_d, out_h, out_w) =
miopen::tien<5>(miopen::deref(outputTensor).GetLengths());

size_t flopCnt =
2L * in_n * in_c * wei_d * wei_h * wei_w * out_c * out_d * out_h * out_w / group_count;
size_t flopCnt = static_cast<size_t>(2) * in_n * in_c * wei_d * wei_h * wei_w * out_c *
out_d * out_h * out_w / group_count;
size_t readBytes = 0;
size_t outputBytes = 0;

Expand Down
8 changes: 4 additions & 4 deletions include/miopen/miopen.h
Original file line number Diff line number Diff line change
Expand Up @@ -554,10 +554,10 @@ typedef enum
*/
typedef enum
{
MIOPEN_32BIT_INDICES = 0, /*!< unsigned integer indices */
MIOPEN_64BIT_INDICES = 1, /*!< unsigned long indices */
MIOPEN_16BIT_INDICES = 2, /*!< unsigned short indices */
MIOPEN_8BIT_INDICES = 3, /*!< unsigned char indices */
MIOPEN_32BIT_INDICES = 0, /*!< 32-bit unsigned integer indices */
MIOPEN_64BIT_INDICES = 1, /*!< 64-bit unsigned integer indices */
MIOPEN_16BIT_INDICES = 2, /*!< 16-bit unsigned integer indices */
MIOPEN_8BIT_INDICES = 3, /*!< 8-bit unsigned integer indices */
} miopenIndicesType_t;

/*! @ingroup convolutions
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
sqlite3@3.17 -DCMAKE_POSITION_INDEPENDENT_CODE=On
boost@1.79 -DCMAKE_POSITION_INDEPENDENT_CODE=On --build -DCMAKE_CXX_FLAGS=" -std=c++14 -Wno-enum-constexpr-conversion "
ROCmSoftwarePlatform/half@4f19ce3e56f3d3a17cf69f9db4ff3722f7445b0d --build
ROCmSoftwarePlatform/rocMLIR@rocm-5.5.0 -H sha256:a5f62769d28a73e60bc8d61022820f050e97c977c8f6f6275488db31512e1f42 -DBUILD_FAT_LIBROCKCOMPILER=1 -DCMAKE_IGNORE_PATH=/opt/conda/envs/py_3.8
ROCmSoftwarePlatform/rocMLIR@rocm-5.5.0 -H sha256:a5f62769d28a73e60bc8d61022820f050e97c977c8f6f6275488db31512e1f42 -DBUILD_FAT_LIBROCKCOMPILER=1 -DCMAKE_IGNORE_PATH=/opt/conda/envs/py_3.9 -DCMAKE_IGNORE_PREFIX_PATH=/opt/conda
nlohmann/json@v3.9.1 -DJSON_MultipleHeaders=ON -DJSON_BuildTests=Off
ROCmSoftwarePlatform/FunctionalPlus@v0.2.18-p0
ROCmSoftwarePlatform/eigen@3.4.0
Expand Down
1 change: 1 addition & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,7 @@ set( MIOpen_Source
solver/conv_hip_implicit_gemm_fwd_v4r5_xdlops.cpp
solver/conv_hip_implicit_gemm_fwd_xdlops.cpp
solver/conv_hip_implicit_gemm_grouped_fwd_xdlops.cpp
solver/conv_hip_implicit_gemm_3d_grouped_fwd_xdlops.cpp
solver/conv_hip_implicit_gemm_nonxdlops_common.cpp
solver/conv_hip_implicit_gemm_wrw_v4r4.cpp
solver/conv_hip_implicit_gemm_wrw_v4r4_xdlops.cpp
Expand Down
6 changes: 3 additions & 3 deletions src/include/miopen/env.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ inline bool IsEnvvarValueEnabled(const char* name)

// Return 0 if env is enabled else convert environment var to an int.
// Supports hexadecimal with leading 0x or decimal
inline unsigned long int EnvvarValue(const char* name, unsigned long int fallback = 0)
inline uint64_t EnvvarValue(const char* name, uint64_t fallback = 0)
{
// NOLINTNEXTLINE (concurrency-mt-unsafe)
const auto value_env_p = std::getenv(name);
Expand All @@ -108,7 +108,7 @@ inline unsigned long int EnvvarValue(const char* name, unsigned long int fallbac
}
else
{
return strtoul(value_env_p, nullptr, 0);
return strtoull(value_env_p, nullptr, 0);
}
}

Expand Down Expand Up @@ -147,7 +147,7 @@ inline bool IsDisabled(T)
}

template <class T>
inline unsigned long int Value(T, unsigned long int fallback = 0)
inline uint64_t Value(T, uint64_t fallback = 0)
{
static const auto result = miopen::EnvvarValue(T::value(), fallback);
return result;
Expand Down
10 changes: 5 additions & 5 deletions src/include/miopen/hipoc_kernel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -145,12 +145,12 @@ struct HIPOCKernelInvoke
memcpy(hip_args, &(any_args[0].buffer[0]), any_args[0].size());
// copy_arg(any_args[0], hip_args, 0);

for(unsigned long idx = 1; idx < any_args.size(); idx++)
for(std::size_t idx = 1; idx < any_args.size(); idx++)
{
auto& any_arg = any_args[idx];
unsigned long alignment = any_arg.size();
unsigned long padding = (alignment - (sz_left % alignment)) % alignment;
unsigned long second_index = sz_left + padding;
auto& any_arg = any_args[idx];
std::size_t alignment = any_arg.size();
std::size_t padding = (alignment - (sz_left % alignment)) % alignment;
std::size_t second_index = sz_left + padding;
memcpy(hip_args + second_index, &(any_arg.buffer[0]), any_arg.size());
// copy_arg(any_arg, hip_args, second_index);
sz_left = second_index + alignment;
Expand Down
2 changes: 1 addition & 1 deletion src/include/miopen/magic_div.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ static inline magic_div_u32_t magic_div_u32_gen(uint32_t d)

constexpr uint64_t one = 1;
uint64_t magic = ((one << 32) * ((one << shift) - d)) / d + 1;
assert(magic <= 0xffffffffUL);
assert(magic <= 0xffffffffU);

return {static_cast<uint32_t>(magic), shift};
}
Expand Down
4 changes: 2 additions & 2 deletions src/include/miopen/solution.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,8 @@ struct Solution : miopenSolution

struct SerializationMetadata final
{
unsigned long validation_number;
unsigned long version;
uint64_t validation_number;
uint64_t version;

static constexpr SerializationMetadata Current() { return {0x123456789ABCDEF0, 1}; }

Expand Down
103 changes: 81 additions & 22 deletions src/include/miopen/solver.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2954,7 +2954,7 @@ struct ConvCkIgemmFwdV6r1DlopsNchw final : ConvTunableSolver<PerformanceConvCkIg
bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override;
size_t GetWorkspaceSize(const ConvolutionContext&, const ProblemDescription&) const override;
bool MayNeedWorkspace() const override { return true; }
bool IsDynamic() const override { return true; }
bool IsDynamic() const override { return false; }
PerformanceConvCkIgemmFwdV6r1DlopsNchw
GetDefaultPerformanceConfig(const ConvolutionContext&,
const ProblemDescription&) const override;
Expand Down Expand Up @@ -4561,7 +4561,7 @@ struct PerformanceConfigHipImplicitGemmFwdXdlops
PerformanceConfigHipImplicitGemmFwdXdlops() : PerformanceConfigHipImplicitGemmFwdXdlops(0, "")
{
}
PerformanceConfigHipImplicitGemmFwdXdlops(bool)
explicit PerformanceConfigHipImplicitGemmFwdXdlops(bool)
: PerformanceConfigHipImplicitGemmFwdXdlops(0, "")
{
}
Expand Down Expand Up @@ -4610,6 +4610,7 @@ struct ConvHipImplicitGemmFwdXdlops final
ConvSolution GetSolution(const ConvolutionContext&,
const ProblemDescription&,
const PerformanceConfigHipImplicitGemmFwdXdlops&) const override;
/// \anchor igemm_get_wti_magic_number
// Magic Number Alert:
// Naive convolutions have GetWti() that return very small value (0.01f).
// This allows MIOpen to use Naive Solvers if no other applicable Solvers
Expand Down Expand Up @@ -4692,16 +4693,7 @@ struct ConvHipImplicitGemmBwdXdlops final
ConvSolution GetSolution(const ConvolutionContext&,
const ProblemDescription&,
const PerformanceConfigHipImplicitGemmBwdXdlops&) const override;
// Magic Number Alert:
// Naive convolutions have GetWti() that return very small value (0.01f).
// This allows MIOpen to use Naive Solvers if no other applicable Solvers
// have known WTIs. Right now this means that in case of find-db miss,
// the library will try to use Winograd or GEMM (whatever is faster according
// to their GetWti's), but if both are not applicable, the library will
// use Naive Solver
// Since we would like to us CK before naive, and use it instead (because
// we do expect that CK is faster than Naive), therefore we use a
// value bigger than 0.01f, e.g. 0.02f.
/// \ref igemm_get_wti_magic_number
float GetWti(const ConvolutionContext&, const ProblemDescription&) const override
{
return 0.02f;
Expand Down Expand Up @@ -4776,16 +4768,83 @@ struct ConvHipImplicitGemmGroupFwdXdlops final
ConvSolution GetSolution(const ConvolutionContext&,
const ProblemDescription&,
const PerformanceConfigHipImplicitGemmGroupFwdXdlops&) const override;
// Magic Number Alert:
// Naive convolutions have GetWti() that return very small value (0.01f).
// This allows MIOpen to use Naive Solvers if no other applicable Solvers
// have known WTIs. Right now this means that in case of find-db miss,
// the library will try to use Winograd or GEMM (whatever is faster according
// to their GetWti's), but if both are not applicable, the library will
// use Naive Solver
// Since we would like to us CK before naive, and use it instead (because
// we do expect that CK is faster than Naive), therefore we use a
// value bigger than 0.01f, e.g. 0.02f.
/// \ref igemm_get_wti_magic_number
float GetWti(const ConvolutionContext&, const ProblemDescription&) const override
{
return 0.02f;
};

private:
template <typename DataType>
bool CheckCKApplicability(const ProblemDescription&) const;
};

struct PerformanceConfigHipImplicitGemm3DGroupFwdXdlops
: PerfConfigBase<PerformanceConfigHipImplicitGemm3DGroupFwdXdlops>
{
int index;
std::string kernel_id;
std::vector<std::string> valid_kernels;
PerformanceConfigHipImplicitGemm3DGroupFwdXdlops(int idx, std::string kernl_id)
: index(idx), kernel_id(kernl_id)
{
}
PerformanceConfigHipImplicitGemm3DGroupFwdXdlops()
: PerformanceConfigHipImplicitGemm3DGroupFwdXdlops(0, "")
{
}
PerformanceConfigHipImplicitGemm3DGroupFwdXdlops(bool)
: PerformanceConfigHipImplicitGemm3DGroupFwdXdlops(0, "")
{
}
void HeuristicInit(const ProblemDescription&);
bool SetNextValue(const ProblemDescription&);
bool IsValidValue() const;
bool IsValid(const ConvolutionContext&, const ProblemDescription& problem) const
{
return IsValid(problem);
}
bool IsValid(const ProblemDescription&) const;
template <typename Self, typename F>
static void Visit(Self&& s, F f)
{
f(s.kernel_id, "kernel_id");
}
bool operator==(const PerformanceConfigHipImplicitGemm3DGroupFwdXdlops& other) const;

private:
template <typename DataType>
void Init(const ProblemDescription&);
template <typename DataType>
bool CheckIsSupportCKArgs(const ProblemDescription&) const;
};

struct ConvHipImplicitGemm3DGroupFwdXdlops final
: ConvTunableSolver<PerformanceConfigHipImplicitGemm3DGroupFwdXdlops>
{
const std::string& SolverDbId() const override
{
return GetSolverDbId<ConvHipImplicitGemm3DGroupFwdXdlops>();
}

PerformanceConfigHipImplicitGemm3DGroupFwdXdlops
GetDefaultPerformanceConfig(const ConvolutionContext&,
const ProblemDescription&) const override;
bool IsValidPerformanceConfig(
const ConvolutionContext&,
const ProblemDescription&,
const PerformanceConfigHipImplicitGemm3DGroupFwdXdlops&) const override;
PerformanceConfigHipImplicitGemm3DGroupFwdXdlops
Search(const ConvolutionContext&,
const ProblemDescription&,
const AnyInvokeParams& invoke_ctx) const override;
bool IsApplicable(const ConvolutionContext&, const ProblemDescription&) const override;
bool IsDynamic() const override { return true; }
ConvSolution
GetSolution(const ConvolutionContext&,
const ProblemDescription&,
const PerformanceConfigHipImplicitGemm3DGroupFwdXdlops&) const override;
/// \ref igemm_get_wti_magic_number
float GetWti(const ConvolutionContext&, const ProblemDescription&) const override
{
return 0.02f;
Expand Down
6 changes: 3 additions & 3 deletions src/include/miopen/utility/transposing_solver.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -199,9 +199,9 @@ struct UniversalTransposeSolver : TransposePseudoSolver
const auto kernel = kernels.front();
return [kernel](const Handle& handle, const AnyInvokeParams& any_params) {
const auto& params = any_params.CastTo<TransposeInvokeParams>();
const auto& lens = GetNCDHW<unsigned long>(params.in_desc.GetLengths());
const auto& in_strides = GetNCDHW<unsigned long>(params.in_desc.GetStrides());
const auto& out_strides = GetNCDHW<unsigned long>(params.out_desc.GetStrides());
const auto& lens = GetNCDHW<uint64_t>(params.in_desc.GetLengths());
const auto& in_strides = GetNCDHW<uint64_t>(params.in_desc.GetStrides());
const auto& out_strides = GetNCDHW<uint64_t>(params.out_desc.GetStrides());

// clang-format off
handle.Run(kernel)(
Expand Down
6 changes: 3 additions & 3 deletions src/kernels/MIOpenDropout.cl
Original file line number Diff line number Diff line change
Expand Up @@ -273,13 +273,13 @@ void xorwow_lite_init(prngStates* cur_state,
cur_state->d += (uint)(offset)*362437;
}

__kernel void InitKernelState(__global prngStates* state)
__kernel void InitKernelState(__global prngStates* state, ulong prng_seed, ulong states_num)
{
for(uint gid = get_global_id(0); gid < STATES_NUM; gid += get_global_size(0))
for(uint gid = get_global_id(0); gid < states_num; gid += get_global_size(0))
{
prngStates state_gid;
xorwow_lite_init(&state_gid,
(unsigned long long)PRNG_SEED,
(unsigned long long)prng_seed,
(unsigned long long)gid,
(unsigned long long)0);

Expand Down
4 changes: 2 additions & 2 deletions src/kernels/gfx908.kdb.bz2
Git LFS file not shown
4 changes: 2 additions & 2 deletions src/kernels/gfx90a.kdb.bz2
Git LFS file not shown
1 change: 1 addition & 0 deletions src/mlo_dir_conv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ static auto GetImplicitGemmSolvers()
miopen::solver::ConvHipImplicitGemmFwdXdlops,
miopen::solver::ConvHipImplicitGemmBwdXdlops,
miopen::solver::ConvHipImplicitGemmGroupFwdXdlops,
miopen::solver::ConvHipImplicitGemm3DGroupFwdXdlops,
#endif // MIOPEN_BACKEND_HIP && MIOPEN_USE_COMPOSABLEKERNEL
miopen::solver::ConvAsmImplicitGemmGTCDynamicFwdDlopsNCHWC>{};
}
Expand Down
24 changes: 12 additions & 12 deletions src/ocl/convolutionocl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1237,24 +1237,24 @@ void ConvolutionBackwardBias(const Handle& handle,
{
kernels.front()(dy,
db,
uint(out_k),
uint(stride_k),
uint(stride_n),
uint(map_size_aligned),
uint(off_pix),
uint(total_work));
static_cast<unsigned>(out_k),
static_cast<unsigned>(stride_k),
static_cast<unsigned>(stride_n),
static_cast<unsigned>(map_size_aligned),
static_cast<unsigned>(off_pix),
static_cast<unsigned>(total_work));
}
else
{
handle.AddKernel(algo_name, network_config, program_name, kernel_name, vld, vgd, params)(
dy,
db,
uint(out_k),
uint(stride_k),
uint(stride_n),
uint(map_size_aligned),
uint(off_pix),
uint(total_work));
static_cast<unsigned>(out_k),
static_cast<unsigned>(stride_k),
static_cast<unsigned>(stride_n),
static_cast<unsigned>(map_size_aligned),
static_cast<unsigned>(off_pix),
static_cast<unsigned>(total_work));
}

if(miopen::CheckNumericsEnabled())
Expand Down
Loading

0 comments on commit feab6fd

Please sign in to comment.