[OpenCLML] Reactor and introduce on chip memory and memory planner (#…

…14922) * Reactor and introduce in chip memory and memory planner Introduced thread context with CLMLWorkspace. Organized the code as runtime, utils and memory planners Introcuded recording queue support and on chip memory support. On chip memory allocation planner to acommodate multiple tensors at a time. DDR memory planner introduced to reuse the underlaying memory across multiple tensor descriptors. Dense layer support refactored to use GEMM. CLML binary operators doesn't support broadcasting. Hence introduced an explicite broadcast op as a work around. clml SDK codegen is enhanced accordingly. * * review comments * * Memory planner cpp_runtime tests. * * gtest build rules while in android environments. * * review comments --------- Co-authored-by: Siva Rama Krishna Reddy B <sivb@blr-ubuntu-ripper.qualcomm.com>
apache · Jun 5, 2023 · 1366f2e · 1366f2e
1 parent 80079b6
commit 1366f2e
Show file tree

Hide file tree

Showing 16 changed files with 2,088 additions and 597 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -498,6 +498,7 @@ include(cmake/modules/VTA.cmake)
 include(cmake/modules/StandaloneCrt.cmake)
 include(cmake/modules/CUDA.cmake)
 include(cmake/modules/Hexagon.cmake) # This must come before logging.cmake
+include(cmake/modules/contrib/CLML.cmake) # Must be before OpenCL.cmake
 include(cmake/modules/OpenCL.cmake)
 include(cmake/modules/OpenMP.cmake)
 include(cmake/modules/Vulkan.cmake)
@@ -531,7 +532,6 @@ include(cmake/modules/contrib/ArmComputeLib.cmake)
 include(cmake/modules/contrib/TensorRT.cmake)
 include(cmake/modules/contrib/VitisAI.cmake)
 include(cmake/modules/contrib/Verilator.cmake)
-include(cmake/modules/contrib/CLML.cmake)
 include(cmake/modules/contrib/UMA.cmake)
 include(cmake/modules/Git.cmake)
 include(cmake/modules/LibInfo.cmake)

diff --git a/apps/cpp_clml/clml_runner.cc b/apps/cpp_clml/clml_runner.cc
@@ -50,8 +50,8 @@ CLMLRunner::CLMLRunner(std::string name, ToolArgs& args, cl_platform_id arg_plat
       context(arg_context),
       device_id(arg_device_id),
       queue(arg_queue) {
-  LOG(INFO) << "CLMLRunner Constructor: Input:" << r_args.input << " Output:" << r_args.output
-            << " Params:" << r_args.params;
+  LOG(INFO) << "CLMLRunner Constructor:" << name << " Input:" << r_args.input
+            << " Output:" << r_args.output << " Params:" << r_args.params;
   cl_int result;
 
   // Query and Get CLML Interface
@@ -648,25 +648,29 @@ void CLMLRunner::MakeConcatenate(
 void CLMLRunner::MakeDense(std::shared_ptr<cl_ml_tensor_memory_desc_qcom> input_desc,
                            std::shared_ptr<cl_ml_tensor_memory_desc_qcom> weight_desc,
                            std::shared_ptr<cl_ml_tensor_memory_desc_qcom> output_desc,
-                           std::shared_ptr<cl_ml_tensor_memory_desc_qcom> bias_desc,
+                           std::vector<cl_uint> in_shape, std::vector<cl_uint> wt_shape,
                            std::string dtype) {
   cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(MakeCLDataType(dtype));
   cl_ml_op_qcom op = nullptr;
   cl_int result;
+  cl_gemm_transform_qcom b_transform = CL_GEMM_TRANSFORM_NONE_QCOM;
 
-  cl_ml_op_convolution_desc_qcom conv_desc = {CL_CONVOLUTION_MODE_CONVOLUTION_QCOM,
-                                              1,
-                                              4,
-                                              {0, 0},
-                                              {0, 0},
-                                              {1, 1},
-                                              {1, 1},
-                                              0,
-                                              cl_arithmetic_mode};
-
-  result = h_ClmlIntf->clCreateMLOpConvolutionForwardQCOM(
-      this->context, 0, &conv_desc, input_desc->tensor, weight_desc->tensor, bias_desc->tensor,
-      output_desc->tensor, &op, tuning_cache);
+  if (in_shape[1] == wt_shape[1]) {
+    b_transform = CL_GEMM_TRANSFORM_TRANSPOSE_QCOM;
+  }
+
+  cl_ml_op_gemm_desc_qcom gemmDesc = {in_shape[0],                  // m
+                                      wt_shape[0],                  // n
+                                      wt_shape[1],                  // k
+                                      CL_GEMM_TRANSFORM_NONE_QCOM,  // A transform
+                                      b_transform,                  // B transform
+                                      {{1.0}, CL_FLOAT},            // alpha
+                                      {{0.0}, CL_FLOAT},            // beta
+                                      cl_arithmetic_mode};
+
+  result =
+      h_ClmlIntf->clCreateMLOpGemmQCOM(this->context, 0, &gemmDesc, input_desc->tensor,
+                                       weight_desc->tensor, output_desc->tensor, &op, tuning_cache);
 
   CLML_SDK_TEST_AND_EXIT(op && result == CL_SUCCESS);
   this->function.push_back(op);

diff --git a/apps/cpp_clml/clml_runner.h b/apps/cpp_clml/clml_runner.h
@@ -178,7 +178,7 @@ class CLMLRunner {
   void MakeDense(std::shared_ptr<cl_ml_tensor_memory_desc_qcom> input_desc,
                  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> weight_desc,
                  std::shared_ptr<cl_ml_tensor_memory_desc_qcom> output_desc,
-                 std::shared_ptr<cl_ml_tensor_memory_desc_qcom> bias_desc, std::string dtype);
+                 std::vector<cl_uint> in_shape, std::vector<cl_uint> wt_shape, std::string dtype);
 
   /*! \brief SoftMax layer implementattion */
   void MakeSoftMax(std::shared_ptr<cl_ml_tensor_memory_desc_qcom> input_desc,

diff --git a/apps/cpp_clml/scripts/clml_codegen.py b/apps/cpp_clml/scripts/clml_codegen.py
@@ -45,7 +45,7 @@ def main():
         clml_mod = clml.partition_for_clml(mod, params)
         libm = relay.build(
             clml_mod,
-            target="opencl -device=adreno",
+            target="opencl",
             target_host="llvm -mtriple=aarch64-linux-gnu",
             params=params,
         )

diff --git a/cmake/modules/OpenCL.cmake b/cmake/modules/OpenCL.cmake
@@ -59,20 +59,35 @@ if(USE_OPENCL)
     list(APPEND TVM_RUNTIME_LINKER_LIBS ${OpenCL_LIBRARIES})
   endif()
 
-  if(DEFINED USE_OPENCL_GTEST AND EXISTS ${USE_OPENCL_GTEST})
-    include(FetchContent)
-    FetchContent_Declare(googletest SOURCE_DIR "${USE_OPENCL_GTEST}")
-    set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
-    FetchContent_MakeAvailable(googletest)
-    install(TARGETS gtest EXPORT ${PROJECT_NAME}Targets DESTINATION lib${LIB_SUFFIX})
+  if(DEFINED USE_OPENCL_GTEST)
+    if(EXISTS ${USE_OPENCL_GTEST})
+        include(FetchContent)
+        FetchContent_Declare(googletest SOURCE_DIR "${USE_OPENCL_GTEST}")
+        set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
+        FetchContent_MakeAvailable(googletest)
+        install(TARGETS gtest EXPORT ${PROJECT_NAME}Targets DESTINATION lib${LIB_SUFFIX})
 
-    message(STATUS "Found OpenCL gtest at ${USE_OPENCL_GTEST}")
+        message(STATUS "Found OpenCL gtest at ${USE_OPENCL_GTEST}")
+        set(Build_OpenCL_GTests ON)
+    elseif (ANDROID_ABI AND DEFINED ENV{ANDROID_NDK_HOME})
+        set(GOOGLETEST_ROOT $ENV{ANDROID_NDK_HOME}/sources/third_party/googletest)
+        add_library(gtest_main STATIC ${GOOGLETEST_ROOT}/src/gtest_main.cc ${GOOGLETEST_ROOT}/src/gtest-all.cc)
+        target_include_directories(gtest_main PRIVATE ${GOOGLETEST_ROOT})
+        target_include_directories(gtest_main PUBLIC ${GOOGLETEST_ROOT}/include)
+        message(STATUS "Using gtest from Android NDK")
+        set(Build_OpenCL_GTests ON)
+    endif()
 
-    tvm_file_glob(GLOB_RECURSE OPENCL_TEST_SRCS
-      "${CMAKE_SOURCE_DIR}/tests/cpp-runtime/opencl/*.cc"
-    )
-    add_executable(opencl-cpptest ${OPENCL_TEST_SRCS})
-    target_link_libraries(opencl-cpptest PRIVATE gtest_main tvm_runtime)
+    if(Build_OpenCL_GTests)
+        message(STATUS "Building OpenCL-Gtests")
+        tvm_file_glob(GLOB_RECURSE OPENCL_TEST_SRCS
+          "${CMAKE_SOURCE_DIR}/tests/cpp-runtime/opencl/*.cc"
+        )
+        add_executable(opencl-cpptest ${OPENCL_TEST_SRCS})
+        target_link_libraries(opencl-cpptest PRIVATE gtest_main tvm_runtime)
+    else()
+        message(STATUS "Couldn't build OpenCL-Gtests")
+    endif()
   endif()
   list(APPEND RUNTIME_SRCS ${RUNTIME_OPENCL_SRCS})
   if(USE_OPENCL_ENABLE_HOST_PTR)