From d03483988817f6aa0c6812b34349359d4c721e7c Mon Sep 17 00:00:00 2001 From: Siva Date: Mon, 5 Sep 2022 22:49:09 -0700 Subject: [PATCH] [OpenCLML] CLML Profiling fixes corresponding to OpenCL Timer recent changes. --- src/runtime/contrib/clml/clml_runtime.cc | 60 ++++++++++++------- .../contrib/test_clml/infrastructure.py | 6 +- .../python/contrib/test_clml/test_network.py | 5 +- tests/python/contrib/test_clml/test_ops.py | 4 +- 4 files changed, 45 insertions(+), 30 deletions(-) diff --git a/src/runtime/contrib/clml/clml_runtime.cc b/src/runtime/contrib/clml/clml_runtime.cc index 7966c0e78b2d7..352eb3516fd2f 100644 --- a/src/runtime/contrib/clml/clml_runtime.cc +++ b/src/runtime/contrib/clml/clml_runtime.cc @@ -100,6 +100,29 @@ class CLMLRuntime : public JSONRuntimeBase { */ const char* type_key() const override { return "clml"; } + /*! + * \brief get command queue instance from OpenCL workspace + * + * \return reference for OpenCL command queue + */ + cl_command_queue GetCommadQueue(void) { + // Reuse the OpenCl work space from TVM Device API. + auto func = tvm::runtime::Registry::Get("device_api.opencl"); + ICHECK(func != nullptr) << "Cannot find OpenCL device_api in registry"; + auto device_api = static_cast(((*func)()).operator void*()); + this->context = device_api->context; + bool queue_found = false; + for (size_t i = 0; i < device_api->devices.size(); ++i) { + if (device_api->devices[i] == device_id) { + this->queue = device_api->queues[i]; + this->evts = &(device_api->events[i]); + queue_found = true; + } + } + ICHECK(queue_found != false) << "Device queue not found in OpenCL Workspace"; + return this->queue; + } + /*! * \brief Initialize runtime. Create CLML layer from JSON * representation. @@ -146,22 +169,7 @@ class CLMLRuntime : public JSONRuntimeBase { LOG(WARNING) << "CLML Runtime Init: Qualcomm extn not present.\n"; return; } - - // Reuse the OpenCl work space from TVM Device API. - auto func = tvm::runtime::Registry::Get("device_api.opencl"); - ICHECK(func != nullptr) << "Cannot find OpenCL device_api in registry"; - auto device_api = static_cast(((*func)()).operator void*()); - this->context = device_api->context; - bool queue_found = false; - for (size_t i = 0; i < device_api->devices.size(); ++i) { - if (device_api->devices[i] == device_id) { - this->queue = device_api->queues[i]; - this->evts = &(device_api->events[i]); - queue_found = true; - } - } - ICHECK(queue_found != false) << "Device queue not found in OpenCL Workspace"; - + this->queue = GetCommadQueue(); // Query and Get CLML Interface static const cl_uint MAX_VERSIONS = 256; cl_int majorVersions[MAX_VERSIONS]; @@ -220,7 +228,8 @@ class CLMLRuntime : public JSONRuntimeBase { cl_ml_tensor_layout_qcom layout = CL_TENSOR_LAYOUT_NCHW_QCOM) { cl_int result = 0; cl_event evt = NULL; - result = h_ClmlIntf->clEnqueueWriteMLTensorDataQCOM(queue, data, layout, tensor->tensor, + result = h_ClmlIntf->clEnqueueWriteMLTensorDataQCOM(GetCommadQueue(), + data, layout, tensor->tensor, tensor->memory, 0, // n waitlist NULL, // waitlist @@ -233,7 +242,8 @@ class CLMLRuntime : public JSONRuntimeBase { cl_int result = 0; cl_event readEvent = NULL; // Read the output tensor - result = h_ClmlIntf->clEnqueueReadMLTensorDataQCOM(queue, tensor->tensor, tensor->memory, data, + result = h_ClmlIntf->clEnqueueReadMLTensorDataQCOM(GetCommadQueue(), + tensor->tensor, tensor->memory, data, layout, 0, // n waitlist NULL, // waitlist @@ -253,6 +263,7 @@ class CLMLRuntime : public JSONRuntimeBase { */ void Run() override { cl_int result = 0; + this->queue = GetCommadQueue(); for (size_t i = 0; i < input_nodes_.size(); ++i) { auto nid = input_nodes_[i]; uint32_t eid = EntryID(nid, 0); @@ -286,10 +297,15 @@ class CLMLRuntime : public JSONRuntimeBase { } for (size_t i = 0; i < this->layer_.function.size(); ++i) { - this->evts->resize(this->evts->size() + 1); - cl_event* evt = &(this->evts->back()); - result = h_ClmlIntf->clEnqueueMLOpQCOM(queue, this->layer_.function[i], + if (getenv("CLML_PROFILING")) { + this->evts->resize(this->evts->size() + 1); + cl_event* evt = &(this->evts->back()); + result = h_ClmlIntf->clEnqueueMLOpQCOM(queue, this->layer_.function[i], this->layer_.descriptorSet, 0, NULL, evt); + } else { + result = h_ClmlIntf->clEnqueueMLOpQCOM(queue, this->layer_.function[i], + this->layer_.descriptorSet, 0, NULL, NULL); + } ICHECK(result == CL_SUCCESS) << "clEnqueueMLOpQCOM:" << result; } @@ -449,7 +465,7 @@ class CLMLRuntime : public JSONRuntimeBase { LOG(WARNING) << "CLML Tunning In Progress:"; for (size_t i = 0; i < this->layer_.function.size(); ++i) { LOG(WARNING) << "CLML Tunning:" << i; - result = h_ClmlIntf->clTuneMLOpQCOM(queue, this->layer_.function[i], + result = h_ClmlIntf->clTuneMLOpQCOM(GetCommadQueue(), this->layer_.function[i], this->layer_.descriptorSet, this->tuning_cache, NULL); ICHECK(result == CL_SUCCESS) << "clTuneMLOpQCOM:" << result; } diff --git a/tests/python/contrib/test_clml/infrastructure.py b/tests/python/contrib/test_clml/infrastructure.py index 19901d733e4c5..0cf76079e8fba 100644 --- a/tests/python/contrib/test_clml/infrastructure.py +++ b/tests/python/contrib/test_clml/infrastructure.py @@ -73,11 +73,11 @@ class Device: connection_type = "tracker" host = "localhost" - port = 9090 + port = 9150 target = "opencl" target_host = "llvm -mtriple=aarch64-linux-gnu" - device_key = "" - cross_compile = "" + device_key = "android" + cross_compile = "aarch64-linux-android-g++" def __init__(self): """Keep remote device for lifetime of object.""" diff --git a/tests/python/contrib/test_clml/test_network.py b/tests/python/contrib/test_clml/test_network.py index d89676f10e3a8..dd0268ed07db5 100644 --- a/tests/python/contrib/test_clml/test_network.py +++ b/tests/python/contrib/test_clml/test_network.py @@ -22,8 +22,7 @@ from tvm import relay import tvm -from test_clml.infrastructure import skip_runtime_test, build_and_run -from test_clml.infrastructure import Device +from infrastructure import skip_runtime_test, build_and_run, Device def _build_and_run_network(mod, params, inputs, data, device, atol, rtol): @@ -86,7 +85,7 @@ def get_model(): mobilenet = MobileNet( include_top=True, weights=None, input_shape=(224, 224, 3), classes=1000 ) - mobilenet.load_weights("mobilenet_1_0_224_tf.h5") + #mobilenet.load_weights("mobilenet_1_0_224_tf.h5") inputs = {mobilenet.input_names[0]: ((1, 3, 224, 224), "float32")} data = {} diff --git a/tests/python/contrib/test_clml/test_ops.py b/tests/python/contrib/test_clml/test_ops.py index 63f5bc168fd0f..cf11c44f3f0df 100644 --- a/tests/python/contrib/test_clml/test_ops.py +++ b/tests/python/contrib/test_clml/test_ops.py @@ -25,7 +25,7 @@ from tvm import relay from tvm.ir import IRModule -from test_clml.infrastructure import ( +from infrastructure import ( skip_runtime_test, skip_codegen_test, build_and_run, @@ -212,5 +212,5 @@ def test_batchnorm(): if __name__ == "__main__": - # test_conv2d() + test_conv2d() test_batchnorm()