fix(device_conf): Devices never actually got swithed in multi device

narendasan · narendasan · commit f1d0a4325c26 · 2021-08-19T21:40:07.000-07:00
cases

Signed-off-by: Naren Dasan &lt;naren@narendasan.com&gt;
Signed-off-by: Naren Dasan &lt;narens@nvidia.com&gt;
diff --git a/core/runtime/CudaDevice.cpp b/core/runtime/CudaDevice.cpp
@@ -66,6 +66,15 @@ CudaDevice::CudaDevice(std::string device_info) {
   LOG_DEBUG("Deserialized Device Info: " << *this);
 }
 
+CudaDevice& CudaDevice::operator=(const CudaDevice& other) {
+  id = other.id;
+  major = other.major;
+  minor = other.minor;
+  device_type = other.device_type;
+  device_name = other.device_name;
+  return (*this);
+}
+
 std::string CudaDevice::serialize() {
   std::vector<std::string> content;
   content.resize(DEVICE_NAME_IDX + 1);
diff --git a/core/runtime/TRTEngine.cpp b/core/runtime/TRTEngine.cpp
@@ -38,7 +38,9 @@ TRTEngine::TRTEngine(std::vector<std::string> serialized_info) {
 }
 
 TRTEngine::TRTEngine(std::string mod_name, std::string serialized_engine, CudaDevice cuda_device) {
-  device_info = cuda_device;
+  auto most_compatible_device = get_most_compatible_device(cuda_device);
+  TRTORCH_CHECK(most_compatible_device, "No compatible device was found for instantiating TensorRT engine");
+  device_info = most_compatible_device.value();
   set_cuda_device(device_info);
 
   rt = std::shared_ptr<nvinfer1::IRuntime>(nvinfer1::createInferRuntime(util::logging::get_logger()));
diff --git a/core/runtime/register_trt_op.cpp b/core/runtime/register_trt_op.cpp
@@ -11,75 +11,50 @@ namespace core {
 namespace runtime {
 
 // Checks if the context switch requred for device ID
-bool is_switch_required(const CudaDevice& curr_device, const CudaDevice& conf_device) {
+bool is_switch_required(const CudaDevice& curr_device, const CudaDevice& engine_device) {
   // If SM capability is not the same as configured then switch
-  if ((curr_device.major != conf_device.major) || (curr_device.minor != conf_device.minor)) {
+  if ((curr_device.major != engine_device.major) || (curr_device.minor != engine_device.minor)) {
     LOG_WARNING(
-        "Configured SM capability " << conf_device.getSMCapability()
+        "Configured SM capability " << engine_device.getSMCapability()
                                     << " does not match with current device SM capability "
                                     << curr_device.getSMCapability() << " (" << curr_device
                                     << "). Switching device context");
     return true;
   }
 
   // GPU case
-  if (conf_device.device_type == nvinfer1::DeviceType::kGPU) {
-    if (curr_device.device_name != conf_device.device_name) {
+  if (engine_device.device_type == nvinfer1::DeviceType::kGPU) {
+    if (curr_device.device_name != engine_device.device_name) {
       LOG_WARNING(
-          "Program compiled for " << conf_device.device_name << " but current CUDA device is " << curr_device
+          "Program compiled for " << engine_device.device_name << " but current CUDA device is " << curr_device
                                   << ". Attempting to switch device context for better compatibility");
       return true;
     }
   }
 
-  if (curr_device.id != conf_device.id) {
+  if (curr_device.id != engine_device.id) {
     LOG_WARNING(
-        "Configured Device ID: " << conf_device.id << " is different that current device ID: " << curr_device.id
-                                 << ". Moving input tensors to device: " << conf_device.id);
+        "Configured Device ID: " << engine_device.id << " is different that current device ID: " << curr_device.id
+                                 << ". Moving input tensors to device: " << engine_device.id);
     return true;
   }
 
   return false;
 }
 
-CudaDevice select_cuda_device(const CudaDevice& conf_device) {
-  int64_t device_id = -1;
-  auto dla_supported = get_dla_supported_SMs();
-
-  auto device_list = get_available_device_list().get_devices();
-
-  CudaDevice new_target_device;
-
-  for (auto device : device_list) {
-    auto compute_cap = device.second.getSMCapability();
-    // In case of DLA select the DLA supported device ID
-    if (conf_device.device_type == nvinfer1::DeviceType::kDLA) {
-      if (dla_supported.find(compute_cap) != dla_supported.end() &&
-          dla_supported[compute_cap] == device.second.device_name) {
-        device_id = device.second.id;
-        new_target_device = CudaDevice(device_id, nvinfer1::DeviceType::kDLA);
-        break;
-      }
-    } else if (conf_device.device_type == nvinfer1::DeviceType::kGPU) {
-      auto conf_sm = conf_device.getSMCapability();
-      if (compute_cap == conf_sm && device.second.device_name == conf_device.device_name) {
-        device_id = device.second.id;
-        new_target_device = CudaDevice(device_id, nvinfer1::DeviceType::kGPU);
-        break;
-      }
-    } else {
-      TRTORCH_THROW_ERROR("Unknown target device type detected from the compiled program (runtime.select_cuda_device)");
-      break;
-    }
-  }
+CudaDevice select_cuda_device(const CudaDevice& engine_device) {
+  auto new_target_device_opt = get_most_compatible_device(engine_device);
 
   // REVIEW: THIS DOES NOT LIST DLA PROBABLY, WHICH WE SHOULD
+  // TODO: I think this logic could be way simpler at execution time since if the tensors arent on the right
+  // device, its not going to run. We should just set device to engine device and maybe reset and memcpy tensors
+  // back to orginal device if needed.
   TRTORCH_CHECK(
-      device_id >= 0,
+      new_target_device_opt,
       "No compatible device found on system to run program.\n Program targets "
-          << conf_device << "\n Available targets: \n"
+          << engine_device << "\n Available targets: \n"
           << get_available_device_list().dump_list() << "\n(runtime.select_cuda_device)");
-  return new_target_device;
+  return new_target_device_opt.value();
 }
 
 std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intrusive_ptr<TRTEngine> compiled_engine) {
@@ -96,7 +71,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
     std::string target_device = "cuda:" + std::to_string(device.id);
 
     for (auto& in : inputs) {
-      in = in.to(at::kCUDA);
+      in = in.to(torch::Device(target_device));
     }
   }
 
diff --git a/core/runtime/runtime.cpp b/core/runtime/runtime.cpp
@@ -7,9 +7,72 @@ namespace trtorch {
 namespace core {
 namespace runtime {
 
+c10::optional<CudaDevice> get_most_compatible_device(const CudaDevice& target_device) {
+  LOG_DEBUG("Target Device: " << target_device);
+  auto device_options = find_compatible_devices(target_device);
+  if (device_options.size() == 0) {
+    return {};
+  } else if (device_options.size() == 1) {
+    return {device_options[0]};
+  }
+
+  CudaDevice best_match;
+  std::stringstream dev_list;
+  dev_list << "[" << std::endl;
+  for (auto device : device_options) {
+    dev_list << "    " << device << ',' << std::endl;
+    if (device.device_name == target_device.device_name && best_match.device_name != target_device.device_name) {
+      best_match = device;
+    } else if (device.device_name == target_device.device_name && best_match.device_name == target_device.device_name) {
+      if (device.id == target_device.id && best_match.id != target_device.id) {
+        best_match = device;
+      }
+    }
+  }
+  dev_list << ']';
+  LOG_DEBUG("Compatible device options: " << dev_list.str());
+
+  if (best_match.id == -1) {
+    LOG_DEBUG("No valid device options");
+    return {};
+  } else {
+    LOG_DEBUG("Selected: " << best_match);
+    return {best_match};
+  }
+}
+
+std::vector<CudaDevice> find_compatible_devices(const CudaDevice& target_device) {
+  auto dla_supported = get_dla_supported_SMs();
+  auto device_list = get_available_device_list().get_devices();
+
+  std::vector<CudaDevice> compatible_devices;
+
+  for (auto device : device_list) {
+    auto poss_dev_cc = device.second.getSMCapability();
+    if (target_device.device_type == nvinfer1::DeviceType::kDLA) {
+      if (dla_supported.find(poss_dev_cc) != dla_supported.end() &&
+          dla_supported[poss_dev_cc] == target_device.device_name) {
+        compatible_devices.push_back(device.second);
+      }
+    } else if (target_device.device_type == nvinfer1::DeviceType::kGPU) {
+      auto target_dev_cc = target_device.getSMCapability();
+      // If the SM Capabilities match, should be good enough to run
+      if (poss_dev_cc == target_dev_cc) {
+        compatible_devices.push_back(device.second);
+      }
+    } else {
+      TRTORCH_THROW_ERROR(
+          "Unknown target device type detected from the compiled program (runtime.find_compatible_devices)");
+      break;
+    }
+  }
+  return compatible_devices;
+}
+
 void set_cuda_device(CudaDevice& cuda_device) {
   TRTORCH_CHECK(
       (cudaSetDevice(cuda_device.id) == cudaSuccess), "Unable to set device: " << cuda_device << "as active device");
+  LOG_DEBUG("Setting " << cuda_device << " as active device");
 }
 
 CudaDevice get_current_device() {
diff --git a/core/runtime/runtime.h b/core/runtime/runtime.h
@@ -24,6 +24,7 @@ struct CudaDevice {
   CudaDevice();
   CudaDevice(int64_t gpu_id, nvinfer1::DeviceType device_type);
   CudaDevice(std::string serialized_device_info);
+  CudaDevice& operator=(const CudaDevice& other);
   std::string serialize();
   std::string getSMCapability() const;
   friend std::ostream& operator<<(std::ostream& os, const CudaDevice& device);
@@ -33,6 +34,9 @@ void set_cuda_device(CudaDevice& cuda_device);
 // Gets the current active GPU (DLA will not show up through this)
 CudaDevice get_current_device();
 
+c10::optional<CudaDevice> get_most_compatible_device(const CudaDevice& target_device);
+std::vector<CudaDevice> find_compatible_devices(const CudaDevice& target_device);
+
 std::string serialize_device(CudaDevice& cuda_device);
 CudaDevice deserialize_device(std::string device_info);
 
diff --git a/tests/core/conversion/evaluators/test_aten_evaluators.cpp b/tests/core/conversion/evaluators/test_aten_evaluators.cpp
@@ -3,6 +3,7 @@
 #include "gtest/gtest.h"
 #include "tests/util/util.h"
 #include "torch/csrc/jit/ir/irparser.h"
+#include "torch/torch.h"
 
 TEST(Evaluators, DivIntEvaluatesCorrectly) {
   const auto graph = R"IR(
diff --git a/tests/cpp/BUILD b/tests/cpp/BUILD
@@ -94,10 +94,10 @@ cc_test(
     name = "test_multi_gpu_serdes",
     srcs = ["test_multi_gpu_serdes.cpp"],
     data = [
-        ":jit_models",
+        "//tests/modules:jit_models",
     ],
     deps = [
-        ":module_test",
+        ":cpp_api_test",
     ],
 )
 
diff --git a/tests/cpp/test_multi_gpu_serdes.cpp b/tests/cpp/test_multi_gpu_serdes.cpp
@@ -23,11 +23,12 @@ TEST_P(CppAPITests, CompiledModuleIsClose) {
   trt_results.push_back(trt_results_ivalues.toTensor());
 
   for (size_t i = 0; i < trt_results.size(); i++) {
-    ASSERT_TRUE(trtorch::tests::util::almostEqual(jit_results[i], trt_results[i].reshape_as(jit_results[i]), 2e-5));
+    ASSERT_TRUE(trtorch::tests::util::almostEqual(
+        jit_results[i], trt_results[i].reshape_as(jit_results[i]).to(torch::Device("cuda:0")), 2e-5));
   }
 }
 
 INSTANTIATE_TEST_SUITE_P(
     CompiledModuleForwardIsCloseSuite,
     CppAPITests,
-    testing::Values(PathAndInSize({"tests/modules/resnet18_traced.jit.pt", {{1, 3, 224, 224}}})));
+    testing::Values(PathAndInSize({"tests/modules/resnet18_traced.jit.pt", {{1, 3, 224, 224}}, 2e-5})));

Original file line number	Diff line number	Diff line change
`@@ -38,7 +38,9 @@ TRTEngine::TRTEngine(std::vector<std::string> serialized_info) {`
`38`	`38`	`}`
`39`	`39`
`40`	`40`	`TRTEngine::TRTEngine(std::string mod_name, std::string serialized_engine, CudaDevice cuda_device) {`
`41`		`- device_info = cuda_device;`
	`41`	`+ auto most_compatible_device = get_most_compatible_device(cuda_device);`
	`42`	`+ TRTORCH_CHECK(most_compatible_device, "No compatible device was found for instantiating TensorRT engine");`
	`43`	`+ device_info = most_compatible_device.value();`
`42`	`44`	`set_cuda_device(device_info);`
`43`	`45`
`44`	`46`	`rt = std::shared_ptr<nvinfer1::IRuntime>(nvinfer1::createInferRuntime(util::logging::get_logger()));`
Original file line number	Diff line number	Diff line change
`@@ -23,11 +23,12 @@ TEST_P(CppAPITests, CompiledModuleIsClose) {`
`23`	`23`	`trt_results.push_back(trt_results_ivalues.toTensor());`
`24`	`24`
`25`	`25`	`for (size_t i = 0; i < trt_results.size(); i++) {`
`26`		`- ASSERT_TRUE(trtorch::tests::util::almostEqual(jit_results[i], trt_results[i].reshape_as(jit_results[i]), 2e-5));`
	`26`	`+ ASSERT_TRUE(trtorch::tests::util::almostEqual(`
	`27`	`+ jit_results[i], trt_results[i].reshape_as(jit_results[i]).to(torch::Device("cuda:0")), 2e-5));`
`27`	`28`	`}`
`28`	`29`	`}`
`29`	`30`
`30`	`31`	`INSTANTIATE_TEST_SUITE_P(`
`31`	`32`	`CompiledModuleForwardIsCloseSuite,`
`32`	`33`	`CppAPITests,`
`33`		`- testing::Values(PathAndInSize({"tests/modules/resnet18_traced.jit.pt", {{1, 3, 224, 224}}})));`
	`34`	`+ testing::Values(PathAndInSize({"tests/modules/resnet18_traced.jit.pt", {{1, 3, 224, 224}}, 2e-5})));`