diff --git a/include/tvm/ir/function.h b/include/tvm/ir/function.h
index 5ee719f9964f8..e466cde097ac1 100644
--- a/include/tvm/ir/function.h
+++ b/include/tvm/ir/function.h
@@ -191,24 +191,24 @@ constexpr const char* kTarget = "target";
 constexpr const char* kGlobalSymbol = "global_symbol";
 
 /*!
- * \brief The device type which will hold each of the functions parameters.
+ * \brief The SEScope which will hold each of the functions parameters.
  *
  * Only supported on Relay \p Functions. Generally added by the \p PlanDevices pass, but
  * may be included as an annotation on user programs.
  *
- * Type: Array<Integer> (but interpreted as Array<DLDeviceType>)
+ * Type: Array<SEScope>
  */
-constexpr const char* kParamDeviceTypes = "param_device_types";
+constexpr const char* kParamSEScopes = "param_se_scopes";
 
 /*!
- * \brief The device type which will hold the function result.
+ * \brief The SEScope which will hold the function result.
  *
  * Only supported on Relay \p Functions. Generally added by the \p PlanDevices pass, but
  * may be included as an annotation on user programs.
  *
- * Type: Integer (but interpreted as DLDeviceType)
+ * Type: SEScope
  */
-constexpr const char* kResultDeviceType = "result_device_type";
+constexpr const char* kResultSEScope = "result_se_scope";
 
 }  // namespace attr
 }  // namespace tvm
diff --git a/include/tvm/relay/attrs/annotation.h b/include/tvm/relay/attrs/annotation.h
index 85ac3f36ff607..30839d725aab0 100644
--- a/include/tvm/relay/attrs/annotation.h
+++ b/include/tvm/relay/attrs/annotation.h
@@ -25,74 +25,13 @@
 #define TVM_RELAY_ATTRS_ANNOTATION_H_
 
 #include <tvm/ir/attrs.h>
+#include <tvm/target/se_scope.h>
 
 #include <string>
 
 namespace tvm {
 namespace relay {
 
-/*!
- * \brief Attributes for the "on_device" special operator.
- *
- * The Relay call (aka 'annotation'):
- * \code
- *   on_device(sub_expr, device_type=2)
- * \endcode
- * constrains \p sub_expr to execute and store its result on a device with \p DLDeviceType \p 2
- * (i.e. a \p kDLCuda device). However the annotation itself may appear in an expression to be
- * executed and stored on a different device. If so the compiler will automatically insert a
- * "device_copy" call to mediate the transition between devices.
- *
- * E.g.: Assuming %x and %y reside on the GPU and %z on the CPU then:
- * \code
- *   multiply(on_device(add(%x, %y), device_type=2), %z)
- * \endcode
- * indicates the \p add should execute on the GPU but the \p multiply should execute on the CPU.
- * The compiler will rewrite this to:
- * \code
- *   multiply(device_copy(add(%x, %y), src_dev_type=2, dst_dev_type=1), %z)
- * \endcode
- *
- * The Relay call
- * \code
- *   on_device(sub_expr, device_type=2, is_fixed=True)
- * \endcode
- * is similar to the above, however the annotation itself must appear in an expression on the
- * same device. The compiler will check the devices are consistent, and will not insert any
- * "device_copy" call. This form of annotation shouldn't be necessary in user programs. However
- * it is needed by the \p PlanDevices pass to fully specify the results of device planning so that
- * the pass is idempotent.
- *
- * E.g.: The following program is equivalent to the above:
- * \code
- *   let %a = on_device(add(%x, %y), device_type=2, is_fixed=True)
- *   multiply(device_copy(%a, src_dev_type=2, dst_dev_type=1), %z)
- * \endcode
- * The "on_device" annotation with \p is_fixed=True indicates unambiguously that \p %a is stored
- * on the GPU.
- */
-struct OnDeviceAttrs : public tvm::AttrsNode<OnDeviceAttrs> {
-  // TODO(mbs): Replace device types with TargetDevice.
-  /*! \brief Device type on which argument expression should be evaluated. */
-  int device_type = kInvalidDeviceType;
-  /*!
-   * \brief If true, the result device must also be \p device_type and device planning should
-   * not insert any "device_copy" calls to respect this annotation.
-   *
-   * This is used by the device planning pass itself when annotating the planned program.
-   */
-  bool is_fixed = false;
-
-  TVM_DECLARE_ATTRS(OnDeviceAttrs, "relay.attrs.OnDeviceAttrs") {
-    TVM_ATTR_FIELD(device_type)
-        .describe("The type of the virtual device which should hold the expression result.")
-        .set_default(0);
-    TVM_ATTR_FIELD(is_fixed)
-        .describe("If true, do not insert a \"device_copy\" call to respect this annotation.")
-        .set_default(false);
-  }
-};
-
 /*!
  * \brief Annotate an expression to be cast into specific data type.
  */
@@ -118,6 +57,8 @@ struct CompilerAttrs : public tvm::AttrsNode<CompilerAttrs> {
 
 /*!
  * \brief Metadata for calls to TIR functions, useful for program analysis crossing Relay and TIR.
+ *
+ * TODO(mbs): Replace with typed fields once attributes have stabilized.
  */
 struct TIRCallAttrs : public tvm::AttrsNode<TIRCallAttrs> {
   /*! \brief The metadata attached to the call node. */
diff --git a/include/tvm/relay/attrs/device_copy.h b/include/tvm/relay/attrs/device_copy.h
index f7b0a04f45fa8..6d97ab79be4a2 100644
--- a/include/tvm/relay/attrs/device_copy.h
+++ b/include/tvm/relay/attrs/device_copy.h
@@ -25,6 +25,7 @@
 #define TVM_RELAY_ATTRS_DEVICE_COPY_H_
 
 #include <tvm/ir/attrs.h>
+#include <tvm/target/se_scope.h>
 
 #include <string>
 
@@ -35,17 +36,14 @@ namespace relay {
  * \brief Options for the device copy operators.
  */
 struct DeviceCopyAttrs : public tvm::AttrsNode<DeviceCopyAttrs> {
-  // TODO(mbs): Should be TargetDevice.
-  int dst_dev_type;
-  int src_dev_type;
+  SEScope src_se_scope = SEScope::FullyUnconstrained();
+  SEScope dst_se_scope = SEScope::FullyUnconstrained();
 
   TVM_DECLARE_ATTRS(DeviceCopyAttrs, "relay.attrs.DeviceCopyAttrs") {
-    TVM_ATTR_FIELD(src_dev_type)
-        .describe("The virtual device/context type where the op copies data from.")
-        .set_default(0);
-    TVM_ATTR_FIELD(dst_dev_type)
-        .describe("The virtual device/context type where the op copies data to.")
-        .set_default(0);
+    TVM_ATTR_FIELD(src_se_scope)
+        .describe("The (virtual) device and scope where the op copies data from.");
+    TVM_ATTR_FIELD(dst_se_scope)
+        .describe("The (virtual) device and scope where the op copies data to.");
   }
 };
 
diff --git a/include/tvm/relay/attrs/memory.h b/include/tvm/relay/attrs/memory.h
index 85462c087cee0..952d4affc5849 100644
--- a/include/tvm/relay/attrs/memory.h
+++ b/include/tvm/relay/attrs/memory.h
@@ -26,6 +26,7 @@
 
 #include <tvm/ir/attrs.h>
 #include <tvm/relay/expr.h>
+#include <tvm/target/se_scope.h>
 
 #include <string>
 #include <vector>
@@ -42,15 +43,13 @@ Expr ToTupleType(const Type& t, const std::vector<Expr>& exprs);
  */
 struct AllocStorageAttrs : public tvm::AttrsNode<AllocStorageAttrs> {
   DataType dtype;
-  int device_id;
-  int device_type;
+  SEScope se_scope = SEScope::FullyUnconstrained();
 
   TVM_DECLARE_ATTRS(AllocStorageAttrs, "relay.attrs.AllocStorageAttrs") {
     TVM_ATTR_FIELD(dtype)
         .describe("The dtype of the tensor to allocate.")
         .set_default(DataType::Float(32, 1));
-    TVM_ATTR_FIELD(device_id).describe("The device id on which to allocate memory.");
-    TVM_ATTR_FIELD(device_type).describe("The device type on which to allocate memory.");
+    TVM_ATTR_FIELD(se_scope).describe("The SEScope on which to allocate memory.");
   }
 };
 
diff --git a/include/tvm/relay/attrs/on_device.h b/include/tvm/relay/attrs/on_device.h
new file mode 100644
index 0000000000000..405926e209c69
--- /dev/null
+++ b/include/tvm/relay/attrs/on_device.h
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file tvm/relay/attrs/on_device.h
+ * \brief Attribute for the on device annotation.
+ */
+#ifndef TVM_RELAY_ATTRS_ON_DEVICE_H_
+#define TVM_RELAY_ATTRS_ON_DEVICE_H_
+
+#include <tvm/ir/attrs.h>
+#include <tvm/target/se_scope.h>
+
+#include <string>
+
+namespace tvm {
+namespace relay {
+
+/*!
+ * \brief Attributes for the "on_device" special operator.
+ *
+ * The Relay call (aka 'annotation'):
+ * \code
+ *   on_device(sub_expr, se_scope=S)
+ * \endcode
+ * constrains \p sub_expr to execute and store its result on the \p SEScope \p S.
+ * However the annotation itself may appear in an expression to be executed and stored on a
+ * different \p SEScope. If so the compiler will automatically insert a "device_copy" call to
+ * mediate the transition between \p SEScopes.
+ *
+ * E.g.: Assuming %x and %y reside on the GPU and %z on the CPU then:
+ * \code
+ *   multiply(on_device(add(%x, %y), se_scope=GPU), %z)
+ * \endcode
+ * indicates the \p add should execute on the GPU but the \p multiply should execute on the CPU.
+ * The compiler will rewrite this to:
+ * \code
+ *   multiply(device_copy(add(%x, %y), src_se_scope=GPU, dst_se_scope=CPU), %z)
+ * \endcode
+ *
+ * The Relay call
+ * \code
+ *   on_device(sub_expr, se_scope=S, is_fixed=True)
+ * \endcode
+ * is similar to the above, however the annotation itself must appear in an expression on the
+ * same \p SEScope \p S. The compiler will check the \p SEScopes are consistent, and will not
+ * insert any "device_copy" call. This form of annotation shouldn't be necessary in user programs.
+ * However it is needed by the \p PlanDevices pass to fully specify the results of device planning
+ * so that the pass is idempotent.
+ *
+ * E.g.: The following program is equivalent to the above:
+ * \code
+ *   let %a = on_device(add(%x, %y), se_scope=GPU, is_fixed=True)
+ *   multiply(device_copy(%a, src_se_scope=GPU, dst_se_scope=CPU), %z)
+ * \endcode
+ * The "on_device" annotation with \p is_fixed=True indicates unambiguously that \p %a is stored
+ * on the GPU.
+ */
+struct OnDeviceAttrs : public tvm::AttrsNode<OnDeviceAttrs> {
+  /*!
+   * \brief (Virtual) \p SEScope on which the result of the argument expression should be stored.
+   */
+  SEScope se_scope = SEScope::FullyUnconstrained();
+  /*!
+   * \brief If true, the result \p SEScope must also be \p se_scope, and device planning should
+   * not insert any "device_copy" calls to respect this annotation.
+   *
+   * This is used by the device planning pass itself when annotating the planned program.
+   */
+  bool is_fixed = false;
+
+  TVM_DECLARE_ATTRS(OnDeviceAttrs, "relay.attrs.OnDeviceAttrs") {
+    TVM_ATTR_FIELD(se_scope)
+        .describe("The (virtual) device and scope holding the expression result.")
+        .set_default(SEScope::FullyUnconstrained());
+    TVM_ATTR_FIELD(is_fixed)
+        .describe("If true, do not insert a \"device_copy\" call to respect this annotation.")
+        .set_default(false);
+  }
+};
+
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_ATTRS_ON_DEVICE_H_
diff --git a/include/tvm/relay/transform.h b/include/tvm/relay/transform.h
index e740776d6d4f4..aa9d3b41554c5 100644
--- a/include/tvm/relay/transform.h
+++ b/include/tvm/relay/transform.h
@@ -30,6 +30,8 @@
 #include <tvm/relay/function.h>
 #include <tvm/relay/op.h>
 #include <tvm/relay/op_attr_types.h>
+#include <tvm/target/compilation_config.h>
+#include <tvm/target/se_scope.h>
 #include <tvm/target/target.h>
 
 #include <string>
@@ -437,23 +439,27 @@ TVM_DLL Pass RelayToTIRTargetHook();
  * \brief A pass for manifesting explicit memory allocations and rewriting
  * specific dialects.
  *
- * \param target_host The target used by the host for compilation.
- * \param targets The device type and target pairs for compilation.
+ * \param cpu_se_scope SEScope for computations and data which must reside on a CPU, such as
+ * shapes and shape functions.
  *
  * \return The pass.
  */
-TVM_DLL Pass ManifestAlloc(Target target_host, Map<tvm::Integer, tvm::Target> targets);
+TVM_DLL Pass ManifestAlloc(SEScope cpu_se_scope);
 
 /*!
- * \brief Uses existing "on_device" and "device_copy" CallNodes to infer the device on which
- * every Relay sub-expression should run (and the result stored). Captures the result of that
- * analysis using new "on_device" and "device_copy" CallNodes. See
- * tvm::relay::transform::{LexicalOnDeviceMixin,DeviceAwareExprVisitor,DeviceAwareExprMutator}
+ * \brief Uses existing "on_device" and "device_copy" CallNodes to infer the \p SEScope on which
+ * every Relay sub-expression should run and the result stored. Captures the result of that
+ * analysis using new "on_device" and "device_copy" CallNodes.
+ *
+ * See tvm::relay::transform::{LexicalOnDeviceMixin,DeviceAwareExprVisitor,DeviceAwareExprMutator}
  * for help recovering the device for an arbitrary sub-expression in downstream transformations.
  *
- * \param default_device_type DLDeviceType for default device.
+ * \param config Describes the targets and default \p SEScope for all primitive operators and
+ * host sub-expressions.
+ *
+ * \return The pass.
  */
-TVM_DLL Pass PlanDevices(DLDeviceType default_device_type);
+TVM_DLL Pass PlanDevices(CompilationConfig config);
 
 }  // namespace transform
 
diff --git a/include/tvm/runtime/vm/bytecode.h b/include/tvm/runtime/vm/bytecode.h
index 72a557fa93b1e..a2a64d76ce869 100644
--- a/include/tvm/runtime/vm/bytecode.h
+++ b/include/tvm/runtime/vm/bytecode.h
@@ -176,6 +176,7 @@ struct Instruction {
       RegName object;
     } get_tag;
     struct /* AllocADT Operands */ {
+      // TODO(mbs): Needs a DeviceAndScope.
       /*! \brief The datatype's constructor tag. */
       Index constructor_tag;
       /*! \brief The number of fields to store in the datatype. */
@@ -184,6 +185,7 @@ struct Instruction {
       RegName* datatype_fields;
     };
     struct /* AllocClosure Operands */ {
+      // TODO(mbs): Needs a DeviceAndScope.
       /*! \brief The index into the function table. */
       Index clo_index;
       /*! \brief The number of free variables to capture. */
@@ -198,8 +200,8 @@ struct Instruction {
       Index alignment;
       /*! \brief The hint of the dtype. */
       DLDataType dtype_hint;
-      /*! \brief The device type of the allocation. */
-      Index device_type;
+      /*! \brief The index of the device on which the allocation will be made. */
+      Index device_index;
     } alloc_storage;
     struct /* ShapeOf Operands */ {
       RegName tensor;
@@ -210,11 +212,11 @@ struct Instruction {
     } reshape_tensor;
     struct /* DeviceCopy Operands */ {
       RegName src;
-      /*! \brief The source device type. */
-      Index src_device_type;
-      /*! \brief The destination device type. */
-      Index dst_device_type;
-    };
+      /*! \brief The index of the source device to copy from. */
+      Index src_device_index;
+      /*! \brief The index of the destination deviceto copy to. */
+      Index dst_device_index;
+    } device_copy;
   };
 
   /*!
@@ -352,12 +354,12 @@ struct Instruction {
    * \param size The size of the allocation.
    * \param alignment The allocation's alignment.
    * \param dtype_hint The data type hint for the allocator.
-   * \param device_type The device type for the allocator.
+   * \param device_index The index of the device to allocate on.
    * \param dst The destination to place the storage.
    * \return The alloc storage instruction.
    */
   static Instruction AllocStorage(RegName size, Index alignment, DLDataType dtype_hint,
-                                  Index device_type, RegName dst);
+                                  Index device_index, RegName dst);
   /*!
    * \brief Get the shape of an input tensor.
    * \param tensor The input tensor.
@@ -376,12 +378,12 @@ struct Instruction {
   /*!
    * \brief Copy tensor cross different devices.
    * \param src The source register.
-   * \param src_device_type The device type of the tensor for the source register.
-   * \param dst_device_type The device type of the tensor ofr the destination register.
+   * \param src_device_index The index of the device holding the tensor in the source register.
+   * \param dst_device_index The index of the device to hold the tensor in the destination register.
    * \param dst The destination register to store the copied tensor.
    * \return The device copy instruction.
    */
-  static Instruction DeviceCopy(RegName src, Index src_device_type, Index dst_device_type,
+  static Instruction DeviceCopy(RegName src, Index src_device_index, Index dst_device_index,
                                 RegName dst);
 
   Instruction();
diff --git a/include/tvm/runtime/vm/executable.h b/include/tvm/runtime/vm/executable.h
index 6e564fd623802..311667904df61 100644
--- a/include/tvm/runtime/vm/executable.h
+++ b/include/tvm/runtime/vm/executable.h
@@ -132,12 +132,18 @@ class Executable : public ModuleNode {
   std::string GetBytecode() const;
 
   /*!
-   * \brief Returns a description of all the contants in the executable in human-readable
+   * \brief Returns a description of all the constants in the executable in human-readable
    * format. Not intended to be machine readable, but rather to help with debugging and
    * diffing generated code.
    */
   std::string GetConstants() const;
 
+  /*!
+   * \brief Returns a description of all the (virtual) devices in the executable in human-readable
+   * format.
+   */
+  std::string GetVirtualDevices() const;
+
   /*!
    * \brief Print the detailed statistics of the given code, i.e. number of
    * globls and constants, etc.
@@ -183,6 +189,16 @@ class Executable : public ModuleNode {
 
   const char* type_key() const final { return "VMExecutable"; }
 
+  /*!
+   * \brief The (compile-time, virtual) devices corresponding to each device index.
+   * Currently we only support at most one device per device type.
+   */
+  std::vector<Device> virtual_devices;
+  /*!
+   * \brief The device index corresponding to the 'host' device. That will hold and evaluate
+   * shape-related data and code.
+   */
+  int host_device_index = -1;
   /*! \brief The global constant pool. */
   std::vector<ObjectRef> constants;
   /*! \brief A map from globals (as strings) to their index in the function map. */
@@ -195,38 +211,52 @@ class Executable : public ModuleNode {
   std::map<Index, Map<String, ObjectRef>> op_attrs;
   /*! \brief The virtual machine's function table. */
   std::vector<VMFunction> functions;
-  /*! \brief The device type for each constant. */
-  std::vector<Index> const_device_type;
+  /*! \brief The index of the device holding each constant. */
+  std::vector<Index> const_device_indexes;
 
  private:
+  /*!
+   * \brief Save the virtual devices
+   *
+   * /param strm The output stream.
+   */
+  void SaveVirtualDevicesSection(dmlc::Stream* strm);
+
   /*!
    * \brief Save the globals.
    *
-   * \param strm The input stream.
+   * \param strm The output stream.
    */
   void SaveGlobalSection(dmlc::Stream* strm);
 
   /*!
    * \brief Save the constant pool.
    *
-   * \param strm The input stream.
+   * \param strm The output stream.
    */
   void SaveConstantSection(dmlc::Stream* strm);
 
   /*!
    * \brief Save primitive op names.
    *
-   *  \param strm The input stream.
+   *  \param strm The output stream.
    */
   void SavePrimitiveOpNames(dmlc::Stream* strm);
 
   /*!
    * \brief Save the vm functions.
    *
-   * \param strm The input stream.
+   * \param strm The output stream.
    */
   void SaveCodeSection(dmlc::Stream* strm);
 
+  /*!
+   * \brief Load the virtual devices
+   *
+   * /param strm The input stream.
+   */
+  void LoadVirtualDevicesSection(dmlc::Stream* strm);
+
   /*!
    * \brief Load the globals.
    *
diff --git a/include/tvm/runtime/vm/vm.h b/include/tvm/runtime/vm/vm.h
index ece73fcfda34d..604c97330d995 100644
--- a/include/tvm/runtime/vm/vm.h
+++ b/include/tvm/runtime/vm/vm.h
@@ -82,20 +82,20 @@ struct VMFunction {
   /*! \brief The instructions representing the function. */
   std::vector<Instruction> instructions;
   /*! \brief The size of the frame for this function */
-  Index register_file_size;
-  /*! \brief The device type of each parameter for this function. */
-  std::vector<Index> params_device_type;
-
-  VMFunction(const std::string& name, std::vector<std::string> params,
-             const std::vector<Instruction>& instructions, Index register_file_size,
-             const std::vector<Index> params_device_type = {})
-      : name(name),
-        params(params),
-        instructions(instructions),
+  Index register_file_size = 0;
+  /*! \brief The indexes for the device holding each function parameter. */
+  std::vector<Index> param_device_indexes;
+
+  VMFunction(std::string name, std::vector<std::string> params,
+             std::vector<Instruction> instructions, Index register_file_size,
+             std::vector<Index> param_device_indexes)
+      : name(std::move(name)),
+        params(std::move(params)),
+        instructions(std::move(instructions)),
         register_file_size(register_file_size),
-        params_device_type(params_device_type) {}
+        param_device_indexes(std::move(param_device_indexes)) {}
 
-  VMFunction() {}
+  VMFunction() = default;
 
   friend std::ostream& operator<<(std::ostream& os, const VMFunction&);
 };
@@ -239,17 +239,19 @@ class VirtualMachine : public runtime::ModuleNode {
                             Index output_size, const std::vector<ObjectRef>& args);
 
   /*!
-   * \brief Initialize the virtual machine for a set of devices.
-   * \param devices The set of TVM devices.
+   * \brief Initialize the virtual machine for a set of (physical) devices.
+   * \param physical_devices The set of TVM devices.
    * \param alloc_types The allocator types for each device.
    */
-  void Init(const std::vector<Device>& devices, const std::vector<AllocatorType>& alloc_types);
+  void Init(const std::vector<Device>& physical_devices,
+            const std::vector<AllocatorType>& alloc_types);
 
   /*! \brief Run VM dispatch loop. */
   void RunLoop();
 
-  /*! \brief Get device from the device list based on a given device type. */
-  Device GetDevice(Index device_type) const;
+  /*! \brief Get device from the device list based on a given device index. */
+  Device GetDevice(Index device_index) const;
+  Allocator* GetAllocator(Index device_index) const;
 
   /*!
    * \brief Invoke a global setting up the VM state to execute.
@@ -301,9 +303,13 @@ class VirtualMachine : public runtime::ModuleNode {
   const Executable* exec_;
   /*! \brief The function name to inputs mapping. */
   std::unordered_map<std::string, std::vector<ObjectRef>> inputs_;
-  /*! \brief The set of TVM devices the VM is currently executing on. */
+  /*!
+   * \brief The "physical" devices the VM can execute primitives on. All "device indexes"
+   * are w.r.t. this vector. Each entry in this vector must match the corresponding entry
+   * in the executable's "virtual" devices vector.
+   */
   std::vector<Device> devices_;
-  /*! \brief The cached memory allocators. */
+  /*! \brief The cached memory allocators, one per device. */
   std::vector<Allocator*> allocators_;
   /*!
    * \brief The constant pool for runtime. It caches the device dependent
diff --git a/python/tvm/micro/contrib/stm32/emitter.py b/python/tvm/micro/contrib/stm32/emitter.py
index 8453ea78e012a..aec5912871fd5 100644
--- a/python/tvm/micro/contrib/stm32/emitter.py
+++ b/python/tvm/micro/contrib/stm32/emitter.py
@@ -44,7 +44,7 @@
 
 
 def _fix_name(node_name):
-    """ Replace ':' with '_' in names like 'InputImg:0' """
+    """Replace ':' with '_' in names like 'InputImg:0'"""
     return node_name.replace(":", "_")
 
 
@@ -116,7 +116,7 @@ def _get_tensor_size_bytes(dims, dltype):
 
 
 def _preprocess_code(src):
-    """ Hack the C code implementing the model. """
+    """Hack the C code implementing the model."""
     dst = "#include <stdio.h>\n" "#include <math.h>\n\n"
     dst = dst + src
     return dst
@@ -193,7 +193,7 @@ def __init__(self, include_activations=True, include_inputs=True, include_output
         self._quantization = {}
 
     def _extract_quantization_info(self, quantization):
-        """ Build dictionary with quantization infos."""
+        """Build dictionary with quantization infos."""
 
         for dl_tensor_name in self._input_data:
             if dl_tensor_name in quantization:
@@ -258,7 +258,7 @@ def _get_tensor_from_node(self, nid, idx):
         return tensor
 
     def _compute_data_placement(self):
-        """ Compute inputs, outputs, weight, activation sizes"""
+        """Compute inputs, outputs, weight, activation sizes"""
 
         self._inputs = self._arg_nodes.copy()
 
@@ -548,7 +548,7 @@ def parse_module(self, module, quantization=None):
         self._parse_model(quantization)
 
     def _emit_params_data(self, name, out_h, out_c):
-        """ Emits the network_data[c,h] files with parameters."""
+        """Emits the network_data[c,h] files with parameters."""
 
         name_upper = name.upper()
 
@@ -674,7 +674,7 @@ def _emit_open(self, name, out_h, out_c):
         )
 
     def _emit_close(self, name, out_h, out_c):
-        """ Emits the ai_model_info structure. """
+        """Emits the ai_model_info structure."""
 
         name_upper = name.upper()
 
@@ -794,7 +794,7 @@ def _emit_tensor_quant(self, dl_tensor_name, out_c):
         return None
 
     def _emit_tensor_init(self, dl_tensor_name, tensor, out_c):
-        """ Emits the tensor instantiation code. """
+        """Emits the tensor instantiation code."""
 
         dltype = tensor["dltype"]
         dims = tensor["dims"]
@@ -838,7 +838,7 @@ def _emit_tensor_init(self, dl_tensor_name, tensor, out_c):
 
     def _emit_activation_buffers(self, name, out_c):
         # pylint: disable=unused-argument
-        """ Emits activation tensors, including inputs/outputs."""
+        """Emits activation tensors, including inputs/outputs."""
 
         out_c.write(
             textwrap.dedent(
@@ -905,7 +905,7 @@ def _emit_activation_buffers(self, name, out_c):
         out_c.write(f"\n")
 
     def _emit_params_buffers(self, name, out_c):
-        """ Emits all parameter tensors."""
+        """Emits all parameter tensors."""
 
         out_c.write(
             textwrap.dedent(
@@ -922,7 +922,7 @@ def _emit_params_buffers(self, name, out_c):
         out_c.write(f"\n")
 
     def _emit_network(self, name, out_c):
-        """ Emits prototypes for the network operator functions."""
+        """Emits prototypes for the network operator functions."""
 
         out_c.write(
             textwrap.dedent(
@@ -967,7 +967,7 @@ def _emit_tensor_activation(self, dl_tensor_name, tensor, out_c):
         )
 
     def _emit_activation_init(self, name, out_c):
-        """ Emits buffer initialization code for activation tensors."""
+        """Emits buffer initialization code for activation tensors."""
 
         out_c.write(
             textwrap.dedent(
@@ -1015,7 +1015,7 @@ def _emit_activation_init(self, name, out_c):
         )
 
     def _emit_params_init(self, name, out_c):
-        """ Emits buffer initialization code for params tensors."""
+        """Emits buffer initialization code for params tensors."""
 
         out_c.write(
             textwrap.dedent(
@@ -1063,13 +1063,13 @@ def _emit_params_init(self, name, out_c):
         )
 
     def _emit_init(self, name, out_c):
-        """ Emits buffer initialization code."""
+        """Emits buffer initialization code."""
 
         self._emit_activation_init(name, out_c)
         self._emit_params_init(name, out_c)
 
     def _emit_run(self, name, out_h, out_c):
-        """ Emits the run function code."""
+        """Emits the run function code."""
 
         out_h.write(
             textwrap.dedent(
@@ -1230,7 +1230,7 @@ def _emit_run(self, name, out_h, out_c):
         out_c.write(f"\n")
 
     def _emit_create_destroy(self, name, out_h, out_c):
-        """ Emits the create/destroy functions."""
+        """Emits the create/destroy functions."""
 
         out_h.write(
             textwrap.dedent(
@@ -1296,7 +1296,7 @@ def _emit_create_destroy(self, name, out_h, out_c):
         )
 
     def emit_code(self, dest_dir, model_name):
-        """ Emits the C code implementing the model. """
+        """Emits the C code implementing the model."""
 
         # Build the directory structure
         if os.path.exists(dest_dir):
diff --git a/python/tvm/relay/op/annotation/annotation.py b/python/tvm/relay/op/annotation/annotation.py
index f5f8870ab0153..cf70dc6e267e5 100644
--- a/python/tvm/relay/op/annotation/annotation.py
+++ b/python/tvm/relay/op/annotation/annotation.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 """Annotation operations."""
+from tvm import target
 from tvm.runtime import ndarray as _nd
 from tvm.runtime import Device as _Device
 
@@ -22,11 +23,11 @@
 from .. import op as reg
 
 
-def _device_to_int(device):
+def _make_se_scope(device):
     if isinstance(device, _Device):
-        return device.device_type
+        return target.make_se_scope(device)
     if isinstance(device, str):
-        return _nd.device(device).device_type
+        return target.make_se_scope(_nd.device(device))
     raise ValueError("expecting a Device or device name, but received a %s" % (type(device)))
 
 
@@ -39,7 +40,7 @@ def on_device(data, device, is_fixed=False):
         The expression to be annotated.
 
     device : Union[:py:class:`Device`, str]
-        The device to annotate with. Only the device's type is significant.
+        The device to annotate with.
 
     is_fixed : bool
         If false (the default), a device_copy
@@ -52,7 +53,7 @@ def on_device(data, device, is_fixed=False):
     result : tvm.relay.Expr
         The annotated expression.
     """
-    return _make.on_device(data, _device_to_int(device), is_fixed)
+    return _make.OnDevice(data, _make_se_scope(device), is_fixed)
 
 
 def function_on_device(function, param_devices, result_device):
@@ -65,18 +66,18 @@ def function_on_device(function, param_devices, result_device):
         The function to be annotated.
 
     param_devices : Array[Union[:py:class:`Device`, str]]
-        The devices for each parameter. Only the device types are significant.
+        The devices for each parameter.
 
     result_device: Union[:py:class:`Device`, str]
-        The device for the function result. Only the device type is significant.
+        The device for the function result.
 
     Returns
     -------
-    result : tvm.rleay.Function
+    result : tvm.relay.Function
         The annotated function.
     """
-    return _make.function_on_device(
-        function, [_device_to_int(d) for d in param_devices], _device_to_int(result_device)
+    return _make.FunctionOnDevice(
+        function, [_make_se_scope(d) for d in param_devices], _make_se_scope(result_device)
     )
 
 
diff --git a/python/tvm/relay/op/tensor.py b/python/tvm/relay/op/tensor.py
index e615bbf21b864..d9847a4535694 100644
--- a/python/tvm/relay/op/tensor.py
+++ b/python/tvm/relay/op/tensor.py
@@ -16,6 +16,7 @@
 # under the License.
 """Basic tensor operations."""
 # pylint: disable=redefined-builtin, unused-argument
+from tvm import target
 from tvm.runtime import ndarray as _nd
 from tvm.runtime import Device as _Device
 from tvm.te.hybrid import script
@@ -26,6 +27,14 @@
 from . import op as reg
 
 
+def _make_se_scope(device):
+    if isinstance(device, _Device):
+        return target.make_se_scope(device)
+    if isinstance(device, str):
+        return target.make_se_scope(_nd.device(device))
+    raise ValueError("expecting a Device or device name, but received a %s" % (type(device)))
+
+
 # We create a wrapper function for each operator in the
 # python side to call into the positional _make.OpName function.
 #
@@ -1181,7 +1190,7 @@ def copy_shape_func(attrs, inputs, _):
     return [_copy_shape_func(inputs[0])]
 
 
-def device_copy(data, src_dev, dst_dev):
+def device_copy(data, src_device, dst_device):
     """Copy data from the source device to the destination device. This
     operator helps data transferring between difference devices for
     heterogeneous execution.
@@ -1191,10 +1200,10 @@ def device_copy(data, src_dev, dst_dev):
     data : tvm.relay.Expr
         The tensor to be copied.
 
-    src_dev : Union[:py:class:`Device`, str]
+    src_device : Union[:py:class:`Device`, str]
         The source device where the data is copied from.
 
-    dst_dev : Union[:py:class:`Device`, str]
+    dst_device : Union[:py:class:`Device`, str]
         The destination device where the data is copied to.
 
     Returns
@@ -1202,26 +1211,7 @@ def device_copy(data, src_dev, dst_dev):
     result : tvm.relay.Expr
         The copied result.
     """
-    if isinstance(src_dev, _Device):
-        src_dev = src_dev.device_type
-    elif isinstance(src_dev, str):
-        src_dev = _nd.device(src_dev).device_type
-    else:
-        raise ValueError(
-            "src_dev is expected to be the type of Device or "
-            "str, but received %s" % (type(src_dev))
-        )
-
-    if isinstance(dst_dev, _Device):
-        dst_dev = dst_dev.device_type
-    elif isinstance(dst_dev, str):
-        dst_dev = _nd.device(dst_dev).device_type
-    else:
-        raise ValueError(
-            "dst_dev is expected to be the type of Device or "
-            "str, but received %s" % (type(dst_dev))
-        )
-    return _make.device_copy(data, src_dev, dst_dev)
+    return _make.DeviceCopy(data, _make_se_scope(src_device), _make_se_scope(dst_device))
 
 
 def shape_of(data, dtype="int32"):
diff --git a/python/tvm/relay/transform/transform.py b/python/tvm/relay/transform/transform.py
index 0dc07944836dc..01473a82fb3ac 100644
--- a/python/tvm/relay/transform/transform.py
+++ b/python/tvm/relay/transform/transform.py
@@ -891,6 +891,7 @@ def __init__(self, *args, **kwargs):
             # initialize handle in cass pass_cls creation failed.fg
             self.handle = None
             inst = pass_cls(*args, **kwargs)
+
             # it is important not to capture self to
             # avoid a cyclic dependency
             def _pass_func(func, mod, ctx):
@@ -1146,14 +1147,26 @@ def SimplifyExpr():
     return _ffi_api.SimplifyExpr()
 
 
-def PlanDevices(default_device):
+def PlanDevices(config):
     """
-    Uses existing "on_device" and "device_copy" CallNodes to infer the device on which
-    every Relay sub-expression should run (and the result stored). Captures the result of that
-    analysis using new "on_device" and "device_copy" CallNodes. Note that the device_id of
-    the default_device is ignored.
+    Uses existing "on_device" and "device_copy" CallNodes to infer the SEScope on which
+    every Relay sub-expression should run and the result stored. Captures the result of that
+    analysis using new "on_device" and "device_copy" CallNodes. Sub-expressions which are
+    not otherwise constrained are assigned to the default_primitive_se_scope. However data and
+    computations which must be hosted on a CPU (such as shapes and shape functions) use the
+    cpu_se_scope.
+
+    Parameters
+    ----------
+    config : tvm.CompilationConfig
+        The compilation configuration, specifying available targets and default devices.
+
+    Returns
+    -------
+    ret : tvm.transforms.Pass
+        The pass.
     """
-    return _ffi_api.PlanDevices(default_device)
+    return _ffi_api.PlanDevices(config)
 
 
 def FoldExplicitPadding():
diff --git a/python/tvm/runtime/vm.py b/python/tvm/runtime/vm.py
index c1cbc966acdc6..365e38c6e06c0 100644
--- a/python/tvm/runtime/vm.py
+++ b/python/tvm/runtime/vm.py
@@ -72,6 +72,7 @@ def __init__(self, mod):
         self._get_lib = self.mod["get_lib"]
         self._get_bytecode = self.mod["get_bytecode"]
         self._get_constants = self.mod["get_constants"]
+        self._get_virtual_devices = self.mod["get_virtual_devices"]
         self._get_stats = self.mod["get_stats"]
         self._get_function_arity = self.mod["get_function_arity"]
         self._get_function_param_name = self.mod["get_function_param_name"]
@@ -251,6 +252,11 @@ def constants(self):
         Useful for debugging and diffing generated executables in unit tests."""
         return self._get_constants()
 
+    @property
+    def virtual_devices(self):
+        """Returns a human-readable description of all the (virtual) devices in the executable."""
+        return self._get_virtual_devices()
+
     @property
     def globals(self):
         """Get the globals used by the Relay VM executable.
@@ -295,7 +301,8 @@ class VirtualMachine(object):
         The VM executable.
 
     device : tvm.runtime.Device or List[tvm.runtime.Device]
-        The device to deploy the module
+        The device(s) on which the model will run.
+        Currently at most one device per device type is supported.
 
     memory_cfg : str or Dict[tvm.runtime.Device, str], optional
         Config the type of memory allocator. The allocator type can be ["naive",
@@ -363,10 +370,7 @@ def _setup_device(self, dev, memory_cfg):
         devs = dev
         if not isinstance(dev, (list, tuple)):
             if not isinstance(dev, tvm.runtime.Device):
-                raise TypeError(
-                    "dev is expected to be Device or \
-                                List[Device]"
-                )
+                raise TypeError("dev is expected to be Device or List[Device]")
             devs = [dev]
 
         # CPU is required for executing shape functions
diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index 7e5702296542b..a266f185e9962 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -109,18 +109,18 @@ class AOTOnDemandAllocator : public transform::DeviceAwareExprVisitor {
 
   void VisitExpr_(const TupleNode* op) final {
     std::vector<int64_t> storage_ids;
-    std::vector<DLDeviceType> device_types;
+    std::vector<SEScope> se_scopes;
     std::vector<int64_t> storage_sizes_in_bytes;
     Expr expr = GetRef<Expr>(op);
     for (Expr field : op->fields) {
       auto sid = GetStorage(field);
       storage_ids.insert(storage_ids.end(), sid->storage_ids.begin(), sid->storage_ids.end());
-      device_types.insert(device_types.end(), sid->device_types.begin(), sid->device_types.end());
+      se_scopes.insert(se_scopes.end(), sid->se_scopes.begin(), sid->se_scopes.end());
       storage_sizes_in_bytes.insert(storage_sizes_in_bytes.end(),
                                     sid->storage_sizes_in_bytes.begin(),
                                     sid->storage_sizes_in_bytes.end());
     }
-    storage_device_map_[expr] = StorageInfo(storage_ids, device_types, storage_sizes_in_bytes);
+    storage_device_map_[expr] = StorageInfo(storage_ids, se_scopes, storage_sizes_in_bytes);
     AssignReturnSid(expr);
   }
 
@@ -129,7 +129,7 @@ class AOTOnDemandAllocator : public transform::DeviceAwareExprVisitor {
     auto sids = GetStorage(op->tuple);
     ICHECK_LT(static_cast<size_t>(op->index), sids->storage_ids.size());
     storage_device_map_[expr] =
-        StorageInfo({sids->storage_ids[op->index]}, {sids->device_types[op->index]},
+        StorageInfo({sids->storage_ids[op->index]}, {sids->se_scopes[op->index]},
                     {sids->storage_sizes_in_bytes[op->index]});
     AssignReturnSid(expr);
   }
@@ -163,7 +163,7 @@ class AOTOnDemandAllocator : public transform::DeviceAwareExprVisitor {
    * \param prototype The prototype token.
    * \return The required memory size.
    *
-   * TODO(mbs): Cf CalculateRelayExprSizeBytes in utils.cc
+   * TODO(mbs): Cf CalculateRelayExprSizeBytes in utils.cc, GetMemorySize is graph_plan_memory.cc
    */
   size_t GetMemorySizeBytes(const TensorType& ttype) {
     size_t size = 1;
@@ -195,24 +195,25 @@ class AOTOnDemandAllocator : public transform::DeviceAwareExprVisitor {
    */
   void CreateStorage(const ExprNode* op) {
     Expr expr = GetRef<Expr>(op);
-    return CreateStorage(expr, GetInScopeDeviceType(expr));
+    return CreateStorage(expr, GetSEScope(expr));
   }
 
   /*!
-   * \brief Create storage to hold the result of evaluating \p expr on \p device_type.
+   * \brief Create storage to hold the result of evaluating \p expr in \p se_scope.
    */
-  void CreateStorage(const Expr& expr, DLDeviceType device_type) {
-    ICHECK(device_type != kInvalidDeviceType) << "invalid device type for expr:" << std::endl
+  void CreateStorage(const Expr& expr, SEScope se_scope) {
+    ICHECK(!se_scope->IsFullyUnconstrained()) << "invalid SEScope for expr:" << std::endl
                                               << PrettyPrint(expr);
     std::vector<int64_t> storage_ids;
-    std::vector<DLDeviceType> device_types;
+    std::vector<SEScope> se_scopes;
     std::vector<int64_t> storage_sizes_in_bytes;
     for (const auto& ttype : FlattenTupleType(expr->checked_type())) {
       storage_ids.push_back(next_available_sid_++);
-      device_types.push_back(device_type);
+      se_scopes.push_back(se_scope);
       storage_sizes_in_bytes.push_back(GetMemorySizeBytes(ttype));
     }
-    storage_device_map_[expr] = StorageInfo(storage_ids, device_types, storage_sizes_in_bytes);
+    storage_device_map_[expr] = StorageInfo(std::move(storage_ids), std::move(se_scopes),
+                                            std::move(storage_sizes_in_bytes));
   }
 
   /*! \brief mapping of expression -> storageInfo */
@@ -589,7 +590,7 @@ class AOTExecutorCodegen : public MixedModeVisitor {
       mod = WithAttr(mod, "main_func_info", func_info);
     }
 
-    IRModule lowered_mod = tec::LowerTEPass(targets_, mod_name, [this](Function func) {
+    IRModule lowered_mod = tec::LowerTEPass(mod_name, [this](Function func) {
       // We need to maintain the constant map for external
       // functions so we pass this processing function which
       // allows us to process each function as we lower it.
diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
index 78978e192f0e4..cd9c7d68366d1 100644
--- a/src/relay/backend/build_module.cc
+++ b/src/relay/backend/build_module.cc
@@ -28,6 +28,7 @@
 #include <tvm/relay/qnn/transform.h>
 #include <tvm/relay/transform.h>
 #include <tvm/runtime/device_api.h>
+#include <tvm/target/compilation_config.h>
 
 #include <memory>
 
@@ -161,6 +162,8 @@ std::unique_ptr<ExecutorCodegen> MakeExecutorCodegen(String executor_str) {
  */
 class RelayBuildModule : public runtime::ModuleNode {
  public:
+  RelayBuildModule() = default;
+
   /*!
    * \brief Get member function to front-end
    * \param name The name of the function.
@@ -207,7 +210,7 @@ class RelayBuildModule : public runtime::ModuleNode {
     } else if (name == "optimize") {
       return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
         ICHECK_EQ(args.num_args, 2);
-        *rv = this->Optimize(args[0], args[1], this->params_);
+        *rv = this->Optimize(args[0], args[1]);
       });
     } else {
       LOG(FATAL) << "Unknown packed function: " << name;
@@ -274,26 +277,16 @@ class RelayBuildModule : public runtime::ModuleNode {
    * \brief Build relay IRModule for graph executor
    *
    * \param mod Relay IRModule
-   * \param target Target device
+   * \param targets Target devices
    * \param target_host Host target device
    */
   void Build(IRModule mod, const TargetMap& targets, const tvm::Target& target_host,
              const String executor, const String mod_name) {
-    for (const auto& pair : targets) {
-      VLOG(0) << "Build target " << pair.first << " = " << pair.second->str();
-    }
-    if (target_host.defined()) {
-      VLOG(0) << "Build target_host = " << target_host->str();
-    }
-    VLOG(0) << "Build executor = '" << executor << "'";
-    VLOG(0) << "Build mod_name = '" << mod_name << "'";
-
-    // Create protected variable targets_ from ground up
-    targets_ = targets;
-    target_host_ = target_host;
+    VLOG_CONTEXT << "Build";
     executor_ = executor;
-    CheckAndUpdateHostConsistency(&targets_, &target_host_);
-    BuildRelay(mod, params_, mod_name);
+    config_ = CompilationConfig(PassContext::Current(), targets, target_host);
+
+    BuildRelay(std::move(mod), mod_name);
   }
 
  protected:
@@ -302,95 +295,58 @@ class RelayBuildModule : public runtime::ModuleNode {
    *
    * \param relay_module The input IRModule where optmization will be applied on.
    * \param targets The device type to `Target` mapping.
-   * \param params The param name to value mapping.
    *
    * \return relay::IRModule The updated Relay IR module after optimization.
    */
-  IRModule Optimize(IRModule relay_module, const TargetMap& targets,
-                    const std::unordered_map<std::string, runtime::NDArray>& params) {
-    targets_ = targets;
-    // No target_host setup it seems.
-    return OptimizeImpl(relay_module, params);
+  IRModule Optimize(IRModule relay_module, const TargetMap& targets) {
+    VLOG_CONTEXT << "Optimize";
+    // TODO(mbs): executor_ will be whatever was left over from last Build. Note that
+    // the empty executor string will CHECK fail, so how are folks using this API?
+    config_ = CompilationConfig(transform::PassContext::Current(), targets,
+                                /*optional_host_target=*/Target());
+    return OptimizeImpl(std::move(relay_module));
   }
 
-  IRModule OptimizeImpl(IRModule relay_module,
-                        const std::unordered_map<std::string, runtime::NDArray>& params) {
+  IRModule OptimizeImpl(IRModule relay_module) {
     ICHECK(relay_module.defined()) << "The IRModule must be defined for the Relay compiler.";
 
-    if (params.size()) {
+    if (!params_.empty()) {
       ICHECK(relay_module->ContainGlobalVar("main")) << "Missing the main entry function";
       GlobalVar main_glb_var = relay_module->GetGlobalVar("main");
       Function main_func = Downcast<Function>(relay_module->Lookup(main_glb_var));
-      auto new_main = BindParamsByName(main_func, params);
+      auto new_main = BindParamsByName(main_func, params_);
       IRModuleNode* relay_module_ptr = relay_module.CopyOnWrite();
       relay_module_ptr->Update(main_glb_var, new_main);
     }
 
-    Array<Pass> pass_seqs = GetPassPrefix(targets_, false);
+    Array<Pass> pass_seqs = GetPassPrefix(
+        /*is_homogenous=*/config_->optional_homogeneous_target.defined(), /*is_vm=*/false);
     transform::PassContext pass_ctx = PassContext::Current();
 
-    // TODO(mbs): Centralize this logic and reconcile with similar in relay/backend/vm/compiler.cc
-    DLDeviceType default_device_type;
-    if (targets_.size() == 1) {
-      // Homogenous execution.
-      default_device_type = static_cast<DLDeviceType>((*targets_.begin()).first->value);
-      const auto& target = (*targets_.begin()).second;
-
-      // This pass currently only supports the homogeneous case.
-      pass_seqs.push_back(
-          transform::SplitArgs(target->GetAttr<Integer>("max_function_args", -1).value()));
-    } else {
-      // Heterogeneous execution.
-      Optional<Integer> opt_fallback_dev =
-          pass_ctx->GetConfig<Integer>("relay.fallback_device_type");
-      if (opt_fallback_dev) {
-        default_device_type = static_cast<DLDeviceType>(opt_fallback_dev.value()->value);
-        Integer integer(static_cast<int>(default_device_type));
-        CHECK_GT(default_device_type, 0U)
-            << "The 'relay.fallback_device_type' is set to an invalid device type.";
-        if (targets_.count(integer) == 0) {
-          LOG(WARNING)
-              << "The 'relay.fallback_device_type' has been set to " << default_device_type
-              << " however no target has been given for that device type in the targets map. "
-                 "Creating an appropriate default target.";
-          targets_.Set(integer, CreateDefaultTarget(default_device_type));
-        }
-      } else {
-        default_device_type = kDLCPU;
-        Integer integer(static_cast<int>(default_device_type));
-        if (targets_.count(integer) == 0) {
-          LOG(WARNING) << "Using the default device type of kDLCPU, however no target has been "
-                          "given for that device type in the targets map. Creating an appropriate "
-                          "default target.";
-          targets_.Set(integer, CreateDefaultTarget(default_device_type));
-        }
-      }
-    }
-
     // Always plan devices so the remaining passes don't need to distinguish homogeneous vs
     // hetrogenous execution.
-    pass_seqs.push_back(transform::PlanDevices(default_device_type));
+    pass_seqs.push_back(transform::PlanDevices(config_));
 
     // Fuse the operations if it is needed.
     pass_seqs.push_back(transform::FuseOps());
 
     // Create a sequential pass and perform optimizations.
     transform::Pass seq = transform::Sequential(pass_seqs);
-    if (targets_.size() == 1) {
-      With<Target> tctx((*targets_.begin()).second);
+    if (config_->optional_homogeneous_target.defined()) {
+      With<Target> tctx(config_->optional_homogeneous_target);
       relay_module = seq(relay_module);
     } else {
       relay_module = seq(relay_module);
     }
 
     // Do layout rewrite for auto-scheduler.
-    if (backend::IsAutoSchedulerEnabled() && targets_.size() == 1) {
-      const auto& target = (*targets_.begin()).second;
+    if (backend::IsAutoSchedulerEnabled() && config_->optional_homogeneous_target.defined()) {
       Pass major_pass = transform::AutoSchedulerLayoutRewrite();
       bool enable_layout_rewrite_targets =
-          target->kind->device_type == kDLCPU || target->GetAttr<String>("device", "") == "mali";
+          config_->optional_homogeneous_target->kind->device_type == kDLCPU ||
+          config_->optional_homogeneous_target->GetAttr<String>("device", "") == "mali";
       if (enable_layout_rewrite_targets && pass_ctx.PassEnabled(major_pass->Info())) {
-        With<Target> tctx(target);
+        With<Target> tctx(config_->optional_homogeneous_target);
         relay_module = major_pass(relay_module);
         // Defuse ops to fold constants, then fuse them again
         relay_module = transform::DefuseOps()(relay_module);
@@ -416,45 +372,22 @@ class RelayBuildModule : public runtime::ModuleNode {
     return relay_module;
   }
 
-  /*!
-   * \brief Returns a default target to represent \p device_type.
-   */
-  static Target CreateDefaultTarget(DLDeviceType device_type) {
-    std::string name = runtime::DeviceName(device_type);
-    if (name == "cpu") {
-      return Target("llvm");
-    } else {
-      return Target(name);
-    }
-  }
-
   /*!
    * \brief Compile a Relay IR module to runtime module.
    *
    * \param relay_module The Relay IR module.
    * \param params The parameters.
    */
-  void BuildRelay(IRModule relay_module,
-                  const std::unordered_map<std::string, tvm::runtime::NDArray>& params,
-                  const String mod_name) {
-    Target target_host = GetTargetHost();
-    // If no target_host has been set, we choose a default one, which is
-    // llvm if "codegen.LLVMModuleCreate" is accessible.
-    const runtime::PackedFunc* pf = runtime::Registry::Get("codegen.LLVMModuleCreate");
-    if (!target_host.defined()) target_host = (pf != nullptr) ? Target("llvm") : Target("stackvm");
-
-    // Update all the targets in the targets_ TargetMap
-    CheckAndUpdateHostConsistency(&targets_, &target_host);
-
+  void BuildRelay(IRModule relay_module, const String& mod_name) {
     // Relay IRModule -> IRModule optimizations.
-    relay_module = OptimizeImpl(relay_module, params);
+    relay_module = OptimizeImpl(std::move(relay_module));
 
     // Get the updated function.
     auto func = Downcast<Function>(relay_module->Lookup("main"));
 
     // Generate code for the updated function.
     executor_codegen_ = MakeExecutorCodegen(executor_);
-    executor_codegen_->Init(nullptr, targets_);
+    executor_codegen_->Init(nullptr, config_->legacy_target_map);
     executor_codegen_->Codegen(func, mod_name);
     executor_codegen_->UpdateOutput(&ret_);
     ret_.params = executor_codegen_->GetParams();
@@ -467,9 +400,12 @@ class RelayBuildModule : public runtime::ModuleNode {
       lowered_funcs.Set(ext_dev, IRModule());
     }
 
+    const runtime::PackedFunc* pf = runtime::Registry::Get("codegen.LLVMModuleCreate");
+
     // Generate a placeholder function that attaches linked params as its arguments.
-    if (target_host->GetAttr<Bool>("link-params").value_or(Bool(false))) {
-      CHECK(pf != nullptr) << "Unable to link-params with no target_host and no llvm codegen.";
+    const Target& host_target = config_->host_se_scope->target;
+    if (host_target->GetAttr<Bool>("link-params").value_or(Bool(false))) {
+      CHECK(pf != nullptr) << "Unable to link-params without llvm codegen.";
       auto param_ids = executor_codegen_->GetParamIds();
       auto link_params = Map<String, tir::LinkedParam>();
       for (auto param : ret_.params) {
@@ -482,18 +418,19 @@ class RelayBuildModule : public runtime::ModuleNode {
       DictAttrs attrs{dict};
       auto prim = tir::PrimFunc(Array<tir::Var>(), tir::SeqStmt(Array<tir::Stmt>()), VoidType(),
                                 Map<tir::Var, tir::Buffer>(), attrs);
-      if (lowered_funcs.find(target_host) == lowered_funcs.end()) {
-        lowered_funcs.Set(target_host, IRModule(Map<GlobalVar, BaseFunc>({})));
+      if (lowered_funcs.find(host_target) == lowered_funcs.end()) {
+        lowered_funcs.Set(host_target, IRModule(Map<GlobalVar, BaseFunc>({})));
       }
-      lowered_funcs[target_host]->Add(GlobalVar(::tvm::runtime::symbol::tvm_lookup_linked_param),
+      lowered_funcs[host_target]->Add(GlobalVar(::tvm::runtime::symbol::tvm_lookup_linked_param),
                                       prim);
     }
 
     // When there is no lowered_funcs due to reasons such as optimization.
     if (lowered_funcs.size() == 0) {
-      if (target_host.defined() && target_host->kind->name == "llvm") {
+      if (host_target->kind->name == "llvm") {
+        CHECK(pf != nullptr) << "Unable to create empty module for llvm without llvm codegen.";
         // If we can decide the target is LLVM, we then create an empty LLVM module.
-        ret_.mod = (*pf)(target_host->str(), "empty_module");
+        ret_.mod = (*pf)(host_target->str(), "empty_module");
       } else {
         // If we cannot decide the target is LLVM, we create an empty CSourceModule.
         // The code content is initialized with ";" to prevent complaining
@@ -501,11 +438,11 @@ class RelayBuildModule : public runtime::ModuleNode {
         ret_.mod = tvm::codegen::CSourceModuleCreate(";", "", Array<String>{});
       }
     } else {
-      ret_.mod = tvm::build(lowered_funcs, target_host_);
+      ret_.mod = tvm::build(lowered_funcs, host_target);
     }
 
     auto ext_mods = executor_codegen_->GetExternalModules();
-    ret_.mod = tvm::codegen::CreateMetadataModule(ret_.params, ret_.mod, ext_mods, GetTargetHost(),
+    ret_.mod = tvm::codegen::CreateMetadataModule(ret_.params, ret_.mod, ext_mods, host_target,
                                                   executor_codegen_->GetMetadata());
     // Remove external params which were stored in metadata module.
     for (tvm::runtime::Module mod : ext_mods) {
@@ -522,26 +459,8 @@ class RelayBuildModule : public runtime::ModuleNode {
     }
   }
 
- private:
-  Target GetTargetHost() {
-    Target target_host = target_host_;
-    if (!target_host_.defined()) {
-      for (const auto& it : targets_) {
-        if (it.second->kind->device_type == kDLCPU) {
-          target_host = it.second;
-          break;
-        }
-      }
-    }
-    return target_host;
-  }
-
  protected:
   std::unique_ptr<ExecutorCodegen> executor_codegen_;
-  /*! \brief target device */
-  TargetMap targets_;
-  /*! \brief target host device */
-  tvm::Target target_host_;
   /*! \brief parameters */
   std::unordered_map<std::string, runtime::NDArray> params_;
   /*! \brief building output */
@@ -552,6 +471,8 @@ class RelayBuildModule : public runtime::ModuleNode {
    * - aot: use the aot executor
    */
   String executor_;
+  /*! \brief Collects all the targets and scopes we need during compilation. */
+  CompilationConfig config_;
 };
 
 runtime::Module RelayBuildCreate() {
diff --git a/src/relay/backend/graph_executor_codegen.cc b/src/relay/backend/graph_executor_codegen.cc
index d32ded3796886..1fe8a5c953558 100644
--- a/src/relay/backend/graph_executor_codegen.cc
+++ b/src/relay/backend/graph_executor_codegen.cc
@@ -203,7 +203,7 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
     VLOG_CONTEXT << "GraphExecutorCodegen";
     VLOG(1) << "compiling:" << std::endl << PrettyPrint(func);
     for (const auto& pair : targets_) {
-      VLOG(1) << "target: " << pair.first << " = " << pair.second->str();
+      VLOG(1) << "target: " << pair.first << " = " << pair.second->ToDebugString();
     }
 
     // This first phase moves from implicit use of compile engine,
@@ -223,7 +223,7 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
       mod = WithAttr(mod, "main_func_info", func_info);
     }
 
-    IRModule lowered_mod = tec::LowerTEPass(targets_, mod_name_, [this](Function func) {
+    IRModule lowered_mod = tec::LowerTEPass(mod_name_, [this](Function func) {
       // We need to maintain the constant map for external
       // functions so we pass this processing function which
       // allows us to process each function as we lower it.
@@ -318,8 +318,10 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
     node->attrs_["storage_id"] = std::move(storage_ids);
     // type
     std::vector<int64_t> device_types;
-    for (auto v : storage_info->device_types) {
-      device_types.push_back(static_cast<int64_t>(v));
+    for (const auto& se_scope : storage_info->se_scopes) {
+      // TODO(mbs): Keeping only the device type.
+      ICHECK_GT(se_scope->device_type(), 0);
+      device_types.push_back(se_scope->device_type());
     }
     size_t num_unknown_devices = std::count(device_types.begin(), device_types.end(), 0);
     if (num_unknown_devices != 0 && num_unknown_devices != device_types.size()) {
@@ -446,7 +448,7 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
 
   std::vector<GraphNodeRef> VisitExpr_(const CallNode* call_node) override {
     relay::Call call = GetRef<Call>(call_node);
-    auto props = GetOnDeviceProps(call_node);
+    OnDeviceProps props = GetOnDeviceProps(call_node);
     if (props.body.defined()) {
       // See through "on_device" calls.
       return VisitExpr(props.body);
@@ -472,6 +474,7 @@ class GraphExecutorCodegen : public backend::MemoizedExprTranslator<std::vector<
     auto vtuple = VisitExpr(op->tuple);
     return {vtuple[op->index]};
   }
+
   std::vector<GraphNodeRef> VisitExpr_(const OpNode* op) override {
     LOG(FATAL) << "All OpNodes should have been expanded";
     return {};
diff --git a/src/relay/backend/graph_plan_memory.cc b/src/relay/backend/graph_plan_memory.cc
index 961252a14fa76..c92caba0862fe 100644
--- a/src/relay/backend/graph_plan_memory.cc
+++ b/src/relay/backend/graph_plan_memory.cc
@@ -51,21 +51,19 @@ struct StorageToken {
   size_t max_bytes{0};
   /*! \brief The corresponding tensor type. */
   TensorType ttype{nullptr};
-  /*! \brief Device on which memory will reside. */
-  Device device{kInvalidDeviceType, -1};
+  /*! \brief SEScope on which the memory will reside. */
+  SEScope se_scope = SEScope::FullyUnconstrained();
   /*! \brief The storage id */
   int64_t storage_id{-1};
 
-  bool is_valid() const { return device.device_type != kInvalidDeviceType; }
+  bool is_valid() const { return !se_scope->IsFullyUnconstrained(); }
 
-  bool is_compatible(const StorageToken& that) const {
-    return device.device_type == that.device.device_type;
-  }
+  bool is_compatible(const StorageToken& that) const { return se_scope == that.se_scope; }
 
   std::string ToString() const {
     std::ostringstream os;
-    os << "{id: " << storage_id << ", bytes: " << max_bytes << ", type: " << PrettyPrint(ttype)
-       << ", device: " << device.device_type << "}";
+    os << "{storage_id: " << storage_id << ", max_bytes: " << max_bytes
+       << ", ttype: " << PrettyPrint(ttype) << ", se_scope: " << se_scope << "}";
     return os.str();
   }
 };
@@ -160,14 +158,14 @@ class StorageAllocaBaseVisitor : public transform::DeviceAwareExprVisitor {
    * the result of evaluating \p op.
    */
   void CreateToken(const ExprNode* op, bool can_realloc) {
-    return CreateTokenOnDevice(op, GetInScopeDeviceType(GetRef<Expr>(op)), can_realloc);
+    return CreateTokenOnDevice(op, GetSEScope(GetRef<Expr>(op)), can_realloc);
   }
 
   /*!
    * \brief Allocates (or reuses if \p can_realloc is true) a storage token for holding
    * the result of evaluating \p op on \p device_type.
    */
-  virtual void CreateTokenOnDevice(const ExprNode* op, DLDeviceType device_type,
+  virtual void CreateTokenOnDevice(const ExprNode* op, const SEScope& se_scope,
                                    bool can_realloc) = 0;
 };
 
@@ -186,16 +184,13 @@ class StorageAllocaInit : protected StorageAllocaBaseVisitor {
  protected:
   using StorageAllocaBaseVisitor::VisitExpr_;
 
-  void CreateTokenOnDevice(const ExprNode* op, DLDeviceType device_type,
-                           bool can_realloc) override {
+  void CreateTokenOnDevice(const ExprNode* op, const SEScope& se_scope, bool can_realloc) override {
     ICHECK(!token_map_.count(op));
     std::vector<StorageToken*> tokens;
     for (const auto& ttype : FlattenTupleType(op->checked_type())) {
-      StorageToken* token = arena_->make<StorageToken>();
+      auto* token = arena_->make<StorageToken>();
       token->ttype = ttype;
-      // TODO(mbs): Should be TargetDevice.
-      token->device.device_type = device_type;
-      token->device.device_id = 0;
+      token->se_scope = se_scope;
       tokens.push_back(token);
     }
     token_map_[op] = tokens;
@@ -251,8 +246,11 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
 
     for (const auto& kv : token_map_) {
       std::vector<int64_t> storage_ids;
-      std::vector<DLDeviceType> device_types;
+      storage_ids.reserve(kv.second.size());
+      std::vector<SEScope> se_scopes;
+      se_scopes.reserve(kv.second.size());
       std::vector<int64_t> sid_sizes_byte;
+      sid_sizes_byte.reserve(kv.second.size());
 
       for (StorageToken* tok : kv.second) {
         VLOG(1) << "token: " << tok->ToString();
@@ -261,10 +259,11 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
         }
         num_nodes++;
         storage_ids.push_back(tok->storage_id);
-        device_types.push_back(static_cast<DLDeviceType>(tok->device.device_type));
+        se_scopes.push_back(tok->se_scope);
         sid_sizes_byte.push_back(GetMemorySize(tok));
       }
-      auto storage_info = backend::StorageInfo(storage_ids, device_types, sid_sizes_byte);
+      auto storage_info = backend::StorageInfo(std::move(storage_ids), std::move(se_scopes),
+                                               std::move(sid_sizes_byte));
       smap.Set(GetRef<Expr>(kv.first), storage_info);
     }
     // Either all or none of the nodes should be annotated.
@@ -279,20 +278,20 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
 
  protected:
   // override create token by getting token as prototype requirements.
-  void CreateTokenOnDevice(const ExprNode* op, DLDeviceType device_type, bool can_realloc) final {
+  void CreateTokenOnDevice(const ExprNode* op, const SEScope& se_scope, bool can_realloc) final {
     ICHECK(!token_map_.count(op));
     auto it = prototype_.find(op);
     ICHECK(it != prototype_.end());
     std::vector<StorageToken*> tokens;
 
     for (StorageToken* tok : it->second) {
-      ICHECK_EQ(tok->device.device_type, device_type);
+      ICHECK(tok->se_scope == se_scope);
       if (can_realloc) {
         tokens.push_back(Request(tok));
       } else {
         // Allocate a new token,
         StorageToken* allocated_tok = Alloc(tok, GetMemorySize(tok));
-        allocated_tok->device = tok->device;
+        allocated_tok->se_scope = tok->se_scope;
         // ensure it never get de-allocated.
         allocated_tok->ref_counter += 1;
         tokens.push_back(allocated_tok);
@@ -363,7 +362,7 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
    * \param size The original size.
    * \param word_size The element size.
    */
-  static size_t DivRoundUp(size_t size, size_t word_size) {
+  static int64_t DivRoundUp(int64_t size, int64_t word_size) {
     return (size + word_size - 1) / word_size;
   }
   /*!
@@ -390,16 +389,19 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
    * \brief Get the memory requirement.
    * \param prototype The prototype token.
    * \return The required memory size.
+   *
+   * TODO(mbs): Gf GetMemorySizeBytes in aot_executor_codegen.cc,
+   * CalculateRelayExprSizeBytes in utils.cc
    */
-  size_t GetMemorySize(StorageToken* prototype) {
+  static int64_t GetMemorySize(StorageToken* prototype) {
     TensorType ttype = prototype->ttype;
     ICHECK(ttype.defined());
-    size_t size = 1;
+    int64_t size = 1;
     for (IndexExpr dim : ttype->shape) {
       const int64_t* pval = tir::as_const_int(dim);
       ICHECK(pval != nullptr) << "Cannot allocate memory symbolic tensor shape " << ttype->shape;
       ICHECK_GE(*pval, 0) << "Cannot allocate memory for tensor with negative shape" << *pval;
-      size *= static_cast<size_t>(pval[0]);
+      size *= pval[0];
     }
     size *= DivRoundUp(ttype->dtype.bits() * ttype->dtype.lanes(), 8);
     return size;
diff --git a/src/relay/backend/interpreter.cc b/src/relay/backend/interpreter.cc
index 13b855624461a..ecca1fac03d97 100644
--- a/src/relay/backend/interpreter.cc
+++ b/src/relay/backend/interpreter.cc
@@ -908,16 +908,12 @@ class Interpreter : public ExprFunctor<ObjectRef(const Expr& n)>,
  * functions needed by the rewritten module.
  */
 IRModule Prepare(IRModule mod, CompilationConfig config) {
-  tec::TargetMap tec_target_map;
-  for (const auto& pair : config->legacy_target_map) {
-    tec_target_map.emplace(static_cast<DLDeviceType>(pair.first->value), pair.second);
-  }
   // Run minimal transforms on module to establish invariants needed by interpreter.
   transform::Sequential seq(
       {transform::SimplifyInference(),
        // Figure out which devices should be used to execute.
        // TODO(mbs): Should ignore all existing annotations when constant folding
-       transform::PlanDevices(config->default_primitive_se_scope->device_type()),
+       transform::PlanDevices(std::move(config)),
        // FuseOps will mark wrapped calls to prim-ops with the 'Primitive'
        // attribute.
        transform::FuseOps(/*fuse_opt_level=*/0),
@@ -927,8 +923,7 @@ IRModule Prepare(IRModule mod, CompilationConfig config) {
        transform::EtaExpand(
            /*expand_constructor=*/true, /*expand_global_var=*/false),
        transform::InferType(),
-       tec::LowerTEPass(tec_target_map, /*module_name=*/"intrp",
-                        [](Function func) { /* no-op */ })});
+       tec::LowerTEPass(/*module_name=*/"intrp", [](Function func) { /* no-op */ })});
 
   transform::PassContext pass_ctx = transform::PassContext::Current();
   With<transform::PassContext> ctx(pass_ctx);
diff --git a/src/relay/backend/te_compiler.cc b/src/relay/backend/te_compiler.cc
index b284fc8bc0ca6..4d1515de2010c 100644
--- a/src/relay/backend/te_compiler.cc
+++ b/src/relay/backend/te_compiler.cc
@@ -43,6 +43,7 @@
 #include <vector>
 
 #include "../op/annotation/annotation.h"
+#include "../op/memory/device_copy.h"
 #include "../transforms/device_aware_visitors.h"
 #include "./te_compiler_cache.h"
 #include "./utils.h"
@@ -359,21 +360,6 @@ TVM_REGISTER_GLOBAL("relay.backend._TECompilerListItems").set_body_typed([](TECo
 
 using AnalysisRemapping = std::unordered_map<Expr, Expr, ObjectHash, ObjectEqual>;
 
-std::tuple<bool, int, int> IsDeviceCopy(const Function& func) {
-  if (auto call_node = func->body.as<CallNode>()) {
-    if (auto op_node = call_node->op.as<OpNode>()) {
-      if (op_node->name == "device_copy") {
-        auto attrs = call_node->attrs.as<DeviceCopyAttrs>();
-        auto dst = attrs->dst_dev_type;
-        auto src = attrs->src_dev_type;
-        return std::tuple<bool, int, int>(true, src, dst);
-      }
-    }
-  }
-
-  return std::tuple<bool, int, int>(false, -1, -1);
-}
-
 /*!
  * \brief Rewrites call expressions to Relay functions marked as 'primitive'
  * to calls to the corresponding TIR primitive for the appropriate target.
@@ -415,11 +401,10 @@ std::tuple<bool, int, int> IsDeviceCopy(const Function& func) {
  */
 class LowerTensorExprMutator : public DeviceAwareExprMutator {
  public:
-  LowerTensorExprMutator(const IRModule& module, const TargetMap& targets, ProcessFn process_fn,
-                         const String& module_name, TECompiler compiler)
+  LowerTensorExprMutator(const IRModule& module, ProcessFn process_fn, const String& module_name,
+                         TECompiler compiler)
       : DeviceAwareExprMutator(module),
         module_(module),
-        targets_(targets),
         process_fn_(process_fn),
         module_name_(module_name),
         compiler_(compiler),
@@ -484,7 +469,8 @@ class LowerTensorExprMutator : public DeviceAwareExprMutator {
     }
 
     // Non-External Relay Function
-    VLOG(1) << "lowering to target '" << target->str() << "' for primitive:\n" << PrettyPrint(func);
+    VLOG(1) << "lowering to target " << target->ToDebugString() << " for primitive:" << std::endl
+            << PrettyPrint(func);
     CCacheKey key = CCacheKey(func, target);
     CachedFunc lowered_func = compiler_->Lower(key, module_name_);
     VLOG(1) << "lowered primitive bound to '" << PrettyPrint(lowered_func->prim_fn_var) << "'";
@@ -514,14 +500,12 @@ class LowerTensorExprMutator : public DeviceAwareExprMutator {
       tir_call_attrs->metadata.Set(attr::kReshapeOnly, tvm::Integer(1));
     }
 
-    auto device_copy = IsDeviceCopy(func);
-    if (std::get<0>(device_copy)) {
-      // Record that device copy source and destination devices so the device planner can
-      // still follow along.
-      auto source_device = std::get<1>(device_copy);
-      auto dst_device = std::get<2>(device_copy);
-      tir_call_attrs->metadata.Set("source_device", tvm::Integer(source_device));
-      tir_call_attrs->metadata.Set("dst_device", tvm::Integer(dst_device));
+    DeviceCopyProps props = GetDeviceCopyProps(func);
+    if (props.body.defined()) {
+      // Record the device copy source and destination SEScopes so the device planner can
+      // still follow along even after lowering.
+      tir_call_attrs->metadata.Set("src_se_scope", props.src_se_scope);
+      tir_call_attrs->metadata.Set("dst_se_scope", props.dst_se_scope);
     }
 
     tir_call_attrs->metadata.Set("relay_attrs", func->attrs);
@@ -534,8 +518,8 @@ class LowerTensorExprMutator : public DeviceAwareExprMutator {
       // on the host cpu irrespective of where the primitive runs.
       // TODO(mbs): Cleanup target handling.
       Target shape_target("llvm");
-      VLOG(1) << "lowering to target '" << shape_target->str()
-              << "' for dynamic shape function for primitive";
+      VLOG(1) << "lowering to target " << shape_target->ToDebugString()
+              << " for dynamic shape function for primitive";
       CCacheKey shape_key(func, shape_target);
       CachedFunc lowered_shape_func = compiler_->LowerShapeFunc(shape_key);
       // Capture the shape function's global var and parameters 'states' in call
@@ -617,9 +601,10 @@ class LowerTensorExprMutator : public DeviceAwareExprMutator {
       target = Target("ext_dev");
     } else {
       // The target corresponding to the call_node expression's annotation.
-      DLDeviceType device_type = GetInScopeDeviceType(call);
-      // TODO(mbs): Replace device_type with target so this lookup is unnecessary.
-      target = GetTargetFromInteger(device_type, targets_);
+      SEScope se_scope = GetSEScope(call);
+      ICHECK(!se_scope->IsFullyUnconstrained());
+      target = se_scope->target;
+      ICHECK(target.defined());
     }
 
     // Lower the primitive function for that target.
@@ -639,7 +624,6 @@ class LowerTensorExprMutator : public DeviceAwareExprMutator {
   }
 
   IRModule module_;
-  TargetMap targets_;
   ProcessFn process_fn_;
   // Map from in-scope let-bound variables to Relay functions known to be
   // primitive. We'll rewrite these to the fresh global vars bound to the lowered
@@ -685,11 +669,11 @@ Target GetTargetFromInteger(DLDeviceType dev_type, tec::TargetMap targets) {
   }
 }
 
-Pass LowerTensorExpr(TargetMap targets, const String& module_name, TECompiler compiler,
+Pass LowerTensorExpr(const String& module_name, TECompiler compiler,
                      std::function<void(Function)> process_fn) {
   runtime::TypedPackedFunc<Function(Function, IRModule, PassContext)> pass_func =
       [=](Function func, IRModule module, PassContext ctx) {
-        LowerTensorExprMutator lower_te(module, targets, process_fn, module_name, compiler);
+        LowerTensorExprMutator lower_te(module, process_fn, module_name, compiler);
         return Downcast<Function>(lower_te.Mutate(func));
       };
   return CreateFunctionPass(pass_func, 0, "LowerTensorExpr", {});
@@ -706,6 +690,7 @@ backend::FunctionInfo UpdateMainWorkspaceSize(const IRModule& mod, tec::TargetMa
   }
 
   // This is a Map<device,Map<storage_id, size>>
+  // TODO(mbs): Collapsing SEScopes to just device type.
   std::unordered_map<DLDeviceType, std::unordered_map<int, int>, backend::EnumClassHash>
       sid_workspace;
   // This is a Map<device, size_of_inputs_and_outputs>
@@ -716,15 +701,15 @@ backend::FunctionInfo UpdateMainWorkspaceSize(const IRModule& mod, tec::TargetMa
   // Initialize the mapping from all storage identifiers to workspace sizes,
   // the amount of device io, and the device constants.
   for (const auto& kv : storage_info_map) {
-    backend::StorageInfo storage_info = kv.second;
-    std::vector<int64_t> storage_ids = storage_info->storage_ids;
-    std::vector<DLDeviceType> devices = storage_info->device_types;
-
-    CHECK_EQ(storage_ids.size(), devices.size());
-    for (uint32_t i = 0; i < devices.size(); i++) {
-      sid_workspace[devices[i]][storage_ids[i]] = 0;
-      device_io[devices[i]] = 0;
-      device_consts[devices[i]] = 0;
+    const backend::StorageInfo& storage_info = kv.second;
+    const std::vector<int64_t>& storage_ids = storage_info->storage_ids;
+    const std::vector<SEScope>& se_scopes = storage_info->se_scopes;
+    CHECK_EQ(storage_ids.size(), se_scopes.size());
+    for (uint32_t i = 0; i < se_scopes.size(); i++) {
+      DLDeviceType device_type = se_scopes[i]->device_type();
+      sid_workspace[device_type][storage_ids[i]] = 0;
+      device_io[device_type] = 0;
+      device_consts[device_type] = 0;
     }
   }
 
@@ -753,18 +738,20 @@ backend::FunctionInfo UpdateMainWorkspaceSize(const IRModule& mod, tec::TargetMa
             << PrettyPrint(expr->checked_type()) << std::endl
             << "has size " << size_bytes << " and storage info:" << std::endl
             << storage_info;
-    std::vector<int64_t> storage_ids = storage_info->storage_ids;
-    std::vector<DLDeviceType> devices = storage_info->device_types;
+    const std::vector<int64_t>& storage_ids = storage_info->storage_ids;
+    const std::vector<SEScope>& se_scopes = storage_info->se_scopes;
 
     if (expr->IsInstance<ConstantNode>()) {
-      for (const auto& dev : devices) {
-        ICHECK_EQ(device_consts.count(dev), 1);
-        device_consts[dev] += size_bytes;
+      for (const auto& se_scope : se_scopes) {
+        DLDeviceType device_type = se_scope->device_type();
+        ICHECK_EQ(device_consts.count(device_type), 1);
+        device_consts[device_type] += size_bytes;
       }
     } else if (expr->IsInstance<VarNode>() || expr.same_as(func->body)) {
-      CHECK_GE(devices.size(), 1) << "must be at least one device";
-      for (const auto& dev : devices) {
-        device_io[dev] += size_bytes;
+      CHECK_GE(se_scopes.size(), 1) << "must be at least one device";
+      for (const auto& se_scope : se_scopes) {
+        DLDeviceType device_type = se_scope->device_type();
+        device_io[device_type] += size_bytes;
       }
     } else {
       // TODO(@electriclilies): This code is never being called which means sid_workspace is not
@@ -774,8 +761,9 @@ backend::FunctionInfo UpdateMainWorkspaceSize(const IRModule& mod, tec::TargetMa
         // Here we record the largest size of the tensor
         // that share the same storage id, because storage_id will
         // be shared between multiple tensors that are not live simultaneously.
-        if (size_bytes > sid_workspace[devices[i]][storage_ids[i]]) {
-          sid_workspace[devices[i]][storage_ids[i]] = size_bytes;
+        DLDeviceType device_type = se_scopes[i]->device_type();
+        if (size_bytes > sid_workspace[device_type][storage_ids[i]]) {
+          sid_workspace[device_type][storage_ids[i]] = size_bytes;
         }
       }
     }
@@ -820,8 +808,9 @@ backend::FunctionInfo UpdateMainWorkspaceSize(const IRModule& mod, tec::TargetMa
     constant_sizes.Set(tgt, dev_and_size.second);
   }
 
-  backend::FunctionInfo func_info(workspace_sizes, io_sizes, constant_sizes, tir_primfuncs,
-                                  relay_primfuncs);
+  backend::FunctionInfo func_info(std::move(workspace_sizes), std::move(io_sizes),
+                                  std::move(constant_sizes), std::move(tir_primfuncs),
+                                  std::move(relay_primfuncs));
   VLOG(1) << "func_info: " << func_info;
   return std::move(func_info);
 }
@@ -879,6 +868,7 @@ void UpdateFunctionMetadata(Function relay_func,
     workspace_sizes.Set(prim_fn_target, workspace_size);
 
     // Calculating size for I/O
+    // TODO(mbs): See also the other three utils for calculating tensor bytesize.
     for (auto const& param : prim_fn->params) {
       auto p_shape = prim_fn->buffer_map[param]->shape;
       int num_of_elements = 1;
@@ -899,8 +889,9 @@ void UpdateFunctionMetadata(Function relay_func,
     relay_primfuncs.Set(prim_fn_target, relay_func);
   }
 
-  backend::FunctionInfo fi = backend::FunctionInfo(workspace_sizes, io_sizes, constant_sizes,
-                                                   tir_primfuncs, relay_primfuncs);
+  backend::FunctionInfo fi = backend::FunctionInfo(
+      std::move(workspace_sizes), std::move(io_sizes), std::move(constant_sizes),
+      std::move(tir_primfuncs), std::move(relay_primfuncs));
 
   VLOG(1) << "FunctionInfo: " << prim_fn_var.value()->name_hint << " = " << PrettyPrint(fi);
 
@@ -909,11 +900,11 @@ void UpdateFunctionMetadata(Function relay_func,
   function_metadata.Set(prim_fn_var.value()->name_hint, fi);
 }
 
-IRModule LowerTE(const IRModule& module, TargetMap targets, const String& module_name,
+IRModule LowerTE(const IRModule& module, const String& module_name,
                  std::function<void(Function)> process_fn) {
   TECompiler compiler;
 
-  auto updated_module = LowerTensorExpr(targets, module_name, compiler, process_fn)(module);
+  auto updated_module = LowerTensorExpr(module_name, compiler, process_fn)(module);
 
   backend::UpdateAutoSchedulerOpWeights(compiler);
 
@@ -958,12 +949,9 @@ Map<Target, IRModule> GetPerTargetModules(IRModule mod) {
   return per_target_modules;
 }
 
-Pass LowerTEPass(TargetMap targets, const String& module_name,
-                 std::function<void(Function)> process_fn) {
-  runtime::TypedPackedFunc<IRModule(IRModule, PassContext)> pass_func = [=](IRModule module,
-                                                                            PassContext ctx) {
-    return LowerTE(module, targets, module_name, process_fn);
-  };
+Pass LowerTEPass(const String& module_name, std::function<void(Function)> process_fn) {
+  runtime::TypedPackedFunc<IRModule(IRModule, PassContext)> pass_func =
+      [=](IRModule module, PassContext ctx) { return LowerTE(module, module_name, process_fn); };
 
   return tvm::transform::Sequential({tvm::relay::transform::RelayToTIRTargetHook(),
                                      tvm::transform::CreateModulePass(pass_func, 0, "LowerTE", {}),
diff --git a/src/relay/backend/te_compiler.h b/src/relay/backend/te_compiler.h
index d0401e9605f7f..da7333d64d463 100644
--- a/src/relay/backend/te_compiler.h
+++ b/src/relay/backend/te_compiler.h
@@ -173,7 +173,6 @@ Map<Target, IRModule> GetPerTargetModules(IRModule mod);
  * to TE expressions, schedules them, and then to TIR.
  *
  * \param module The IRModule.
- * \param targets The mapping for devices to targets.
  * \param memory_plan The memory plan used during lowering
  * \param module_name The name of this module
  * \param process_fn Callback allowing one-level up code generators to process
@@ -181,8 +180,8 @@ Map<Target, IRModule> GetPerTargetModules(IRModule mod);
  * \return The lowered module, see above.
  */
 IRModule LowerTE(
-    const IRModule& module, TargetMap targets, backend::StaticMemoryPlan memory_plan,
-    const String& module_name, ProcessFn process_fn = [](Function f) {});
+    const IRModule& module, backend::StaticMemoryPlan memory_plan, const String& module_name,
+    ProcessFn process_fn = [](Function f) {});
 
 /*! \brief Pass to lower an IRModule's primitive functions to TIR.
  *
@@ -190,14 +189,12 @@ IRModule LowerTE(
  * to TE expressions, schedules them, and then to TIR. It annotates all functions
  * with their target.
  *
- * \param targets The mapping for devices to targets.
  * \param module_name The name of this module
  * \param process_fn Callback allowing one-level up code generators to process
  * each function that we lower
  * \returns The pass which lowers primitive functions to TIR
  */
-transform::Pass LowerTEPass(TargetMap targets, const String& module_name,
-                            std::function<void(Function)> process_fn);
+transform::Pass LowerTEPass(const String& module_name, std::function<void(Function)> process_fn);
 }  // namespace tec
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/backend/utils.cc b/src/relay/backend/utils.cc
index 02caf56c66e65..9a1c428482e2d 100644
--- a/src/relay/backend/utils.cc
+++ b/src/relay/backend/utils.cc
@@ -34,49 +34,56 @@ namespace backend {
 
 TVM_REGISTER_NODE_TYPE(StorageInfoNode);
 
-StorageInfo::StorageInfo(std::vector<int64_t> storage_ids, std::vector<DLDeviceType> device_types,
-                         std::vector<int64_t> storage_sizes_in_bytes) {
-  auto n = make_object<StorageInfoNode>();
-  n->storage_ids = std::move(storage_ids);
-  n->device_types = std::move(device_types);
-  n->storage_sizes_in_bytes = std::move(storage_sizes_in_bytes);
-  data_ = std::move(n);
-}
-
 TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
     .set_dispatch<StorageInfoNode>([](const ObjectRef& ref, ReprPrinter* p) {
       const auto* node = ref.as<StorageInfoNode>();
-      p->stream << "StorageInfoNode(\n"
-                << "  storage_ids=[";
+      p->stream << "StorageInfoNode("
+                << "storage_ids=[";
       for (auto id : node->storage_ids) {
-        p->stream << id << ", ";
+        p->stream << id << ",";
       }
-      p->stream << "],\n  device_types=[";
-      for (auto device_type : node->device_types) {
-        p->stream << device_type << ", ";
+      p->stream << "], se_scopes=[";
+      for (const auto& se_scope : node->se_scopes) {
+        p->stream << se_scope << ",";
       }
-      p->stream << "],\n  storage_size_in_bytes=[";
+      p->stream << "], storage_size_in_bytes=[";
       for (auto bytes : node->storage_sizes_in_bytes) {
-        p->stream << bytes << ", ";
+        p->stream << bytes << ",";
       }
       p->stream << "])";
     });
 
+StorageInfo::StorageInfo(std::vector<int64_t> storage_ids, std::vector<SEScope> se_scopes,
+                         std::vector<int64_t> storage_sizes_in_bytes) {
+  ICHECK_EQ(storage_ids.size(), se_scopes.size());
+  ICHECK_EQ(storage_ids.size(), storage_sizes_in_bytes.size());
+  auto node = make_object<StorageInfoNode>();
+  node->storage_ids = std::move(storage_ids);
+  node->se_scopes = std::move(se_scopes);
+  node->storage_sizes_in_bytes = std::move(storage_sizes_in_bytes);
+  data_ = std::move(node);
+}
+
+// This is the legacy interface for devices as DLDeviceTypes (represented by integers)
 TVM_REGISTER_GLOBAL("relay.ir.StorageInfo")
-    .set_body_typed([](const Array<Integer>& sids, const Array<Integer>& dev_types,
+    .set_body_typed([](const Array<Integer>& sids, const Array<Integer>& device_types,
                        const Array<Integer>& sizes_in_bytes) {
-      std::vector<int64_t> sids_v, sizes_v;
-      std::vector<DLDeviceType> dev_types_v;
+      std::vector<int64_t> sids_v;
+      sids_v.reserve(sids.size());
       for (auto s : sids) {
         sids_v.push_back(s);
       }
-      for (auto d : dev_types) {
-        dev_types_v.push_back(static_cast<DLDeviceType>(static_cast<int64_t>(d)));
+      std::vector<SEScope> se_scopes_v;
+      se_scopes_v.reserve(device_types.size());
+      for (const auto& device_type : device_types) {
+        se_scopes_v.emplace_back(SEScope::ForDeviceType(device_type));
       }
+      std::vector<int64_t> size_in_bytes_v;
+      size_in_bytes_v.reserve(sizes_in_bytes.size());
       for (auto s : sizes_in_bytes) {
-        sizes_v.push_back(s);
+        size_in_bytes_v.push_back(s);
       }
-      return StorageInfo(sids_v, dev_types_v, sizes_v);
+      return StorageInfo(std::move(sids_v), std::move(se_scopes_v), std::move(size_in_bytes_v));
     });
 
 TVM_REGISTER_GLOBAL("relay.ir.StorageInfoStorageIds").set_body_typed([](StorageInfo si) {
@@ -87,10 +94,11 @@ TVM_REGISTER_GLOBAL("relay.ir.StorageInfoStorageIds").set_body_typed([](StorageI
   return ids;
 });
 
+// This is the legacy interface for devices as DLDeviceTypes (represented by integers)
 TVM_REGISTER_GLOBAL("relay.ir.StorageInfoDeviceTypes").set_body_typed([](StorageInfo si) {
   Array<tvm::Integer> device_types;
-  for (auto id : si->device_types) {
-    device_types.push_back(id);
+  for (const auto& se_scope : si->se_scopes) {
+    device_types.push_back(se_scope->device_type());
   }
   return device_types;
 });
@@ -116,7 +124,8 @@ TVM_REGISTER_GLOBAL("relay.ir.StaticMemoryPlan")
       return StaticMemoryPlan(expr_to_storage_info);
     });
 
-// TODO(mbs): Cf GetMemorySizeBytes in aot_executor_codegen.cc
+// TODO(mbs): Cf GetMemorySizeBytes in aot_executor_codegen.cc, GetMemorySize in
+// graph_plan_memory.cc
 int64_t CalculateRelayExprSizeBytes(const Type& expr_type) {
   if (expr_type->IsInstance<TupleTypeNode>()) {
     auto tuple_type = Downcast<TupleType>(expr_type);
@@ -166,7 +175,7 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable)
                 << ",\n  relay_primfuncs=" << node->relay_primfuncs << ")";
     });
 
-Array<Pass> GetPassPrefix(const Map<tvm::Integer, tvm::Target>& targets, bool is_vm) {
+Array<Pass> GetPassPrefix(bool is_homegeneous, bool is_vm) {
   Array<Pass> pass_seqs;
   Array<runtime::String> entry_functions{"main"};
   pass_seqs.push_back(transform::RemoveUnusedFunctions(entry_functions));
@@ -175,7 +184,7 @@ Array<Pass> GetPassPrefix(const Map<tvm::Integer, tvm::Target>& targets, bool is
   pass_seqs.push_back(relay::qnn::transform::Legalize());
 
   // Legalize pass is restricted to homogeneous execution for now.
-  if (targets.size() == 1) {
+  if (is_homegeneous) {
     pass_seqs.push_back(transform::Legalize());
   }
 
@@ -217,8 +226,8 @@ Array<Pass> GetPassPrefix(const Map<tvm::Integer, tvm::Target>& targets, bool is
   pass_seqs.push_back(transform::CanonicalizeCast());
   pass_seqs.push_back(transform::CanonicalizeOps());
 
-  // Alter layout transformation is only applied to homogeneous execution yet.
-  if (targets.size() == 1) {
+  // Alter layout transformation is currently only applied to homogeneous execution.
+  if (is_homegeneous) {
     if (!is_vm) {
       pass_seqs.push_back(transform::InferType());
     }
diff --git a/src/relay/backend/utils.h b/src/relay/backend/utils.h
index 16cbe0e8dbcae..4224a99c26285 100644
--- a/src/relay/backend/utils.h
+++ b/src/relay/backend/utils.h
@@ -31,6 +31,7 @@
 #include <tvm/relay/transform.h>
 #include <tvm/relay/type.h>
 #include <tvm/target/codegen.h>
+#include <tvm/target/se_scope.h>
 #include <tvm/te/operation.h>
 
 #include <string>
@@ -57,15 +58,17 @@ namespace backend {
 using Pass = tvm::transform::Pass;
 
 /*!
- * \brief The static storage information produced by memory planning.
+ * \brief The static storage information for each Tensor in the result of a Relay expression
+ * (as per relay::FlattenTupleType).
  */
 class StorageInfoNode : public Object {
  public:
+  // TODO(mbs): Switch from struct-of-array to array-of-struct repr throughout.
   /*! \brief The set of storage ids where the expression is stored. */
   std::vector<int64_t> storage_ids;
-  /* \brief The type of "virtual devices" these expressions are stored on. */
-  std::vector<DLDeviceType> device_types;
-  /* \brief The sizes of each storage element. */
+  /* \brief The SEScopes these expressions are stored within. */
+  std::vector<SEScope> se_scopes;
+  /* \brief The sizes of each storage element, in bytes. */
   std::vector<int64_t> storage_sizes_in_bytes;
 
   // TODO(@jroesch): expose the fields
@@ -78,7 +81,7 @@ class StorageInfoNode : public Object {
 /*! \brief The storage information for a single expression. */
 class StorageInfo : public ObjectRef {
  public:
-  StorageInfo(std::vector<int64_t> storage_ids, std::vector<DLDeviceType> device_types,
+  StorageInfo(std::vector<int64_t> storage_ids, std::vector<SEScope> se_scopes,
               std::vector<int64_t> storage_sizes_in_bytes);
   TVM_DEFINE_OBJECT_REF_METHODS(StorageInfo, ObjectRef, StorageInfoNode);
 };
@@ -442,11 +445,11 @@ inline bool IsMetaScheduleEnabled() {
  * difference. This function unifies the shared optimization pass prefix between vm and graph
  * runtime, and returns the pass prefix given the backend type.
  *
- * \param targets The device type to `Target` mapping.
- * \param is_vm A boolean indicating if the passes are used for vm or graph runtime.
+ * \param is_homogenous True if all primitives are to be executed on the same device and target.
+ * \param is_vm True if passes are to be used for the vm executor.
  * \return An array of passes.
  */
-Array<Pass> GetPassPrefix(const TargetMap& targets, bool is_vm);
+Array<Pass> GetPassPrefix(bool is_homogenous, bool is_vm);
 
 /*! \brief Target hash function */
 struct TargetStrHash {
diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
index c4c50c6c5646d..f4bce3c417a8f 100644
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -81,6 +81,9 @@ using namespace tvm::runtime;
 using namespace tvm::runtime::vm;
 using namespace relay::transform;
 
+/*! \brief The host device is always stored at device index 0. */
+constexpr Index kHostDeviceIndex = 0;
+
 // (@jroesch): VM passes, eventually declare as passes.
 bool IsClosure(const Function& func);
 
@@ -93,9 +96,9 @@ using MatchValuePtr = std::shared_ptr<MatchValue>;
 // A runtime object that resides in a register
 struct RegisterValue : MatchValue {
   // The register num
-  RegName rergister_num;
+  RegName register_num;
 
-  explicit RegisterValue(RegName reg) : rergister_num(reg) {}
+  explicit RegisterValue(RegName reg) : register_num(reg) {}
 
   ~RegisterValue() {}
 };
@@ -227,44 +230,17 @@ std::vector<int64_t> ToAllocTensorShape(NDArray shape) {
   return raw_shape;
 }
 
-/*!
- * \brief Create a default type.
- * \param device_type The device type index.
- * \return the default target for the device.
- */
-Target CreateDefaultTarget(int device_type) {
-  std::string name = runtime::DeviceName(device_type);
-  if (name == "cpu") return Target("llvm");
-  if (name == "cuda") return Target("cuda");
-  return Target(name);
-}
-
-int GetFallbackDevice() {
-  transform::PassContext pass_ctx = PassContext::Current();
-  Optional<Integer> opt_fallback_dev =
-      pass_ctx->GetConfig("relay.fallback_device_type", Integer(static_cast<int>(kDLCPU)));
-  auto fallback_dev = opt_fallback_dev.value();
-  ICHECK_GT(fallback_dev->value, 0U);
-  return fallback_dev->value;
-}
-
 class VMFunctionCompiler : DeviceAwareExprFunctor<void(const Expr& n)> {
  public:
-  VMFunctionCompiler(VMCompilerContext* context, TargetMap targets, Target target_host)
+  VMFunctionCompiler(VMCompilerContext* context, SEScope host_se_scope)
       : DeviceAwareExprFunctor(context->module),
         last_register_(0),
         registers_num_(0),
         context_(context),
-        target_host_(target_host) {
-    CheckAndUpdateHostConsistency(&targets, &target_host);
-    for (const auto& it : targets) {
-      targets_[it.first->value] = it.second;
-    }
-    target_host_ = target_host;
-  }
+        host_se_scope_(std::move(host_se_scope)) {}
 
   VMFunction Compile(const GlobalVar& var, const Function& func) {
-    std::vector<DLDeviceType> params_device_type;
+    std::vector<Index> param_device_indexes;
     if (IsClosure(func)) {
       // After lifting we'll have functions of the form:
       //   fn(closure args) { fn(lifted function args) { body } }
@@ -273,16 +249,21 @@ class VMFunctionCompiler : DeviceAwareExprFunctor<void(const Expr& n)> {
       // Do that flattening on-the-fly here.
       Function inner_func = Downcast<Function>(func->body);
       std::vector<Var> params;
-      std::vector<DLDeviceType> param_device_types;
+      std::vector<SEScope> param_se_scopes;
       params.reserve(func->params.size() + inner_func->params.size());
-      param_device_types.reserve(func->params.size() + inner_func->params.size());
+      param_se_scopes.reserve(func->params.size() + inner_func->params.size());
+      param_device_indexes.reserve(func->params.size() + inner_func->params.size());
       for (size_t i = 0; i < func->params.size(); ++i) {
         params.emplace_back(func->params[i]);
-        params_device_type.push_back(GetFunctionParamDeviceType(func.get(), i));
+        SEScope param_se_scope = GetFunctionParamSEScope(func.get(), i);
+        param_se_scopes.push_back(param_se_scope);
+        param_device_indexes.push_back(GetDeviceIndex(param_se_scope));
       }
       for (size_t i = 0; i < inner_func->params.size(); ++i) {
         params.emplace_back(inner_func->params[i]);
-        params_device_type.push_back(GetFunctionParamDeviceType(inner_func.get(), i));
+        SEScope param_se_scope = GetFunctionParamSEScope(inner_func.get(), i);
+        param_se_scopes.push_back(param_se_scope);
+        param_device_indexes.push_back(GetDeviceIndex(param_se_scope));
       }
       std::vector<TypeVar> type_params;
       type_params.reserve(func->type_params.size() + inner_func->type_params.size());
@@ -294,22 +275,17 @@ class VMFunctionCompiler : DeviceAwareExprFunctor<void(const Expr& n)> {
       }
       Function flattened_func = Function(params, inner_func->body, inner_func->ret_type,
                                          type_params, func->attrs, func->span);
-      VisitExpr(MaybeFunctionOnDevice(flattened_func, params_device_type,
-                                      GetFunctionResultDeviceType(inner_func.get())));
+      VisitExpr(MaybeFunctionOnDevice(flattened_func, param_se_scopes,
+                                      GetFunctionResultSEScope(inner_func.get())));
     } else {
-      params_device_type.reserve(func->params.size());
+      param_device_indexes.reserve(func->params.size());
       for (size_t i = 0; i < func->params.size(); ++i) {
-        params_device_type.push_back(GetFunctionParamDeviceType(func.get(), i));
+        param_device_indexes.push_back(GetDeviceIndex(GetFunctionParamSEScope(func.get(), i)));
       }
       VisitExpr(func);
     }
-    std::vector<Index> params_device_type_index;
-    params_device_type_index.reserve(params_device_type.size());
-    for (auto device_type : params_device_type) {
-      params_device_type_index.push_back(static_cast<Index>(device_type));
-    }
     return VMFunction(var->name_hint, params_, instructions_, registers_num_,
-                      params_device_type_index);
+                      std::move(param_device_indexes));
   }
 
   /*! \brief Attrs objects for each op. */
@@ -352,6 +328,48 @@ class VMFunctionCompiler : DeviceAwareExprFunctor<void(const Expr& n)> {
     instructions_.push_back(instr);
   }
 
+  /*!
+   * \brief Returns the "device index" to represent \p se_scope for primitives
+   * in emitted code. Note that the host device is always at index 0.
+   */
+  Index GetDeviceIndex(const SEScope& se_scope) {
+    VLOG(2) << "getting device index for " << se_scope;
+    auto itr = std::find(context_->se_scopes_.begin(), context_->se_scopes_.end(), se_scope);
+    if (itr != context_->se_scopes_.end()) {
+      VLOG(2) << "reusing existing scope";
+      return std::distance(context_->se_scopes_.begin(), itr);
+    }
+
+    ICHECK_GT(context_->se_scopes_.size(), 0);
+    ICHECK_NE(se_scope, host_se_scope_);
+
+    if (se_scope->device_type() == context_->se_scopes_.front()->device_type()) {
+      // It's ok if we see distinct scopes which share the host device type. This is because
+      // we allow the SEScope for the host to be different from the SEScope for primitive
+      // operations which happen to be, eg, on the CPU.
+      return 0;
+    }
+
+    // However, otherwise we allow at most one SEScope per device type.
+    // TODO(mbs): This will eventually need to account for memory scopes somehow so device_copy
+    // instructions can do the right thing.
+    itr = std::find_if(context_->se_scopes_.begin() + 1, context_->se_scopes_.end(),
+                       [&se_scope](const SEScope& existing_se_scope) {
+                         return existing_se_scope->device_type() == se_scope->device_type();
+                       });
+    CHECK(itr == context_->se_scopes_.end())
+        << "The VM does not currently support using more than one device with the same device type "
+           "for primitives, however the program is using the distinct scopes "
+        << se_scope << " and " << *itr << " of device type " << se_scope->device_type();
+
+    ICHECK(se_scope != host_se_scope_);
+    Index index = context_->se_scopes_.size();
+    VLOG(2) << "adding new scope";
+    context_->se_scopes_.push_back(se_scope);
+
+    return index;
+  }
+
   using DeviceAwareExprFunctor<void(const Expr&)>::VisitExpr_;
 
   void VisitExpr_(const ConstantNode* const_node) final {
@@ -359,7 +377,7 @@ class VMFunctionCompiler : DeviceAwareExprFunctor<void(const Expr& n)> {
     NDArray data = const_node->data;
     size_t konst_idx = context_->constants.size();
     auto con = GetRef<Constant>(const_node);
-    context_->const_device_type.push_back(GetInScopeDeviceType(con));
+    context_->const_device_indexes.push_back(GetDeviceIndex(GetSEScope(con)));
     context_->constants.push_back(const_node->data);
     Emit(Instruction::LoadConst(konst_idx, NewRegister()));
   }
@@ -463,7 +481,7 @@ class VMFunctionCompiler : DeviceAwareExprFunctor<void(const Expr& n)> {
 
   void EmitShapeFunc(Function func, Array<Expr> inputs, Array<Expr> outputs) {
     // Lower shape function
-    tec::CCacheKey key(func, target_host_);
+    tec::CCacheKey key(func, host_se_scope_->target);
     auto cfunc = context_->compiler->LowerShapeFunc(key);
     int op_index = -1;
     // pick the only function inside the context
@@ -498,7 +516,8 @@ class VMFunctionCompiler : DeviceAwareExprFunctor<void(const Expr& n)> {
                                    argument_registers));
   }
 
-  void EmitInvokeTVMOp(const Function& func, const Expr& inputs, const Expr& outputs) {
+  void EmitInvokeTVMOp(const Function& func, const Expr& inputs, const Expr& outputs,
+                       SEScope se_scope) {
     std::vector<Index> argument_registers;
 
     ICHECK(func->HasNonzeroAttr(attr::kPrimitive))
@@ -531,13 +550,9 @@ class VMFunctionCompiler : DeviceAwareExprFunctor<void(const Expr& n)> {
     if (func->GetAttr<String>(attr::kCompiler).defined()) {
       target = Target("ext_dev");
     } else {
-      int dev_type = GetInScopeDeviceType(func);
-      if (targets_.count(dev_type) == 0) {
-        target = CreateDefaultTarget(dev_type);
-      } else {
-        target = targets_[dev_type];
-      }
+      target = se_scope->target;
     }
+    ICHECK(target.defined()) << "No target for function:" << std::endl << PrettyPrint(func);
 
     tec::CCacheKey key(func, target);
     auto mangle_fn = [](String name) { return name; };
@@ -577,9 +592,11 @@ class VMFunctionCompiler : DeviceAwareExprFunctor<void(const Expr& n)> {
       OpMatch<void> matcher;
       matcher
           .Match("vm.invoke_tvm_op",
-                 [this](const Array<Expr>& args, const Attrs& attrs, const Array<Type>& type_arg) {
+                 [this, call_node](const Array<Expr>& args, const Attrs& attrs,
+                                   const Array<Type>& type_arg) {
                    ICHECK_EQ(args.size(), 3);
-                   EmitInvokeTVMOp(Downcast<Function>(args[0]), args[1], args[2]);
+                   EmitInvokeTVMOp(Downcast<Function>(args[0]), args[1], args[2],
+                                   GetSEScope(GetRef<Call>(call_node)));
                  })
           .Match("memory.alloc_tensor",
                  [this](const Array<Expr>& args, const Attrs& attrs, const Array<Type>& type_arg) {
@@ -639,7 +656,8 @@ class VMFunctionCompiler : DeviceAwareExprFunctor<void(const Expr& n)> {
                    auto dtype = alloc_attrs->dtype;
 
                    Emit(Instruction::AllocStorage(size_register, alignment, dtype,
-                                                  alloc_attrs->device_type, NewRegister()));
+                                                  GetDeviceIndex(alloc_attrs->se_scope),
+                                                  NewRegister()));
                  })
           .Match("vm.shape_func",
                  [this](const Array<Expr>& args, const Attrs& attrs, const Array<Type>& type_arg) {
@@ -671,17 +689,17 @@ class VMFunctionCompiler : DeviceAwareExprFunctor<void(const Expr& n)> {
                    Emit(Instruction::ReshapeTensor(tensor_reg, shape_reg, NewRegister()));
                  })
           .Match("device_copy",
-                 [this](const Array<Expr>& args, const Attrs& attrs, const Array<Type>& type_arg) {
+                 [this, call_node](const Array<Expr>& args, const Attrs& attrs,
+                                   const Array<Type>& type_arg) {
                    ICHECK_EQ(args.size(), 1U);
                    this->VisitExpr(args[0]);
                    auto src_reg = last_register_;
 
                    auto device_copy_attrs = attrs.as<DeviceCopyAttrs>();
                    ICHECK(device_copy_attrs != nullptr) << "Must be the device copy attrs";
-                   Index src_device_type = device_copy_attrs->src_dev_type;
-                   Index dst_device_type = device_copy_attrs->dst_dev_type;
-                   Emit(Instruction::DeviceCopy(src_reg, src_device_type, dst_device_type,
-                                                NewRegister()));
+                   Emit(Instruction::DeviceCopy(
+                       src_reg, GetDeviceIndex(device_copy_attrs->src_se_scope),
+                       GetDeviceIndex(device_copy_attrs->dst_se_scope), NewRegister()));
                  })
           .Match("memory.kill",
                  [](const Array<Expr>& args, const Attrs& attrs, const Array<Type>& type_arg) {
@@ -781,7 +799,7 @@ class VMFunctionCompiler : DeviceAwareExprFunctor<void(const Expr& n)> {
   RegName CompileMatchValue(MatchValuePtr val) {
     if (std::dynamic_pointer_cast<RegisterValue>(val)) {
       auto r = std::dynamic_pointer_cast<RegisterValue>(val);
-      return r->rergister_num;
+      return r->register_num;
     } else {
       auto path = std::dynamic_pointer_cast<AccessField>(val);
       auto p = CompileMatchValue(path->parent);
@@ -858,18 +876,15 @@ class VMFunctionCompiler : DeviceAwareExprFunctor<void(const Expr& n)> {
   size_t registers_num_;
   /*! \brief Global shared meta data */
   VMCompilerContext* context_;
-  /*! \brief Target devices. */
-  std::unordered_map<int, tvm::Target> targets_;
-  /*! \brief Host target. */
-  Target target_host_;
+  /*! \brief SEScope for data and computation which must reside on a CPU. */
+  SEScope host_se_scope_;
 };
 
 PackedFunc VMCompiler::GetFunction(const std::string& name, const ObjectPtr<Object>& sptr_to_self) {
   if (name == "lower") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
       ICHECK_EQ(args.num_args, 3);
-      IRModule mod = args[0];
-      this->Lower(mod, args[1], args[2]);
+      this->Lower(args[0], args[1], args[2]);
     });
   } else if (name == "codegen") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
@@ -909,15 +924,16 @@ void VMCompiler::SetParam(const std::string& name, runtime::NDArray data_in) {
   params_[name] = data_in;
 }
 
-void VMCompiler::Lower(IRModule mod, const tvm::TargetMap& targets,
-                       const tvm::Target& target_host) {
+void VMCompiler::Lower(IRModule mod, TargetMap targets, tvm::Target target_host) {
   exec_ = make_object<Executable>();
-  targets_ = targets;
-  target_host_ = target_host;
-  CheckAndUpdateHostConsistency(&targets_, &target_host_);
+  config_ = CompilationConfig(PassContext::Current(), std::move(targets), std::move(target_host));
+
+  // The first device is always for the host.
+  CHECK(context_.se_scopes_.empty());
+  context_.se_scopes_.push_back(config_->host_se_scope);
 
   // Run the optimizations necessary to target the VM.
-  context_.module = OptimizeModule(mod, targets_, target_host_);
+  context_.module = OptimizeModuleImpl(std::move(mod));
 
   // Populate the global map.
   //
@@ -933,7 +949,7 @@ void VMCompiler::Lower(IRModule mod, const tvm::TargetMap& targets,
     auto gvar = named_func.first;
     if (auto* n = named_func.second.as<FunctionNode>()) {
       auto func = GetRef<Function>(n);
-      VMFunctionCompiler func_compiler(&context_, targets_, target_host_);
+      VMFunctionCompiler func_compiler(&context_, config_->host_se_scope);
       auto vm_func = func_compiler.Compile(gvar, func);
 
       size_t func_index = context_.global_map.at(gvar);
@@ -947,17 +963,27 @@ void VMCompiler::Lower(IRModule mod, const tvm::TargetMap& targets,
     }
   }
 
+  // Populate virtual devices and the host device index.
+  for (const auto& se_scope : context_.se_scopes_) {
+    ICHECK(!se_scope->IsFullyUnconstrained());
+    ICHECK_GT(se_scope->device_type(), 0);
+    // TODO(mbs): We forget the memory scope.
+    exec_->virtual_devices.push_back(
+        Device{/*device_type=*/se_scope->device_type(), /*device_id=*/se_scope->virtual_device_id});
+  }
+  exec_->host_device_index = kHostDeviceIndex;
+
   // populate constants
-  for (auto data : context_.constants) {
+  for (const auto& data : context_.constants) {
     exec_->constants.push_back(data);
   }
 
-  for (auto i : context_.const_device_type) {
-    exec_->const_device_type.push_back(i);
+  for (auto index : context_.const_device_indexes) {
+    exec_->const_device_indexes.push_back(index);
   }
 
   // update global function map
-  for (auto gv : context_.global_map) {
+  for (const auto& gv : context_.global_map) {
     exec_->global_map.insert({gv.first->name_hint, gv.second});
   }
 
@@ -976,13 +1002,13 @@ void VMCompiler::Lower(IRModule mod, const tvm::TargetMap& targets,
   backend::UpdateAutoSchedulerOpWeights(context_.compiler);
 }
 
-transform::Sequential MemoryOpt(tvm::Target host_target, tvm::TargetMap targets) {
+transform::Sequential MemoryOpt(const SEScope& cpu_se_scope) {
   Array<Pass> pass_seqs;
   // Remove unused functions
   Array<runtime::String> entry_functions{"main"};
   pass_seqs.push_back(transform::RemoveUnusedFunctions(entry_functions));
   // Manifest the allocations.
-  pass_seqs.push_back(transform::ManifestAlloc(host_target, targets));
+  pass_seqs.push_back(transform::ManifestAlloc(cpu_se_scope));
 
   // Compute away possibly introduced constant computation.
   pass_seqs.push_back(transform::FoldConstant());
@@ -991,7 +1017,7 @@ transform::Sequential MemoryOpt(tvm::Target host_target, tvm::TargetMap targets)
   pass_seqs.push_back(transform::FuseOps());
 
   // Manifest the allocations needed for the shape functions.
-  pass_seqs.push_back(transform::ManifestAlloc(host_target, targets));
+  pass_seqs.push_back(transform::ManifestAlloc(cpu_se_scope));
 
   // Fuse the shape functions.
   pass_seqs.push_back(transform::FuseOps());
@@ -1009,7 +1035,7 @@ transform::Sequential MemoryOpt(tvm::Target host_target, tvm::TargetMap targets)
   pass_seqs.push_back(transform::FuseOps());
 
   // Create allocations for math introduced by dynamic region math.
-  pass_seqs.push_back(transform::ManifestAlloc(host_target, targets));
+  pass_seqs.push_back(transform::ManifestAlloc(cpu_se_scope));
 
   // Compute away possibly introduced constant computation.
   pass_seqs.push_back(transform::FoldConstant());
@@ -1019,15 +1045,22 @@ transform::Sequential MemoryOpt(tvm::Target host_target, tvm::TargetMap targets)
   //  instructions need to access to constant
   // pass_seqs.push_back(transform::LiftConstants());
 
-  return transform::Sequential(pass_seqs);
+  return transform::Sequential(std::move(pass_seqs));
 }
 
-IRModule VMCompiler::OptimizeModule(IRModule mod, const TargetMap& targets_arg,
-                                    const Target& target_host_arg) {
+IRModule VMCompiler::OptimizeModule(IRModule mod, const TargetMap& targets,
+                                    const Target& target_host) {
+  config_ = CompilationConfig(PassContext::Current(), targets, target_host);
+  // The first device always corresponds to the host.
+  CHECK(context_.se_scopes_.empty());
+  context_.se_scopes_.push_back(config_->host_se_scope);
+  // TODO(mbs): exec_ is not allocated. What is the API here?
+  CHECK(exec_ == nullptr);
+  return OptimizeModuleImpl(std::move(mod));
+}
+
+IRModule VMCompiler::OptimizeModuleImpl(IRModule mod) {
   VLOG_CONTEXT << "VMCompiler::OptimizeModule";
-  TargetMap targets = targets_arg;
-  Target target_host = target_host_arg;
-  CheckAndUpdateHostConsistency(&targets, &target_host);
   if (params_.size()) {
     BaseFunc base_func = mod->Lookup("main");
     ICHECK(base_func->IsInstance<FunctionNode>())
@@ -1037,29 +1070,24 @@ IRModule VMCompiler::OptimizeModule(IRModule mod, const TargetMap& targets_arg,
     mod->Add(gvar, f);
   }
 
-  Array<Pass> pass_seqs = relay::backend::GetPassPrefix(targets, true);
+  Array<Pass> pass_seqs = relay::backend::GetPassPrefix(
+      /*is_homogenous=*/config_->optional_homogeneous_target.defined(), /*is_vm=*/true);
 
-  // TODO(mbs): Reconcile with relay/backend/build_module.cc
-  DLDeviceType default_device_type;
-  if (targets_arg.size() == 1) {
-    default_device_type =
-        static_cast<DLDeviceType>(static_cast<int>((*targets_arg.begin()).first->value));
-  } else {
-    default_device_type = static_cast<DLDeviceType>(GetFallbackDevice());
-  }
-  pass_seqs.push_back(PlanDevices(default_device_type));
+  // Always plan devices so the remaining passes don't need to distinguish homogeneous vs
+  // hetrogeneous execution.
+  pass_seqs.push_back(transform::PlanDevices(config_));
 
   pass_seqs.push_back(transform::FuseOps());
 
   // Do layout rewrite for auto-scheduler.
   transform::PassContext pass_ctx = PassContext::Current();
-  if (backend::IsAutoSchedulerEnabled() && targets.size() == 1) {
-    const auto& target = (*targets.begin()).second;
+  if (backend::IsAutoSchedulerEnabled() && config_->optional_homogeneous_target.defined()) {
     Pass major_pass = transform::AutoSchedulerLayoutRewrite();
     bool enable_layout_rewrite_targets =
-        target->kind->device_type == kDLCPU || target->GetAttr<String>("device", "") == "mali";
+        config_->optional_homogeneous_target->kind->device_type == kDLCPU ||
+        config_->optional_homogeneous_target->GetAttr<String>("device", "") == "mali";
     if (enable_layout_rewrite_targets && pass_ctx.PassEnabled(major_pass->Info())) {
-      With<Target> tctx(target);
+      With<Target> tctx(config_->optional_homogeneous_target);
       pass_seqs.push_back(major_pass);
       // Defuse ops to fold constants, then fuse them again
       pass_seqs.push_back(transform::DefuseOps());
@@ -1080,18 +1108,18 @@ IRModule VMCompiler::OptimizeModule(IRModule mod, const TargetMap& targets_arg,
   // external codegen.
   pass_seqs.push_back(transform::Inline());
 
-  pass_seqs.push_back(MemoryOpt(target_host, targets));
+  pass_seqs.push_back(MemoryOpt(config_->host_se_scope));
   pass_seqs.push_back(transform::InferType());
   pass_seqs.push_back(transform::LabelOps());
 
   transform::Sequential seq(pass_seqs);
   tvm::With<relay::transform::PassContext> ctx(pass_ctx);
-  if (targets.size() == 1) {
-    const auto& it = targets.begin();
-    With<Target> tctx((*it).second);
-    return seq(mod);
+  if (config_->optional_homogeneous_target.defined()) {
+    With<Target> tctx(config_->optional_homogeneous_target);
+    return seq(std::move(mod));
+  } else {
+    return seq(std::move(mod));
   }
-  return seq(mod);
 }
 
 void VMCompiler::PopulateGlobalMap() {
@@ -1138,13 +1166,14 @@ void VMCompiler::Codegen() {
 
   runtime::Module lib;
   if (funcs.size() > 0) {
-    lib = tvm::build(funcs, target_host_);
+    lib = tvm::build(funcs, config_->host_target);
   } else {
     // There is no function handled by TVM. We create a virtual main module
     // to make sure a DSO module will be also available.
     lib = codegen::CSourceModuleCreate(";", "", Array<String>{});
   }
-  lib = codegen::CreateMetadataModule(params_, lib, ext_mods, target_host_, runtime::Metadata());
+  lib = codegen::CreateMetadataModule(params_, lib, ext_mods, config_->host_target,
+                                      runtime::Metadata());
   exec_->SetLib(lib);
 }
 
diff --git a/src/relay/backend/vm/compiler.h b/src/relay/backend/vm/compiler.h
index 5b51d7821d78b..2edec70d5c3be 100644
--- a/src/relay/backend/vm/compiler.h
+++ b/src/relay/backend/vm/compiler.h
@@ -78,17 +78,21 @@ struct VMCompilerContext {
   tec::TECompiler compiler;
   // List of constants
   std::vector<NDArray> constants;
-  // Device type for constants
-  std::vector<Index> const_device_type;
+  // Device indexes  for constants
+  std::vector<Index> const_device_indexes;
   // List of cached functions
   std::vector<tec::CachedFunc> cached_funcs;
   // The functions that have been lowered.
   std::unordered_map<tir::PrimFunc, size_t, ObjectPtrHash, ObjectPtrEqual> seen_funcs;
+  // The SEScopes corresponding to each device index. The first device always corresponds
+  // to the host device, and all remaining devices are for the primitive operations.
+  std::vector<SEScope> se_scopes_;
 };
 
 class VMCompiler : public runtime::ModuleNode {
  public:
-  virtual ~VMCompiler() {}
+  VMCompiler() = default;
+  virtual ~VMCompiler() = default;
 
   virtual PackedFunc GetFunction(const std::string& name, const ObjectPtr<Object>& sptr_to_self);
 
@@ -110,7 +114,7 @@ class VMCompiler : public runtime::ModuleNode {
    *                to target mapping. For homogeneous compilation, it is a singleton build target.
    * \param target_host Host compilation target, if target is device.
    */
-  void Lower(IRModule mod, const TargetMap& targets, const tvm::Target& target_host);
+  void Lower(IRModule mod, TargetMap targets, Target target_host);
 
   /*! \brief Generate the machine code for lowered functions. */
   void Codegen();
@@ -128,6 +132,8 @@ class VMCompiler : public runtime::ModuleNode {
    */
   IRModule OptimizeModule(IRModule mod, const TargetMap& targets, const Target& target_host);
 
+  IRModule OptimizeModuleImpl(IRModule mod);
+
   /*!
    * \brief Populate the global function names in a map where the value is used
    *        as the index by the VMFunctions.
@@ -135,10 +141,8 @@ class VMCompiler : public runtime::ModuleNode {
   void PopulateGlobalMap();
 
  protected:
-  /*! \brief Target devices. */
-  TargetMap targets_;
-  /*! \brief Target host device. */
-  tvm::Target target_host_;
+  /*! \brief Targets and scopes needed for compilation. */
+  CompilationConfig config_;
   /*! \brief Global shared meta data */
   VMCompilerContext context_;
   /*! \brief Compiled executable. */
diff --git a/src/relay/backend/vm/lambda_lift.cc b/src/relay/backend/vm/lambda_lift.cc
index d9a2b8b91fa35..ffd0e466eb24e 100644
--- a/src/relay/backend/vm/lambda_lift.cc
+++ b/src/relay/backend/vm/lambda_lift.cc
@@ -112,7 +112,7 @@ class LambdaLifter : public transform::DeviceAwareExprMutator {
     auto free_type_vars = FreeTypeVars(func, module_);
 
     Array<Var> captured_vars;
-    std::vector<DLDeviceType> captured_var_device_types;
+    std::vector<SEScope> captured_var_se_scopes;
     bool recursive = false;
     for (const auto& var : free_vars) {
       if (!letrec_.empty() && var == letrec_.back()) {
@@ -120,7 +120,7 @@ class LambdaLifter : public transform::DeviceAwareExprMutator {
         continue;
       }
       captured_vars.push_back(var);
-      captured_var_device_types.push_back(GetInScopeDeviceType(var));
+      captured_var_se_scopes.push_back(GetSEScope(var));
     }
 
     // Freshen all the captured vars.
@@ -132,7 +132,7 @@ class LambdaLifter : public transform::DeviceAwareExprMutator {
       rebinding_map.Set(free_var, var);
     }
 
-    DLDeviceType result_device_type = GetInScopeDeviceType(func_node->body);
+    SEScope result_se_scope = GetSEScope(func_node->body);
 
     if (recursive) {
       if (!captured_vars.empty()) {
@@ -195,8 +195,7 @@ class LambdaLifter : public transform::DeviceAwareExprMutator {
       lifted_func =
           Function(typed_captured_vars, rebound_body, /*ret_type=*/func->func_type_annotation(),
                    free_type_vars, /*attrs=*/{}, func->span);
-      lifted_func =
-          MaybeFunctionOnDevice(lifted_func, captured_var_device_types, result_device_type);
+      lifted_func = MaybeFunctionOnDevice(lifted_func, captured_var_se_scopes, result_se_scope);
       lifted_func = MarkClosure(lifted_func);
     }
 
diff --git a/src/relay/ir/expr_functor.cc b/src/relay/ir/expr_functor.cc
index 9a2297a759626..e9441f1b3e58e 100644
--- a/src/relay/ir/expr_functor.cc
+++ b/src/relay/ir/expr_functor.cc
@@ -32,6 +32,7 @@
 #include <stack>
 
 #include "../op/annotation/annotation.h"
+#include "../op/memory/on_device.h"
 
 namespace tvm {
 namespace relay {
@@ -529,11 +530,11 @@ Expr Bind(const Expr& expr, const tvm::Map<Var, Expr>& args_map) {
   if (const FunctionNode* func = expr.as<FunctionNode>()) {
     Expr new_body = ExprBinder(args_map).VisitExpr(func->body);
     Array<Var> new_params;
-    std::vector<DLDeviceType> new_param_device_types;
+    std::vector<SEScope> new_param_se_scopes;
     for (size_t i = 0; i < func->params.size(); ++i) {
       if (!args_map.count(func->params[i])) {
         new_params.push_back(func->params[i]);
-        new_param_device_types.push_back(GetFunctionParamDeviceType(func, i));
+        new_param_se_scopes.push_back(GetFunctionParamSEScope(func, i));
       }
     }
     if (new_body.same_as(func->body) && new_params.size() == func->params.size()) {
@@ -541,7 +542,7 @@ Expr Bind(const Expr& expr, const tvm::Map<Var, Expr>& args_map) {
     }
     auto ret =
         Function(new_params, new_body, func->ret_type, func->type_params, func->attrs, func->span);
-    ret = MaybeFunctionOnDevice(ret, new_param_device_types, GetFunctionResultDeviceType(func));
+    ret = MaybeFunctionOnDevice(ret, new_param_se_scopes, GetFunctionResultSEScope(func));
     std::unordered_set<Var, ObjectPtrHash, ObjectPtrEqual> set;
     for (const auto& v : FreeVars(expr)) {
       set.insert(v);
@@ -549,19 +550,19 @@ Expr Bind(const Expr& expr, const tvm::Map<Var, Expr>& args_map) {
     for (const auto& v : FreeVars(ret)) {
       if (set.count(v) == 0) {
         new_params.push_back(v);
-        if (GetFunctionResultDeviceType(func) != kInvalidDeviceType) {
+        if (!GetFunctionResultSEScope(func)->IsFullyUnconstrained()) {
           // TODO(mbs): The function has been annotated with a device, which means we are supposed
           // to be preserving device annotations on every transformation. However there's no
           // such context for the free vars in args_map.
           LOG(WARNING) << "introduced free var '" << PrettyPrint(v)
                        << "' into function body but no device is known for it";
         }
-        new_param_device_types.push_back(kInvalidDeviceType);
+        new_param_se_scopes.push_back(SEScope::FullyUnconstrained());
       }
     }
     ret =
         Function(new_params, new_body, func->ret_type, func->type_params, func->attrs, func->span);
-    ret = MaybeFunctionOnDevice(ret, new_param_device_types, GetFunctionResultDeviceType(func));
+    ret = MaybeFunctionOnDevice(ret, new_param_se_scopes, GetFunctionResultSEScope(func));
     ICHECK_EQ(FreeVars(expr).size(), FreeVars(ret).size());
     return std::move(ret);
   } else {
diff --git a/src/relay/op/annotation/annotation.cc b/src/relay/op/annotation/annotation.cc
index 27b61333c9eb4..bd3162dfde869 100644
--- a/src/relay/op/annotation/annotation.cc
+++ b/src/relay/op/annotation/annotation.cc
@@ -38,158 +38,6 @@
 namespace tvm {
 namespace relay {
 
-TVM_REGISTER_NODE_TYPE(OnDeviceAttrs);
-
-const Op& OnDeviceOp() {
-  static const Op& op = Op::Get("on_device");
-  return op;
-}
-
-Expr OnDevice(Expr expr, DLDeviceType device_type, bool is_fixed) {
-  auto attrs = make_object<OnDeviceAttrs>();
-  attrs->device_type = device_type;
-  attrs->is_fixed = is_fixed;
-  Span span = expr->span;
-  return Call(OnDeviceOp(), {std::move(expr)}, Attrs(std::move(attrs)), /*type_args=*/{}, span);
-}
-
-Expr MaybeOnDevice(Expr expr, DLDeviceType device_type, bool is_fixed) {
-  if (device_type == kInvalidDeviceType) {
-    // Undefined signals no annotation is required.
-    return expr;
-  }
-  if (expr->IsInstance<OpNode>() || expr->IsInstance<ConstructorNode>()) {
-    // These operators are device polymorphic so no annotation is required.
-    // TODO(mbs): The device planning pass does NOT currently support device polymorphism for
-    // constructors, so we could remove them from this condition. However most constructors
-    // accept type parameters, and it is not well-formed Relay to simply wrap such a
-    // constructor in an "on_device" call. So we'll pretend they are device polymorphic to
-    // avoid that difficultly. Overall ADTs need more work to be fully supported.
-    return expr;
-  }
-  if (expr->IsInstance<GlobalVarNode>() || expr->IsInstance<VarNode>()) {
-    // The device can be recovered from the binding site of the global or local variable.
-    return expr;
-  }
-  if (expr->IsInstance<FunctionNode>()) {
-    // If a primitive function then it is device polymorphic. Otherwise the device is captured
-    // by the function's attributes.
-    return expr;
-  }
-  OnDeviceProps props = GetOnDeviceProps(expr);
-  if (props.body.defined()) {
-    // Don't nest on_devices.
-    // If the inner and outer device types differ then we need to be careful:
-    //  - If the inner on_device is_fixed then it disagrees with the outer.
-    //  - If the outer on_device is_fixed then it implies a hidden device_copy
-    // Otherwise just use the inner device type and ignore the outer.
-    ICHECK(props.device_type == device_type || (!is_fixed && !props.is_fixed));
-    return OnDevice(props.body, device_type, is_fixed || props.is_fixed);
-  }
-  return OnDevice(expr, device_type, is_fixed);
-}
-
-TVM_REGISTER_GLOBAL("relay.op.annotation._make.on_device")
-    .set_body_typed([](Expr expr, int device_type, bool is_fixed) {
-      return OnDevice(expr, static_cast<DLDeviceType>(device_type), is_fixed);
-    });
-
-RELAY_REGISTER_OP("on_device")
-    .describe(R"code(Annotate an expression with device type)code" TVM_ADD_FILELINE)
-    .set_num_inputs(1)
-    .add_argument("data", "Tensor", "The input data.")
-    .set_support_level(10)
-    .add_type_rel("Identity", IdentityRel)
-    .set_attrs_type_key("relay.attrs.OnDeviceAttrs")
-    .set_attr<TOpPattern>("TOpPattern", kOpaque)
-    .set_attr<TOpIsStateful>("TOpIsStateful", false)
-    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ElemwiseArbitraryLayout)
-    .set_attr<TNonComputational>("TNonComputational", true);
-
-OnDeviceProps GetOnDeviceProps(const CallNode* call_node) {
-  if (call_node->op == OnDeviceOp()) {
-    ICHECK_EQ(call_node->args.size(), 1) << "on_device expects one argument";
-    ICHECK(call_node->attrs.defined()) << "on_device requires attributes";
-    const auto* on_device_attrs = call_node->attrs.as<OnDeviceAttrs>();
-    ICHECK(on_device_attrs != nullptr) << "on_device requires OnDeviceAttrs";
-    auto device_type = static_cast<DLDeviceType>(on_device_attrs->device_type);
-    // Follow nesting:
-    //   on_device(on_device(expr, device_type=1), device_type=2) == {expr, 1}
-    auto inner = GetOnDeviceProps(call_node->args[0]);
-    if (inner.body.defined()) {
-      return {inner.body, inner.device_type, on_device_attrs->is_fixed || inner.is_fixed};
-    } else {
-      return {call_node->args[0], device_type, on_device_attrs->is_fixed};
-    }
-  }
-  return {};
-}
-
-OnDeviceProps GetOnDeviceProps(const Expr& expr) {
-  if (const auto* call_node = expr.as<CallNode>()) {
-    return GetOnDeviceProps(call_node);
-  }
-  return {};
-}
-
-Function FunctionOnDevice(Function function, Array<Integer> param_device_types,
-                          Integer result_device_type) {
-  return WithAttrs(std::move(function), {{tvm::attr::kParamDeviceTypes, param_device_types},
-                                         {tvm::attr::kResultDeviceType, result_device_type}});
-}
-
-Function FunctionOnDevice(Function function, const std::vector<DLDeviceType>& param_device_types,
-                          DLDeviceType result_device_type) {
-  Array<Integer> arr;
-  arr.reserve(param_device_types.size());
-  for (const auto device_type : param_device_types) {
-    arr.push_back(static_cast<int>(device_type));
-  }
-  return FunctionOnDevice(std::move(function), std::move(arr),
-                          static_cast<int>(result_device_type));
-}
-
-Function MaybeFunctionOnDevice(Function function,
-                               const std::vector<DLDeviceType>& param_device_types,
-                               DLDeviceType result_device_type) {
-  if (std::all_of(param_device_types.begin(), param_device_types.end(),
-                  [](DLDeviceType type) { return type == kInvalidDeviceType; }) &&
-      result_device_type == kInvalidDeviceType) {
-    return function;
-  }
-  return FunctionOnDevice(function, param_device_types, result_device_type);
-}
-
-TVM_REGISTER_GLOBAL("relay.op.annotation._make.function_on_device")
-    .set_body_typed([](Function function, Array<Integer> param_device_types,
-                       int result_device_type) {
-      return FunctionOnDevice(function, param_device_types,
-                              static_cast<DLDeviceType>(result_device_type));
-    });
-
-DLDeviceType GetFunctionResultDeviceType(const FunctionNode* function_node) {
-  auto opt_integer = function_node->GetAttr<Integer>(tvm::attr::kResultDeviceType);
-  if (!opt_integer) {
-    // No annotation.
-    return kInvalidDeviceType;
-  }
-  return static_cast<DLDeviceType>(opt_integer.value()->value);
-}
-
-DLDeviceType GetFunctionParamDeviceType(const FunctionNode* function_node, size_t i) {
-  ICHECK_LT(i, function_node->params.size())
-      << "param index " << i << " out of range for function of arity "
-      << function_node->params.size();
-  auto opt_array = function_node->GetAttr<Array<Integer>>(tvm::attr::kParamDeviceTypes);
-  if (!opt_array) {
-    // No annotation.
-    return kInvalidDeviceType;
-  }
-  ICHECK_EQ(opt_array.value().size(), function_node->params.size())
-      << "annotation parameters do not match function arity";
-  return static_cast<DLDeviceType>(opt_array.value()[i]->value);
-}
-
 Expr StopFusion(Expr data) {
   static const Op& op = Op::Get("annotation.stop_fusion");
   return Call(op, {data}, Attrs{}, {});
diff --git a/src/relay/op/annotation/annotation.h b/src/relay/op/annotation/annotation.h
index d772df9b023a3..1675b7281ebb6 100644
--- a/src/relay/op/annotation/annotation.h
+++ b/src/relay/op/annotation/annotation.h
@@ -34,112 +34,6 @@
 namespace tvm {
 namespace relay {
 
-/*! \brief Returns the "on_device" operator. */
-const Op& OnDeviceOp();
-
-/*!
- * \brief Wraps \p expr in an "on_device" CallNode for \p device_type and \p is_fixed.
- *
- * See \p OnDeviceAttrs for an overview.
- */
-Expr OnDevice(Expr expr, DLDeviceType device_type, bool is_fixed);
-
-/*!
- * \brief Wraps \p expr in an "on_device" CallNode for \p device_type and \p is_fixed if the
- * device for \p expr cannot otherwise be recovered by the lexical scoping convention. This means
- * we will NOT wrap if:
- *  - \p device_type is \p kInvalidDeviceType, which signals there are no device annotations
- *    already in play.
- *  - \p expr is an operator or primitive function literal. These are device polymorphic.
- *  - \p expr is a non-primitive function literal. The device is captured by the
- *    "result_device_type" attribute on the function itself.
- *  - \p expr is a global var. The device is on the function attributes the global is bound to.
- *  - \p expr is a local var. The device is tracked by the device aware visitors for us.
- *  - \p expr is a constructor. These should eventually be device polymorphic but are currently
- *    in an in-between state at the moment.
- */
-Expr MaybeOnDevice(Expr expr, DLDeviceType device_type, bool is_fixed);
-
-/*! \brief Result of \p GetOnDeviceProps. */
-struct OnDeviceProps {
-  Expr body;  // = null
-  DLDeviceType device_type = kInvalidDeviceType;
-  bool is_fixed = false;
-
-  OnDeviceProps() = default;
-
-  OnDeviceProps(const Expr& body, DLDeviceType deviceType, bool isFixed)
-      : body(body), device_type(deviceType), is_fixed(isFixed) {}
-};
-
-/*!
- * \brief Returns the body expression, device type and is_fixed field for \p call_node if it is
- * an "on_device" CallNode. Otherwise returns the null expression, \p kInvalidDeviceType and \p
- * false.
- */
-OnDeviceProps GetOnDeviceProps(const CallNode* call_node);
-
-/*!
- * \brief Returns the body expression, device type and is_fixed field for \p expr if it is an
- * "on_device" CallNode. Otherwise returns the null expression, \p kInvalidDeviceType and \p false.
- */
-OnDeviceProps GetOnDeviceProps(const Expr& expr);
-
-/*!
- * \brief Returns the body of \p expr if it is an "on_device" annotation, otherwise returns
- * \p expr directly.
- */
-inline Expr IgnoreOnDevice(const Expr& expr) {
-  OnDeviceProps props = GetOnDeviceProps(expr);
-  return props.body.defined() ? props.body : expr;
-}
-
-/*!
- * \brief Returns \p expr as \p NodeType, or null if it is not of that type. Looks through
- * any "on_device" annotations.
- */
-template <typename NodeType>
-const NodeType* AsIgnoringOnDevice(const Expr& expr) {
-  const auto* node = expr.as<NodeType>();
-  if (node != nullptr) {
-    return node;
-  }
-  OnDeviceProps props = GetOnDeviceProps(expr);
-  if (!props.body.defined()) {
-    return nullptr;
-  }
-  return props.body.as<NodeType>();
-}
-
-/*!
- * \brief Returns \p function annotated with "param_device_types" and "result_device_type"
- * attributes capturing parameter and result devices types respectively.
- */
-Function FunctionOnDevice(Function function, Array<Integer> param_device_types,
-                          Integer body_device_type);
-Function FunctionOnDevice(Function function, const std::vector<DLDeviceType>& param_device_types,
-                          DLDeviceType body_device_type);
-
-/*!
- * \brief As for \p FunctionOnDevice, but returns \p function unchanged if all parameters and
- * result device types are \p kInvalidDeviceType.
- */
-Function MaybeFunctionOnDevice(Function function,
-                               const std::vector<DLDeviceType>& param_device_types,
-                               DLDeviceType result_device_type);
-
-/*!
- * \brief Returns the device type for the resut of \p function_node, or \p kInvalidDeviceType
- * if function does not have "result_device_type" annotation.
- */
-DLDeviceType GetFunctionResultDeviceType(const FunctionNode* function_node);
-
-/*!
- * \brief Returns the device type for the \p i'th parameter of \p function_node, or
- * \p kInvalidDeviceType if function does not have "param_device_types" annotation.
- */
-DLDeviceType GetFunctionParamDeviceType(const FunctionNode* function_node, size_t i);
-
 /*! \brief Wraps \p data in a "stop_fusion" annotation. */
 Expr StopFusion(Expr data);
 
diff --git a/src/relay/op/memory/device_copy.cc b/src/relay/op/memory/device_copy.cc
index dce89aa91b65a..d086eb1dc1840 100644
--- a/src/relay/op/memory/device_copy.cc
+++ b/src/relay/op/memory/device_copy.cc
@@ -30,6 +30,8 @@
 #include <tvm/relay/op_attr_types.h>
 #include <tvm/topi/elemwise.h>
 
+#include <utility>
+
 #include "../../transforms/infer_layout_utils.h"
 #include "../type_relations.h"
 
@@ -44,29 +46,27 @@ const Op& DeviceCopyOp() {
   return op;
 }
 
-Expr DeviceCopy(Expr expr, DLDeviceType src_dev_type, DLDeviceType dst_dev_type) {
+Expr DeviceCopy(Expr expr, SEScope src_se_scope, SEScope dst_se_scope) {
+  ICHECK(!src_se_scope->IsFullyUnconstrained());
+  ICHECK(!dst_se_scope->IsFullyUnconstrained());
   auto attrs = make_object<DeviceCopyAttrs>();
-  attrs->src_dev_type = src_dev_type;
-  attrs->dst_dev_type = dst_dev_type;
+  attrs->src_se_scope = std::move(src_se_scope);
+  attrs->dst_se_scope = std::move(dst_se_scope);
   Span span = expr->span;
-  return Call(DeviceCopyOp(), {std::move(expr)}, Attrs(attrs), /*type_args=*/{}, span);
+  return Call(DeviceCopyOp(), {std::move(expr)}, Attrs(std::move(attrs)), /*type_args=*/{},
+              std::move(span));
 }
 
-Expr OptDeviceCopy(Expr expr, DLDeviceType src_dev_type, DLDeviceType dst_dev_type) {
-  if (src_dev_type == dst_dev_type) {
+TVM_REGISTER_GLOBAL("relay.op._make.DeviceCopy").set_body_typed(DeviceCopy);
+
+Expr MaybeDeviceCopy(Expr expr, SEScope src_se_scope, SEScope dst_se_scope) {
+  if (src_se_scope == dst_se_scope) {
+    // No copy needed.
     return expr;
   }
-  ICHECK_NE(src_dev_type, kInvalidDeviceType);
-  ICHECK_NE(dst_dev_type, kInvalidDeviceType);
-  return DeviceCopy(expr, src_dev_type, dst_dev_type);
+  return DeviceCopy(std::move(expr), std::move(src_se_scope), std::move(dst_se_scope));
 }
 
-TVM_REGISTER_GLOBAL("relay.op._make.device_copy")
-    .set_body_typed([](Expr expr, int src_dev_type, int dst_dev_type) {
-      return DeviceCopy(expr, static_cast<DLDeviceType>(src_dev_type),
-                        static_cast<DLDeviceType>(dst_dev_type));
-    });
-
 RELAY_REGISTER_OP("device_copy")
     .describe(R"code(
 Copy data from one tensor to another. The source and destination might be
@@ -92,16 +92,14 @@ DeviceCopyProps GetDeviceCopyProps(const CallNode* call_node) {
     ICHECK(call_node->attrs.defined()) << "device_copy requires attributes";
     const auto* device_copy_attrs = call_node->attrs.as<DeviceCopyAttrs>();
     ICHECK(device_copy_attrs != nullptr) << "device_copy requires DeviceCopyAttrs";
-    auto src_dev_type = static_cast<DLDeviceType>(device_copy_attrs->src_dev_type);
-    auto dst_dev_type = static_cast<DLDeviceType>(device_copy_attrs->dst_dev_type);
     // Follow nesting:
-    //   device_copy(device_copy(expr, src_dev_type=1, dst_dev_type=2),
-    //               src_dev_type=2, dst_dev_type=3) ==> {expr, 1, 3}
+    //   device_copy(device_copy(expr, src_se_scope=S, dst_se_scope=T),
+    //               src_se_scope=T, dst_se_scope=U) ==> {expr, S, U}
     auto inner = GetDeviceCopyProps(call_node->args[0]);
     if (inner.body.defined()) {
-      return {inner.body, inner.src_dev_type, inner.dst_dev_type};
+      return {inner.body, inner.src_se_scope, device_copy_attrs->dst_se_scope};
     } else {
-      return {call_node->args[0], src_dev_type, dst_dev_type};
+      return {call_node->args[0], device_copy_attrs->src_se_scope, device_copy_attrs->dst_se_scope};
     }
   }
   return {};
diff --git a/src/relay/op/memory/device_copy.h b/src/relay/op/memory/device_copy.h
index d21fdb6abe198..3b40f410e53ba 100644
--- a/src/relay/op/memory/device_copy.h
+++ b/src/relay/op/memory/device_copy.h
@@ -28,6 +28,8 @@
 #include <tvm/relay/attrs/device_copy.h>
 #include <tvm/relay/expr.h>
 
+#include <utility>
+
 namespace tvm {
 namespace relay {
 
@@ -35,41 +37,43 @@ namespace relay {
 const Op& DeviceCopyOp();
 
 /*!
- * \brief Wraps \p expr in a "device_copy" CallNode indicating it should be evaluated on
- * a device of type \p src_dev_type but then copied to a device of type \p dst_dev_type.
+ * \brief Wraps \p expr in a "device_copy" CallNode indicating it should be evaluated and
+ * stored at \p src_se_scope but then copied to \p dst_se_scope.
  */
-Expr DeviceCopy(Expr expr, DLDeviceType src_dev_type, DLDeviceType dst_dev_type);
+Expr DeviceCopy(Expr expr, SEScope src_se_scope, SEScope dst_se_scope);
 
 /*!
- * \brief Wraps \p expr in a "device_copy" CallNode indicating it should be evaluated on
- * a device of type \p src_dev_type but then copied to a device of type \p dst_dev_type.
- * However, return \p expr directly if \p src_dev_type equals \p dst_dev_type.
+ * \brief Wraps \p expr in a "device_copy" CallNode indicating it should be evaluated and
+ * stored at \p src_se_scope but then copied to \p dst_se_scope.However, return \p expr
+ * directly if \p src_se_scope and \p dst_se_scope are (structurally) the same.
  */
-Expr MaybeDeviceCopy(Expr expr, DLDeviceType src_dev_type, DLDeviceType dst_dev_type);
+Expr MaybeDeviceCopy(Expr expr, SEScope src_se_scope, SEScope dst_se_scope);
 
 /*! \brief Result of \p GetDeviceCopyProps. */
 struct DeviceCopyProps {
   Expr body;  // = null
-  DLDeviceType src_dev_type = kInvalidDeviceType;
-  DLDeviceType dst_dev_type = kInvalidDeviceType;
+  SEScope src_se_scope = SEScope::FullyUnconstrained();
+  SEScope dst_se_scope = SEScope::FullyUnconstrained();
 
   DeviceCopyProps() = default;
 
-  DeviceCopyProps(const Expr& body, DLDeviceType srcDevType, DLDeviceType dstDevType)
-      : body(body), src_dev_type(srcDevType), dst_dev_type(dstDevType) {}
+  DeviceCopyProps(Expr body, SEScope src_se_scope, SEScope dst_se_scope)
+      : body(std::move(body)),
+        src_se_scope(std::move(src_se_scope)),
+        dst_se_scope(std::move(dst_se_scope)) {}
 };
 
 /*!
- * \brief Returns the body expression, source, and destination device types for \p call_node if it
- * is a "device_copy" CallNode. Otherwise returns the null expression and \p kInvalidDeviceType
- * device types.
+ * \brief Returns the body expression, source, and destination \p SEScopes for \p call_node
+ * if it is a "device_copy" CallNode. Otherwise returns the null expression and unconstrained
+ * device and scopes.
  */
 DeviceCopyProps GetDeviceCopyProps(const CallNode* call_node);
 
 /*!
- * \brief Returns the body expression, source, and destination device types for \p expr if it
- * is a "device_copy" CallNode. Otherwise returns the null expression and \p kInvalidDeviceType
- * device types.
+ * \brief Returns the body expression, source, and destination \p SEScopes for \p expr if it
+ * is a "device_copy" Call. Otherwise returns the null expression and unconstrained device and
+ * scopes.
  */
 DeviceCopyProps GetDeviceCopyProps(const Expr& expr);
 
diff --git a/src/relay/op/memory/memory.cc b/src/relay/op/memory/memory.cc
index 08e92b31965e4..0574fd50f4b67 100644
--- a/src/relay/op/memory/memory.cc
+++ b/src/relay/op/memory/memory.cc
@@ -32,12 +32,14 @@
 #include <tvm/runtime/data_type.h>
 #include <tvm/topi/elemwise.h>
 
+#include <utility>
 #include <vector>
 
 #include "../../transforms/infer_layout_utils.h"
 #include "../annotation/annotation.h"
 #include "../op_common.h"
 #include "../type_relations.h"
+#include "on_device.h"
 
 namespace tvm {
 namespace relay {
@@ -48,13 +50,12 @@ TVM_REGISTER_NODE_TYPE(AllocTensorAttrs);
 // The passing value in attrs and args doesn't seem super great.
 // We should consider a better solution, i.e the type relation
 // being able to see the arguments as well?
-Expr AllocStorage(Expr size, Expr alignment, Device dev, DataType dtype_hint) {
+Expr AllocStorage(Expr size, Expr alignment, SEScope se_scope, DataType dtype_hint) {
   auto attrs = make_object<AllocStorageAttrs>();
   attrs->dtype = dtype_hint;
-  attrs->device_id = dev.device_id;
-  attrs->device_type = dev.device_type;
+  attrs->se_scope = std::move(se_scope);
   static const Op& op = Op::Get("memory.alloc_storage");
-  return Call(op, {size, alignment}, Attrs(attrs), {});
+  return Call(op, {std::move(size), std::move(alignment)}, Attrs(std::move(attrs)), {});
 }
 
 TVM_REGISTER_GLOBAL("relay.op.memory._make.alloc_storage").set_body_typed(AllocStorage);
diff --git a/src/relay/op/memory/memory.h b/src/relay/op/memory/memory.h
index 558c409782f57..618044a9f2ca3 100644
--- a/src/relay/op/memory/memory.h
+++ b/src/relay/op/memory/memory.h
@@ -25,6 +25,8 @@
 #ifndef TVM_RELAY_OP_MEMORY_MEMORY_H_
 #define TVM_RELAY_OP_MEMORY_MEMORY_H_
 
+#include <tvm/target/se_scope.h>
+
 #include <vector>
 
 #include "tvm/relay/expr.h"
@@ -32,7 +34,7 @@
 namespace tvm {
 namespace relay {
 
-Expr AllocStorage(Expr size, Expr alignment, Device dev, DataType dtype_hint);
+Expr AllocStorage(Expr size, Expr alignment, SEScope se_scope, DataType dtype_hint);
 Expr AllocTensor(Expr storage, Expr offset, tvm::relay::Expr shape, DataType dtype,
                  Array<IndexExpr> assert_shape);
 Expr ToTupleType(const Type& ty, const std::vector<Expr>& exprs);
diff --git a/src/relay/op/memory/on_device.cc b/src/relay/op/memory/on_device.cc
new file mode 100644
index 0000000000000..9541d4122a2f9
--- /dev/null
+++ b/src/relay/op/memory/on_device.cc
@@ -0,0 +1,167 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *
+ * \file src/relay/op/memory/on_device.cc
+ * \brief Helpers for working with the "on_device" 'annotation' call.
+ */
+
+#include "./on_device.h"
+
+#include <tvm/relay/attrs/annotation.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/op.h>
+#include <tvm/relay/op_attr_types.h>
+#include <tvm/tir/expr.h>
+#include <tvm/topi/elemwise.h>
+
+#include "../../transforms/infer_layout_utils.h"
+#include "../type_relations.h"
+
+namespace tvm {
+namespace relay {
+
+TVM_REGISTER_NODE_TYPE(OnDeviceAttrs);
+
+const Op& OnDeviceOp() {
+  static const Op& op = Op::Get("on_device");
+  return op;
+}
+
+Expr OnDevice(Expr expr, SEScope se_scope, bool is_fixed) {
+  ICHECK(!se_scope->IsFullyUnconstrained());
+  auto attrs = make_object<OnDeviceAttrs>();
+  attrs->se_scope = std::move(se_scope);
+  attrs->is_fixed = is_fixed;
+  Span span = expr->span;
+  return Call(OnDeviceOp(), {std::move(expr)}, Attrs(std::move(attrs)), /*type_args=*/{},
+              std::move(span));
+}
+
+TVM_REGISTER_GLOBAL("relay.op.annotation._make.OnDevice").set_body_typed(OnDevice);
+
+Expr MaybeOnDevice(Expr expr, SEScope se_scope, bool is_fixed) {
+  if (se_scope->IsFullyUnconstrained()) {
+    // Nothing to annotate with.
+    return expr;
+  }
+  if (expr->IsInstance<OpNode>() || expr->IsInstance<ConstructorNode>()) {
+    // These operators are device polymorphic so no annotation is required.
+    return expr;
+  }
+  if (expr->IsInstance<GlobalVarNode>() || expr->IsInstance<VarNode>()) {
+    // The device can be recovered from the binding site of the global or local variable.
+    return expr;
+  }
+  if (expr->IsInstance<FunctionNode>()) {
+    // If a primitive function then it is device polymorphic. Otherwise the device is captured
+    // by the function's "result_se_scope" attribute.
+    return expr;
+  }
+  OnDeviceProps props = GetOnDeviceProps(expr);
+  if (props.body.defined()) {
+    // Don't nest on_devices.
+    // If the inner and outer device types differ then we need to be careful:
+    //  - If the inner on_device is_fixed then it disagrees with the outer.
+    //  - If the outer on_device is_fixed then it implies a hidden device_copy
+    // Otherwise just use the inner device type and ignore the outer.
+    ICHECK(props.se_scope == se_scope || (!is_fixed && !props.is_fixed));
+    return OnDevice(props.body, se_scope, is_fixed || props.is_fixed);
+  }
+  return OnDevice(expr, std::move(se_scope), is_fixed);
+}
+
+RELAY_REGISTER_OP("on_device")
+    .describe(R"code(Annotate an expression with device type)code" TVM_ADD_FILELINE)
+    .set_num_inputs(1)
+    .add_argument("data", "Tensor", "The input data.")
+    .set_support_level(10)
+    .add_type_rel("Identity", IdentityRel)
+    .set_attrs_type_key("relay.attrs.OnDeviceAttrs")
+    .set_attr<TOpPattern>("TOpPattern", kOpaque)
+    .set_attr<TOpIsStateful>("TOpIsStateful", false)
+    .set_attr<FInferCorrectLayout>("FInferCorrectLayout", ElemwiseArbitraryLayout)
+    .set_attr<TNonComputational>("TNonComputational", true);
+
+OnDeviceProps GetOnDeviceProps(const CallNode* call_node) {
+  if (call_node->op == OnDeviceOp()) {
+    ICHECK_EQ(call_node->args.size(), 1) << "on_device expects one argument";
+    ICHECK(call_node->attrs.defined()) << "on_device requires attributes";
+    const auto* on_device_attrs = call_node->attrs.as<OnDeviceAttrs>();
+    ICHECK(on_device_attrs != nullptr) << "on_device requires OnDeviceAttrs";
+    // Follow nesting:
+    //   on_device(on_device(expr, se_scope=S), se_scope=T) == {expr, S}
+    auto inner = GetOnDeviceProps(call_node->args[0]);
+    if (inner.body.defined()) {
+      return {inner.body, inner.se_scope, on_device_attrs->is_fixed || inner.is_fixed};
+    } else {
+      return {call_node->args[0], on_device_attrs->se_scope, on_device_attrs->is_fixed};
+    }
+  }
+  return {};
+}
+
+OnDeviceProps GetOnDeviceProps(const Expr& expr) {
+  if (const auto* call_node = expr.as<CallNode>()) {
+    return GetOnDeviceProps(call_node);
+  }
+  return {};
+}
+
+Function FunctionOnDevice(Function function, Array<SEScope> param_se_scopes,
+                          SEScope result_se_scope) {
+  return WithAttrs(std::move(function), {{tvm::attr::kParamSEScopes, std::move(param_se_scopes)},
+                                         {tvm::attr::kResultSEScope, std::move(result_se_scope)}});
+}
+
+TVM_REGISTER_GLOBAL("relay.op.annotation._make.FunctionOnDevice").set_body_typed(FunctionOnDevice);
+
+Function MaybeFunctionOnDevice(Function function, Array<SEScope> param_se_scopes,
+                               SEScope result_se_scope) {
+  if (std::all_of(param_se_scopes.begin(), param_se_scopes.end(),
+                  [](const SEScope& se_scope) { return se_scope->IsFullyUnconstrained(); }) &&
+      result_se_scope->IsFullyUnconstrained()) {
+    // Nothing to annotate.
+    return function;
+  }
+  return FunctionOnDevice(function, std::move(param_se_scopes), std::move(result_se_scope));
+}
+
+SEScope GetFunctionResultSEScope(const FunctionNode* function_node) {
+  auto opt_se_scope = function_node->GetAttr<SEScope>(tvm::attr::kResultSEScope);
+  return opt_se_scope.value_or(SEScope::FullyUnconstrained());
+}
+
+SEScope GetFunctionParamSEScope(const FunctionNode* function_node, size_t i) {
+  ICHECK_LT(i, function_node->params.size())
+      << "param index " << i << " out of range for function of arity "
+      << function_node->params.size();
+  auto opt_array = function_node->GetAttr<Array<SEScope>>(tvm::attr::kParamSEScopes);
+  if (!opt_array) {
+    // No annotation.
+    return SEScope::FullyUnconstrained();
+  }
+  ICHECK_EQ(opt_array.value().size(), function_node->params.size())
+      << "annotation parameters do not match function arity";
+  return opt_array.value()[i];
+}
+
+}  // namespace relay
+}  // namespace tvm
diff --git a/src/relay/op/memory/on_device.h b/src/relay/op/memory/on_device.h
new file mode 100644
index 0000000000000..a7b6cb7cf52a5
--- /dev/null
+++ b/src/relay/op/memory/on_device.h
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file relay/op/memory/on_device.h
+ * \brief Helpers for working with the "on_device" 'annotation' call.
+ */
+#ifndef TVM_RELAY_OP_MEMORY_ON_DEVICE_H_
+#define TVM_RELAY_OP_MEMORY_ON_DEVICE_H_
+
+#include <tvm/relay/attrs/on_device.h>
+#include <tvm/relay/expr.h>
+#include <tvm/relay/function.h>
+#include <tvm/runtime/ndarray.h>
+
+#include <utility>
+#include <vector>
+
+namespace tvm {
+namespace relay {
+
+/*! \brief Returns the "on_device" operator. */
+const Op& OnDeviceOp();
+
+/*!
+ * \brief Wraps \p expr in an "on_device" CallNode for \p se_scope and \p is_fixed.
+ *
+ * See \p OnDeviceAttrs for an overview.
+ */
+Expr OnDevice(Expr expr, SEScope se_scope, bool is_fixed);
+
+/*!
+ * \brief Wraps \p expr in an "on_device" CallNode for \p se_scope and \p is_fixed if the
+ * \p SEScope for \p expr cannot otherwise be recovered by the lexical scoping convention.
+ * This means we will NOT wrap if:
+ *  - \p se_scope is full unconstrained, which signals there are no device annotations
+ *    already in play.
+ *  - \p expr is an operator or primitive function literal. These are device polymorphic.
+ *  - \p expr is a non-primitive function literal. The device is captured by the
+ *    "result_se_scope" attribute on the function itself.
+ *  - \p expr is a global var. The device is on the function attributes the global is bound to.
+ *  - \p expr is a local var. The device is tracked by the device aware visitors for us.
+ *  - \p expr is a constructor. These are device polymorphic.
+ *
+ */
+Expr MaybeOnDevice(Expr expr, SEScope se_scope, bool is_fixed);
+
+/*! \brief Result of \p GetOnDeviceProps. */
+struct OnDeviceProps {
+  Expr body;  // = null
+  SEScope se_scope = SEScope::FullyUnconstrained();
+  bool is_fixed = false;
+
+  OnDeviceProps() = default;
+
+  OnDeviceProps(Expr body, SEScope se_scope, bool isFixed)
+      : body(std::move(body)), se_scope(std::move(se_scope)), is_fixed(isFixed) {}
+};
+
+/*!
+ * \brief Returns the body expression, \p SEScope, and is_fixed field for \p call_node if it
+ * is an "on_device" CallNode. Otherwise returns the null expression, the unconstrained
+ * \p SEScope, and false.
+ */
+OnDeviceProps GetOnDeviceProps(const CallNode* call_node);
+
+/*!
+ * \brief Returns the body expression, \p SEScope, and is_fixed field for \p expr if it is an
+ * "on_device" CallNode. Otherwise returns the null expression, the unconstrained \p SEScope,
+ * and \p false.
+ */
+OnDeviceProps GetOnDeviceProps(const Expr& expr);
+
+/*!
+ * \brief Returns the body of \p expr if it is an "on_device" annotation, otherwise returns
+ * \p expr directly.
+ */
+inline Expr IgnoreOnDevice(const Expr& expr) {
+  OnDeviceProps props = GetOnDeviceProps(expr);
+  return props.body.defined() ? props.body : expr;
+}
+
+/*!
+ * \brief Returns \p expr as \p NodeType, or null if it is not of that type. Looks through
+ * any "on_device" annotations.
+ */
+template <typename NodeType>
+const NodeType* AsIgnoringOnDevice(const Expr& expr) {
+  const auto* node = expr.as<NodeType>();
+  if (node != nullptr) {
+    return node;
+  }
+  OnDeviceProps props = GetOnDeviceProps(expr);
+  if (!props.body.defined()) {
+    return nullptr;
+  }
+  return props.body.as<NodeType>();
+}
+
+/*!
+ * \brief Returns \p function annotated with "param_se_scopes" and "result_se_scope"
+ * attributes capturing parameter and result \p SEScopes respectively.
+ */
+Function FunctionOnDevice(Function function, Array<SEScope> param_se_scopes, SEScope body_se_scope);
+
+/*!
+ * \brief As for \p FunctionOnDevice, but returns \p function unchanged if all parameters and
+ * result \p SEScopes are unconstrained.
+ */
+Function MaybeFunctionOnDevice(Function function, Array<SEScope> param_se_scopes,
+                               SEScope result_se_scope);
+
+/*!
+ * \brief Returns the \p SEScope for the resut of \p function_node, or the unconstrained
+ * \p SEScope if function does not have the "result_se_scope" annotation.
+ */
+SEScope GetFunctionResultSEScope(const FunctionNode* function_node);
+
+/*!
+ * \brief Returns the \p SEScope for the \p i'th parameter of \p function_node, or
+ * the unconstrained \p SEScope if function does not have the "param_se_scopes" annotation.
+ */
+SEScope GetFunctionParamSEScope(const FunctionNode* function_node, size_t i);
+
+}  // namespace relay
+}  // namespace tvm
+
+#endif  // TVM_RELAY_OP_MEMORY_ON_DEVICE_H_
diff --git a/src/relay/transforms/device_aware_visitors.cc b/src/relay/transforms/device_aware_visitors.cc
index 38c3305d31941..e3d5a821c58e4 100644
--- a/src/relay/transforms/device_aware_visitors.cc
+++ b/src/relay/transforms/device_aware_visitors.cc
@@ -38,41 +38,51 @@ LexicalOnDeviceMixin::LexicalOnDeviceMixin(const Optional<IRModule>& maybe_mod)
   if (maybe_mod) {
     for (const auto& pair : maybe_mod.value()->functions) {
       if (const auto* function_node = pair.second.as<FunctionNode>()) {
-        DLDeviceType device_type = GetFunctionResultDeviceType(function_node);
-        if (device_type != kInvalidDeviceType) {
-          global_var_device_types_.emplace(pair.first, device_type);
+        SEScope se_scope = GetFunctionResultSEScope(function_node);
+        if (!se_scope->IsFullyUnconstrained()) {
+          global_var_se_scopes_.emplace(pair.first, se_scope);
         }
       }
     }
   }
 }
 
-DLDeviceType LexicalOnDeviceMixin::GetInScopeDeviceType(const Expr& expr) const {
-  auto props = GetOnDeviceProps(expr);
+SEScope LexicalOnDeviceMixin::GetSEScope(const Expr& expr) const {
+  OnDeviceProps props = GetOnDeviceProps(expr);
   if (props.body.defined() && props.is_fixed) {
-    return props.device_type;
+    return props.se_scope;
   } else if (const auto* var_node = expr.as<VarNode>()) {
     // Lookup variable binding.
-    auto itr = var_device_types_.find(GetRef<Var>(var_node));
-    if (itr != var_device_types_.end()) {
+    auto itr = var_se_scopes_.find(GetRef<Var>(var_node));
+    if (itr != var_se_scopes_.end()) {
       return itr->second;
     }
-    // else: fallthrough to unknown
+    // else: fallthrough to unconstrained
   } else if (const auto* global_var_node = expr.as<GlobalVarNode>()) {
     // Lookup global variable.
-    auto itr = global_var_device_types_.find(GetRef<GlobalVar>(global_var_node));
-    if (itr != global_var_device_types_.end()) {
+    auto itr = global_var_se_scopes_.find(GetRef<GlobalVar>(global_var_node));
+    if (itr != global_var_se_scopes_.end()) {
       return itr->second;
     }
-    // else: fallthrough to unknown
+    // else: fallthrough to unconstrained
+  } else if (const auto* function_node = expr.as<FunctionNode>()) {
+    if (function_node->HasNonzeroAttr(attr::kPrimitive)) {
+      if (!expr_se_scopes_.empty()) {
+        // Use the currently in-scope device type.
+        return expr_se_scopes_.back();
+      }
+      // else: fallthrough to unconstrained
+    } else {
+      return GetFunctionResultSEScope(function_node);
+    }
   } else {
-    if (!expr_device_types_.empty()) {
+    if (!expr_se_scopes_.empty()) {
       // Use the currently in-scope device type.
-      return expr_device_types_.back();
+      return expr_se_scopes_.back();
     }
-    // else: fallthrough to unknown
+    // else: fallthrough to unconstrained
   }
-  return kInvalidDeviceType;
+  return SEScope::FullyUnconstrained();
 }
 
 void LexicalOnDeviceMixin::EnterFunctionBody() { ++function_nesting_; }
@@ -82,34 +92,34 @@ void LexicalOnDeviceMixin::ExitFunctionBody() {
   --function_nesting_;
 }
 
-void LexicalOnDeviceMixin::PushDeviceType(DLDeviceType device_type) {
-  if (device_type == kInvalidDeviceType) {
+void LexicalOnDeviceMixin::PushSEScope(const SEScope& se_scope) {
+  if (se_scope->IsFullyUnconstrained()) {
     return;
   }
-  expr_device_types_.emplace_back(device_type);
+  expr_se_scopes_.emplace_back(se_scope);
 }
 
-void LexicalOnDeviceMixin::PopDeviceType() {
-  if (expr_device_types_.empty()) {
+void LexicalOnDeviceMixin::PopSEScope() {
+  if (expr_se_scopes_.empty()) {
     return;
   }
-  expr_device_types_.pop_back();
+  expr_se_scopes_.pop_back();
 }
 
-void LexicalOnDeviceMixin::PushBoundVar(Var var, DLDeviceType device_type) {
-  if (device_type == kInvalidDeviceType) {
+void LexicalOnDeviceMixin::PushBoundVar(Var var, const SEScope& se_scope) {
+  if (se_scope->IsFullyUnconstrained()) {
     return;
   }
-  ICHECK(var_device_types_.find(var) == var_device_types_.end());
-  var_device_types_.emplace(std::move(var), device_type);
+  ICHECK(var_se_scopes_.find(var) == var_se_scopes_.end());
+  var_se_scopes_.emplace(std::move(var), se_scope);
 }
 
 void LexicalOnDeviceMixin::PopBoundVar(const Var& var) {
-  auto itr = var_device_types_.find(var);
-  if (itr == var_device_types_.end()) {
+  auto itr = var_se_scopes_.find(var);
+  if (itr == var_se_scopes_.end()) {
     return;
   }
-  var_device_types_.erase(itr);
+  var_se_scopes_.erase(itr);
 }
 
 // TODO(mbs): We'd probably have less tedious code duplication if we redefined the memoizing
@@ -122,17 +132,17 @@ void DeviceAwareExprVisitor::VisitExpr_(const FunctionNode* function_node) {
   } else {
     // Function parameters come into scope.
     for (size_t i = 0; i < function_node->params.size(); ++i) {
-      PushBoundVar(function_node->params[i], GetFunctionParamDeviceType(function_node, i));
+      PushBoundVar(function_node->params[i], GetFunctionParamSEScope(function_node, i));
     }
     // Entering scope of function body.
-    PushDeviceType(GetFunctionResultDeviceType(function_node));
+    PushSEScope(GetFunctionResultSEScope(function_node));
     EnterFunctionBody();
 
     DeviceAwareVisitExpr_(function_node);
 
     // Leaving scope of function body.
     ExitFunctionBody();
-    PopDeviceType();
+    PopSEScope();
     // Function parameters go out of scope.
     for (size_t i = 0; i < function_node->params.size(); ++i) {
       PopBoundVar(function_node->params[i]);
@@ -147,7 +157,7 @@ void DeviceAwareExprVisitor::VisitExpr_(const LetNode* let_node) {
   while (const auto* inner_let_node = expr.as<LetNode>()) {
     // Let-bound var (in pre visited version) goes into scope.
     // (We'll just assume this is a letrec).
-    PushBoundVar(inner_let_node->var, GetInScopeDeviceType(inner_let_node->value));
+    PushBoundVar(inner_let_node->var, GetSEScope(inner_let_node->value));
     PreVisitLetBinding_(inner_let_node->var, inner_let_node->value);
     bindings.emplace_back(inner_let_node);
     expr = inner_let_node->body;
@@ -164,13 +174,13 @@ void DeviceAwareExprVisitor::VisitExpr_(const LetNode* let_node) {
 }
 
 void DeviceAwareExprVisitor::VisitExpr_(const CallNode* call_node) {
-  auto props = GetOnDeviceProps(call_node);
+  OnDeviceProps props = GetOnDeviceProps(call_node);
   if (props.body.defined() && props.is_fixed) {
     // Entering lexical scope of fixed "on_device" call.
-    PushDeviceType(props.device_type);
+    PushSEScope(props.se_scope);
     VisitExpr(props.body);
     // Leaving lexical scope of "on_device" call.
-    PopDeviceType();
+    PopSEScope();
   } else {
     DeviceAwareVisitExpr_(call_node);
   }
@@ -208,17 +218,17 @@ Expr DeviceAwareExprMutator::VisitExpr_(const FunctionNode* function_node) {
   } else {
     // Function parameters come into scope.
     for (size_t i = 0; i < function_node->params.size(); ++i) {
-      PushBoundVar(function_node->params[i], GetFunctionParamDeviceType(function_node, i));
+      PushBoundVar(function_node->params[i], GetFunctionParamSEScope(function_node, i));
     }
     // Entering scope of function body.
-    PushDeviceType(GetFunctionResultDeviceType(function_node));
+    PushSEScope(GetFunctionResultSEScope(function_node));
     EnterFunctionBody();
 
     Expr result = DeviceAwareVisitExpr_(function_node);
 
     // Leaving scope of function body.
     ExitFunctionBody();
-    PopDeviceType();
+    PopSEScope();
     // Function parameters go out of scope.
     for (size_t i = 0; i < function_node->params.size(); ++i) {
       PopBoundVar(function_node->params[i]);
@@ -235,7 +245,7 @@ Expr DeviceAwareExprMutator::VisitExpr_(const LetNode* let_node) {
   while (const auto* inner_let_node = expr.as<LetNode>()) {
     // Let-bound var (in pre visited version) goes into scope.
     // (We'll just assume this is a letrec.)
-    PushBoundVar(inner_let_node->var, GetInScopeDeviceType(inner_let_node->value));
+    PushBoundVar(inner_let_node->var, GetSEScope(inner_let_node->value));
     std::pair<Var, Expr> pair = PreVisitLetBinding_(inner_let_node->var, inner_let_node->value);
     bindings.emplace_back(pair.first, pair.second, inner_let_node->span, inner_let_node);
     expr = inner_let_node->body;
@@ -255,14 +265,14 @@ Expr DeviceAwareExprMutator::VisitExpr_(const LetNode* let_node) {
 }
 
 Expr DeviceAwareExprMutator::VisitExpr_(const CallNode* call_node) {
-  auto props = GetOnDeviceProps(call_node);
+  OnDeviceProps props = GetOnDeviceProps(call_node);
   if (props.body.defined() && props.is_fixed) {
     // Entering lexical scope of fixed "on_device" call.
-    PushDeviceType(props.device_type);
+    PushSEScope(props.se_scope);
     Expr expr = VisitExpr(props.body);
     // Leaving lexical scope of "on_device" call.
-    PopDeviceType();
-    return MaybeOnDevice(expr, props.device_type, props.is_fixed);
+    PopSEScope();
+    return MaybeOnDevice(expr, props.se_scope, props.is_fixed);
   } else {
     return DeviceAwareVisitExpr_(call_node);
   }
diff --git a/src/relay/transforms/device_aware_visitors.h b/src/relay/transforms/device_aware_visitors.h
index 3f4c5c24481e7..8cdf0db74ebd3 100644
--- a/src/relay/transforms/device_aware_visitors.h
+++ b/src/relay/transforms/device_aware_visitors.h
@@ -35,13 +35,14 @@
 #include <vector>
 
 #include "../op/annotation/annotation.h"
+#include "../op/memory/on_device.h"
 
 namespace tvm {
 namespace relay {
 namespace transform {
 
 /*!
- * \brief Helper class for expression transformers which need to keep track of the device
+ * \brief Helper class for expression transformers which need to keep track of the \p SEScope
  * holding the results of expressions. This is recovered from function attributes and "on_device"
  * CallNodes added by the PlanDevices pass.
  *
@@ -52,11 +53,11 @@ class LexicalOnDeviceMixin {
   explicit LexicalOnDeviceMixin(const Optional<IRModule>& maybe_mod);
 
   /*!
-   * \brief Returns the device type on which the result of \p expr should/will be stored, assuming
-   * Push/Pop DeviceType/BoundVar have been correctly called. May return \p kInvalidDeviceType if
-   * the device planning pass has not been run.
+   * \brief Returns the \p SEScope on which the result of \p expr should/will be stored, assuming
+   * {Push,Pop}{SEScope,BoundVar} have been correctly called. May return the unconstrained
+   * \p SEScope if the device planning pass has not been run.
    */
-  DLDeviceType GetInScopeDeviceType(const Expr& expr) const;
+  SEScope GetSEScope(const Expr& expr) const;
 
   /*! \brief Indicate a function body is being entered. */
   void EnterFunctionBody();
@@ -64,19 +65,19 @@ class LexicalOnDeviceMixin {
   /*! \brief Indicate a function body has been processed. */
   void ExitFunctionBody();
 
-  /*! \brief Push a device type onto the lexical device stack. Ignore if \p kInvalidDeviceType. */
-  void PushDeviceType(DLDeviceType device_type);
+  /*! \brief Push an \p SEScope onto the lexical SEScope stack. Ignore if unconstrained. */
+  void PushSEScope(const SEScope& se_scope);
 
-  /*! \brief Pop a device type from the lexical device stack. Ignore if stack is empty. */
-  void PopDeviceType();
+  /*! \brief Pop an \p SEScope from the lexical SEScope stack. Ignore if stack is empty. */
+  void PopSEScope();
 
-  /*! \brief Remember that \p var will be stored on \p device_type. Ignore if \p kInvalidDeviceType.
+  /*! \brief Remember that \p var will be stored at \p se_scope. Ignore if unconstrained.
    *
    * CAUTION: Despite the name we don't support re-entering the same function body.
    */
-  void PushBoundVar(Var var, DLDeviceType device_type);
+  void PushBoundVar(Var var, const SEScope& se_scope);
 
-  /*! \brief Remove the binding for \p var to it's device type. Ignore if var is not bound. */
+  /*! \brief Remove the binding for \p var to its \p SEScope. Ignore if var is not bound. */
   void PopBoundVar(const Var& var);
 
   /*!
@@ -92,36 +93,36 @@ class LexicalOnDeviceMixin {
   int function_nesting_ = 0;
 
   /*!
-   * \brief The stack of lexically enclosing "on_device" devices types, from outermost to innermost.
-   * When visiting an expression other than a variable we can assume the expression's result is to
-   * be stored on device_type_.back().
+   * \brief The stack of lexically enclosing "on_device" \p SEScopes, from outermost to
+   * innermost. When visiting an expression other than a variable we can assume the expression's
+   * result is to be stored on \p expr_se_scopes.back().
    */
-  std::vector<DLDeviceType> expr_device_types_;
+  std::vector<SEScope> expr_se_scopes_;
 
   /*!
-   * \brief A map from in-scope local variables to their device types. We may assume the variable is
-   * only ever bound to a value stored on this device at runtime.
+   * \brief A map from in-scope local variables to their \p SEScopes. We may assume the variable is
+   * only ever bound to a value stored on this \p SEScope at runtime.
    *
    * Note: We're playing it safe and keying by object refs here just in case the Relay expression
    * being rewritten has no module or other global to keep it alive.
    */
-  std::unordered_map<Var, DLDeviceType, runtime::ObjectPtrHash, runtime::ObjectPtrEqual>
-      var_device_types_;
+  std::unordered_map<Var, SEScope, runtime::ObjectPtrHash, runtime::ObjectPtrEqual> var_se_scopes_;
 
   /*!
-   * \brief A map from global variables to their device types, ie the "result_device_type" of the
-   * function they are bound to in the module we are working on. We calculate this explicitly so
-   * that we don't neeed to hold on to any module, which is often in the process of being rewritten.
+   * \brief A map from global variables to their \p SEScopes, ie the "result_se_scope" of the
+   * function they are bound to in the module we are working on. We calculate and store this
+   * explicitly so that we don't need to hold on to any module, which is often in the process of
+   * being rewritten.
    */
-  std::unordered_map<GlobalVar, DLDeviceType, runtime::ObjectPtrHash, runtime::ObjectPtrEqual>
-      global_var_device_types_;
+  std::unordered_map<GlobalVar, SEScope, runtime::ObjectPtrHash, runtime::ObjectPtrEqual>
+      global_var_se_scopes_;
 };
 
 template <typename FType>
 class DeviceAwareExprFunctor;
 
 /*!
- * \brief ExprFunctor which tracks devices. We only support 'visitor' style implementation
+ * \brief ExprFunctor which tracks \p SEScopes. We only support 'visitor' style implementation
  * with no additional arguments, thus this is equivalent to \p DeviceAwareExprVisitor without
  * any memoization.
  */
@@ -142,17 +143,17 @@ class DeviceAwareExprFunctor<void(const Expr& n)> : public ExprFunctor<void(cons
     } else {
       // Function parameters come into scope.
       for (size_t i = 0; i < function_node->params.size(); ++i) {
-        PushBoundVar(function_node->params[i], GetFunctionParamDeviceType(function_node, i));
+        PushBoundVar(function_node->params[i], GetFunctionParamSEScope(function_node, i));
       }
       // Entering scope of function body.
-      PushDeviceType(GetFunctionResultDeviceType(function_node));
+      PushSEScope(GetFunctionResultSEScope(function_node));
       EnterFunctionBody();
 
       DeviceAwareVisitExpr_(function_node);
 
       // Leaving scope of function body.
       ExitFunctionBody();
-      PopDeviceType();
+      PopSEScope();
       // Function parameters go out of scope.
       for (size_t i = 0; i < function_node->params.size(); ++i) {
         PopBoundVar(function_node->params[i]);
@@ -167,7 +168,7 @@ class DeviceAwareExprFunctor<void(const Expr& n)> : public ExprFunctor<void(cons
     while (const auto* inner_let_node = expr.as<LetNode>()) {
       // Let-bound var (in pre visited version) goes into scope.
       // (We'll just assume this is a letrec.)
-      PushBoundVar(inner_let_node->var, GetInScopeDeviceType(inner_let_node->value));
+      PushBoundVar(inner_let_node->var, GetSEScope(inner_let_node->value));
       PreVisitLetBinding_(inner_let_node->var, inner_let_node->value);
       bindings.emplace_back(inner_let_node);
       expr = inner_let_node->body;
@@ -185,20 +186,20 @@ class DeviceAwareExprFunctor<void(const Expr& n)> : public ExprFunctor<void(cons
   }
 
   void VisitExpr_(const CallNode* call_node) {
-    auto props = GetOnDeviceProps(call_node);
+    OnDeviceProps props = GetOnDeviceProps(call_node);
     if (props.body.defined() && props.is_fixed) {
       // Entering lexical scope of fixed "on_device" call.
-      PushDeviceType(props.device_type);
+      PushSEScope(props.se_scope);
       VisitExpr(props.body);
       // Leaving lexical scope of "on_device" call.
-      PopDeviceType();
+      PopSEScope();
     } else {
       DeviceAwareVisitExpr_(call_node);
     }
   }
 
   /*!
-   * \brief These are as for VisitExpr_. Devices for expressions and function parameters will be
+   * \brief These are as for VisitExpr_. \p SEScopes for expressions and function parameters will be
    * tracked automatically. Default implementation defers to ExprMutator::VisitExpr_. For
    * functions the function_nesting count will already include that of \p function_node.
    */
@@ -242,7 +243,7 @@ class DeviceAwareExprFunctor<void(const Expr& n)> : public ExprFunctor<void(cons
   virtual void PostVisitLetBlock_(const LetNode* let_node) {}
 };
 
-/*! \brief ExprVisitor which tracks devices. */
+/*! \brief ExprVisitor which tracks \p SEScopes. */
 class DeviceAwareExprVisitor : public ExprVisitor, public LexicalOnDeviceMixin {
  public:
   explicit DeviceAwareExprVisitor(const Optional<IRModule>& maybe_mod)
@@ -255,7 +256,7 @@ class DeviceAwareExprVisitor : public ExprVisitor, public LexicalOnDeviceMixin {
   void VisitExpr_(const CallNode* call_node) final;
 
   /*!
-   * \brief These are as for VisitExpr_. Devices for expressions and function parameters will be
+   * \brief These are as for VisitExpr_. \p SEScopes for expressions and function parameters will be
    * tracked automatically. Default implementation defers to ExprMutator::VisitExpr_. For
    * functions the function_nesting count will already include that of \p function_node.
    */
@@ -269,7 +270,7 @@ class DeviceAwareExprVisitor : public ExprVisitor, public LexicalOnDeviceMixin {
   virtual void PreVisitLetBlock_(const LetNode* let_node);
 
   /*!
-   * \brief Visit a let-bound expression before the let body has been visited. Devices for the
+   * \brief Visit a let-bound expression before the let body has been visited. \p SEScopes for the
    * let-bound variable will be tracked automatically. Default implementation just visits var and
    * value.
    */
@@ -288,7 +289,7 @@ class DeviceAwareExprVisitor : public ExprVisitor, public LexicalOnDeviceMixin {
   virtual void PostVisitLetBlock_(const LetNode* let_node);
 };
 
-/*! \brief ExprMutator which tracks devices. */
+/*! \brief ExprMutator which tracks \p SEScopes. */
 class DeviceAwareExprMutator : public ExprMutator, public LexicalOnDeviceMixin {
  public:
   explicit DeviceAwareExprMutator(const Optional<IRModule>& maybe_mod)
@@ -299,7 +300,7 @@ class DeviceAwareExprMutator : public ExprMutator, public LexicalOnDeviceMixin {
   Expr VisitExpr_(const CallNode* call_node) final;
 
   /*!
-   * \brief These are as for VisitExpr_. Devices for expressions and function parameters will be
+   * \brief These are as for VisitExpr_. \p SEScopes for expressions and function parameters will be
    * tracked automatically. Default implementation defers to ExprMutator::VisitExpr_. For
    * functions the function_nesting count will already include that of \p function_node.
    */
@@ -313,7 +314,7 @@ class DeviceAwareExprMutator : public ExprMutator, public LexicalOnDeviceMixin {
   virtual void PreVisitLetBlock_(const LetNode* let_node);
 
   /*!
-   * \brief Visit a let-bound expression before the let body has been visited. Devices for the
+   * \brief Visit a let-bound expression before the let body has been visited. \p SEScopes for the
    * let-bound variable will be tracked automatically. Default implementation just visits var and
    * value.
    */
diff --git a/src/relay/transforms/device_domains.cc b/src/relay/transforms/device_domains.cc
index 15784856edbf5..305ee3dddbc48 100644
--- a/src/relay/transforms/device_domains.cc
+++ b/src/relay/transforms/device_domains.cc
@@ -28,6 +28,7 @@
 
 #include "../op/annotation/annotation.h"
 #include "../op/memory/device_copy.h"
+#include "../op/memory/on_device.h"
 
 namespace tvm {
 namespace relay {
@@ -35,11 +36,6 @@ namespace transform {
 
 namespace {
 
-// Ye olde boost hash mixer.
-constexpr size_t mix(size_t h1, size_t h2) {
-  return h1 ^ (h1 + 0x9e3779b9 + (h2 << 6) + (h2 >> 2));
-}
-
 /*!
  * \brief As for GetDeviceCopyProps, but for the call to the lowered TIR primitives rather
  * than the original "device_copy" operator.
@@ -51,77 +47,57 @@ DeviceCopyProps GetPrimitiveDeviceCopyProps(const CallNode* call_node) {
   if (tir_call_attrs == nullptr) {
     return {};
   }
-  if (tir_call_attrs->metadata.count("source_device") != 1 ||
-      tir_call_attrs->metadata.count("dst_device") != 1) {
+  if (tir_call_attrs->metadata.count("src_se_scope") != 1 ||
+      tir_call_attrs->metadata.count("dst_se_scope") != 1) {
     return {};
   }
   ICHECK_EQ(call_node->args.size(), 1) << "device_copy is of arity 1";
-  return {
-      call_node->args[0],
-      static_cast<DLDeviceType>(
-          Downcast<Integer>(tir_call_attrs->metadata["source_device"])->value),
-      static_cast<DLDeviceType>(Downcast<Integer>(tir_call_attrs->metadata["dst_device"])->value)};
+  return {call_node->args[0], Downcast<SEScope>(tir_call_attrs->metadata["src_se_scope"]),
+          Downcast<SEScope>(tir_call_attrs->metadata["dst_se_scope"])};
 }
 
 }  // namespace
 
-// The following hash and equality helpers give each free first-order domain pointer its own
-// distinct identity.
-
-size_t DeviceDomainHash::operator()(const DeviceDomainPtr& domain) const {
-  if (domain->is_free()) {
-    // Give each free first-order domain its own identity.
-    return static_cast<size_t>(reinterpret_cast<uintptr_t>(domain.get()));
-  } else {
-    size_t h = domain->args_and_result_.size();
-    h = mix(h, std::hash<int>()(static_cast<int>(domain->device_type_)));
-    for (const auto& sub_domain_ptr : domain->args_and_result_) {
-      h = mix(h, DeviceDomainHash()(sub_domain_ptr));
-    }
-    return h;
-  }
+DeviceDomains::DeviceDomains(CompilationConfig config) : config_(std::move(config)) {
+  host_domain_ = MakeFirstOrderDomain(config_->host_se_scope);
 }
 
-bool DeviceDomainEqual::operator()(const DeviceDomainPtr& lhs, const DeviceDomainPtr& rhs) const {
-  if (lhs->args_and_result_.size() != rhs->args_and_result_.size()) {
-    // Mismatched arities are never equal.
-    // (Though we'll never ask to do such a comparison explicitly, the hash map
-    // may do so implicitly due to hash collisions.)
-    return false;
-  }
-  if (lhs->is_free() && rhs->is_free()) {
-    // Compare first-order free domains by their address.
-    return lhs.get() == rhs.get();
-  }
-  if (lhs->args_and_result_.empty()) {
-    // Compare first-order domains by their device type -- free vs bound will compare as false.
-    return lhs->device_type_ == rhs->device_type_;
-  } else {
-    // Compare higher-order domains pointwise.
-    for (size_t i = 0; i < lhs->args_and_result_.size(); ++i) {
-      if (!(*this)(lhs->args_and_result_[i], rhs->args_and_result_[i])) {
-        return false;
-      }
+DeviceDomainPtr DeviceDomains::MakeFirstOrderDomain(const SEScope& se_scope) {
+  if (se_scope->IsFullyConstrained()) {
+    auto itr = fully_constrained_se_scope_to_domain_.find(se_scope);
+    if (itr != fully_constrained_se_scope_to_domain_.end()) {
+      return itr->second;
     }
-    return true;
+    DeviceDomainPtr domain = std::make_shared<DeviceDomain>(se_scope);
+    fully_constrained_se_scope_to_domain_.emplace(se_scope, domain);
+    return domain;
+  } else {
+    return std::make_shared<DeviceDomain>(se_scope);
   }
 }
 
-/* static */
-DeviceDomainPtr DeviceDomains::MakeDomain(const Type& type, DLDeviceType device_type) {
+DeviceDomainPtr DeviceDomains::MakeDomain(const Type& type, const SEScope& se_scope) {
   if (const auto* func_type_node = type.as<FuncTypeNode>()) {
     std::vector<DeviceDomainPtr> args_and_result;
     args_and_result.reserve(func_type_node->arg_types.size() + 1);
     for (const auto& arg_type : func_type_node->arg_types) {
-      args_and_result.emplace_back(MakeDomain(arg_type, kInvalidDeviceType));
+      args_and_result.emplace_back(MakeDomain(arg_type, SEScope::FullyUnconstrained()));
     }
-    args_and_result.emplace_back(MakeDomain(func_type_node->ret_type, device_type));
+    args_and_result.emplace_back(MakeDomain(func_type_node->ret_type, se_scope));
     return std::make_shared<DeviceDomain>(std::move(args_and_result));
   } else {
-    return std::make_shared<DeviceDomain>(device_type);
+    return MakeFirstOrderDomain(se_scope);
   }
 }
 
+DeviceDomainPtr DeviceDomains::ForSEScope(const Type& type, const SEScope& non_canonical_se_scope) {
+  // Generally se_scope will have come from an annotation so resolve it to ensure we have
+  // its canonical representation.
+  SEScope se_scope = config_->CanonicalSEScope(non_canonical_se_scope);
+  ICHECK(!se_scope->IsFullyUnconstrained());
+  return MakeDomain(type, se_scope);
+}
+
 DeviceDomainPtr DeviceDomains::Lookup(DeviceDomainPtr domain) {
   DeviceDomainPtr root = domain;
   while (true) {
@@ -144,56 +120,82 @@ DeviceDomainPtr DeviceDomains::Lookup(DeviceDomainPtr domain) {
   return root;
 }
 
-DeviceDomainPtr DeviceDomains::Join(const DeviceDomainPtr& lhs, const DeviceDomainPtr& rhs) {
-  // TODO(mbs): Proper diagnostics.
+DeviceDomainPtr DeviceDomains::JoinOrNull(const DeviceDomainPtr& lhs, const DeviceDomainPtr& rhs) {
+  if (lhs == rhs) {
+    return lhs;
+  }
   ICHECK_EQ(lhs->args_and_result_.size(), rhs->args_and_result_.size())
       << "Device domains:" << std::endl
       << ToString(lhs) << std::endl
       << "and" << std::endl
       << ToString(rhs) << std::endl
       << "do not have the same kind and can't be unified.";
-  if (rhs->is_free()) {
-    return lhs;
-  } else if (lhs->is_free()) {
-    return rhs;
-  } else if (lhs->args_and_result_.empty()) {
-    // Must have consistent device types for first order domains.
-    if (lhs->device_type_ != rhs->device_type_) {
-      // TODO(mbs): Proper diagnostics.
-      std::ostringstream os;
-      os << "Inconsistent device types " << lhs->device_type_ << " and " << rhs->device_type_;
-      throw Error(os.str());
+  if (lhs->args_and_result_.empty()) {
+    // Directly compare first-order.
+    if (rhs->se_scope_->IsFullyUnconstrained()) {
+      return lhs;
     }
-    return lhs;
+    if (lhs->se_scope_->IsFullyUnconstrained()) {
+      return rhs;
+    }
+    Optional<SEScope> joined_se_scope = SEScope::Join(lhs->se_scope_, rhs->se_scope_);
+    if (!joined_se_scope) {
+      return nullptr;
+    }
+    return MakeFirstOrderDomain(config_->CanonicalSEScope(joined_se_scope.value()));
   } else {
     // Recurse for higher-order.
     std::vector<DeviceDomainPtr> args_and_result;
     args_and_result.reserve(lhs->args_and_result_.size());
     for (size_t i = 0; i < lhs->args_and_result_.size(); ++i) {
-      args_and_result.emplace_back(Unify(lhs->args_and_result_[i], rhs->args_and_result_[i]));
+      DeviceDomainPtr joined_domain =
+          UnifyOrNull(lhs->args_and_result_[i], rhs->args_and_result_[i]);
+      if (joined_domain == nullptr) {
+        return nullptr;
+      }
+      args_and_result.emplace_back(std::move(joined_domain));
     }
-    return MakeDomain(std::move(args_and_result));
+    return MakeHigherOrderDomain(std::move(args_and_result));
   }
 }
 
-DeviceDomainPtr DeviceDomains::Unify(DeviceDomainPtr lhs, DeviceDomainPtr rhs) {
+DeviceDomainPtr DeviceDomains::UnifyOrNull(DeviceDomainPtr lhs, DeviceDomainPtr rhs) {
+  ICHECK_NOTNULL(lhs);
+  ICHECK_NOTNULL(rhs);
   lhs = Lookup(lhs);
   rhs = Lookup(rhs);
-  auto joined_domain = Join(lhs, rhs);
-  if (!DeviceDomainEqual()(lhs, joined_domain)) {
+  DeviceDomainPtr joined_domain = JoinOrNull(lhs, rhs);
+  if (joined_domain == nullptr) {
+    return nullptr;
+  }
+  if (lhs != joined_domain) {
     domain_to_equiv_.emplace(lhs, joined_domain);
   }
-  if (!DeviceDomainEqual()(rhs, joined_domain)) {
+  if (rhs != joined_domain) {
     domain_to_equiv_.emplace(rhs, joined_domain);
   }
   return joined_domain;
 }
 
-void DeviceDomains::UnifyCollapsed(const DeviceDomainPtr& lhs, const DeviceDomainPtr& rhs) {
-  if (!lhs->is_higher_order() && rhs->is_higher_order()) {
-    Collapse(lhs, rhs);
+bool DeviceDomains::CollapseOrFalse(const DeviceDomainPtr& first_order_domain,
+                                    const DeviceDomainPtr& higher_order_domain) {
+  ICHECK(!first_order_domain->is_higher_order());
+  ICHECK(higher_order_domain->is_higher_order());
+  for (size_t i = 0; i < higher_order_domain->function_arity(); ++i) {
+    if (UnifyOrNull(higher_order_domain->function_param(i), first_order_domain) == nullptr) {
+      return false;
+    }
+  }
+  return UnifyOrNull(higher_order_domain->function_result(), first_order_domain) != nullptr;
+}
+
+bool DeviceDomains::UnifyCollapsedOrFalse(const DeviceDomainPtr& lhs_first_order,
+                                          const DeviceDomainPtr& rhs_maybe_higher_order) {
+  ICHECK(!lhs_first_order->is_higher_order());
+  if (rhs_maybe_higher_order->is_higher_order()) {
+    return CollapseOrFalse(lhs_first_order, rhs_maybe_higher_order);
   } else {
-    Unify(lhs, rhs);
+    return UnifyOrNull(lhs_first_order, rhs_maybe_higher_order) != nullptr;
   }
 }
 
@@ -215,49 +217,49 @@ DeviceDomainPtr DeviceDomains::DomainForCallee(const Call& call) {
   }
   std::vector<DeviceDomainPtr> args_and_result;
 
-  auto on_device_props = GetOnDeviceProps(call.get());
-  auto device_copy_props = GetDeviceCopyProps(call.get());
+  OnDeviceProps on_device_props = GetOnDeviceProps(call.get());
+  DeviceCopyProps device_copy_props = GetDeviceCopyProps(call.get());
   if (!device_copy_props.body.defined()) {
+    // Special case for the TIR-ified version of "device_copy".
     device_copy_props = GetPrimitiveDeviceCopyProps(call.get());
   }
 
   if (on_device_props.body.defined()) {
-    // on_device(expr, device_type=<t>, is_fixed=false)
+    // on_device(expr, se_scope=<t>, is_fixed=false)
     // on_device : fn(<t>):?x?
     //
-    // on_device(expr, device_type=<t>, is_fixed=true)
+    // on_device(expr, se_scope=<t>, is_fixed=true)
     // on_device: fn(<t>):<t>
     args_and_result.emplace_back(
-        ForDeviceType(on_device_props.body->checked_type(), on_device_props.device_type));
+        ForSEScope(on_device_props.body->checked_type(), on_device_props.se_scope));
     if (on_device_props.is_fixed) {
       args_and_result.emplace_back(args_and_result.front());
     } else {
       args_and_result.emplace_back(Free(on_device_props.body->checked_type()));
     }
   } else if (device_copy_props.body.defined()) {
-    // device_copy(expr, src_dev_type=<s>, dst_dev_type=<d>)
+    // device_copy(expr, src_se_scope=<s>, dst_se_scope=<d>)
     // device_copy: fn(<s>):<d>
     args_and_result.emplace_back(
-        ForDeviceType(device_copy_props.body->checked_type(), device_copy_props.src_dev_type));
+        ForSEScope(device_copy_props.body->checked_type(), device_copy_props.src_se_scope));
     args_and_result.emplace_back(
-        ForDeviceType(device_copy_props.body->checked_type(), device_copy_props.dst_dev_type));
+        ForSEScope(device_copy_props.body->checked_type(), device_copy_props.dst_se_scope));
   } else if (call->op == alloc_storage_op) {
     ICHECK_EQ(call->args.size(), 2U);
-    // alloc_storage(size, alignment, device_type=<t>)
+    // alloc_storage(size, alignment, se_scope=<t>)
     // alloc_storage: fn(<cpu>, <cpu>):<t>
     const auto* attrs = call->attrs.as<AllocStorageAttrs>();
-    args_and_result.emplace_back(cpu_domain_);
-    args_and_result.emplace_back(cpu_domain_);
-    args_and_result.emplace_back(
-        ForDeviceType(call->checked_type(), static_cast<DLDeviceType>(attrs->device_type)));
+    args_and_result.emplace_back(host_domain_);
+    args_and_result.emplace_back(host_domain_);
+    args_and_result.emplace_back(ForSEScope(call->checked_type(), attrs->se_scope));
   } else if (call->op == alloc_tensor_op) {
     ICHECK_EQ(call->args.size(), 3U);
     // alloc_tensor(storage, offset, shape)
     // alloc_tensor: fn(?x?, <cpu>, <cpu>):?x?
     auto free_domain = Free(call->checked_type());
     args_and_result.emplace_back(free_domain);
-    args_and_result.emplace_back(cpu_domain_);
-    args_and_result.emplace_back(cpu_domain_);
+    args_and_result.emplace_back(host_domain_);
+    args_and_result.emplace_back(host_domain_);
     args_and_result.emplace_back(free_domain);
   } else if (call->op == shape_func_op) {
     ICHECK_EQ(call->args.size(), 3U);
@@ -267,15 +269,15 @@ DeviceDomainPtr DeviceDomains::DomainForCallee(const Call& call) {
     args_and_result.emplace_back(Free(call->args[0]->checked_type()));
     // TODO(mbs): I think this should be on the cpu only when is_input = [false], but
     // what do we do when we have multiple arguments with different is_input values?
-    args_and_result.emplace_back(cpu_domain_);
-    args_and_result.emplace_back(cpu_domain_);
-    args_and_result.emplace_back(cpu_domain_);
+    args_and_result.emplace_back(host_domain_);
+    args_and_result.emplace_back(host_domain_);
+    args_and_result.emplace_back(host_domain_);
   } else if (call->op == shape_of_op) {
     ICHECK_EQ(call->args.size(), 1U);
     // shape_of(tensor)
     // shape_of: fn(?x?):<cpu>
     args_and_result.emplace_back(Free(call->args[0]->checked_type()));
-    args_and_result.emplace_back(cpu_domain_);
+    args_and_result.emplace_back(host_domain_);
   } else if (call->op == invoke_tvm_op) {
     ICHECK_EQ(call->args.size(), 3U);
     // invoke_tvm_op(op, inputs, outputs)
@@ -292,13 +294,13 @@ DeviceDomainPtr DeviceDomains::DomainForCallee(const Call& call) {
     // reshape_tensor: fn(?x?, <cpu>):?x?
     auto free_domain = Free(call->checked_type());
     args_and_result.emplace_back(free_domain);
-    args_and_result.emplace_back(cpu_domain_);
+    args_and_result.emplace_back(host_domain_);
     args_and_result.emplace_back(free_domain);
   } else if (call->op->IsInstance<OpNode>()) {
     // <primitive>(arg1, ..., argn)
     // <primitive>: fn(?x?, ..., ?x?):?x?
     // (all args and result must be first-order).
-    auto free_domain = Free(arb_);
+    auto free_domain = MakeFirstOrderDomain(SEScope::FullyUnconstrained());
     for (size_t i = 0; i < call->args.size(); ++i) {
       args_and_result.emplace_back(free_domain);
     }
@@ -314,8 +316,9 @@ DeviceDomainPtr DeviceDomains::DomainForCallee(const Call& call) {
     ICHECK_EQ(func_type_node->arg_types.size(), call->args.size());
     auto result_domain = Free(func_type_node->ret_type);  // first-order
     for (const auto& arg_type : func_type_node->arg_types) {
-      auto param_domain = Free(arg_type);           // possibly higher-order
-      UnifyCollapsed(result_domain, param_domain);  // collapse if required
+      auto param_domain = Free(arg_type);                                 // possibly higher-order
+      bool success = UnifyCollapsedOrFalse(result_domain, param_domain);  // collapse if required
+      ICHECK(success);
       args_and_result.emplace_back(param_domain);
     }
     args_and_result.emplace_back(result_domain);
@@ -323,7 +326,7 @@ DeviceDomainPtr DeviceDomains::DomainForCallee(const Call& call) {
     // Defer to normal case where op can be an arbitrary expression.
     return DomainFor(call->op);
   }
-  auto domain = MakeDomain(std::move(args_and_result));
+  auto domain = MakeHigherOrderDomain(std::move(args_and_result));
   call_to_callee_domain_.emplace(call.get(), domain);
   return domain;
 }
@@ -331,111 +334,104 @@ DeviceDomainPtr DeviceDomains::DomainForCallee(const Call& call) {
 void DeviceDomains::UnifyExprExact(const Expr& lhs, const Expr& rhs) {
   auto lhs_domain = DomainFor(lhs);
   auto rhs_domain = DomainFor(rhs);
-  try {
-    Unify(lhs_domain, rhs_domain);
-  } catch (const Error& e) {
+  if (UnifyOrNull(lhs_domain, rhs_domain) == nullptr) {
     // TODO(mbs): Proper diagnostics.
-    LOG(FATAL) << "Incompatible devices for expressions:" << std::endl
+    LOG(FATAL) << "Incompatible SEScopes for expressions:" << std::endl
                << PrettyPrint(lhs) << std::endl
-               << "with device:" << std::endl
+               << "with scope:" << std::endl
                << ToString(lhs_domain) << "and:" << std::endl
                << PrettyPrint(rhs) << std::endl
-               << "with device:" << std::endl
-               << ToString(rhs_domain) << std::endl
-               << e.what();
+               << "with scope:" << std::endl
+               << ToString(rhs_domain);
   }
 }
 
 void DeviceDomains::UnifyExprExact(const Expr& expr, const DeviceDomainPtr& expected_domain) {
   auto actual_domain = DomainFor(expr);
-  try {
-    Unify(actual_domain, expected_domain);
-  } catch (const Error& e) {
+  if (UnifyOrNull(actual_domain, expected_domain) == nullptr) {
     // TODO(mbs): Proper diagnostics.
-    LOG(FATAL) << "Incompatible devices for expression:" << std::endl
+    LOG(FATAL) << "Incompatible SEScopes for expression:" << std::endl
                << PrettyPrint(expr) << std::endl
-               << "with actual device:" << std::endl
+               << "with actual scope:" << std::endl
                << ToString(actual_domain) << std::endl
-               << "and expected device:" << std::endl
-               << ToString(expected_domain) << std::endl
-               << e.what();
+               << "and expected scope:" << std::endl
+               << ToString(expected_domain);
   }
 }
 
-void DeviceDomains::UnifyExprCollapsed(const Expr& expr, const DeviceDomainPtr& expected_domain) {
-  auto actual_domain = DomainFor(expr);
-  try {
-    UnifyCollapsed(actual_domain, expected_domain);
-  } catch (const Error& e) {
+void DeviceDomains::UnifyExprCollapsed(const Expr& expr_first_order,
+                                       const DeviceDomainPtr& expected_domain_maybe_higher_order) {
+  auto actual_domain_first_order = DomainFor(expr_first_order);
+  if (!UnifyCollapsedOrFalse(actual_domain_first_order, expected_domain_maybe_higher_order)) {
     // TODO(mbs): Proper diagnostics.
-    LOG(FATAL) << "Incompatible devices for expression:" << std::endl
-               << PrettyPrint(expr) << std::endl
-               << "with actual device:" << std::endl
-               << ToString(actual_domain) << std::endl
-               << "and expected device:" << std::endl
-               << ToString(expected_domain) << std::endl
-               << e.what();
+    LOG(FATAL) << "Incompatible SEScopes for expression:" << std::endl
+               << PrettyPrint(expr_first_order) << std::endl
+               << "with actual scope:" << std::endl
+               << ToString(actual_domain_first_order) << std::endl
+               << "and expected scope:" << std::endl
+               << ToString(expected_domain_maybe_higher_order);
   }
 }
 
-bool DeviceDomains::AnyFree(DeviceDomainPtr domain) {
+bool DeviceDomains::IsFullyConstrained(DeviceDomainPtr domain) {
   domain = Lookup(domain);
-  if (domain->is_free()) {
-    return true;
-  }
-  for (const auto& sub_domain : domain->args_and_result_) {
-    if (AnyFree(sub_domain)) {
-      return true;
-    }
-  }
-  return false;
-}
-
-void DeviceDomains::Collapse(const DeviceDomainPtr& first_order_domain,
-                             const DeviceDomainPtr& higher_order_domain) {
-  for (size_t i = 0; i < higher_order_domain->function_arity(); ++i) {
-    Unify(higher_order_domain->function_param(i), first_order_domain);
+  if (domain->args_and_result_.empty()) {
+    // First-order.
+    return domain->se_scope_->IsFullyConstrained();
+  } else {
+    // Higher-order.
+    return std::all_of(
+        domain->args_and_result_.begin(), domain->args_and_result_.end(),
+        [this](const DeviceDomainPtr& sub_domain) { return IsFullyConstrained(sub_domain); });
   }
-  Unify(higher_order_domain->function_result(), first_order_domain);
 }
 
-void DeviceDomains::SetDefault(DeviceDomainPtr domain, DLDeviceType default_device_type) {
-  ICHECK_NE(default_device_type, kInvalidDeviceType);
+void DeviceDomains::SetDefault(DeviceDomainPtr domain, const SEScope& default_se_scope) {
+  ICHECK(!default_se_scope->IsFullyUnconstrained());
   domain = Lookup(domain);
-  if (domain->is_free()) {
-    // Will never throw since lhs is free.
-    Unify(domain, std::make_shared<DeviceDomain>(default_device_type));
-  } else if (!domain->args_and_result_.empty()) {
+  if (domain->args_and_result_.empty()) {
+    DeviceDomainPtr defaulted_domain_ptr =
+        UnifyOrNull(domain, MakeFirstOrderDomain(config_->CanonicalSEScope(
+                                SEScope::Default(domain->se_scope_, default_se_scope))));
+    ICHECK_NOTNULL(defaulted_domain_ptr);
+  } else {
     for (const auto& sub_domain : domain->args_and_result_) {
-      SetDefault(sub_domain, default_device_type);
+      SetDefault(sub_domain, default_se_scope);
     }
   }
 }
 
-void DeviceDomains::SetResultDefaultThenParams(const DeviceDomainPtr& domain,
-                                               DLDeviceType default_device_type) {
-  if (!domain->is_higher_order()) {
-    SetDefault(domain, default_device_type);
-    return;
+void DeviceDomains::SetResultDefaultThenParams(const DeviceDomainPtr& domain_maybe_higher_order,
+                                               const SEScope& default_se_scope) {
+  if (domain_maybe_higher_order->args_and_result_.empty()) {
+    SetDefault(domain_maybe_higher_order, default_se_scope);
+  } else {
+    // First set default for result domain.
+    SetDefault(ResultDomain(domain_maybe_higher_order), default_se_scope);
+    // Then use current result domain as default for everything else.
+    SetDefault(domain_maybe_higher_order, ResultSEScope(domain_maybe_higher_order));
   }
-  DLDeviceType result_device_type = ResultDeviceType(domain);
-  if (result_device_type == kInvalidDeviceType) {
-    // If the function result device is still free use the given default.
-    result_device_type = default_device_type;
+}
+
+DeviceDomainPtr DeviceDomains::ResultDomain(DeviceDomainPtr domain) {
+  domain = Lookup(domain);
+  while (!domain->args_and_result_.empty()) {
+    domain = Lookup(domain->args_and_result_.back());
   }
-  // Default any remaining free parameters to the function result device.
-  SetDefault(domain, result_device_type);
+  return domain;
 }
 
 std::string DeviceDomains::ToString(DeviceDomainPtr domain) {
   domain = Lookup(domain);
   std::ostringstream os;
-  if (domain->is_free()) {
-    // first-order free
-    os << "?" << static_cast<size_t>(reinterpret_cast<uintptr_t>(domain.get())) << "?";
-  } else if (domain->args_and_result_.empty()) {
-    // first-order bound
-    os << "<" << domain->device_type_ << ">";
+  if (domain->args_and_result_.empty()) {
+    // First-order.
+    if (!domain->se_scope_->IsFullyConstrained()) {
+      os << "?" << static_cast<size_t>(reinterpret_cast<uintptr_t>(domain.get())) << "?";
+    }
+    if (!domain->se_scope_->IsFullyUnconstrained()) {
+      os << domain->se_scope_;
+    }
   } else {
     // higher-order
     os << "fn(";
@@ -469,14 +465,6 @@ std::string DeviceDomains::ToString() {
   return os.str();
 }
 
-DeviceDomainPtr DeviceDomains::ResultDomain(DeviceDomainPtr domain) {
-  domain = Lookup(domain);
-  while (!domain->args_and_result_.empty()) {
-    domain = Lookup(domain->args_and_result_.back());
-  }
-  return domain;
-}
-
 }  // namespace transform
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/transforms/device_domains.h b/src/relay/transforms/device_domains.h
index a29370a0e8077..f3f31e790983b 100644
--- a/src/relay/transforms/device_domains.h
+++ b/src/relay/transforms/device_domains.h
@@ -29,6 +29,8 @@
 #include <tvm/relay/expr.h>
 #include <tvm/relay/type.h>
 #include <tvm/runtime/ndarray.h>
+#include <tvm/target/compilation_config.h>
+#include <tvm/target/se_scope.h>
 
 #include <memory>
 #include <string>
@@ -42,13 +44,14 @@ namespace transform {
 
 class DeviceDomain;
 using DeviceDomainPtr = std::shared_ptr<DeviceDomain>;
+class DeviceDomains;
 
 /*!
  * \brief Represents the domain over which we collect equality constraints.
  *
  * \code
  *   D ::= ?x?                  -- first order, free
- *       | <device_type>        -- first order, bound
+ *       | <se_scope>           -- first order, bound to specific device and memory scope
  *       | fn(D1, ..., Dn):Dr   -- higher order
  * \endcode
  *
@@ -56,44 +59,46 @@ using DeviceDomainPtr = std::shared_ptr<DeviceDomain>;
  * a notion of the 'result domain' of a domain:
  * \code
  *   result_domain(?x?)                = ?x?
- *   result_domain(<device_type>)      = <device_type>
+ *   result_domain(<se_scope>)         = <se_scope>
  *   result_domain(fn(D1, ..., Dn):Dr) = result_domain(Dr)
  * \endcode
  */
 class DeviceDomain {
  public:
   /*!
-   * \brief Constructs a first-order domain of \p device_type, which may be
-   * \p kInvalidDeviceType to indicate the domain is free.
+   * \brief Constructs a first-order domain for \p se_scope, which may be
+   * fully free (ie se_scope is unconstrained), partially  free (ie se_scope has at least on
+   * of its target, device id or memory scopes known), or fully fixed (ie se_scope has its target,
+   * device id and memory scopes set).
+   *
+   * CAUTION: Use DeviceDomains::MakeFirstOrderDomain instead of this ctor.
    */
-  explicit DeviceDomain(DLDeviceType device_type) : device_type_(device_type) {}
+  explicit DeviceDomain(SEScope se_scope) : se_scope_(std::move(se_scope)) {}
 
   /*!
    * \brief Constructs a higher-order domain, where \p args_and_result contain the
    * function argument and result domains in order.
+   *
+   * CAUTION: Use DeviceDomains::MakeHigherOrderDomain instead of this ctor.
    */
   explicit DeviceDomain(std::vector<DeviceDomainPtr> args_and_result)
-      : device_type_(kInvalidDeviceType), args_and_result_(std::move(args_and_result)) {}
+      : se_scope_(SEScope::FullyUnconstrained()), args_and_result_(std::move(args_and_result)) {}
 
-  /*! \brief Returns true if domain is first-order and free. */
-  bool is_free() const { return device_type_ == kInvalidDeviceType && args_and_result_.empty(); }
-
-  /*! \brief Returns true if domain is higher-order. */
   bool is_higher_order() const { return !args_and_result_.empty(); }
 
-  DLDeviceType first_order_device_type() const {
-    ICHECK(args_and_result_.empty());
-    return device_type_;
+  SEScope first_order_se_scope() const {
+    ICHECK(args_and_result_.empty()) << "expecting domain to be first-order";
+    return se_scope_;
   }
 
   size_t function_arity() const {
-    ICHECK(!args_and_result_.empty());
+    ICHECK(!args_and_result_.empty()) << "expecting domain to be higher-order";
     return args_and_result_.size() - 1UL;
   }
 
   DeviceDomainPtr function_param(size_t i) const {
-    ICHECK(!args_and_result_.empty());
-    ICHECK_LT(i + 1, args_and_result_.size());
+    ICHECK(!args_and_result_.empty()) << "expecting domain to be higher-order";
+    ICHECK_LT(i + 1, args_and_result_.size()) << "parameter index is out of range";
     return args_and_result_[i];
   }
 
@@ -104,11 +109,12 @@ class DeviceDomain {
 
  private:
   /*!
-   * \brief If this is a function domain then always kInvalidDevice. Otherwise will be
-   * kInvalidDevice if the domain is still free, or the specific concrete device if the domain is
-   * bound.
+   * \brief If this is a function domain then always fully unconstrained. Otherwise will be
+   * fully unconstrained (the domain is still completely free), partially constrained
+   * (for example, the \p target and \p device_type are constrained but the \p virtual_device_id and
+   * \p memory_scope are still unconstrained), or fully constrained (everything is known).
    */
-  const DLDeviceType device_type_;
+  const SEScope se_scope_;
 
   /*!
    * \brief If this is a function domain then the sub-domains for each of the function's
@@ -116,81 +122,92 @@ class DeviceDomain {
    */
   const std::vector<DeviceDomainPtr> args_and_result_;
 
-  friend struct DeviceDomainHash;
-  friend struct DeviceDomainEqual;
   friend class DeviceDomains;
 };
 
-// The following hash and equality helpers give each free first-order domain pointer its own
-// distinct identity.
-struct DeviceDomainHash {
-  size_t operator()(const DeviceDomainPtr& domain) const;
-};
-
-struct DeviceDomainEqual {
- public:
-  bool operator()(const DeviceDomainPtr& lhs, const DeviceDomainPtr& rhs) const;
-};
-
 /*!
  * \brief Tracks the device domains for a set of expressions w.r.t. an equivalence relation
- * built up by calls to \p Unify.
+ * built up by calls to \p UnifyOrNull.
  */
 class DeviceDomains {
  public:
-  DeviceDomains() = default;
+  explicit DeviceDomains(CompilationConfig config);
+
+  const CompilationConfig& config() const { return config_; }
 
   /*!
-   * \brief Returns a domain appropriate for \p type who's result domain is bound
-   * to \p device_type. If \p device_type is \p kInvalidDeviceType then the entire domain
-   * will be free.
+   * \brief Returns the domain representing \p se_scope. If \p se_scope is fully constrained
+   * then the domain will be unique that \p se_scope.
    */
-  static DeviceDomainPtr MakeDomain(const Type& type, DLDeviceType device_type);
+  DeviceDomainPtr MakeFirstOrderDomain(const SEScope& se_scope);
 
   /*!
    * \brief Returns a higher-order domain with \p args_and_results.
    */
-  static DeviceDomainPtr MakeDomain(std::vector<DeviceDomainPtr> arg_and_results) {
+  DeviceDomainPtr MakeHigherOrderDomain(std::vector<DeviceDomainPtr> arg_and_results) {
     return std::make_shared<DeviceDomain>(std::move(arg_and_results));
   }
 
-  /*! \brief Returns a domain with the given result device type appropriate \p device_type. */
-  static DeviceDomainPtr ForDeviceType(const Type& type, DLDeviceType device_type) {
-    ICHECK_NE(device_type, kInvalidDeviceType);
-    return MakeDomain(type, device_type);
-  }
+  /*!
+   * \brief Returns a domain appropriate for \p type who's result domain is bound to \p se_scope.
+   * If \p type is a function then all parameter domains will be completely free. It is valid for
+   * \p se_scope to be fully unconstrained.
+   */
+  DeviceDomainPtr MakeDomain(const Type& type, const SEScope& se_scope);
+
+  /*!
+   * \brief Returns a domain with the given result appropriate \p non_canonical_se_scope,
+   * which cannot be fully unconstrained. We first canonicalize the scope to unsure it has
+   * a target and is unique.
+   */
+  DeviceDomainPtr ForSEScope(const Type& type, const SEScope& non_canonical_se_scope);
 
   /*! \brief Returns a free domain appropriate for \p type. */
-  static DeviceDomainPtr Free(const Type& type) { return MakeDomain(type, kInvalidDeviceType); }
+  DeviceDomainPtr Free(const Type& type) { return MakeDomain(type, SEScope::FullyUnconstrained()); }
 
   /*! \brief Returns the domain representing the equivalence class containing \p domain. */
   DeviceDomainPtr Lookup(DeviceDomainPtr domain);
 
   /*!
-   * \brief Returns the domain accounting for all bound devices in \p lhs and \p rhs.
-   *
-   * Throws \p Error on failure.
+   * \brief Returns the most constrained domain which agrees with both \p lhs and \p rhs. Returns
+   * null if no such domain exists, ie some first-order component of \p lhs is constrained
+   * differently than the corresponding component of \p rhs.
    */
-  DeviceDomainPtr Join(const DeviceDomainPtr& lhs, const DeviceDomainPtr& rhs);
+  DeviceDomainPtr JoinOrNull(const DeviceDomainPtr& lhs, const DeviceDomainPtr& rhs);
 
   /*!
-   * \brief Unifies \p lhs and \p rhs, returning the most-bound of the two. Fails if \p lhs and \p
-   * rhs disagree on bound device type.
-   *
-   * Throws \p Error on failure.
+   * \brief Unifies \p lhs and \p rhs, returning the most-bound of the two. Returns null if
+   * \p lhs and \p rhs are not unifiable.
    */
   // TODO(mbs): I don't think we need an occurs check since the program is well-typed, but
   // given we have refs to functions I'm prepared to be surprised.
-  DeviceDomainPtr Unify(DeviceDomainPtr lhs, DeviceDomainPtr rhs);
+  DeviceDomainPtr UnifyOrNull(DeviceDomainPtr lhs, DeviceDomainPtr rhs);
+
+  /*
+   * \brief Force all domains in \p higher_order_domain to unify with \p first_order_domain.
+   * This can be used to handle functions within tuples, references and ADTs since we don't
+   * attempt to track anything beyond 'the device' for expressions of those first-order types.
+   *
+   * Returns false if any unification fails.
+   */
+  bool CollapseOrFalse(const DeviceDomainPtr& first_order_domain,
+                       const DeviceDomainPtr& higher_order_domain);
 
   /*!
-   * \brief Unifies \p lhs and \p rhs. If \p lhs is first-order and \p rhs is higher-order,
-   * require all arguments and result of \p rhs to unify with \p lhs. Otherwise same as
-   * \p Unify.
+   * \brief Unifies \p lhs_first_order and \p rhs_maybe_higher_order. If \p rhs_maybe_higher_order
+   * is indeed higher-order, require all of its arguments and result to unify with
+   * \p lhs_first_order. Otherwise same as \p Unify. Returns false if unification is not possible.
    *
-   * Throws \p Error on failure.
+   * In an expression such as:
+   * \code
+   * (fn(...) {...}, ...).0
+   * \endcode
+   * we need to force all the devices of the inner function to be the same as the device for the
+   * overall tuple since the device domain does not understand tuples. Similarly for references
+   * and ADTs.
    */
-  void UnifyCollapsed(const DeviceDomainPtr& lhs, const DeviceDomainPtr& rhs);
+  bool UnifyCollapsedOrFalse(const DeviceDomainPtr& lhs_first_order,
+                             const DeviceDomainPtr& rhs_maybe_higher_order);
 
   /*! \brief Returns true if a domain is known for \p expr. */
   bool contains(const Expr& expr) const { return expr_to_domain_.count(expr.get()); }
@@ -204,7 +221,8 @@ class DeviceDomains {
    * DomainFor(call->op).
    *
    * This special handling is needed:
-   * - To handle the "on_device" and "device_copy" ops which constrain devices to the given devices.
+   * - To handle the "on_device" and "device_copy" ops which constrain devices to the given
+   * devices.
    * - To handle some special ops which constrain devices to the CPU.
    * - To allow the same primitive to be called on different devices at different call sites.
    * Since each call to the op can have a different domain we index the ops by the call expression
@@ -212,11 +230,17 @@ class DeviceDomains {
    */
   DeviceDomainPtr DomainForCallee(const Call& call);
 
-  /*! \brief Unifies the domains for expressions \p lhs and \p rhs. */
+  /*!
+   * \brief Unifies the domains for expressions \p lhs and \p rhs.
+   *
+   * Aborts if unification fails.
+   */
   void UnifyExprExact(const Expr& lhs, const Expr& rhs);
 
   /*!
    * \brief Unifies the domain for \p expr with \p expected_domain.
+   *
+   * Aborts if unification fails.
    */
   void UnifyExprExact(const Expr& expr, const DeviceDomainPtr& expected_domain);
 
@@ -224,37 +248,25 @@ class DeviceDomains {
    * \brief Unifies the domain for \p expr with \p expected_domain.
    * If \p expected_domain is higher-order but \p expr is first-order, require all arguments
    * and the result of \p expected_domain to have the same domain as for \p expr.
-   */
-  void UnifyExprCollapsed(const Expr& expr, const DeviceDomainPtr& expected_domain);
-
-  /*! \brief Returns true if \p domain contains any free sub-domains. */
-  bool AnyFree(DeviceDomainPtr domain);
-
-  /*
-   * \brief Force all domains in \p higher_order_domain to unify with \p first_order_domain.
-   * This can be used to handle functions within tuples, references and ADTs since we don't
-   * attempt to track anything beyond 'the device' for expressions of those first-order types.
    *
-   * Throws \p Error on failure.
+   * Aborts if unification fails.
    */
-  void Collapse(const DeviceDomainPtr& first_order_domain,
-                const DeviceDomainPtr& higher_order_domain);
+  void UnifyExprCollapsed(const Expr& expr_first_order,
+                          const DeviceDomainPtr& expected_domain_maybe_higher_order);
 
-  /*! \brief Force all free domains in \p domain to default to \p default_device_type. */
-  void SetDefault(DeviceDomainPtr domain, DLDeviceType default_device_type);
+  /*! \brief Returns true if \p domain is fully constrainted. */
+  bool IsFullyConstrained(DeviceDomainPtr domain);
+
+  /*! \brief Force all \p SEScopes in \p domain to default to \p default_se_scope. */
+  void SetDefault(DeviceDomainPtr domain, const SEScope& default_se_scope);
 
   /*!
-   * \brief If \p domain is higher-order and its result domain is free, force it to
-   * \p default_device_type. Then force any  remaining free domains to the result domain
-   * (freshly defaulted or original). If \p domain is first-order same as \p SetDefault.
+   * \brief If \p domain is higher-order default it's result domain to \p default_se_scope.
+   * Then force all remaining \p SEScopes to the result domain (freshly defaulted or original).
+   * If \p domain is first-order same as \p SetDefault.
    */
-  void SetResultDefaultThenParams(const DeviceDomainPtr& domain, DLDeviceType default_device_type);
-
-  /*! \brief Returns one-line description of \p domain for debugging. */
-  std::string ToString(DeviceDomainPtr domain);
-
-  /*! \brief Returns description of entire system of constraints for debugging */
-  std::string ToString();
+  void SetResultDefaultThenParams(const DeviceDomainPtr& domain_maybe_higher_order,
+                                  const SEScope& default_se_scope);
 
   /*!
    * \brief Returns the result domain for \p domain (see defn in DeviceDomain comment).
@@ -262,13 +274,19 @@ class DeviceDomains {
   DeviceDomainPtr ResultDomain(DeviceDomainPtr domain);
 
   /*!
-   * \brief Returns the result (possibly free) device type for \p domain (see defn in DeviceDomain
-   * comment).
+   * \brief Returns the result \p SEScope (possibly unconstrained) for \p domain
+   * (see defn in DeviceDomain comment).
    */
-  DLDeviceType ResultDeviceType(const DeviceDomainPtr& domain) {
-    return ResultDomain(domain)->first_order_device_type();
+  SEScope ResultSEScope(const DeviceDomainPtr& domain) {
+    return ResultDomain(domain)->first_order_se_scope();
   }
 
+  /*! \brief Returns one-line description of \p domain for debugging. */
+  std::string ToString(DeviceDomainPtr domain);
+
+  /*! \brief Returns description of entire system of constraints for debugging */
+  std::string ToString();
+
  private:
   /*! \brief Intrinsics we need to handle specially. */
   const Op& alloc_storage_op = Op::Get("memory.alloc_storage");
@@ -277,12 +295,14 @@ class DeviceDomains {
   const Op& invoke_tvm_op = Op::Get("vm.invoke_tvm_op");
   const Op& shape_func_op = Op::Get("vm.shape_func");
   const Op& reshape_tensor_op = Op::Get("vm.reshape_tensor");
-  /*! \brief The CPU device type for special operators such as dynamic shape functions. */
-  const DLDeviceType cpu_device_type_ = kDLCPU;
-  /*! \brief Placeholder for any first-order type. */
-  Type arb_ = TupleType();
-  /*! \brief The domain for first-order expressions on the CPU. */
-  DeviceDomainPtr cpu_domain_ = ForDeviceType(arb_, cpu_device_type_);
+
+  CompilationConfig config_;
+
+  /*!
+   * \brief The domain for first-order expressions of non-tensor type, such as shapes and
+   * buffer dimensions. Generally this will be a CPU.
+   */
+  DeviceDomainPtr host_domain_;
 
   /*! \brief Maps expressions to their domains as determined during analysis. */
   std::unordered_map<const ExprNode*, DeviceDomainPtr> expr_to_domain_;
@@ -293,8 +313,19 @@ class DeviceDomains {
   std::unordered_map<const CallNode*, DeviceDomainPtr> call_to_callee_domain_;
 
   /*! \brief Maps device domains to their equivalent domains as determined during unification. */
-  std::unordered_map<DeviceDomainPtr, DeviceDomainPtr, DeviceDomainHash, DeviceDomainEqual>
-      domain_to_equiv_;
+  std::unordered_map<DeviceDomainPtr, DeviceDomainPtr> domain_to_equiv_;
+
+  /*!
+   * \brief Maps fully constrained \p SEScopes to their corresponding domains. By sharing those
+   * domains we can ensure:
+   *
+   * \code
+   * domain0 != domain1 && domain0 fully constrained && domain1 fully constrained
+   *   ==> domain0 and domain1 are incompatible
+   * \endcode
+   */
+  std::unordered_map<SEScope, DeviceDomainPtr, runtime::ObjectPtrHash, runtime::ObjectPtrEqual>
+      fully_constrained_se_scope_to_domain_;
 };
 
 }  // namespace transform
diff --git a/src/relay/transforms/device_planner.cc b/src/relay/transforms/device_planner.cc
index 83429a9e616f0..a6298aa677b84 100644
--- a/src/relay/transforms/device_planner.cc
+++ b/src/relay/transforms/device_planner.cc
@@ -19,22 +19,22 @@
 
 /*!
  * \file src/relay/transforms/device_planner.cc
- * \brief Determines a unique device to hold the result of every Relay sub-expression.
+ * \brief Determines a unique \p SEScope to hold the result of every Relay sub-expression.
  *
  * We say a Relay expression E is 'on device D' if the result of executing E is stored on D.
- * Currently we only track the 'device_type' of D and not its 'device id'. We do not track the
- * specific target associated with D (this is recovered independently via a TargetMap), and we
- * do not track the storage scope within D (this is yet to be implemented).
+ * We represent D by an \p SEScope, which means we can track anywhere from an arbitrary device
+ * of some \p DLDeviceType to a specific memory scope on a specific (virtual) \p Device who's
+ * code is compiled with a specific \p Target.
  *
  * Note that 'stored on device D' is almost but not quite the same as 'executes on device D',
  * see below.
  *
  * This pass assumes the module already contains some "on_device" and/or "device_copy" CallNodes:
- *  - "device_copy" CallNodes (with a \p DeviceCopyAttrs attribute) specify a 'src_dev_type' and
- *    'dst_dev_type' device type, which constrain the argument and context of the call
+ *  - "device_copy" CallNodes (with a \p DeviceCopyAttrs attribute) specify a 'src_se_scope' and
+ *    'dst_se_scope' \p SEScopes, which constrain the argument and context of the call
  *     respectively. It is ok if source and destination devices are the same, such no-op copies
  *     will be removed after accounting for the device preference.
- *  - "on_device" CallNodes (with a \p OnDeviceAttrs attribute) specify a 'device_type', which
+ *  - "on_device" CallNodes (with a \p OnDeviceAttrs attribute) specify an 'se_scope', which
  *    constrains the argument of the call, but (usually, see below) leaves the context
  *    unconstrained. These are called 'annotations' in the rest of the code, have no operational
  *    significance by themselves, but may trigger the insertion of a new "device_copy".
@@ -63,15 +63,16 @@
  * -------
  * We flow constraints from the "on_device" and "device_copy" calls (and some special ops, see
  * below) to all other Relay sub-expressions. (For idempotence we also respect any existing
- * "param_device_types" and "result_device_type" function attributes we introduce below.)
+ * "param_se_scopes" and "result_se_scope" function attributes we introduce below.)
  *
  * For a primitive such as \code add(e1, e2) \endcode all arguments and results must be on the
  * same device. However each call site can use a different device. In other words primitives are
- * 'device polymorphic' since we compile and execute them for each required device.
+ * 'device polymorphic' since we compile and execute them for each required device. ADT constructors
+ * are similarly polymorphic.
  *
  * For most Relay expressions the device for the overall expression is the same as the device
- * for it's sub-expressions. E.g. each field of a tuple must be on the same device as the tuple
- * itself, the condition and arms of an \p if must all be on the same device as the overall if,
+ * for its sub-expressions. E.g. each field of a tuple must be on the same device as the tuple
+ * itself, the condition and arms of an \p if must all be on the same device as the overall \p if,
  * and so on.
  *
  * Some special ops (or 'dialects') are handled:
@@ -91,18 +92,18 @@
  *
  * Phase 2
  * -------
- * After flowing constraints we apply some defaulting heuristics (using a global default device)
+ * After flowing constraints we apply some defaulting heuristics (using a global default \p SEScope)
  * to fix the device for any as-yet unconstrained sub-expressions.
  *  - Unconstrained function result devices default to the global default device.
  *  - Unconstrained function parameters devices default to the device for the function result.
  *  - Unconstrained let-bound expression devices default to the device for the overall let.
- * TODO(mbs): I may have over-innovated here and we simply want to bind all free domaints to
- * the global default device. Worth a design doc with motivating examples I think.
+ * TODO(mbs): These are very simple minded heuristics, and ultimately we'd like to treat the
+ * assignment of the remaining unconstrained sub-expressions as an optimiziation problem in itself.
  *
  * Phase 3
  * -------
  * Finally, the result of this analysis is reified into the result as:
- *  - Additional "param_device_types" (an Array<Integer>) and "result_device_type" (Integer)
+ *  - Additional "param_se_scopes" (an \p Array<SEScope>) and "result_se_scope" (an \p SEScope)
  *    attributes for every function (both top-level and local). These describe the devices for
  *    the function's parameters and the result.
  *  - Additional "device_copy" CallNodes where a copy is required in order to respect the
@@ -124,14 +125,15 @@
  * passes must preserve the lexical scoping of the "on_device" CallNodes. E.g. conversion
  * to ANF must respect the lexical scoping convention:
  * \code
- * f(on_device(g(h(a, b), c), device_type=CPU))
+ * f(on_device(g(h(a, b), c), se_scope=CPU))
  * ==>
- * let %x0 = on_device(h(a, b), device_type=CPU)
- * let %x1 = on_device(g(%x0), device-type=CPU)
- * f(on_device(%x1, device_type=CPU))
+ * let %x0 = on_device(h(a, b), se_scope=CPU)
+ * let %x1 = on_device(g(%x0), se_scope=CPU)
+ * f(on_device(%x1, se_scope=CPU))
  * \endcode
  *
  * This pass can be run before FuseOps it can use device-specific fusion rules.
+ * TODO(mbs): We also need to support running after FuseOps.
  *
  * 'Stored on' vs 'Executes on'
  * ----------------------------
@@ -147,7 +149,7 @@
  * pass, but we'd like to fold that into device planning here to ensure everything is consistent.
  *
  * Obviously since tensors are passed-by-pointer it's quite possible to execute a Relay
- * expression (eg an if expression) on one device even though the tensor data resides on
+ * expression (eg an \p if expression) on one device even though the tensor data resides on
  * another. But for AOT that flexibility seems excessive. So we'd like to just take 'executes on'
  * to be 'stored on' exactly. In particular, for a Relay function, we'd like to be able to just
  * compile the function body for the function's result device.
@@ -157,7 +159,7 @@
  * minimize cross-device calls by moving device copies out of functions. E.g.:
  * \code
  *   def @f() {  // execute on CPU
- *     let x = on_device(...GPU computation..., device_type=GPU);
+ *     let x = on_device(...GPU computation..., se_scope=GPU);
  *     device_copy(...GPU computation..., src_dev_type=GPU, dst_dev_type=CPU)
  *   }
  *   def @main() {
@@ -189,7 +191,7 @@
  * \code
  *   let f = fn(x, y) { ... }
  *   let g = fn(f, z) { f(z, z) }
- *   g(f, on_device(..., device_type=CPU))
+ *   g(f, on_device(..., se_scope=CPU))
  * \endcode
  * the parameters \p x and \p y will be on the CPU.
  *
@@ -226,28 +228,16 @@
  *    `-- Mark's stamp of completeness :-)
  *
  * TODO(mbs):
- *  * Though on_device is the identity for all types we can't wrap it around functions/constructors
- *    taking type args (or at least not without changing type_infer.cc to see through them).
- *    This is not currently handled generally.
  *  * Proper diagnostics for unification failure using spans.
- *  * Make sure the pass is idempotent even after FuseOps etc.
- *  * Support application of constructors properly. Are they device polymorphic?
- *  * Replace DLDeviceType with TargetDevice, and unify 'target annotation' with 'device planning'.
  *  * Support running the pass post FuseOps (so need to understand primitive functions, both
  *    outlines and lined) and post the VM transforms (probably need to support more intrinsic
  *    forms?).
  *  * Don't hardcode the 'CPU' device for shape funcs etc, and distinguish between the default
  *    device for primitives vs the default device for the rest of Relay.
- *  * We'll probably need some support for partial 'device polymorphism' for functions once we
- *    incorporate targets and memory scopes into the domain. For example it's ok for the function
- *    body to be executed on different device ids provided they have the same target and memory
- *    scope.
- *  * Might be simpler to just let every type have a device annotation rather than work in
- *    a separate domain?
+ *  * We may want some 'device polymorphism' for Relay functions. Eg it's ok for the function
+ *    to be called with params/result on different (virtual) device ids provided the target and
+ *    memory scopes are consistent.
  *  * Switch to expr.CopyWith(...) form once implemented to avoid unnecessary copies.
- *  * The original device_annotation.cc RewriteAnnotatedOps removed all "on_device" calls
- *    in tuples at the top level of function bodies or main expression, irrespective of the
- *    "on_device" body. What's up with that?
  */
 
 #include <tvm/ir/transform.h>
@@ -267,6 +257,7 @@
 
 #include "../op/annotation/annotation.h"
 #include "../op/memory/device_copy.h"
+#include "../op/memory/on_device.h"
 #include "./device_domains.h"
 
 namespace tvm {
@@ -283,11 +274,11 @@ namespace {
  * \brief Rewrites "on_device" calls to handle some special cases.
  *
  * \code
- * let %x = on_device(e, device_type=d)
- * ==> let %x = on_device(e, device_type=d, is_fixed=True)
+ * let %x = on_device(e, se_scope=d)
+ * ==> let %x = on_device(e, se_scope=d, is_fixed=True)
  *
- * fn(%x) { on_device(e, device_type=d) }
- * ==> fn(%x) { on_device(e, device_type=d, is_fixed=True)
+ * fn(%x) { on_device(e, se_scope=d) }
+ * ==> fn(%x) { on_device(e, se_scope=d, is_fixed=True)
  *
  * on_device(e).0
  * ==> on_device(e.0)
@@ -303,12 +294,12 @@ class RewriteOnDevices : public ExprMutator {
     // TODO(mbs): Avoid copy.
     Expr tuple_get_item =
         TupleGetItem(tuple, tuple_get_item_node->index, tuple_get_item_node->span);
-    auto props = GetOnDeviceProps(tuple);
+    OnDeviceProps props = GetOnDeviceProps(tuple);
     if (props.body.defined() && !props.is_fixed) {
       VLOG(1) << "wrapping tuple get item:" << std::endl
               << PrettyPrint(GetRef<TupleGetItem>(tuple_get_item_node)) << std::endl
-              << "with \"on_device\" for device " << props.device_type;
-      return OnDevice(tuple_get_item, props.device_type, /*is_fixed=*/false);
+              << "with \"on_device\" for SEScope " << props.se_scope;
+      return OnDevice(tuple_get_item, props.se_scope, /*is_fixed=*/false);
     } else {
       return tuple_get_item;
     }
@@ -320,12 +311,12 @@ class RewriteOnDevices : public ExprMutator {
     while (const auto* inner_let_node = expr.as<LetNode>()) {
       Expr inner_let = GetRef<Let>(inner_let_node);
       Expr value = VisitExpr(inner_let_node->value);
-      auto props = GetOnDeviceProps(value);
+      OnDeviceProps props = GetOnDeviceProps(value);
       if (props.body.defined() && !props.is_fixed) {
         VLOG(1) << "revising let-bound expression of let:" << std::endl
                 << PrettyPrint(expr) << std::endl
-                << "to be fixed to device " << props.device_type;
-        value = OnDevice(props.body, props.device_type, /*is_fixed=*/true);
+                << "to be fixed to SEScope " << props.se_scope;
+        value = OnDevice(props.body, props.se_scope, /*is_fixed=*/true);
       }
       bindings.emplace_back(inner_let_node->var, value, inner_let_node->span);
       expr = inner_let_node->body;
@@ -341,12 +332,12 @@ class RewriteOnDevices : public ExprMutator {
 
   Expr VisitExpr_(const FunctionNode* function_node) final {
     Expr body = VisitExpr(function_node->body);
-    auto props = GetOnDeviceProps(body);
+    OnDeviceProps props = GetOnDeviceProps(body);
     if (props.body.defined() && !props.is_fixed) {
       VLOG(1) << "revising body of function:" << std::endl
               << PrettyPrint(GetRef<Function>(function_node)) << std::endl
-              << "to be fixed to device " << props.device_type;
-      body = OnDevice(props.body, props.device_type, /*is_fixed=*/true);
+              << "to be fixed to SEScope " << props.se_scope;
+      body = OnDevice(props.body, props.se_scope, /*is_fixed=*/true);
     }
     // TODO(mbs): Avoid copy
     return Function(function_node->params, body, function_node->ret_type,
@@ -363,12 +354,12 @@ class RewriteOnDevices : public ExprMutator {
  * It is possible some devices remain free and will need to be defaulted by \p DeviceDefaulter.
  *
  * Eg from \code add(%x, %y) \endcode we know \p %x and \p %y must be on the same device. Later,
- * from \code on_device(%x, device_type=d) \endcode we know \p %x must be on device \p d, and thus
+ * from \code on_device(%x, se_scope=d) \endcode we know \p %x must be on device \p d, and thus
  * so must \p %y.
  *
  * Constraints can flow in interesting ways. E.g. in:
  * \code
- *   let %f = fn(%x, %y) { add(%x, on_device(%y, device_type=d)) }
+ *   let %f = fn(%x, %y) { add(%x, on_device(%y, se_scope=d)) }
  *   let %g = fn(%f, %x, %y) { %f(%x, %y) }
  *   %g(%f, %a, %b)
  * \endcode
@@ -376,8 +367,8 @@ class RewriteOnDevices : public ExprMutator {
  */
 class DeviceAnalyzer : public ExprVisitor {
  public:
-  explicit DeviceAnalyzer(IRModule mod)
-      : mod_(std::move(mod)), domains_(std::make_unique<DeviceDomains>()) {}
+  DeviceAnalyzer(IRModule mod, CompilationConfig config)
+      : mod_(std::move(mod)), domains_(std::make_unique<DeviceDomains>(std::move(config))) {}
 
   /*!
    * \brief Returns the expression-to-device-domain map for all expressions in all the global
@@ -413,7 +404,7 @@ class DeviceAnalyzer : public ExprVisitor {
     }
     args_and_result_domains.emplace_back(domains_->DomainFor(call));
     auto implied_domain =
-        DeviceDomains::MakeDomain(std::move(args_and_result_domains));  // higher-order
+        domains_->MakeHigherOrderDomain(std::move(args_and_result_domains));  // higher-order
 
     VLOG(1) << "initial call function domain:" << std::endl
             << domains_->ToString(func_domain) << std::endl
@@ -423,18 +414,15 @@ class DeviceAnalyzer : public ExprVisitor {
             << PrettyPrint(call);
 
     // The above must match.
-    try {
-      domains_->Unify(func_domain, implied_domain);  // higher-order
-    } catch (const Error& e) {
+    if (domains_->UnifyOrNull(func_domain, implied_domain) == nullptr) {  // higher-order
       // TODO(mbs): Proper diagnostics.
-      LOG(FATAL) << "Function parameters and result devices do not match those of call. Call:"
+      LOG(FATAL) << "Function parameters and result SEScopes do not match those of call. Call:"
                  << std::endl
                  << PrettyPrint(call) << std::endl
-                 << "with function devices:" << std::endl
+                 << "with function scopes:" << std::endl
                  << domains_->ToString(func_domain) << std::endl
-                 << "and implied call devices:" << std::endl
-                 << domains_->ToString(implied_domain) << std::endl
-                 << e.what();
+                 << "and implied call scopes:" << std::endl
+                 << domains_->ToString(implied_domain);
     }
 
     VLOG(1) << "final call function domain:" << std::endl
@@ -492,31 +480,27 @@ class DeviceAnalyzer : public ExprVisitor {
       VisitExpr(function_node->params[i]);
     }
 
-    // If the function already has device attributes then we can further constrain the
+    // If the function already has SEScope attributes then we can further constrain the
     // function's domain to match them.
-    if (GetFunctionResultDeviceType(function_node) != kInvalidDeviceType) {
+    if (!GetFunctionResultSEScope(function_node)->IsFullyUnconstrained()) {
       std::vector<DeviceDomainPtr> args_and_result;
       for (size_t i = 0; i < function_node->params.size(); ++i) {
-        args_and_result.emplace_back(
-            domains_->ForDeviceType(function_node->params[i]->checked_type(),
-                                    GetFunctionParamDeviceType(function_node, i)));
+        args_and_result.emplace_back(domains_->ForSEScope(
+            function_node->params[i]->checked_type(), GetFunctionParamSEScope(function_node, i)));
       }
-      args_and_result.emplace_back(domains_->ForDeviceType(
-          function_node->body->checked_type(), GetFunctionResultDeviceType(function_node)));
-      auto annotation_domain = domains_->MakeDomain(std::move(args_and_result));
-      try {
-        domains_->Unify(func_domain, annotation_domain);  // higher-order
-      } catch (const Error& e) {
+      args_and_result.emplace_back(domains_->ForSEScope(function_node->body->checked_type(),
+                                                        GetFunctionResultSEScope(function_node)));
+      auto annotation_domain = domains_->MakeHigherOrderDomain(std::move(args_and_result));
+      if (domains_->UnifyOrNull(func_domain, annotation_domain) == nullptr) {  // higher-order
         // TODO(mbs): Proper diagnostics.
         LOG(FATAL)
-            << "Function devices are incompatible with its \"on_device\" annotation. Function:"
+            << "Function SEScopes are incompatible with its \"on_device\" annotation. Function:"
             << std::endl
             << PrettyPrint(function) << std::endl
-            << "with function devices:" << std::endl
+            << "with function scopes:" << std::endl
             << domains_->ToString(func_domain) << std::endl
-            << "and annotation devices:" << std::endl
-            << domains_->ToString(annotation_domain) << std::endl
-            << e.what();
+            << "and annotation scopes:" << std::endl
+            << domains_->ToString(annotation_domain);
       }
     }
 
@@ -652,7 +636,7 @@ class DeviceAnalyzer : public ExprVisitor {
  * \code
  *   def @main(%x, %y, %z) {
  *     let %a = add(%x, %y);
- *     multiply(%a, on_device(%z, device_type=d))
+ *     multiply(%a, on_device(%z, se_scope=d))
  * \endcode
  * we know the parameter \p %z must be on device \p d, but the devices for \p %x and \p %y,
  * and the device for the function result, are still 'free'. The global 'default' device type
@@ -664,15 +648,12 @@ class DeviceAnalyzer : public ExprVisitor {
  */
 class DeviceDefaulter : public ExprVisitor {
  public:
-  DeviceDefaulter(IRModule mod, std::unique_ptr<DeviceDomains> domains,
-                  DLDeviceType default_device_type)
-      : mod_(std::move(mod)),
-        domains_(std::move(domains)),
-        default_device_type_(default_device_type) {}
+  DeviceDefaulter(IRModule mod, std::unique_ptr<DeviceDomains> domains)
+      : mod_(std::move(mod)), domains_(std::move(domains)) {}
 
   std::unique_ptr<DeviceDomains> Default() {
     VLOG_CONTEXT << "DeviceDefaulter";
-    VLOG(0) << "using default device type " << default_device_type_;
+    VLOG(0) << "defaulting to SEScope " << domains_->config()->default_primitive_se_scope;
     for (const auto& pair : mod_->functions) {
       VLOG(1) << "defaulting devices for '" << PrettyPrint(pair.first) << "'";
       VisitExpr(pair.second);
@@ -689,9 +670,10 @@ class DeviceDefaulter : public ExprVisitor {
     auto function = GetRef<Function>(function_node);
     auto func_domain = domains_->DomainFor(function);  // higher-order
     ICHECK_EQ(func_domain->function_arity(), function_node->params.size());
-    if (domains_->AnyFree(func_domain)) {
+    if (!domains_->IsFullyConstrained(func_domain)) {
       VLOG(1) << "before defaulting function:" << std::endl << domains_->ToString(func_domain);
-      domains_->SetResultDefaultThenParams(func_domain, default_device_type_);
+      domains_->SetResultDefaultThenParams(func_domain,
+                                           domains_->config()->default_primitive_se_scope);
       VLOG(1) << "after defaulting function:" << std::endl << domains_->ToString(func_domain);
     }
     VisitExpr(function_node->body);
@@ -701,12 +683,13 @@ class DeviceDefaulter : public ExprVisitor {
     auto call = GetRef<Call>(call_node);
     auto func_domain = domains_->DomainForCallee(call);  // higher-order
     ICHECK_EQ(func_domain->function_arity(), call_node->args.size());
-    if (domains_->AnyFree(func_domain)) {
+    if (!domains_->IsFullyConstrained(func_domain)) {
       // For calls to Relay functions this step is identical to that for VisitExpr_(FunctionNode*)
       // above. But for calls to primitives we may still need to force free domains to be
       // defaulted.
       VLOG(1) << "before defaulting callee:" << std::endl << domains_->ToString(func_domain);
-      domains_->SetResultDefaultThenParams(func_domain, default_device_type_);
+      domains_->SetResultDefaultThenParams(func_domain,
+                                           domains_->config()->default_primitive_se_scope);
       VLOG(1) << "after defaulting callee:" << std::endl << domains_->ToString(func_domain);
     }
     return ExprVisitor::VisitExpr_(call_node);
@@ -719,12 +702,12 @@ class DeviceDefaulter : public ExprVisitor {
       Let let = Downcast<Let>(expr);
       // If the let-var device is still free force it to match the overall let.
       auto let_domain = domains_->DomainFor(let);  // may be higher-order
-      DLDeviceType let_device_type = domains_->ResultDeviceType(let_domain);
-      ICHECK_NE(let_device_type, kInvalidDeviceType);
+      SEScope let_se_scope = domains_->ResultSEScope(let_domain);
+      ICHECK(!let_se_scope->IsFullyUnconstrained());
       auto let_var_domain = domains_->DomainFor(let->var);  // may be higher-order
-      if (domains_->AnyFree(let_var_domain)) {
+      if (!domains_->IsFullyConstrained(let_var_domain)) {
         VLOG(1) << "before defaulting let-var:" << std::endl << domains_->ToString(let_var_domain);
-        domains_->SetDefault(let_var_domain, let_device_type);
+        domains_->SetDefault(let_var_domain, let_se_scope);
         VLOG(1) << "after defaulting let-var:" << std::endl << domains_->ToString(let_var_domain);
       }
       VisitExpr(let->var);
@@ -738,8 +721,6 @@ class DeviceDefaulter : public ExprVisitor {
   IRModule mod_;
   /*! \brief The domains for all expressions.  */
   std::unique_ptr<DeviceDomains> domains_;
-  /*! \brief The default device type. */
-  DLDeviceType default_device_type_;
 };
 
 /******
@@ -754,7 +735,7 @@ class DeviceDefaulter : public ExprVisitor {
  * - Discard any existing "on_device" CallNodes since their job is done. Similarly, discard
  *   any existing "device_copy" CallNodes which are no-ops.
  *
- * - Functions are given "param_device_types" and "result_device_type" attributes to capture
+ * - Functions are given "param_se_scopes" and "result_se_scope" attributes to capture
  *   the device type for its parameters and result.
  *
  * - Additional "device_copy" CallNodes are inserted wherever there's a transition between
@@ -773,10 +754,10 @@ class DeviceDefaulter : public ExprVisitor {
  *
  * For example, we'll end up with programs that look like:
  * \code
- *   def @main(%x, %y, param_device_types=[...], result_device_type=...) {
- *     let %a = on_device(..., device_type=..., is_fixed=True)
- *     @f(%a, device_copy(on_device(..., device_type=..., is_fixed=True),
- *                        src_device_type=..., dst_device_type=...))
+ *   def @main(%x, %y, param_se_scopes=[...], result_se_scope=...) {
+ *     let %a = on_device(..., se_scope=..., is_fixed=True)
+ *     @f(%a, device_copy(on_device(..., se_scope=..., is_fixed=True),
+ *                        src_se_scope=..., dst_se_scope=...))
  *   }
  * \endcode
  */
@@ -823,32 +804,32 @@ class DeviceCapturer : public ExprMutator {
 
     // Gather the parameter and result device types for the function attributes.
     ICHECK_EQ(func_domain->function_arity(), function_node->params.size());
-    DLDeviceType result_device_type = domains_->ResultDeviceType(func_domain);
-    ICHECK_NE(result_device_type, kInvalidDeviceType);
-    Array<Integer> param_device_types;
-    param_device_types.reserve(function_node->params.size());
+    SEScope result_se_scope = domains_->ResultSEScope(func_domain);
+    ICHECK(!result_se_scope->IsFullyUnconstrained());
+    Array<SEScope> param_se_scopes;
+    param_se_scopes.reserve(function_node->params.size());
     for (size_t i = 0; i < function_node->params.size(); ++i) {
-      DLDeviceType param_device_type = domains_->ResultDeviceType(func_domain->function_param(i));
-      ICHECK_NE(param_device_type, kInvalidDeviceType);
-      param_device_types.push_back(param_device_type);
+      SEScope param_se_scope = domains_->ResultSEScope(func_domain->function_param(i));
+      ICHECK(!param_se_scope->IsFullyUnconstrained());
+      param_se_scopes.push_back(param_se_scope);
     }
 
     // Rewrite the body. Note that the body may have begun with an "on_device" so
     // be prepared to insert a "device_copy".
     Expr body = VisitChild(
-        /*lexical_device_type=*/result_device_type,
-        /*expected_device_type=*/result_device_type,
-        /*child_device_type=*/GetDeviceType(function_node->body), function_node->body);
+        /*lexical_se_scope=*/result_se_scope,
+        /*expected_se_scope=*/result_se_scope,
+        /*child_se_scope=*/GetSEScope(function_node->body), function_node->body);
 
     // TODO(mbs): Avoid copy
     Function func = Function(function_node->params, body, function_node->ret_type,
                              function_node->type_params, function_node->attrs, function_node->span);
-    return FunctionOnDevice(func, param_device_types, result_device_type);
+    return FunctionOnDevice(func, std::move(param_se_scopes), std::move(result_se_scope));
   }
 
   Expr VisitExpr_(const CallNode* call_node) final {
     auto call = GetRef<Call>(call_node);
-    DLDeviceType call_device_type = GetDeviceType(call);
+    SEScope call_se_scope = GetSEScope(call);
 
     auto on_device_props = GetOnDeviceProps(call_node);
     if (on_device_props.body.defined()) {
@@ -857,31 +838,36 @@ class DeviceCapturer : public ExprMutator {
       return VisitExpr(on_device_props.body);
     }
 
-    auto device_copy_props = GetDeviceCopyProps(call_node);
+    DeviceCopyProps device_copy_props = GetDeviceCopyProps(call_node);
     if (device_copy_props.body.defined()) {
-      DLDeviceType src_device_type = device_copy_props.src_dev_type;
-      ICHECK_EQ(call_device_type, device_copy_props.dst_dev_type);
-      if (call_device_type == src_device_type) {
+      SEScope src_se_scope = domains_->config()->CanonicalSEScope(device_copy_props.src_se_scope);
+      SEScope dst_se_scope = domains_->config()->CanonicalSEScope(device_copy_props.dst_se_scope);
+      ICHECK_EQ(call_se_scope, dst_se_scope);
+      if (src_se_scope == dst_se_scope) {
         // We can pinch out existing "device_copy" CallNodes if their source and destinations
         // match.
         return VisitExpr(device_copy_props.body);
+      } else {
+        return VisitChild(/*lexical_se_scope=*/dst_se_scope,
+                          /*expected_se_scope=*/dst_se_scope,
+                          /*child_se_scope=*/src_se_scope, device_copy_props.body);
       }
-      // else: handle as for any other call.
     }
 
+    // Generic call.
     auto func_domain = domains_->DomainForCallee(call);  // higher-order
     VLOG(1) << "considering call:" << std::endl
             << PrettyPrint(call) << std::endl
-            << "on device " << call_device_type << " with function domain:" << std::endl
+            << "in scope " << call_se_scope << " with function domain:" << std::endl
             << domains_->ToString(func_domain);
-    DLDeviceType result_device_type = domains_->ResultDeviceType(func_domain);
-    ICHECK_NE(result_device_type, kInvalidDeviceType);
+    SEScope result_se_scope = domains_->ResultSEScope(func_domain);
+    ICHECK(!result_se_scope->IsFullyUnconstrained());
 
     // The callee is on the current device.
     Expr op = VisitChild(
-        /*lexical_device_type=*/call_device_type,
-        /*expected_device_type=*/call_device_type,
-        /*child_device_type=*/result_device_type, call_node->op);
+        /*lexical_se_scope=*/call_se_scope,
+        /*expected_se_scope=*/call_se_scope,
+        /*child_se_scope=*/result_se_scope, call_node->op);
 
     // Each argument can be on the device for the corresponding function parameter. However if
     // any of those differ from the overall call device then wrap them in an "on_device" to
@@ -890,13 +876,13 @@ class DeviceCapturer : public ExprMutator {
     args.reserve(call_node->args.size());
     ICHECK_EQ(func_domain->function_arity(), call->args.size());
     for (size_t i = 0; i < call_node->args.size(); ++i) {
-      DLDeviceType param_device_type = domains_->ResultDeviceType(func_domain->function_param(i));
-      ICHECK_NE(param_device_type, kInvalidDeviceType)
+      SEScope param_se_scope = domains_->ResultSEScope(func_domain->function_param(i));
+      ICHECK(!param_se_scope->IsFullyUnconstrained())
           << "for parameter " << i << " for call:" << std::endl
           << PrettyPrint(call);
-      args.push_back(VisitChild(/*lexical_device_type=*/call_device_type,
-                                /*expected_device_type=*/param_device_type,
-                                /*child_device_type=*/GetDeviceType(call_node->args[i]),
+      args.push_back(VisitChild(/*lexical_se_scope=*/call_se_scope,
+                                /*expected_se_scope=*/param_se_scope,
+                                /*child_se_scope=*/GetSEScope(call_node->args[i]),
                                 call_node->args[i]));
     }
     // TODO(mbs): Avoid copy
@@ -907,27 +893,27 @@ class DeviceCapturer : public ExprMutator {
   Expr VisitExpr_(const LetNode* let_node) final {
     Expr expr = GetRef<Expr>(let_node);
     // Iterate through chained lets, provided they all agree on their device type.
-    DLDeviceType let_device_type = GetDeviceType(expr);
+    SEScope let_se_scope = GetSEScope(expr);
     std::vector<std::tuple<Var, Expr, Span>> bindings;
     while (const auto* inner_let_node = expr.as<LetNode>()) {
       Expr inner_let = GetRef<Let>(inner_let_node);
-      if (GetDeviceType(inner_let) != let_device_type) {
+      if (GetSEScope(inner_let) != let_se_scope) {
         // We have a device transition which needs to be handled.
         break;
       }
       // The let-bound value can be on a different device than the overall let. However if those
       // devices don't agree wrap the let-bound value in an "on_device" to help downstream
       // transforms track devices lexically.
-      Expr value = VisitChild(/*lexical_device_type=*/let_device_type,
-                              /*expected_device_type=*/GetDeviceType(inner_let_node->var),
-                              /*child_device_type=*/GetDeviceType(inner_let_node->value),
-                              inner_let_node->value);
+      Expr value =
+          VisitChild(/*lexical_se_scope=*/let_se_scope,
+                     /*expected_se_scope=*/GetSEScope(inner_let_node->var),
+                     /*child_se_scope=*/GetSEScope(inner_let_node->value), inner_let_node->value);
       bindings.emplace_back(inner_let_node->var, value, inner_let_node->span);
       expr = inner_let_node->body;
     }
-    Expr body = VisitChild(/*lexical_device_type=*/let_device_type,
-                           /*expected_device_type=*/let_device_type,
-                           /*child_device_type=*/GetDeviceType(expr), expr);
+    Expr body = VisitChild(/*lexical_se_scope=*/let_se_scope,
+                           /*expected_se_scope=*/let_se_scope,
+                           /*child_se_scope=*/GetSEScope(expr), expr);
     for (auto itr = bindings.rbegin(); itr != bindings.rend(); ++itr) {
       body = Let(/*var=*/std::get<0>(*itr), /*value=*/std::get<1>(*itr), body,
                  /*span=*/std::get<2>(*itr));
@@ -987,69 +973,69 @@ class DeviceCapturer : public ExprMutator {
     return Match(data, std::move(clauses), match_node->complete, match_node->span);
   }
 
-  DLDeviceType GetDeviceType(const Expr& expr) {
+  SEScope GetSEScope(const Expr& expr) {
     // Look through any "on_device" CallNodes, to mimic how we will be pinching them out.
-    auto props = GetOnDeviceProps(expr);
+    OnDeviceProps props = GetOnDeviceProps(expr);
     Expr true_expr = props.body.defined() ? props.body : expr;
     ICHECK(domains_->contains(true_expr));
-    // If expr is higher order we'll return only the result domain's device type.
-    DLDeviceType device_type = domains_->ResultDeviceType(domains_->DomainFor(true_expr));
-    ICHECK_NE(device_type, kInvalidDeviceType)
-        << "no device type was determined for expression:" << std::endl
+    // If expr is higher order we'll return only the result domain's SEScope.
+    SEScope se_scope = domains_->ResultSEScope(domains_->DomainFor(true_expr));
+    ICHECK(!se_scope->IsFullyUnconstrained())
+        << "no SEScope was determined for expression:" << std::endl
         << PrettyPrint(true_expr);
-    return device_type;
+    return std::move(se_scope);
   }
 
   /*!
-   * \brief Reconcile the \p child_device_type for \p child with both the \p expected_device_type
-   * (as required by the expression context the \p child is in) and the \p lexical_device_type
+   * \brief Reconcile the \p child_se_scope for \p child with both the \p expected_se_scope
+   * (as required by the expression context the \p child is in) and the \p lexical_se_scope
    * (as a downstream transform would infer based only on lexically enclosing "on_device"
-   * CallNodes and function attributes.) Generally \p lexical_device_type and \p
-   * expected_device_type are the same by definition, but may differ in arguments to  functions
+   * CallNodes and function attributes.) Generally \p lexical_se_scope and \p
+   * expected_se_scope are the same by definition, but may differ in arguments to  functions
    * and let-bound expressions.
    *
-   * If \p child_device_type differs from \p expected_device_type, wrap it as:
+   * If \p child_se_scope differs from \p expected_se_scope, wrap it as:
    * \code
-   *   device_copy(on_device(child', device_type=child_device_type),
-   *               src_dev_type=child_device_type, dst_dev_type=expected_device_type)
+   *   device_copy(on_device(child', se_scope=child_se_scope),
+   *               src_dev_type=child_se_scope, dst_dev_type=expected_se_scope)
    * \endcode
    * (where child is rewritten to child'). Note the pedantic spelling out of "on_device" on the
    * child.
    *
-   * If \p expected_device_type differs from \p lexical_device_type, then (also) wrap
+   * If \p expected_se_scope differs from \p lexical_se_scope, then (also) wrap
    * the expression as:
    * \code
-   *   on_device(..., device_type=expected_device_type)
+   *   on_device(..., se_scope=expected_se_scope)
    * \endcode
    *
    * TODO(mbs): There's no attempt at sharing here. If usage of child's node could be wrapped
    * by a "device_copy", even though those copies will generally all be to the same destination
    * device.
    */
-  Expr VisitChild(DLDeviceType lexical_device_type, DLDeviceType expected_device_type,
-                  DLDeviceType child_device_type, const Expr& child) {
-    ICHECK_NE(lexical_device_type, kInvalidDeviceType);
-    ICHECK_NE(expected_device_type, kInvalidDeviceType);
-    if (child->IsInstance<OpNode>()) {
-      // Primitive operators don't need to be rewritten and can have a different domain for
-      // each call site.
+  Expr VisitChild(const SEScope& lexical_se_scope, const SEScope& expected_se_scope,
+                  const SEScope& child_se_scope, const Expr& child) {
+    ICHECK(!lexical_se_scope->IsFullyUnconstrained());
+    ICHECK(!expected_se_scope->IsFullyUnconstrained());
+    if (child->IsInstance<OpNode>() || child->IsInstance<ConstructorNode>()) {
+      // Primitive operators and contructors don't need to be rewritten and can have a
+      // different domain at each call site.
       return child;
     }
     Expr result = VisitExpr(child);
-    if (child_device_type != expected_device_type) {
-      VLOG(1) << "creating " << DeviceCopyOp()->name << " from device type " << child_device_type
-              << " to device type " << expected_device_type << " for:" << std::endl
+    if (child_se_scope != expected_se_scope) {
+      VLOG(1) << "creating " << DeviceCopyOp()->name << " from scope " << child_se_scope
+              << " to scope " << expected_se_scope << " for:" << std::endl
               << PrettyPrint(result);
       // Also wrap the child in an "on_device" so downstream transforms can track devices
       // lexically.
-      result = MaybeOnDevice(result, child_device_type, /*is_fixed=*/true);
-      result = DeviceCopy(result, child_device_type, expected_device_type);
+      result = MaybeOnDevice(result, child_se_scope, /*is_fixed=*/true);
+      result = DeviceCopy(result, child_se_scope, expected_se_scope);
     }
-    if (expected_device_type != lexical_device_type) {
-      VLOG(1) << "creating " << OnDeviceOp()->name << " for device type " << expected_device_type
+    if (expected_se_scope != lexical_se_scope) {
+      VLOG(1) << "creating " << OnDeviceOp()->name << " for scope " << expected_se_scope
               << " for:" << std::endl
               << PrettyPrint(result);
-      result = MaybeOnDevice(result, expected_device_type, /*is_fixed=*/true);
+      result = MaybeOnDevice(result, expected_se_scope, /*is_fixed=*/true);
     }
     return result;
   }
@@ -1059,9 +1045,9 @@ class DeviceCapturer : public ExprMutator {
    * is expected to be on the same device as the \p parent.
    */
   Expr VisitChild(const Expr& parent, const Expr& child) {
-    DLDeviceType expected_device_type = GetDeviceType(parent);
-    DLDeviceType child_device_type = GetDeviceType(child);
-    return VisitChild(expected_device_type, expected_device_type, child_device_type, child);
+    SEScope expected_se_scope = GetSEScope(parent);
+    SEScope child_se_scope = GetSEScope(child);
+    return VisitChild(expected_se_scope, expected_se_scope, child_se_scope, child);
   }
 
   /*! \brief Module we are rewriting, so we can lookup global variables. */
@@ -1079,21 +1065,22 @@ tvm::transform::Pass Rewrite() {
 }
 
 /*! \brief Run the remaining phases. */
-tvm::transform::Pass PlanDevicesCore(DLDeviceType default_device_type) {
+tvm::transform::Pass PlanDevicesCore(CompilationConfig config) {
   return tvm::transform::CreateModulePass(
-      [=](IRModule mod, tvm::transform::PassContext pass_cnxt) -> IRModule {
+      [config = std::move(config)](IRModule mod,
+                                   tvm::transform::PassContext pass_cnxt) -> IRModule {
         // Collect the system of constraints for every sub-expression using existing "on_device"
         // and "device_copy" calls.
-        std::unique_ptr<DeviceDomains> domains = DeviceAnalyzer(mod).Analyze();
+        std::unique_ptr<DeviceDomains> domains = DeviceAnalyzer(mod, config).Analyze();
         VLOG(1) << "Domains after analysis:" << std::endl << domains->ToString();
 
         // Choose sensible default devices for every sub-expression if otherwise unconstrained
         // by existing "on_device" or "device_copy" calls.
-        domains = DeviceDefaulter(mod, std::move(domains), default_device_type).Default();
+        domains = DeviceDefaulter(mod, std::move(domains)).Default();
         VLOG(1) << "Domains after defaulting: " << std::endl << domains->ToString();
 
         // Insert "device_copy" and "on_device" CallNodes where needed to unambiguously capture
-        // the above map, and attach additional "param_device_types" and "result_device_type"
+        // the above map, and attach additional "param_se_scopes" and "result_se_scope"
         // attributes to all function definitions.
         return DeviceCapturer(mod, std::move(domains)).Capture();
       },
@@ -1107,17 +1094,14 @@ tvm::transform::Pass PlanDevicesCore(DLDeviceType default_device_type) {
 *******/
 
 // This function is declared in the public <tvm/relay/transform.h>.
-TVM_DLL tvm::transform::Pass PlanDevices(DLDeviceType default_device_type) {
+tvm::transform::Pass PlanDevices(CompilationConfig config) {
   std::vector<Pass> passes;
   passes.emplace_back(Rewrite());
-  passes.emplace_back(PlanDevicesCore(default_device_type));
-  return tvm::transform::Sequential(std::move(passes), "PlanDevices");
+  passes.emplace_back(PlanDevicesCore(std::move(config)));
+  return tvm::transform::Sequential(passes, "PlanDevices");
 }
 
-TVM_REGISTER_GLOBAL("relay._transform.PlanDevices")
-    .set_body_typed([](const Device& default_device) {
-      return PlanDevices(default_device.device_type);
-    });
+TVM_REGISTER_GLOBAL("relay._transform.PlanDevices").set_body_typed(PlanDevices);
 
 }  // namespace transform
 }  // namespace relay
diff --git a/src/relay/transforms/fold_constant.cc b/src/relay/transforms/fold_constant.cc
index c48a9b30967c6..05ee9d5ad5921 100644
--- a/src/relay/transforms/fold_constant.cc
+++ b/src/relay/transforms/fold_constant.cc
@@ -31,8 +31,7 @@
 #include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/object.h>
 
-#include "../op/annotation/annotation.h"
-#include "./device_aware_visitors.h"
+#include "../op/memory/on_device.h"
 #include "./pattern_utils.h"
 
 namespace tvm {
@@ -42,7 +41,7 @@ namespace transform {
 namespace {
 /*!
  * \brief Returns whether \p expr is a literal \p Constant, optionally wrapped by an "on_device"
- * annotation CallNode (which serves only to associate a device to the constant and has no
+ * annotation CallNode (which serves only to associate an \p SEScope to the constant and has no
  * operational effect).
  */
 bool IsSimpleConstant(const Expr& expr) {
@@ -87,19 +86,19 @@ class ConstantFolder : public MixedModeMutator {
         // the variable.
         //
         // We need to retain any "on_device" annotation so that downstream 'device aware'
-        // passes can still retrieve the device for the constant in its new position(s). Eg:
-        //   def @f(..., result_device_type=D) {
-        //     let %x = on_device(... something we eval to a constant..., device_type=E)
+        // passes can still retrieve the \p SEScope for the constant in its new position(s). Eg:
+        //   def @f(..., result_se_scope=D) {
+        //     let %x = on_device(... something we eval to a constant..., se_scope=E)
         //     @f(..., %x, ...)
         //   }
-        // Here the default device is D, whereas the argument %x to @f is on E (and @f expects
+        // Here the default scope is D, whereas the argument %x to @f is on E (and @f expects
         // that). No on_device annotation is required in the call according to the convention used
         // by the device-aware visitors.
         //
         // However once we've inlined the constant we need to insert an on_device, again to
         // respect the convention used by the device-aware visitors.
-        //   def @f(..., result_device_type=D) {
-        //     @f(..., on_device(...the constant..., device_type=E), ...)
+        //   def @f(..., result_se_scope=D) {
+        //     @f(..., on_device(...the constant..., se_scope=E), ...)
         //   }
         VLOG(1) << "Replacing let-binding for " << op->var->name_hint()
                 << " with constant:" << std::endl
@@ -215,8 +214,8 @@ class ConstantFolder : public MixedModeMutator {
       Expr result = tuple_node->fields[tuple_get_item_node->index];
       OnDeviceProps props = GetOnDeviceProps(post_tuple_get_item_node->tuple);
       if (props.body.defined()) {
-        // (on_device((x, y, z), device_type=D).1 ==> on_device(y, device_type=D)
-        return MaybeOnDevice(result, props.device_type, props.is_fixed);
+        // (on_device((x, y, z), se_scope=D).1 ==> on_device(y, se_scope=D)
+        return MaybeOnDevice(result, props.se_scope, props.is_fixed);
       } else {
         return result;
       }
@@ -248,19 +247,15 @@ class ConstantFolder : public MixedModeMutator {
     VLOG(1) << "Evaluating :" << std::endl << PrettyPrint(expr);
 
     // We'll invoke the interpreter using the generic CPU device and target. Technically there's
-    // no guarantee the results we bitwise equal what we'd get on the true device, however to
+    // no guarantee the results will be bitwise equal what we'd get on the true device, however to
     // support cross-compilation we don't want to assume the true device is available.
-    Device dev;
-    dev.device_type = kDLCPU;
-    dev.device_id = 0;
-    Target target = Target("llvm");
 
     // Use a fresh build context in case we are already in a build context.
     // needed for both execution and creation(due to JIT)
     With<transform::PassContext> fresh_build_ctx(transform::PassContext::Create());
 
-    Expr result =
-        ObjectToExpr(Eval(expr, module_->type_definitions, module_->Imports(), dev, target));
+    Expr result = ObjectToExpr(
+        Eval(expr, module_->type_definitions, module_->Imports(), eval_cpu_dev_, eval_cpu_target_));
     VLOG(1) << "Evaluated to constant:" << std::endl << PrettyPrint(result);
     return result;
   }
@@ -288,17 +283,14 @@ class ConstantFolder : public MixedModeMutator {
     }
 
     // Get the constant shape
-    Device dev;
-    dev.device_type = kDLCPU;
-    dev.device_id = 0;
     runtime::NDArray value;
     DLDataType cdtype = DataType::Int(32);
     if (ishape.empty()) {
-      value = runtime::NDArray::Empty({}, cdtype, dev);
+      value = runtime::NDArray::Empty({}, cdtype, eval_cpu_dev_);
     } else {
       ICHECK_NE(ishape.size(), 0);
       std::vector<int64_t> cshape = {static_cast<int64_t>(ishape.size())};
-      value = runtime::NDArray::Empty(cshape, cdtype, dev);
+      value = runtime::NDArray::Empty(cshape, cdtype, eval_cpu_dev_);
       auto* dims = static_cast<int32_t*>(value->data);
       using ::tvm::tir::IntImmNode;
       for (size_t i = 0; i < ishape.size(); ++i) {
@@ -313,7 +305,7 @@ class ConstantFolder : public MixedModeMutator {
     Constant shape = Downcast<Constant>(ObjectToExpr(value));
 
     if (shape->data.Shape().empty() && GetScalarFromConstant<int32_t>(shape) == 0) {
-      auto ndarray = runtime::NDArray::Empty({}, cdtype, dev);
+      auto ndarray = runtime::NDArray::Empty({}, cdtype, eval_cpu_dev_);
       shape = Constant(ndarray);
     }
 
@@ -342,12 +334,9 @@ class ConstantFolder : public MixedModeMutator {
     }
 
     // Get the constant size
-    Device dev;
-    dev.device_type = kDLCPU;
-    dev.device_id = 0;
     runtime::NDArray value;
     DLDataType cdtype = DataType::Int(32);
-    value = runtime::NDArray::Empty({}, cdtype, dev);
+    value = runtime::NDArray::Empty({}, cdtype, eval_cpu_dev_);
     auto* data = static_cast<int32_t*>(value->data);
     if (ishape.empty()) {
       *data = 0;
@@ -390,6 +379,13 @@ class ConstantFolder : public MixedModeMutator {
   // Module
   IRModule module_;
 
+  // The kDLCPU device assumed to be available to the compiler. Used only when evaluating
+  // sub-expressions.
+  Device eval_cpu_dev_{kDLCPU, /*device_id=*/0};
+  // The target for the above device assumed to be available to the compiler. Used only when
+  // evaluating sub-expressions.
+  Target eval_cpu_target_{"llvm"};
+
   // Cache the following ops for equivalence checking in this pass.
   const Op& device_copy_op_;
   const Op& shape_of_op_;
diff --git a/src/relay/transforms/memory_alloc.cc b/src/relay/transforms/memory_alloc.cc
index 81d704e2be8ed..a651a063d4182 100644
--- a/src/relay/transforms/memory_alloc.cc
+++ b/src/relay/transforms/memory_alloc.cc
@@ -57,17 +57,6 @@ using namespace tvm::runtime;
 namespace tvm {
 namespace relay {
 
-inline Constant MakeConstant(const std::vector<int64_t>& value) {
-  return MakeConstantTensor(DataType::Int(64), {static_cast<int64_t>(value.size())}, value);
-}
-
-inline Expr AllocTensor(const Expr& storage, tvm::relay::Expr shape, DataType dtype,
-                        Array<IndexExpr> assert_shape, DLDeviceType offset_device_type) {
-  auto offset =
-      OnDevice(MakeConstantScalar(DataType::Int(64), 0), offset_device_type, /*is_fixed=*/true);
-  return AllocTensor(storage, offset, shape, dtype, assert_shape);
-}
-
 // Check if the primitive function contains only reshape ops.
 bool IsReshapeOnly(const Expr& expr) {
   if (const FunctionNode* func = expr.as<FunctionNode>()) {
@@ -87,11 +76,13 @@ bool IsReshapeOnly(const Expr& expr) {
 
 class DialectRewriter : public transform::DeviceAwareExprMutator {
  public:
-  DialectRewriter(IRModule mod, const Target& target_host)
-      : transform::DeviceAwareExprMutator(std::move(mod)), target_host_(target_host) {}
+  DialectRewriter(IRModule mod, SEScope host_se_scope)
+      : transform::DeviceAwareExprMutator(std::move(mod)),
+        host_se_scope_(std::move(host_se_scope)) {}
 
   Function Rewrite(const Function& expr) { return Downcast<Function>(Mutate(expr)); }
 
+ private:
   Expr VisitExpr_(const TupleNode* tn) final {
     LetList& scope = scopes_.back();
     Array<Expr> new_fields;
@@ -130,7 +121,7 @@ class DialectRewriter : public transform::DeviceAwareExprMutator {
 
   Expr DeviceAwareVisitExpr_(const CallNode* cn) final {
     Call call = GetRef<Call>(cn);
-    DLDeviceType device_type = GetInScopeDeviceType(call);
+    SEScope se_scope = GetSEScope(call);
     if (IsPrimitive(cn)) {
       // Because we are in ANF we do not need to visit the arguments.
       // TODO(mbs): But does so anyway...
@@ -162,26 +153,21 @@ class DialectRewriter : public transform::DeviceAwareExprMutator {
         }
         const DeviceCopyAttrs* copy_attr = attr.as<DeviceCopyAttrs>();
         CHECK(copy_attr);
-        return DeviceCopy(new_args[0], copy_attr->src_dev_type, copy_attr->dst_dev_type);
+        return DeviceCopy(new_args[0], copy_attr->src_se_scope, copy_attr->dst_se_scope);
       } else if (IsDynamic(ret_type)) {
         Function func = Downcast<Function>(cn->op);
-        // TODO(mbs): Device id is always zero.
-        Device device{device_type, /*device_id=*/0};
-        return DynamicInvoke(&scope, func, ins, new_args, out_types, ret_type, device);
+        return DynamicInvoke(&scope, func, ins, new_args, out_types, ret_type, se_scope);
       } else {
         // Handle the static case
         Array<Expr> outs;
         for (size_t i = 0; i < out_types.size(); ++i) {
-          DLDeviceType device_type = GetInScopeDeviceType(GetRef<Call>(cn));
-          // TODO(mbs): Device id is always zero.
-          Device device{device_type, /*device_id=*/0};
-          auto out = MakeStaticAllocation(&scope, out_types[i], device, std::to_string(i));
+          auto out = MakeStaticAllocation(&scope, out_types[i], se_scope, std::to_string(i));
           outs.push_back(out);
         }
         Tuple output(outs);
         // TODO(mbs): Capture device in attributes.
         Expr invoke = InvokeTVMOp(cn->op, ins, output);
-        scope.Push(OnDevice(invoke, device_type, /*is_fixed=*/true));
+        scope.Push(OnDevice(invoke, se_scope, /*is_fixed=*/true));
         return ToTupleType(ret_type,
                            std::vector<Expr>(output->fields.begin(), output->fields.end()));
       }
@@ -190,11 +176,26 @@ class DialectRewriter : public transform::DeviceAwareExprMutator {
     }
   }
 
- private:
+  /*! Returns the Relay Constant representing the 1d tensor with \p value.
+   *
+   * CAUTION: Make sure the constant ends up on the correct device.
+   */
+  inline Constant MakeConstant(const std::vector<int64_t>& value) {
+    return MakeConstantTensor(DataType::Int(64), {static_cast<int64_t>(value.size())}, value);
+  }
+
+  /*! Returns an \p alloc_tensor call for a tensor of \p shape and \p dtype over \p storage. */
+  inline Expr AllocTensor(const Expr& storage, tvm::relay::Expr shape, DataType dtype,
+                          Array<IndexExpr> assert_shape) {
+    Expr offset = OnDevice(MakeConstantScalar(DataType::Int(64), 0), host_se_scope_,
+                           /*is_fixed=*/true);
+    return tvm::relay::AllocTensor(storage, std::move(offset), std::move(shape), dtype,
+                                   assert_shape);
+  }
+
   // Insert a device copy node.
-  Expr DeviceCopy(const Expr& inp, int src_dev, int dst_dev) {
-    return Mutate(relay::DeviceCopy(inp, static_cast<DLDeviceType>(src_dev),
-                                    static_cast<DLDeviceType>(dst_dev)));
+  Expr DeviceCopy(const Expr& inp, SEScope src_se_scope, SEScope dst_se_scope) {
+    return Mutate(relay::DeviceCopy(inp, std::move(src_se_scope), std::move(dst_se_scope)));
   }
 
   // Check if a call invokes a primitive function.
@@ -249,28 +250,28 @@ class DialectRewriter : public transform::DeviceAwareExprMutator {
   }
 
   // Allocate a tensor with a statically known shape.
-  Var MakeStaticAllocation(LetList* scope, const TensorType& type, Device dev, String name_hint) {
+  Var MakeStaticAllocation(LetList* scope, const TensorType& type, const SEScope& se_scope,
+                           String name_hint) {
     std::vector<int64_t> int_shape;
     for (auto it : type->shape) {
       const auto* imm = it.as<IntImmNode>();
       CHECK(imm) << "expect static int shape";
       int_shape.push_back(imm->value);
     }
-    Expr shape = OnDevice(MakeConstant(int_shape), cpu_device_.device_type, /*is_fixed=*/true);
-    Expr size = OnDevice(ComputeStorage(type), cpu_device_.device_type, /*is_fixed=*/true);
+    Expr shape = OnDevice(MakeConstant(int_shape), host_se_scope_, /*is_fixed=*/true);
+    Expr size = OnDevice(ComputeStorage(type), host_se_scope_, /*is_fixed=*/true);
     // Alignment is directly captured in the instruction rather than calculated, so we
     // don't want to wrap it with an "on_device".
     Expr alignment = ComputeAlignment(type->dtype);
     // Run type inference later to get the correct type.
     Var var("storage_" + name_hint, Type(nullptr));
-    Expr value = OnDevice(AllocStorage(size, alignment, dev, type->dtype), dev.device_type,
+    Expr value = OnDevice(AllocStorage(size, alignment, se_scope, type->dtype), se_scope,
                           /*is_fixed=*/true);
     auto sto = scope->Push(var, value);
 
     // TODO(@jroesch): There is a bug with typing based on the constant shape.
-    auto tensor = OnDevice(
-        AllocTensor(sto, shape, type->dtype, /*assert_shape=*/type->shape, cpu_device_.device_type),
-        dev.device_type, /*is_fixed=*/true);
+    auto tensor = OnDevice(AllocTensor(sto, shape, type->dtype, /*assert_shape=*/type->shape),
+                           se_scope, /*is_fixed=*/true);
     Var tensor_var("tensor_" + name_hint, Type(nullptr));
     return scope->Push(tensor_var, tensor);
   }
@@ -282,7 +283,7 @@ class DialectRewriter : public transform::DeviceAwareExprMutator {
 
     tec::TECompiler compiler;
 
-    tec::CCacheKey key(func, target_host_);
+    tec::CCacheKey key(func, host_se_scope_->target);
     auto cfunc = compiler->LowerShapeFunc(key);
     auto input_states = cfunc->shape_func_param_states;
 
@@ -310,10 +311,10 @@ class DialectRewriter : public transform::DeviceAwareExprMutator {
         is_inputs.push_back(0);
       } else if (state == tec::kNeedInputData) {
         auto new_arg = Mutate(arg);  // already accounts for device
-        DLDeviceType device_type = GetInScopeDeviceType(arg);
-        if (device_type != cpu_device_.device_type) {
-          new_arg = OnDevice(DeviceCopy(new_arg, device_type, cpu_device_.device_type),
-                             cpu_device_.device_type, /*is_fixed=*/true);
+        SEScope arg_se_scope = GetSEScope(arg);
+        if (arg_se_scope != host_se_scope_) {
+          new_arg = OnDevice(DeviceCopy(new_arg, arg_se_scope, host_se_scope_), host_se_scope_,
+                             /*is_fixed=*/true);
         }
         Var in_shape_var("in_shape_" + std::to_string(input_pos), Type(nullptr));
         shape_func_ins.push_back(scope->Push(in_shape_var, new_arg));
@@ -331,14 +332,14 @@ class DialectRewriter : public transform::DeviceAwareExprMutator {
       auto tt = TensorType(out->shape, out->dtype);
       // Put shape func on CPU. This also ensures that everything between
       // shape_of and shape_func are on CPU.
-      auto alloc = OnDevice(MakeStaticAllocation(scope, tt, cpu_device_, std::to_string(i)),
-                            cpu_device_.device_type, /*is_fixed=*/true);
+      auto alloc = OnDevice(MakeStaticAllocation(scope, tt, host_se_scope_, std::to_string(i)),
+                            host_se_scope_, /*is_fixed=*/true);
       Var shape_func_out_var("shape_func_out_" + std::to_string(i), Type(nullptr));
       alloc = scope->Push(shape_func_out_var, alloc);
       out_shapes.push_back(alloc);
     }
     auto shape_call = OnDevice(ShapeFunc(func, Tuple(shape_func_ins), Tuple(out_shapes), is_inputs),
-                               cpu_device_.device_type, /*is_fixed=*/true);
+                               host_se_scope_, /*is_fixed=*/true);
     Var shape_func_var("shape_func", Type(nullptr));
     scope->Push(shape_func_var, shape_call);
     return out_shapes;
@@ -347,19 +348,19 @@ class DialectRewriter : public transform::DeviceAwareExprMutator {
   // Generate the code for invoking a TVM op with a dynamic shape.
   Expr DynamicInvoke(LetList* scope, const Function& func, const Tuple& ins,
                      const std::vector<Expr>& new_args, const std::vector<TensorType>& out_types,
-                     const Type& ret_type, Device dev) {
+                     const Type& ret_type, const SEScope& se_scope) {
     auto out_shapes = EmitShapeFunc(scope, func, new_args);
     std::vector<Var> storages;
     CHECK_EQ(out_shapes.size(), out_types.size());
     for (size_t i = 0; i < out_shapes.size(); ++i) {
       auto out_shape = out_shapes[i];
       auto out_type = out_types[i];
-      auto size = OnDevice(ComputeStorageInRelay(out_shape, out_type), cpu_device_.device_type,
+      auto size = OnDevice(ComputeStorageInRelay(out_shape, out_type), host_se_scope_,
                            /*is_fixed=*/true);
       // Alignment is directly captured in the instruction so don't wrap in "on_device".
       auto alignment = ComputeAlignment(out_type->dtype);
       Var sto_var("storage_" + std::to_string(i), Type(nullptr));
-      auto val = OnDevice(AllocStorage(size, alignment, dev, out_type->dtype), dev.device_type,
+      auto val = OnDevice(AllocStorage(size, alignment, se_scope, out_type->dtype), se_scope,
                           /*is_fixed=*/true);
       storages.push_back(scope->Push(sto_var, val));
     }
@@ -369,15 +370,14 @@ class DialectRewriter : public transform::DeviceAwareExprMutator {
       auto out_shape = out_shapes[i];
       auto out_type = out_types[i];
       auto storage = storages[i];
-      auto alloc = OnDevice(AllocTensor(storage, out_shape, out_type->dtype, out_type->shape,
-                                        cpu_device_.device_type),
-                            dev.device_type, /*is_fixed=*/true);
+      auto alloc = OnDevice(AllocTensor(storage, out_shape, out_type->dtype, out_type->shape),
+                            se_scope, /*is_fixed=*/true);
       Var out_var("out_" + std::to_string(i), Type(nullptr));
       outs.push_back(scope->Push(out_var, alloc));
     }
 
     Tuple tuple_outs(outs);
-    auto invoke = OnDevice(InvokeTVMOp(func, ins, tuple_outs), dev.device_type, /*is_fixed=*/true);
+    auto invoke = OnDevice(InvokeTVMOp(func, ins, tuple_outs), se_scope, /*is_fixed=*/true);
     scope->Push(invoke);
     return ToTupleType(ret_type,
                        std::vector<Expr>(tuple_outs->fields.begin(), tuple_outs->fields.end()));
@@ -397,27 +397,24 @@ class DialectRewriter : public transform::DeviceAwareExprMutator {
         CHECK(imm) << "expect static int shape";
         shape.push_back(imm->value);
       }
-      shape_expr = OnDevice(MakeConstant(shape), cpu_device_.device_type, /*is_fixed=*/true);
+      shape_expr = OnDevice(MakeConstant(shape), host_se_scope_, /*is_fixed=*/true);
     }
     return ReshapeTensor(new_args[0], shape_expr, ret_ty->shape);
   }
 
  private:
   const Op& device_copy_op_ = Op::Get("device_copy");
+  runtime::DataType compute_dtype_ = runtime::DataType::Int(64);
+  SEScope host_se_scope_;
 
-  Target target_host_;
   std::vector<LetList> scopes_;
-
-  runtime::DataType compute_dtype_ = runtime::DataType::Int(64);
-  Device cpu_device_{kDLCPU, 0};
 };
 
 namespace transform {
 
-Pass ManifestAlloc(Target target_host, Map<tvm::Integer, tvm::Target> targets) {
-  CheckAndUpdateHostConsistency(&targets, &target_host);
+Pass ManifestAlloc(SEScope host_se_scope) {
   return tvm::transform::CreateModulePass(
-      [=](IRModule mod, const PassContext& pass_ctx) {
+      [host_se_scope](IRModule mod, const PassContext& pass_ctx) {
         // We need to mutate module, therefore making a copy of it.
         mod.CopyOnWrite();
         mod->ImportFromStd("core.rly");
@@ -427,7 +424,7 @@ Pass ManifestAlloc(Target target_host, Map<tvm::Integer, tvm::Target> targets) {
         for (const auto& it : glob_funcs) {
           if (auto* func_node = it.second.as<FunctionNode>()) {
             auto func = GetRef<Function>(func_node);
-            auto rewriter = DialectRewriter(mod, target_host);
+            auto rewriter = DialectRewriter(mod, host_se_scope);
             auto updated_func = rewriter.Rewrite(func);
 
             mod->Update(it.first, updated_func);
@@ -440,11 +437,7 @@ Pass ManifestAlloc(Target target_host, Map<tvm::Integer, tvm::Target> targets) {
       0, "ManifestAlloc", {});
 }
 
-TVM_REGISTER_GLOBAL("relay.transform.ManifestAlloc")
-    .set_body_typed([](Target target_host, Map<tvm::Integer, tvm::Target> targets) {
-      CheckAndUpdateHostConsistency(&targets, &target_host);
-      return ManifestAlloc(target_host, targets);
-    });
+TVM_REGISTER_GLOBAL("relay.transform.ManifestAlloc").set_body_typed(ManifestAlloc);
 
 }  // namespace transform
 
diff --git a/src/relay/transforms/pass_utils.h b/src/relay/transforms/pass_utils.h
index fd7f0a5594c2a..317ac17f83c86 100644
--- a/src/relay/transforms/pass_utils.h
+++ b/src/relay/transforms/pass_utils.h
@@ -37,6 +37,7 @@
 
 #include "../analysis/dependency_graph.h"
 #include "../op/annotation/annotation.h"
+#include "../op/memory/on_device.h"
 #include "./let_list.h"
 
 namespace tvm {
diff --git a/src/relay/transforms/to_a_normal_form.cc b/src/relay/transforms/to_a_normal_form.cc
index c767770a8be8c..0814e73ab73d4 100644
--- a/src/relay/transforms/to_a_normal_form.cc
+++ b/src/relay/transforms/to_a_normal_form.cc
@@ -211,14 +211,14 @@ class Fill : ExprFunctor<Expr(const Expr&, const Var&)>, private transform::Lexi
   }
 
   Expr Atomic(const Expr& e, const Var& v) {
-    Expr annotated_expr = MaybeOnDevice(e, GetInScopeDeviceType(e), /*is_fixed=*/true);
+    Expr annotated_expr = MaybeOnDevice(e, GetSEScope(e), /*is_fixed=*/true);
     return v.defined() ? GetScope(e)->let_list->Push(v, annotated_expr) : annotated_expr;
   }
 
   // Bind expression `now` to var `v` if the original expression is in the include set, or if
   // v is already defined (e.g. coming from a Let expression). Otherwise return `now` directly
   Expr Compound(const Expr& orig, const Expr& now, const Var& v) {
-    Expr annotated_expr = MaybeOnDevice(now, GetInScopeDeviceType(orig), /*is_fixed=*/true);
+    Expr annotated_expr = MaybeOnDevice(now, GetSEScope(orig), /*is_fixed=*/true);
     Var var = v.defined() ? v : Var(String("x"), Type());
     bool not_included = include_set_ && include_set_->find(orig) == include_set_->end();
     if (!v.defined() && not_included) {
@@ -229,15 +229,15 @@ class Fill : ExprFunctor<Expr(const Expr&, const Var&)>, private transform::Lexi
   }
 
   Expr VisitExpr_(const CallNode* c, const Var& v) final {
-    auto props = GetOnDeviceProps(c);
+    OnDeviceProps props = GetOnDeviceProps(c);
     if (props.body.defined() && props.is_fixed) {
       // Keep track of expression device type for lexically enclosing sub-expressions.
-      PushDeviceType(props.device_type);
+      PushSEScope(props.se_scope);
       Expr body = VisitExpr(props.body, v);
       // We are done with this sub-expression.
-      PopDeviceType();
+      PopSEScope();
       // Preserve the "on_device" annotations.
-      return OnDevice(body, props.device_type, props.is_fixed);
+      return OnDevice(body, props.se_scope, props.is_fixed);
     }
 
     Expr e = GetRef<Expr>(c);
@@ -292,9 +292,9 @@ class Fill : ExprFunctor<Expr(const Expr&, const Var&)>, private transform::Lexi
     } else {
       // Keep track of expression and bound variable device types for lexically enclosing
       // sub-expressions.
-      PushDeviceType(GetFunctionResultDeviceType(f));
+      PushSEScope(GetFunctionResultSEScope(f));
       for (size_t i = 0; i < f->params.size(); ++i) {
-        PushBoundVar(f->params[i], GetFunctionParamDeviceType(f, i));
+        PushBoundVar(f->params[i], GetFunctionParamSEScope(f, i));
       }
       EnterFunctionBody();
       ret = Function(f->params, GetSubScope(e, 0)->let_list->Get(VisitExpr(f->body)), f->ret_type,
@@ -304,7 +304,7 @@ class Fill : ExprFunctor<Expr(const Expr&, const Var&)>, private transform::Lexi
       for (size_t i = 0; i < f->params.size(); ++i) {
         PopBoundVar(f->params[i]);
       }
-      PopDeviceType();
+      PopSEScope();
     }
     if (function_nesting() == 0) {
       ICHECK(!v.defined());
@@ -319,7 +319,7 @@ class Fill : ExprFunctor<Expr(const Expr&, const Var&)>, private transform::Lexi
   Expr VisitExpr_(const LetNode* l, const Var& v) final {
     Expr e = GetRef<Expr>(l);
     // Keep track of bound variable device types for lexically enclosing sub-expressions.
-    PushBoundVar(l->var, GetInScopeDeviceType(l->value));
+    PushBoundVar(l->var, GetSEScope(l->value));
     VisitExpr(l->value, l->var);
     Expr ret = GetSubScope(e, 0)->let_list->Get(VisitExpr(l->body));
     // We are done with these sub-expressions.
diff --git a/src/runtime/vm/bytecode.cc b/src/runtime/vm/bytecode.cc
index 09b928fa1e392..f83e27d2c11d8 100644
--- a/src/runtime/vm/bytecode.cc
+++ b/src/runtime/vm/bytecode.cc
@@ -23,6 +23,7 @@
  */
 
 #include <tvm/runtime/logging.h>
+#include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/vm/bytecode.h>
 
 #include <sstream>
@@ -119,13 +120,10 @@ Instruction::Instruction(const Instruction& instr) {
       this->shape_of.tensor = instr.shape_of.tensor;
       return;
     case Opcode::ReshapeTensor:
-      this->reshape_tensor.tensor = instr.reshape_tensor.tensor;
-      this->reshape_tensor.newshape = instr.reshape_tensor.newshape;
+      this->reshape_tensor = instr.reshape_tensor;
       return;
     case Opcode::DeviceCopy:
-      this->src = instr.src;
-      this->src_device_type = instr.src_device_type;
-      this->dst_device_type = instr.dst_device_type;
+      this->device_copy = instr.device_copy;
       return;
     default:
       std::ostringstream out;
@@ -225,13 +223,10 @@ Instruction& Instruction::operator=(const Instruction& instr) {
       this->shape_of.tensor = instr.shape_of.tensor;
       return *this;
     case Opcode::ReshapeTensor:
-      this->reshape_tensor.tensor = instr.reshape_tensor.tensor;
-      this->reshape_tensor.newshape = instr.reshape_tensor.newshape;
+      this->reshape_tensor = instr.reshape_tensor;
       return *this;
     case Opcode::DeviceCopy:
-      this->src = instr.src;
-      this->src_device_type = instr.src_device_type;
-      this->dst_device_type = instr.dst_device_type;
+      this->device_copy = instr.device_copy;
       return *this;
     default:
       std::ostringstream out;
@@ -338,14 +333,14 @@ Instruction Instruction::AllocTensorReg(RegName storage, RegName offset, RegName
 }
 
 Instruction Instruction::AllocStorage(RegName size, Index alignment, DLDataType dtype_hint,
-                                      Index device_type, RegName dst) {
+                                      Index device_index, RegName dst) {
   Instruction instr;
   instr.op = Opcode::AllocStorage;
   instr.dst = dst;
   instr.alloc_storage.allocation_size = size;
   instr.alloc_storage.alignment = alignment;
   instr.alloc_storage.dtype_hint = dtype_hint;
-  instr.alloc_storage.device_type = device_type;
+  instr.alloc_storage.device_index = device_index;
   return instr;
 }
 
@@ -366,14 +361,14 @@ Instruction Instruction::ReshapeTensor(RegName tensor, RegName newshape, RegName
   return instr;
 }
 
-Instruction Instruction::DeviceCopy(RegName src, Index src_device_type, Index dst_device_type,
+Instruction Instruction::DeviceCopy(RegName src, Index src_device_index, Index dst_device_index,
                                     RegName dst) {
   Instruction instr;
   instr.op = Opcode::DeviceCopy;
   instr.dst = dst;
-  instr.src = src;
-  instr.src_device_type = src_device_type;
-  instr.dst_device_type = dst_device_type;
+  instr.device_copy.src = src;
+  instr.device_copy.src_device_index = src_device_index;
+  instr.device_copy.dst_device_index = dst_device_index;
   return instr;
 }
 
@@ -609,7 +604,7 @@ void InstructionPrint(std::ostream& os, const Instruction& instr) {
       os << "alloc_storage $" << instr.dst << " $" << instr.alloc_storage.allocation_size << " "
          << instr.alloc_storage.alignment << " "
          << DLDataType2String(instr.alloc_storage.dtype_hint) << " "
-         << instr.alloc_storage.device_type;
+         << instr.alloc_storage.device_index;
       break;
     }
     case Opcode::ShapeOf: {
@@ -622,8 +617,8 @@ void InstructionPrint(std::ostream& os, const Instruction& instr) {
       break;
     }
     case Opcode::DeviceCopy: {
-      os << "device_copy $" << instr.dst << " $" << instr.src << " " << instr.dst_device_type << " "
-         << instr.src_device_type;
+      os << "device_copy $" << instr.dst << " $" << instr.device_copy.src << " "
+         << instr.device_copy.dst_device_index << " " << instr.device_copy.src_device_index;
       break;
     }
     default:
diff --git a/src/runtime/vm/executable.cc b/src/runtime/vm/executable.cc
index 4d7ee457e1e66..25b29cc2bcaf7 100644
--- a/src/runtime/vm/executable.cc
+++ b/src/runtime/vm/executable.cc
@@ -63,6 +63,8 @@ PackedFunc Executable::GetFunction(const std::string& name, const ObjectPtr<Obje
         [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->GetBytecode(); });
   } else if (name == "get_constants") {
     return PackedFunc([this](TVMArgs args, TVMRetValue* rv) { *rv = this->GetConstants(); });
+  } else if (name == "get_virtual_devices") {
+    return PackedFunc([this](TVMArgs args, TVMRetValue* rv) { *rv = this->GetVirtualDevices(); });
   } else if (name == "get_stats") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->Stats(); });
   } else if (name == "save") {
@@ -165,13 +167,21 @@ String ShapeString(const ShapeTuple& shape_tuple, DLDataType dtype) {
 
 std::string Executable::GetConstants() const {
   std::ostringstream oss;
-
   for (size_t i = 0; i < constants.size(); ++i) {
     const auto& constant = constants[i];
     auto ndarray = Downcast<NDArray>(constant);
-    DLDeviceType device_type = static_cast<DLDeviceType>(const_device_type[i]);
     oss << "VM Constant[" << i << "]: has shape " << ShapeString(ndarray.Shape(), ndarray->dtype)
-        << " on device of type " << device_type << std::endl;
+        << " on device index " << const_device_indexes[i] << std::endl;
+  }
+  return oss.str();
+}
+
+std::string Executable::GetVirtualDevices() const {
+  std::ostringstream oss;
+  for (size_t i = 0; i < virtual_devices.size(); ++i) {
+    const auto& device = virtual_devices[i];
+    oss << "VM VirtualDevice[" << i << "]: device type " << device.device_type << " and id "
+        << device.device_id << std::endl;
   }
   return oss.str();
 }
@@ -245,6 +255,9 @@ TVMByteArray Executable::Save() {
   // Save header
   SaveHeader(&strm);
 
+  // Save virtual devices section.
+  SaveVirtualDevicesSection(&strm);
+
   // Global section.
   SaveGlobalSection(&strm);
 
@@ -263,6 +276,11 @@ TVMByteArray Executable::Save() {
   return arr;
 }
 
+void Executable::SaveVirtualDevicesSection(dmlc::Stream* strm) {
+  strm->Write(virtual_devices);
+  strm->Write(host_device_index);
+}
+
 void Executable::SaveGlobalSection(dmlc::Stream* strm) {
   std::vector<std::pair<std::string, Index>> globals(this->global_map.begin(),
                                                      this->global_map.end());
@@ -289,8 +307,8 @@ void Executable::SaveConstantSection(dmlc::Stream* strm) {
     runtime::SaveDLTensor(strm, it);
   }
 
-  // Save the const to device mapping.
-  strm->Write(this->const_device_type);
+  // Save the const to device index mapping.
+  strm->Write(this->const_device_indexes);
 }
 
 void Executable::SavePrimitiveOpNames(dmlc::Stream* strm) {
@@ -407,7 +425,7 @@ VMInstructionSerializer SerializeInstruction(const Instruction& instr) {
       fields.push_back(dtype.code);
       fields.push_back(dtype.bits);
       fields.push_back(dtype.lanes);
-      fields.push_back(instr.alloc_storage.device_type);
+      fields.push_back(instr.alloc_storage.device_index);
       fields.push_back(instr.dst);
       break;
     }
@@ -487,7 +505,8 @@ VMInstructionSerializer SerializeInstruction(const Instruction& instr) {
     }
     case Opcode::DeviceCopy: {
       // Number of fields = 4
-      fields.assign({instr.src, instr.src_device_type, instr.dst_device_type, instr.dst});
+      fields.assign({instr.device_copy.src, instr.device_copy.src_device_index,
+                     instr.device_copy.dst_device_index, instr.dst});
       break;
     }
     default:
@@ -504,7 +523,7 @@ void Executable::SaveCodeSection(dmlc::Stream* strm) {
   for (const auto& func : this->functions) {
     // Save the function info.
     VMFunctionSerializer func_format(func.name, func.register_file_size, func.instructions.size(),
-                                     func.params, func.params_device_type);
+                                     func.params, func.param_device_indexes);
     func_format.Save(strm);
 
     // Serialize each instruction.
@@ -564,6 +583,9 @@ runtime::Module Executable::Load(const std::string& code, const runtime::Module
   // Load header.
   LoadHeader(&strm);
 
+  // Virtual devices section
+  exec->LoadVirtualDevicesSection(&strm);
+
   // Global section.
   exec->LoadGlobalSection(&strm);
 
@@ -579,6 +601,12 @@ runtime::Module Executable::Load(const std::string& code, const runtime::Module
   return runtime::Module(exec);
 }
 
+void Executable::LoadVirtualDevicesSection(dmlc::Stream* strm) {
+  STREAM_CHECK(strm->Read(&virtual_devices), "virtual_device");
+  STREAM_CHECK(strm->Read(&host_device_index), "virtual_device");
+  ICHECK(host_device_index >= 0 && host_device_index < static_cast<int>(virtual_devices.size()));
+}
+
 void Executable::LoadGlobalSection(dmlc::Stream* strm) {
   std::vector<std::string> globals;
   STREAM_CHECK(strm->Read(&globals), "global");
@@ -597,14 +625,15 @@ void Executable::LoadConstantSection(dmlc::Stream* strm) {
   for (size_t i = 0; i < size; i++) {
     runtime::NDArray constant;
     STREAM_CHECK(constant.Load(strm), "constant");
-    this->constants.push_back(constant);
+    this->constants.emplace_back(std::move(constant));
   }
 
-  // Load the const to device mapping.
-  std::vector<Index> const_device_type;
-  STREAM_CHECK(strm->Read(&const_device_type), "constant");
-  ICHECK_EQ(size, const_device_type.size());
-  this->const_device_type = const_device_type;
+  // Load the const to device index mapping.
+  std::vector<Index> const_device_indexes;
+  const_device_indexes.reserve(size);
+  STREAM_CHECK(strm->Read(&const_device_indexes), "constant");
+  ICHECK_EQ(size, const_device_indexes.size());
+  this->const_device_indexes = std::move(const_device_indexes);
 }
 
 void Executable::LoadPrimitiveOpNames(dmlc::Stream* strm) {
@@ -846,8 +875,9 @@ void Executable::LoadCodeSection(dmlc::Stream* strm) {
     }
 
     // Create the VM function.
-    VMFunction vm_func = VMFunction(loaded_func.name, loaded_func.params, instructions,
-                                    loaded_func.register_file_size, loaded_func.params_device_type);
+    VMFunction vm_func =
+        VMFunction(loaded_func.name, loaded_func.params, instructions,
+                   loaded_func.register_file_size, loaded_func.param_device_indexes);
     auto it = this->global_map.find(loaded_func.name);
     ICHECK(it != this->global_map.end());
     ICHECK_LE(it->second, this->global_map.size());
diff --git a/src/runtime/vm/profiler/vm.cc b/src/runtime/vm/profiler/vm.cc
index cd2d1332580b4..e5afb0e4b1fcc 100644
--- a/src/runtime/vm/profiler/vm.cc
+++ b/src/runtime/vm/profiler/vm.cc
@@ -101,12 +101,10 @@ void VirtualMachineDebug::LoadExecutable(const Executable* exec) {
 void VirtualMachineDebug::OpStartHook(Instruction instr) {
   if (prof_ && prof_.operator*().IsRunning()) {
     if (instr.op == Opcode::LoadConst) {
-      Device dev = GetDevice(exec_->const_device_type[instr.const_index]);
+      Device dev = GetDevice(exec_->const_device_indexes[instr.const_index]);
       prof_.operator*().StartCall("VM::LoadConst", dev, {});
     } else if (instr.op == Opcode::DeviceCopy) {
-      Device dst_dev;
-      dst_dev.device_type = static_cast<DLDeviceType>(instr.dst_device_type);
-      dst_dev.device_id = 0;
+      Device dst_dev = GetDevice(instr.device_copy.dst_device_index);
       prof_.operator*().StartCall("VM::DeviceCopy", dst_dev, {});
     } else if (instr.op == Opcode::ReshapeTensor) {
       prof_.operator*().StartCall("VM::ReshapeTensor", devices_[1], {});
@@ -124,7 +122,7 @@ void VirtualMachineDebug::OpStartHook(Instruction instr) {
     } else if (instr.op == Opcode::AllocTensorReg) {
       auto storage_obj = ReadRegister(instr.alloc_tensor_reg.storage);
       auto storage = Downcast<Storage>(storage_obj);
-      Device cpu_dev = GetDevice(static_cast<Index>(kDLCPU));
+      Device cpu_dev = GetDevice(exec_->host_device_index);
       auto shape_obj = ReadRegister(instr.alloc_tensor_reg.shape_register);
       NDArray shape_tensor = Downcast<NDArray>(shape_obj).CopyTo(cpu_dev);
       prof_.operator*().StartCall(
@@ -135,8 +133,8 @@ void VirtualMachineDebug::OpStartHook(Instruction instr) {
       auto size = LoadScalarInt(instr.alloc_storage.allocation_size);
       std::ostringstream shape;
       shape << DLDataType2String(instr.alloc_storage.dtype_hint) << "[" << size << "]";
-      prof_.operator*().StartCall("VM::AllocStorage",
-                                  {static_cast<DLDeviceType>(instr.alloc_storage.device_type), 0},
+      Device dev = GetDevice(instr.alloc_storage.device_index);
+      prof_.operator*().StartCall("VM::AllocStorage", dev,
                                   {{"VM::Argument Shapes", String(shape.str())}});
     } else {
       prof_.operator*().StartCall("VM::UnknownOp", devices_[1], {});
diff --git a/src/runtime/vm/serialize_utils.h b/src/runtime/vm/serialize_utils.h
index b4a10806caaf5..04a79c9b0210d 100644
--- a/src/runtime/vm/serialize_utils.h
+++ b/src/runtime/vm/serialize_utils.h
@@ -58,19 +58,19 @@ struct VMFunctionSerializer {
   size_t num_instructions;
   /*! \brief The parameters of the VMFunction. */
   std::vector<std::string> params;
-  /*! \brief The device type of each parameter of the VMFunction. */
-  std::vector<Index> params_device_type;
+  /*! \brief The index for the devices holding each parameter of the VMFunction. */
+  std::vector<Index> param_device_indexes;
 
   VMFunctionSerializer() = default;
 
   VMFunctionSerializer(const std::string& name, Index register_file_size, size_t num_instructions,
                        const std::vector<std::string>& params,
-                       const std::vector<Index>& params_device_type)
+                       const std::vector<Index>& param_device_indexes)
       : name(name),
         register_file_size(register_file_size),
         num_instructions(num_instructions),
         params(params),
-        params_device_type(params_device_type) {}
+        param_device_indexes(param_device_indexes) {}
 
   /*!
    * \brief Load the serialized function header.
@@ -87,7 +87,7 @@ struct VMFunctionSerializer {
     // Get the number of instructions.
     num_instructions = static_cast<size_t>(std::stoll(func_info[2]));
     if (!strm->Read(&params)) return false;
-    if (!strm->Read(&params_device_type)) return false;
+    if (!strm->Read(&param_device_indexes)) return false;
     return true;
   }
 
@@ -102,7 +102,7 @@ struct VMFunctionSerializer {
     func_info.push_back(std::to_string(num_instructions));
     strm->Write(func_info);
     strm->Write(params);
-    strm->Write(params_device_type);
+    strm->Write(param_device_indexes);
   }
 };
 
diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc
index b903f793d799f..05adf1d69e8d6 100644
--- a/src/runtime/vm/vm.cc
+++ b/src/runtime/vm/vm.cc
@@ -232,12 +232,11 @@ void VirtualMachine::SetInput(std::string func_name, TVMArgs args, int offset) {
   const auto& param_names = vm_func.params;
   ICHECK_EQ(args.size() - offset, param_names.size())
       << "The number of provided parameters doesn't match the number of arguments";
-  ICHECK_EQ(param_names.size(), vm_func.params_device_type.size())
+  ICHECK_EQ(param_names.size(), vm_func.param_device_indexes.size())
       << "The number of provided parameters doesn't match the number of assigned devices";
   std::vector<ObjectRef> func_args(param_names.size());
   for (int i = offset; i < args.size(); ++i) {
-    Index device_type = vm_func.params_device_type[i - offset];
-    Device dev = GetDevice(device_type);
+    Device dev = GetDevice(vm_func.param_device_indexes[i - offset]);
 
     if (args[i].type_code() == kTVMDLTensorHandle) {
       // Automatically convert input DLTensors to NDArray
@@ -258,13 +257,14 @@ void VirtualMachine::SetInput(std::string func_name, TVMArgs args, int offset) {
   inputs_.emplace(func_name, func_args);
 }
 
-inline Device VirtualMachine::GetDevice(Index device_type) const {
-  ICHECK_GE(devices_.size(), device_type) << "devices_ doesn't contain device:" << device_type;
+inline Device VirtualMachine::GetDevice(Index device_index) const {
+  ICHECK_GE(devices_.size(), device_index) << "invalid device index: " << device_index;
+  return devices_[device_index];
+}
 
-  auto dev = devices_[device_type];
-  ICHECK_EQ(static_cast<Index>(dev.device_type), device_type)
-      << "device type " << device_type << " has not been initialized in the device list.";
-  return dev;
+inline Allocator* VirtualMachine::GetAllocator(Index device_index) const {
+  ICHECK_GE(allocators_.size(), device_index) << "invalid device index: " << device_index;
+  return allocators_[device_index];
 }
 
 void VirtualMachine::PushFrame(Index arg_count, Index ret_pc, const VMFunction& vm_func) {
@@ -297,7 +297,12 @@ void VirtualMachine::InvokeGlobal(const VMFunction& func, const std::vector<Obje
 }
 
 ObjectRef VirtualMachine::Invoke(const VMFunction& func, const std::vector<ObjectRef>& args) {
-  VLOG(2) << "Executing Function: " << std::endl << func;
+  DLOG(INFO) << "Executing Function: " << std::endl << func;
+  for (int i = 0; i < static_cast<int>(devices_.size()); ++i) {
+    DLOG(INFO) << "Device " << i << " has device type " << devices_[i].device_type
+               << " and device id " << devices_[i].device_id
+               << (i == exec_->host_device_index ? " (using as host device)" : "");
+  }
 
   InvokeGlobal(func, args);
   RunLoop();
@@ -383,19 +388,31 @@ void VirtualMachine::LoadExecutable(const Executable* exec) {
   }
 }
 
-void VirtualMachine::Init(const std::vector<Device>& devs,
+void VirtualMachine::Init(const std::vector<Device>& physical_devices,
                           const std::vector<AllocatorType>& alloc_types) {
-  ICHECK_EQ(devs.size(), alloc_types.size());
-  // Cache the device
-  for (size_t i = 0; i < devs.size(); i++) {
-    auto dev_type = static_cast<size_t>(devs[i].device_type);
-    auto alloc = MemoryManager::GetOrCreateAllocator(devs[i], alloc_types[i]);
-    if (devices_.size() <= dev_type) {
-      devices_.resize(dev_type + 1);
-      allocators_.resize(dev_type + 1);
-    }
-    devices_[dev_type] = devs[i];
-    allocators_[dev_type] = alloc;
+  ICHECK_EQ(physical_devices.size(), alloc_types.size());
+
+  // Find a physical device to represent each virtual device the VM code requires.
+  // (Recall the VM instructions refer to devices by "device index" into this vector of
+  // virtual devices.)
+  const size_t num_virtual_devices = exec_->virtual_devices.size();
+  devices_.reserve(num_virtual_devices);
+  allocators_.reserve(num_virtual_devices);
+
+  for (size_t device_index = 0; device_index < num_virtual_devices; ++device_index) {
+    // We'll retain the legacy behaviour and just match by device type.
+    // TODO(mbs): Generalize.
+    DLDeviceType virtual_device_type = exec_->virtual_devices[device_index].device_type;
+    auto itr = std::find_if(physical_devices.begin(), physical_devices.end(),
+                            [virtual_device_type](const Device& physical_device) {
+                              return physical_device.device_type == virtual_device_type;
+                            });
+    CHECK(itr != physical_devices.end())
+        << "Unable to find a physical device (from among the " << physical_devices.size()
+        << " given) to match the virtual device with device type " << virtual_device_type;
+    const size_t i = std::distance(physical_devices.begin(), itr);
+    devices_.push_back(*itr);
+    allocators_.push_back(MemoryManager::GetOrCreateAllocator(*itr, alloc_types[i]));
   }
 }
 
@@ -408,7 +425,7 @@ ObjectRef VirtualMachine::ReadRegister(Index r) const { return frames_.back().re
 int64_t VirtualMachine::LoadScalarInt(Index r) const {
   int64_t result = 0;
   const auto& obj = ReadRegister(r);
-  NDArray array = Downcast<NDArray>(CopyTo(obj, {kDLCPU, 0}));
+  NDArray array = Downcast<NDArray>(CopyTo(obj, GetDevice(exec_->host_device_index)));
 
   switch (array->dtype.bits) {
     case 1: {
@@ -473,7 +490,7 @@ void VirtualMachine::RunLoop() {
         }
 
         if (!const_pool_[instr.const_index].defined()) {
-          Device dev = GetDevice(exec_->const_device_type[instr.const_index]);
+          Device dev = GetDevice(exec_->const_device_indexes[instr.const_index]);
           const_pool_[instr.const_index] = CopyTo(constant_obj, dev);
         }
         WriteRegister(instr.dst, const_pool_[instr.const_index]);
@@ -484,7 +501,7 @@ void VirtualMachine::RunLoop() {
         goto main_loop;
       }
       case Opcode::LoadConsti: {
-        auto tensor = NDArray::Empty({1}, {kDLInt, 64, 1}, {kDLCPU, 0});
+        auto tensor = NDArray::Empty({1}, {kDLInt, 64, 1}, GetDevice(exec_->host_device_index));
         reinterpret_cast<int64_t*>(tensor->data)[0] = instr.load_consti.val;
         WriteRegister(instr.dst, tensor);
         pc_++;
@@ -544,7 +561,7 @@ void VirtualMachine::RunLoop() {
         auto object = ReadRegister(instr.get_tag.object);
         const auto& adt = Downcast<ADT>(object);
         auto tag = adt.tag();
-        auto tag_tensor = NDArray::Empty({1}, {kDLInt, 32, 1}, {kDLCPU, 0});
+        auto tag_tensor = NDArray::Empty({1}, {kDLInt, 32, 1}, GetDevice(exec_->host_device_index));
         reinterpret_cast<int32_t*>(tag_tensor->data)[0] = tag;
         WriteRegister(instr.dst, tag_tensor);
         pc_++;
@@ -600,7 +617,7 @@ void VirtualMachine::RunLoop() {
       }
       case Opcode::AllocTensorReg: {
         OpStartHook(instr);
-        Device cpu_dev = GetDevice(static_cast<Index>(kDLCPU));
+        Device cpu_dev = GetDevice(exec_->host_device_index);
         auto shape_obj = ReadRegister(instr.alloc_tensor_reg.shape_register);
         NDArray shape_tensor = Downcast<NDArray>(CopyTo(shape_obj, cpu_dev));
         auto shape = ToShape(shape_tensor);
@@ -637,16 +654,15 @@ void VirtualMachine::RunLoop() {
         OpStartHook(instr);
         auto size = LoadScalarInt(instr.alloc_storage.allocation_size);
         auto alignment = instr.alloc_storage.alignment;
+
         auto storage_obj = SimpleObjAllocator().make_object<StorageObj>();
-        auto dev_type = instr.alloc_storage.device_type;
-        ICHECK_LT(static_cast<size_t>(dev_type), allocators_.size())
-            << "Memory allocator for device " << dev_type << " has not been initialized";
-        auto* alloc = allocators_[dev_type];
-        ICHECK(alloc) << "Did you forget to init the VirtualMachine with devices?";
+        Allocator* allocator = GetAllocator(instr.alloc_storage.device_index);
+        ICHECK(allocator) << "Did you forget to init the VirtualMachine with devices?";
         VLOG(2) << "AllocStorage: allocation_size=" << size << ", alignment=" << alignment
                 << ", dtype_hint=" << DLDataType2String(instr.alloc_storage.dtype_hint)
-                << ", device_type=" << instr.alloc_storage.device_type;
-        storage_obj->buffer = alloc->Alloc(size, alignment, instr.alloc_storage.dtype_hint);
+                << ", device_index=" << instr.alloc_storage.device_index;
+
+        storage_obj->buffer = allocator->Alloc(size, alignment, instr.alloc_storage.dtype_hint);
         Storage storage(storage_obj);
         WriteRegister(instr.dst, storage);
         OpStopHook();
@@ -657,7 +673,8 @@ void VirtualMachine::RunLoop() {
         auto input = ReadRegister(instr.shape_of.tensor);
         NDArray input_array = Downcast<NDArray>(input);
         int ndim = input_array->ndim;
-        auto out_tensor = NDArray::Empty({ndim}, {kDLInt, 64, 1}, {kDLCPU, 0});
+        auto out_tensor =
+            NDArray::Empty({ndim}, {kDLInt, 64, 1}, GetDevice(exec_->host_device_index));
         for (int i = 0; i < ndim; ++i) {
           reinterpret_cast<int64_t*>(out_tensor->data)[i] = input_array->shape[i];
         }
@@ -682,7 +699,7 @@ void VirtualMachine::RunLoop() {
       }
       case Opcode::ReshapeTensor: {
         OpStartHook(instr);
-        Device cpu_dev = GetDevice(static_cast<Index>(kDLCPU));
+        Device cpu_dev = GetDevice(exec_->host_device_index);
         auto tensor_obj = ReadRegister(instr.reshape_tensor.tensor);
         NDArray tensor_arr = Downcast<NDArray>(tensor_obj);
         // Read the shape from shape tensor
@@ -703,14 +720,13 @@ void VirtualMachine::RunLoop() {
       }
       case Opcode::DeviceCopy: {
         OpStartHook(instr);
-        auto tensor_src = ReadRegister(instr.src);
+        auto tensor_src = ReadRegister(instr.device_copy.src);
         NDArray src_data = Downcast<NDArray>(tensor_src);
-        Device src_dev = src_data->device;
-        ICHECK_EQ(static_cast<Index>(src_dev.device_type), instr.src_device_type);
-
-        Device dst_dev;
-        dst_dev.device_type = static_cast<DLDeviceType>(instr.dst_device_type);
-        dst_dev.device_id = 0;
+        Device actual_src_dev = src_data->device;
+        Device inst_src_dev = GetDevice(instr.device_copy.src_device_index);
+        ICHECK_EQ(actual_src_dev.device_type, inst_src_dev.device_type);
+        ICHECK_EQ(actual_src_dev.device_id, inst_src_dev.device_id);
+        Device dst_dev = GetDevice(instr.device_copy.dst_device_index);
 
         NDArray dst_data = src_data.CopyTo(dst_dev);
         WriteRegister(instr.dst, dst_data);
diff --git a/tests/cpp/relay/transforms/device_domains_test.cc b/tests/cpp/relay/transforms/device_domains_test.cc
index 8f263c3b3273b..5df7984d003a9 100644
--- a/tests/cpp/relay/transforms/device_domains_test.cc
+++ b/tests/cpp/relay/transforms/device_domains_test.cc
@@ -45,24 +45,32 @@ IRModule TestModule() {
 }
 
 TEST(DeviceDomains, SmokeTest) {
-  DeviceDomains domains;
+  SEScope cpu = SEScope::ForDeviceType(kDLCPU);
+  SEScope cuda = SEScope::ForDeviceType(kDLCUDA);
+  TargetMap target_map;
+  target_map.Set(Integer(static_cast<int>(kDLCPU)), Target("llvm"));
+  target_map.Set(Integer(static_cast<int>(kDLCUDA)), Target("cuda"));
+  transform::PassContext ctxt = transform::PassContext::Create();
+  CompilationConfig config(ctxt, target_map, /*optional_host_target=*/{});
+  DeviceDomains domains(config);
   IRModule mod = TestModule();
   Function f = Downcast<Function>(mod->Lookup("f"));
 
   DeviceDomainPtr actual_add_domain = domains.DomainForCallee(Downcast<Call>(f->body));
   DeviceDomainPtr x_domain = domains.DomainFor(f->params[0]);
   DeviceDomainPtr y_domain = domains.DomainFor(f->params[1]);
-  DeviceDomainPtr result_domain = DeviceDomains::Free(f->ret_type);
+  DeviceDomainPtr result_domain = domains.Free(f->ret_type);
   std::vector<DeviceDomainPtr> arg_and_results;
   arg_and_results.push_back(x_domain);
   arg_and_results.push_back(y_domain);
   arg_and_results.push_back(result_domain);
-  DeviceDomainPtr implied_add_domain = DeviceDomains::MakeDomain(std::move(arg_and_results));
-  domains.Unify(actual_add_domain, implied_add_domain);
-  domains.Unify(x_domain, DeviceDomains::ForDeviceType(f->params[0]->checked_type(), kDLCUDA));
+  DeviceDomainPtr implied_add_domain = domains.MakeHigherOrderDomain(std::move(arg_and_results));
+  EXPECT_FALSE(domains.UnifyOrNull(actual_add_domain, implied_add_domain) == nullptr);
+  EXPECT_FALSE(domains.UnifyOrNull(
+                   x_domain, domains.ForSEScope(f->params[0]->checked_type(), cuda)) == nullptr);
 
-  EXPECT_EQ(domains.ResultDeviceType(y_domain), kDLCUDA);
-  EXPECT_EQ(domains.ResultDeviceType(result_domain), kDLCUDA);
+  EXPECT_EQ(domains.ResultSEScope(y_domain), config->CanonicalSEScope(cuda));
+  EXPECT_EQ(domains.ResultSEScope(result_domain), config->CanonicalSEScope(cuda));
 }
 
 }  // namespace
diff --git a/tests/python/relay/op/annotation/test_annotation.py b/tests/python/relay/op/annotation/test_annotation.py
index 58e559eb96809..8ba91976523a1 100644
--- a/tests/python/relay/op/annotation/test_annotation.py
+++ b/tests/python/relay/op/annotation/test_annotation.py
@@ -26,14 +26,17 @@ def test_on_device_via_string():
     assert isinstance(call, relay.Call)
     assert len(call.args) == 1
     assert call.args[0] == x
-    assert call.attrs.device_type == 2  # ie kDLCUDA
+    assert call.attrs.se_scope.device_type_int == 2  # ie kDLCUDA
+    assert call.attrs.se_scope.virtual_device_id == 0
+    assert call.attrs.se_scope.target is None
+    assert call.attrs.se_scope.memory_scope == ""
     assert not call.attrs.is_fixed
 
 
 def test_on_device_via_device():
     x = relay.Var("x")
-    call = relay.annotation.on_device(x, tvm.device("llvm"))
-    assert call.attrs.device_type == 1  # ie kDLCPU
+    call = relay.annotation.on_device(x, tvm.device("cpu"))
+    assert call.attrs.se_scope.device_type_int == 1  # ie kDLCPU
 
 
 def test_on_device_invalid_device():
@@ -44,7 +47,7 @@ def test_on_device_invalid_device():
 def test_on_device_is_fixed():
     x = relay.Var("x")
     call = relay.annotation.on_device(x, "cuda", True)
-    assert call.attrs.device_type == 2
+    assert call.attrs.se_scope.device_type_int == 2  # ie kDLCUDA
     assert call.attrs.is_fixed
 
 
@@ -54,15 +57,13 @@ def test_function_on_device():
     f = relay.Function([x, y], relay.add(x, y))
     func = relay.annotation.function_on_device(f, ["cpu", "cuda"], "cuda")
     assert isinstance(func, relay.Function)
-    assert len(func.attrs["param_device_types"]) == 2
-    assert func.attrs["param_device_types"][0] == 1  # ie kDLCPU
-    assert func.attrs["param_device_types"][1] == 2  # ie kDLCUDA
-    assert func.attrs["result_device_type"] == 2  # ie KDLCUDA
+    assert len(func.attrs["param_se_scopes"]) == 2
+    assert func.attrs["param_se_scopes"][0].device_type_int == 1  # ie kDLCPU
+    assert func.attrs["param_se_scopes"][1].device_type_int == 2  # ie kDLCUDA
+    assert func.attrs["result_se_scope"].device_type_int == 2  # ie KDLCUDA
 
 
 if __name__ == "__main__":
-    test_on_device_via_string()
-    test_on_device_via_device()
-    test_on_device_invalid_device()
-    test_on_device_is_fixed()
-    test_function_on_device()
+    import sys
+
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/python/relay/op/test_tensor.py b/tests/python/relay/op/test_tensor.py
new file mode 100644
index 0000000000000..4d2c1766972ab
--- /dev/null
+++ b/tests/python/relay/op/test_tensor.py
@@ -0,0 +1,52 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Unit tests for tensor helpers."""
+import tvm
+from tvm import relay
+import pytest
+
+
+def test_device_copy_via_string():
+    x = relay.var("x")
+    call = relay.op.device_copy(x, "cuda", "cpu")
+    assert isinstance(call, relay.Call)
+    assert len(call.args) == 1
+    assert call.args[0] == x
+    assert call.attrs.src_se_scope.device_type_int == 2  # ie kDLCUDA
+    assert call.attrs.src_se_scope.virtual_device_id == 0
+    assert call.attrs.src_se_scope.target is None
+    assert call.attrs.src_se_scope.memory_scope == ""
+    assert call.attrs.dst_se_scope.device_type_int == 1  # ie kDLCPU
+    assert call.attrs.dst_se_scope.virtual_device_id == 0
+    assert call.attrs.dst_se_scope.target is None
+    assert call.attrs.dst_se_scope.memory_scope == ""
+
+
+def test_device_copy_via_device():
+    x = relay.var("x")
+    call = relay.op.device_copy(x, tvm.device("cuda"), tvm.device("cpu"))
+    assert isinstance(call, relay.Call)
+    assert len(call.args) == 1
+    assert call.args[0] == x
+    assert call.attrs.src_se_scope.device_type_int == 2  # ie kDLCUDA
+    assert call.attrs.dst_se_scope.device_type_int == 1  # ie kDLCPU
+
+
+if __name__ == "__main__":
+    import sys
+
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/python/relay/test_pass_plan_devices.py b/tests/python/relay/test_pass_plan_devices.py
index e3218ab1a8299..37eb1a2d6456f 100644
--- a/tests/python/relay/test_pass_plan_devices.py
+++ b/tests/python/relay/test_pass_plan_devices.py
@@ -26,18 +26,36 @@
 import tvm.testing
 import numpy as np
 
-CPU = tvm.device("cpu")  # device_type=1
-GPU = tvm.device("cuda")  # device_type=2
+HOST_DEVICE = tvm.device("cpu")
+HOST_TARGET = tvm.target.Target("llvm")
+
+CPU_DEVICE = tvm.device("cpu")
+CPU_TARGET = tvm.target.Target("llvm").with_host(HOST_TARGET)
+
+GPU_DEVICE = tvm.device("cuda")
+GPU_TARGET = tvm.target.Target("cuda").with_host(HOST_TARGET)
+
+TARGETS = {
+    tvm.tir.IntImm("int32", CPU_DEVICE.device_type): CPU_TARGET,
+    tvm.tir.IntImm("int32", GPU_DEVICE.device_type): GPU_TARGET,
+}
+
+HOST = tvm.target.make_se_scope(HOST_DEVICE, HOST_TARGET)  # device_type=1
+CPU = tvm.target.make_se_scope(CPU_DEVICE, CPU_TARGET)  # device_type=1
+GPU = tvm.target.make_se_scope(GPU_DEVICE, GPU_TARGET)  # device_type=2
 DEFAULT = GPU
 
+CTXT = tvm.transform.PassContext(config={"relay.fallback_device_type": DEFAULT.device_type_int})
+
 core = tvm.IRModule()
 core.import_from_std("core.rly")
 
 
 def rewrite_and_assert(in_mod, expected_mod):
     """Manually run the pass and assert it's structurally equals to the expected."""
+    config = tvm.target.make_compilation_config(CTXT, TARGETS, HOST_TARGET)
     actual_mod = relay.transform.InferType()(in_mod)
-    actual_mod = relay.transform.PlanDevices(DEFAULT)(actual_mod)
+    actual_mod = relay.transform.PlanDevices(config)(actual_mod)
     actual_mod = relay.transform.InferType()(actual_mod)
     expected_mod = relay.transform.InferType()(expected_mod)
     if not tvm.ir.structural_equal(actual_mod, expected_mod, True):
@@ -59,7 +77,9 @@ def eval_and_assert(in_mod: tvm.IRModule, reference_func, args):
         print("Not evaluating since GPU is not available")
         return
     with tvm.transform.PassContext(opt_level=3):
-        compiled = relay.create_executor("vm", mod=in_mod, device=GPU, target="cuda").evaluate()
+        compiled = relay.create_executor(
+            "vm", mod=in_mod, device=GPU_DEVICE, target=GPU_TARGET
+        ).evaluate()
         actual = compiled(*args).numpy()
         expected = reference_func(*args)
         tvm.testing.assert_allclose(actual, expected)
@@ -85,9 +105,11 @@ def exercise(in_mod: tvm.IRModule, expected_mod: tvm.IRModule, reference_func, a
 
 
 def test_plain():
+    metatable = {"SEScope": [CPU, GPU]}
+
     # Everything defaults to GPU
     def input():
-        return tvm.parser.fromtext(
+        return tvm.parser.parse(
             """
             #[version = "0.0.5"]
             def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
@@ -96,21 +118,28 @@ def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
               %1 = add(%c, %d);
               subtract(%0, %1)
             }
-        """
+        """,
+            "from_string",
+            None,
+            metatable,
         )
 
     def expected():
-        return tvm.parser.fromtext(
+        return tvm.parser.parse(
             """
             #[version = "0.0.5"]
             def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
                       %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32],
-                      param_device_types=[2, 2, 2, 2], result_device_type=2) {
+                      param_se_scopes=[meta[SEScope][1], meta[SEScope][1], meta[SEScope][1], meta[SEScope][1]],
+                      result_se_scope=meta[SEScope][1]) {
               %0 = add(%a, %b);
               %1 = add(%c, %d);
               subtract(%0, %1)
             }
-        """
+        """,
+            "from_string",
+            None,
+            metatable,
         )
 
     def ref(a, b, c, d):
@@ -120,35 +149,44 @@ def ref(a, b, c, d):
 
 
 def test_left_add_on_cpu():
+    metatable = {"SEScope": [CPU, GPU]}
+
     # Force some args to be on CPU, rest default to GPU.
     def input():
-        return tvm.parser.fromtext(
+        return tvm.parser.parse(
             """
             #[version = "0.0.5"]
             def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
                       %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32]) {
               %0 = add(%a, %b);
-              %1 = on_device(%0, device_type=1);
+              %1 = on_device(%0, se_scope=meta[SEScope][0]);
               %2 = add(%c, %d);
               subtract(%1, %2)
             }
-        """
+        """,
+            "from_string",
+            None,
+            metatable,
         )
 
     def expected():
-        return tvm.parser.fromtext(
+        return tvm.parser.parse(
             """
             #[version = "0.0.5"]
             def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
                       %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32],
-                      param_device_types=[1, 1, 2, 2], result_device_type=2) {
+                      param_se_scopes=[meta[SEScope][0], meta[SEScope][0], meta[SEScope][1], meta[SEScope][1]],
+                      result_se_scope=meta[SEScope][1]) {
               %0 = add(%a, %b);
-              %1 = on_device(%0, device_type=1, is_fixed=True);
-              %2 = device_copy(%1, src_dev_type=1, dst_dev_type=2);
+              %1 = on_device(%0, se_scope=meta[SEScope][0], is_fixed=True);
+              %2 = device_copy(%1, src_se_scope=meta[SEScope][0], dst_se_scope=meta[SEScope][1]);
               %3 = add(%c, %d);
               subtract(%2, %3)
             }
-        """
+        """,
+            "from_string",
+            None,
+            metatable,
         )
 
     def ref(a, b, c, d):
@@ -158,35 +196,44 @@ def ref(a, b, c, d):
 
 
 def test_left_add_on_cpu_via_copy():
+    metatable = {"SEScope": [CPU, GPU]}
+
     # As for test_left_add_on_cpu, but with an explicit device_copy.
     def input():
-        return tvm.parser.fromtext(
+        return tvm.parser.parse(
             """
             #[version = "0.0.5"]
             def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
                       %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32]) {
               %0 = add(%a, %b);
-              %1 = device_copy(%0, src_dev_type=1, dst_dev_type=2);
+              %1 = device_copy(%0, src_se_scope=meta[SEScope][0], dst_se_scope=meta[SEScope][1]);
               %2 = add(%c, %d);
               subtract(%1, %2)
             }
-        """
+        """,
+            "from_string",
+            None,
+            metatable,
         )
 
     def expected():
-        return tvm.parser.fromtext(
+        return tvm.parser.parse(
             """
             #[version = "0.0.5"]
             def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
                       %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32],
-                      param_device_types=[1, 1, 2, 2], result_device_type=2) {
+                      param_se_scopes=[meta[SEScope][0], meta[SEScope][0], meta[SEScope][1], meta[SEScope][1]],
+                      result_se_scope=meta[SEScope][1]) {
               %0 = add(%a, %b);
-              %1 = on_device(%0, device_type=1, is_fixed=True);
-              %2 = device_copy(%1, src_dev_type=1, dst_dev_type=2);
+              %1 = on_device(%0, se_scope=meta[SEScope][0], is_fixed=True);
+              %2 = device_copy(%1, src_se_scope=meta[SEScope][0], dst_se_scope=meta[SEScope][1]);
               %3 = add(%c, %d);
               subtract(%2, %3)
             }
-        """
+        """,
+            "from_string",
+            None,
+            metatable,
         )
 
     def ref(a, b, c, d):
@@ -196,37 +243,46 @@ def ref(a, b, c, d):
 
 
 def test_both_adds_on_cpu():
+    metatable = {"SEScope": [CPU, GPU]}
+
     def input():
-        return tvm.parser.fromtext(
+        return tvm.parser.parse(
             """
             #[version = "0.0.5"]
             def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
                       %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32]) {
               %0 = add(%a, %b);
               %1 = add(%c, %d);
-              %2 = on_device(%0, device_type=1);
-              %3 = on_device(%1, device_type=1);
+              %2 = on_device(%0, se_scope=meta[SEScope][0]);
+              %3 = on_device(%1, se_scope=meta[SEScope][0]);
               subtract(%2, %3)
             }
-        """
+        """,
+            "from_string",
+            None,
+            metatable,
         )
 
     def expected():
-        return tvm.parser.fromtext(
+        return tvm.parser.parse(
             """
             #[version = "0.0.5"]
             def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
                       %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32],
-                      param_device_types=[1, 1, 1, 1], result_device_type=2) {
+                      param_se_scopes=[meta[SEScope][0], meta[SEScope][0], meta[SEScope][0], meta[SEScope][0]],
+                      result_se_scope=meta[SEScope][1]) {
               %0 = add(%a, %b);
-              %1 = on_device(%0, device_type=1, is_fixed=True);
+              %1 = on_device(%0, se_scope=meta[SEScope][0], is_fixed=True);
               %2 = add(%c, %d);
-              %3 = on_device(%2, device_type=1, is_fixed=True);
-              %4 = device_copy(%1, src_dev_type=1, dst_dev_type=2);
-              %5 = device_copy(%3, src_dev_type=1, dst_dev_type=2);
+              %3 = on_device(%2, se_scope=meta[SEScope][0], is_fixed=True);
+              %4 = device_copy(%1, src_se_scope=meta[SEScope][0], dst_se_scope=meta[SEScope][1]);
+              %5 = device_copy(%3, src_se_scope=meta[SEScope][0], dst_se_scope=meta[SEScope][1]);
               subtract(%4, %5)
             }
-        """
+        """,
+            "from_string",
+            None,
+            metatable,
         )
 
     def ref(a, b, c, d):
@@ -236,34 +292,42 @@ def ref(a, b, c, d):
 
 
 def test_sharing():
+    metatable = {"SEScope": [CPU, GPU]}
+
     # The same add sub-expression is annotated twice.
     def input():
-        return tvm.parser.fromtext(
+        return tvm.parser.parse(
             """
             #[version = "0.0.5"]
             def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32]) {
               %0 = add(%a, %b);
-              %1 = on_device(%0, device_type=1);
-              %2 = on_device(%0, device_type=1);
+              %1 = on_device(%0, se_scope=meta[SEScope][0]);
+              %2 = on_device(%0, se_scope=meta[SEScope][0]);
               subtract(%1, %2)
             }
-        """
+        """,
+            "from_string",
+            None,
+            metatable,
         )
 
     def expected():
-        return tvm.parser.fromtext(
+        return tvm.parser.parse(
             """
             #[version = "0.0.5"]
             def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
-                      param_device_types=[1, 1], result_device_type=2) {
+                      param_se_scopes=[meta[SEScope][0], meta[SEScope][0]], result_se_scope=meta[SEScope][1]) {
               %0 = add(%a, %b);
-              %1 = on_device(%0, device_type=1, is_fixed=True);
-              %2 = on_device(%0, device_type=1, is_fixed=True);
-              %3 = device_copy(%1, src_dev_type=1, dst_dev_type=2);
-              %4 = device_copy(%2, src_dev_type=1, dst_dev_type=2);
+              %1 = on_device(%0, se_scope=meta[SEScope][0], is_fixed=True);
+              %2 = on_device(%0, se_scope=meta[SEScope][0], is_fixed=True);
+              %3 = device_copy(%1, src_se_scope=meta[SEScope][0], dst_se_scope=meta[SEScope][1]);
+              %4 = device_copy(%2, src_se_scope=meta[SEScope][0], dst_se_scope=meta[SEScope][1]);
               subtract(%3, %4)
             }
-        """
+        """,
+            "from_string",
+            None,
+            metatable,
         )
 
     def ref(a, b):
@@ -274,35 +338,44 @@ def ref(a, b):
 
 
 def test_let_on_cpu():
+    metatable = {"SEScope": [CPU, GPU]}
+
     # The device for a let-bound expression can flow from uses of the let-bound var.
     def input():
-        return tvm.parser.fromtext(
+        return tvm.parser.parse(
             """
             #[version = "0.0.5"]
             def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
                       %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32]) {
               let %l = add(%a, %b);
               let %r = add(%c, %d);
-              %0 = on_device(%l, device_type=1);
+              %0 = on_device(%l, se_scope=meta[SEScope][0]);
               subtract(%0, %r)
             }
-        """
+        """,
+            "from_string",
+            None,
+            metatable,
         )
 
     def expected():
-        return tvm.parser.fromtext(
+        return tvm.parser.parse(
             """
             #[version = "0.0.5"]
             def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
                       %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32],
-                      param_device_types=[1, 1, 2, 2], result_device_type=2) {
+                      param_se_scopes=[meta[SEScope][0], meta[SEScope][0], meta[SEScope][1], meta[SEScope][1]],
+                      result_se_scope=meta[SEScope][1]) {
               %0 = add(%a, %b);
-              let %l = on_device(%0, device_type=1, is_fixed=True);
+              let %l = on_device(%0, se_scope=meta[SEScope][0], is_fixed=True);
               let %r = add(%c, %d);
-              %1 = device_copy(%l, src_dev_type=1, dst_dev_type=2);
+              %1 = device_copy(%l, src_se_scope=meta[SEScope][0], dst_se_scope=meta[SEScope][1]);
               subtract(%1, %r)
             }
-        """
+        """,
+            "from_string",
+            None,
+            metatable,
         )
 
     def ref(a, b, c, d):
@@ -312,39 +385,49 @@ def ref(a, b, c, d):
 
 
 def test_func_param_on_cpu():
+    metatable = {"SEScope": [CPU, GPU]}
+
     # Devices for function parameters flow to call sites.
     def input():
-        return tvm.parser.fromtext(
+        return tvm.parser.parse(
             """
             #[version = "0.0.5"]
             def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
                       %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32]) {
               let %f = fn (%x, %y) {
                 %0 = add(%x, %y);
-                on_device(%0, device_type=1)
+                on_device(%0, se_scope=meta[SEScope][0])
               };
               %1 = %f(%a, %b);
               %2 = add(%c, %d);
               subtract(%1, %2)
             }
-        """
+        """,
+            "from_string",
+            None,
+            metatable,
         )
 
     def expected():
-        return tvm.parser.fromtext(
+        return tvm.parser.parse(
             """
             #[version = "0.0.5"]
             def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
                       %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32],
-                      param_device_types=[1, 1, 1, 1], result_device_type=1) {
-              let %f = fn (%x, %y, param_device_types=[1, 1], result_device_type=1) {
+                      param_se_scopes=[meta[SEScope][0], meta[SEScope][0], meta[SEScope][0], meta[SEScope][0]],
+                      result_se_scope=meta[SEScope][0]) {
+              let %f = fn (%x, %y,
+                           param_se_scopes=[meta[SEScope][0], meta[SEScope][0]], result_se_scope=meta[SEScope][0]) {
                 add(%x, %y)
               };
               %0 = %f(%a, %b);
               %1 = add(%c, %d);
               subtract(%0, %1)
             }
-        """
+        """,
+            "from_string",
+            None,
+            metatable,
         )
 
     def ref(a, b, c, d):
@@ -354,9 +437,11 @@ def ref(a, b, c, d):
 
 
 def test_func_result_on_cpu():
+    metatable = {"SEScope": [CPU, GPU]}
+
     # Devices for call sites flow to function results.
     def input():
-        return tvm.parser.fromtext(
+        return tvm.parser.parse(
             """
             #[version = "0.0.5"]
             def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
@@ -365,30 +450,38 @@ def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
                 add(%x, %y)
               };
               %0 = %f(%a, %b);
-              %1 = on_device(%0, device_type=1);
+              %1 = on_device(%0, se_scope=meta[SEScope][0]);
               %2 = add(%c, %d);
               subtract(%1, %2)
             }
-        """
+        """,
+            "from_string",
+            None,
+            metatable,
         )
 
     def expected():
-        return tvm.parser.fromtext(
+        return tvm.parser.parse(
             """
             #[version = "0.0.5"]
             def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
                       %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32],
-                      param_device_types=[1, 1, 2, 2], result_device_type=2) {
-              let %f = fn (%x, %y, param_device_types=[1, 1], result_device_type=1) {
+                      param_se_scopes=[meta[SEScope][0], meta[SEScope][0], meta[SEScope][1], meta[SEScope][1]],
+                      result_se_scope=meta[SEScope][1]) {
+              let %f = fn (%x, %y,
+                           param_se_scopes=[meta[SEScope][0], meta[SEScope][0]], result_se_scope=meta[SEScope][0]) {
                 add(%x, %y)
               };
               %1 = %f(%a, %b);
-              %2 = on_device(%1, device_type=1, is_fixed=True);
-              %3 = device_copy(%2, src_dev_type=1, dst_dev_type=2);
+              %2 = on_device(%1, se_scope=meta[SEScope][0], is_fixed=True);
+              %3 = device_copy(%2, src_se_scope=meta[SEScope][0], dst_se_scope=meta[SEScope][1]);
               %4 = add(%c, %d);
               subtract(%3, %4)
             }
-        """
+        """,
+            "from_string",
+            None,
+            metatable,
         )
 
     def ref(a, b, c, d):
@@ -398,15 +491,17 @@ def ref(a, b, c, d):
 
 
 def test_higher_order():
+    metatable = {"SEScope": [CPU, GPU]}
+
     # The constraint on %a flows back to %y via %f and %h
     def input():
-        return tvm.parser.fromtext(
+        return tvm.parser.parse(
             """
             #[version = "0.0.5"]
             def @main(%x: Tensor[(5, 7), float32], %y: Tensor[(5, 7), float32]) {
               let %f = fn (%g) {
                 fn (%a) {
-                  %0 = on_device(%a, device_type=1);
+                  %0 = on_device(%a, se_scope=meta[SEScope][0]);
                   %1 = %g(%0);
                   add(%1, %x)
                 }
@@ -418,30 +513,36 @@ def @main(%x: Tensor[(5, 7), float32], %y: Tensor[(5, 7), float32]) {
               %3 = %2(%y);
               subtract(%x, %3)
             }
-        """
+        """,
+            "from_string",
+            None,
+            metatable,
         )
 
     def expected():
-        return tvm.parser.fromtext(
+        return tvm.parser.parse(
             """
             #[version = "0.0.5"]
             def @main(%x: Tensor[(5, 7), float32], %y: Tensor[(5, 7), float32],
-                      param_device_types=[2, 1], result_device_type=2) {
-              let %f = fn (%g, param_device_types=[2], result_device_type=2) {
-                fn (%a, param_device_types=[1], result_device_type=2) {
-                  %0 = device_copy(%a, src_dev_type=1, dst_dev_type=2);
+                      param_se_scopes=[meta[SEScope][1], meta[SEScope][0]], result_se_scope=meta[SEScope][1]) {
+              let %f = fn (%g, param_se_scopes=[meta[SEScope][1]], result_se_scope=meta[SEScope][1]) {
+                fn (%a, param_se_scopes=[meta[SEScope][0]], result_se_scope=meta[SEScope][1]) {
+                  %0 = device_copy(%a, src_se_scope=meta[SEScope][0], dst_se_scope=meta[SEScope][1]);
                   %1 = %g(%0);
                   add(%1, %x)
                 }
               };
-              let %h = fn (%b, param_device_types=[2], result_device_type=2) {
+              let %h = fn (%b, param_se_scopes=[meta[SEScope][1]], result_se_scope=meta[SEScope][1]) {
                 negative(%b)
               };
               %2 = %f(%h);
               %3 = %2(%y);
               subtract(%x, %3)
             }
-        """
+        """,
+            "from_string",
+            None,
+            metatable,
         )
 
     def ref(x, y):
@@ -457,14 +558,16 @@ def h(b):
 
 
 def test_function_in_tuple():
+    metatable = {"SEScope": [CPU, GPU]}
+
     # Since %f ends up in a tuple its argument and result is forced to be on the CPU
     def input():
-        return tvm.parser.fromtext(
+        return tvm.parser.parse(
             """
             #[version = "0.0.5"]
             def @main(%x: Tensor[(5, 7), float32], %y: Tensor[(5, 7), float32]) {
               let %f = fn (%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32]) {
-                %0 = on_device(%b, device_type=1);
+                %0 = on_device(%b, se_scope=meta[SEScope][0]);
                 add(%a, %0)
               };
               let %t = (%f, %x);
@@ -472,17 +575,20 @@ def @main(%x: Tensor[(5, 7), float32], %y: Tensor[(5, 7), float32]) {
               %2 = %t.0;
               %2(%1, %y)
             }
-        """
+        """,
+            "from_string",
+            None,
+            metatable,
         )
 
     def expected():
-        return tvm.parser.fromtext(
+        return tvm.parser.parse(
             """
             #[version = "0.0.5"] 
             def @main(%x: Tensor[(5, 7), float32], %y: Tensor[(5, 7), float32],
-                      param_device_types=[1, 1], result_device_type=1) {
+                      param_se_scopes=[meta[SEScope][0], meta[SEScope][0]], result_se_scope=meta[SEScope][0]) {
               let %f = fn (%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
-                           param_device_types=[1, 1], result_device_type=1) {
+                           param_se_scopes=[meta[SEScope][0], meta[SEScope][0]], result_se_scope=meta[SEScope][0]) {
                 add(%a, %b)
               };
               let %t = (%f, %x);
@@ -490,7 +596,10 @@ def @main(%x: Tensor[(5, 7), float32], %y: Tensor[(5, 7), float32],
               %1 = %t.0;
               %1(%0, %y)
             }
-        """
+        """,
+            "from_string",
+            None,
+            metatable,
         )
 
     def ref(x, y):
@@ -501,14 +610,14 @@ def ref(x, y):
 
 def test_device_copy():
     const = rand((5, 7))
-    metatable = {"relay.Constant": [relay.const(const)]}
+    metatable = {"SEScope": [CPU, GPU], "relay.Constant": [relay.const(const)]}
 
     def input():
         return tvm.parser.parse(
             """
             #[version = "0.0.5"] 
             def @main(%x: Tensor[(5, 7), float32]) {
-              %0 = device_copy(%x, src_dev_type=1, dst_dev_type=2);
+              %0 = device_copy(%x, src_se_scope=meta[SEScope][0], dst_se_scope=meta[SEScope][1]);
               add(%0, meta[relay.Constant][0])
             }
         """,
@@ -521,8 +630,9 @@ def expected():
         return tvm.parser.parse(
             """
             #[version = "0.0.5"] 
-            def @main(%x: Tensor[(5, 7), float32], param_device_types=[1], result_device_type=2) {
-              %0 = device_copy(%x, src_dev_type=1, dst_dev_type=2);
+            def @main(%x: Tensor[(5, 7), float32],
+                      param_se_scopes=[meta[SEScope][0]], result_se_scope=meta[SEScope][1]) {
+              %0 = device_copy(%x, src_se_scope=meta[SEScope][0], dst_se_scope=meta[SEScope][1]);
               add(%0, meta[relay.Constant][0])
             }
         """,
@@ -538,31 +648,37 @@ def ref(x):
 
 
 def test_shape_func():
+    metatable = {"SEScope": [HOST, GPU]}
+
     def input():
-        return tvm.parser.fromtext(
+        return tvm.parser.parse(
             """
             #[version = "0.0.5"] 
             def @main(%x: Tensor[(?), float32], %s: Tensor[(1), int64]) {
               %0 = fn (%y: Tensor[(?), float32]) {
                 nn.relu(%y)
               };
-              let %p = on_device(%0, device_type=2, is_fixed=True);
-              %1 = on_device(%x, device_type=2, is_fixed=True);
+              let %p = on_device(%0, se_scope=meta[SEScope][1], is_fixed=True);
+              %1 = on_device(%x, se_scope=meta[SEScope][1], is_fixed=True);
               %2 = vm.shape_of(%1, dtype="int64");
               %3 = (%2,);
               %4 = (%s,);
               vm.shape_func(%p, %3, %4, is_input=[False])
             }
-        """
+        """,
+            "from_string",
+            None,
+            metatable,
         )
 
     def expected():
-        return tvm.parser.fromtext(
+        return tvm.parser.parse(
             """
             #[version = "0.0.5"] 
             def @main(%x: Tensor[(?), float32], %s: Tensor[(1), int64],
-                      param_device_types=[2, 1], result_device_type=1) {
-              let %p = fn (%y: Tensor[(?), float32], param_device_types=[2], result_device_type=2) {
+                      param_se_scopes=[meta[SEScope][1], meta[SEScope][0]], result_se_scope=meta[SEScope][0]) {
+              let %p = fn (%y: Tensor[(?), float32],
+                           param_se_scopes=[meta[SEScope][1]], result_se_scope=meta[SEScope][1]) {
                 nn.relu(%y)
               };
               %1 = vm.shape_of(%x, dtype="int64");
@@ -570,7 +686,10 @@ def @main(%x: Tensor[(?), float32], %s: Tensor[(1), int64],
               %3 = (%s,);
               vm.shape_func(%p, %2, %3, is_input=[False])
             }
-        """
+        """,
+            "from_string",
+            None,
+            metatable,
         )
 
     # Don't try to execute, too fiddly to setup.
@@ -578,28 +697,37 @@ def @main(%x: Tensor[(?), float32], %s: Tensor[(1), int64],
 
 
 def test_shape_of():
+    metatable = {"SEScope": [HOST, GPU]}
+
     # We need to use is_fixed=True in the on_device call so that the tensor will be on the GPU. Otherwise the
     # result defaults to the result device for @main which is the CPU, thus forcing a copy.
     # TODO(mbs): Perhaps the defaulting heuristics are being too clever?
     def input():
-        return tvm.parser.fromtext(
+        return tvm.parser.parse(
             """
             #[version = "0.0.5"] 
             def @main(%x: Tensor[(?, ?), float32]) {
-              %0 = on_device(%x, device_type=2, is_fixed=True);
+              %0 = on_device(%x, se_scope=meta[SEScope][1], is_fixed=True);
               vm.shape_of(%0, dtype="int64")
             }
-        """
+        """,
+            "from_string",
+            None,
+            metatable,
         )
 
     def expected():
-        return tvm.parser.fromtext(
+        return tvm.parser.parse(
             """
             #[version = "0.0.5"]
-            def @main(%x: Tensor[(?, ?), float32], param_device_types=[2], result_device_type=1) {
+            def @main(%x: Tensor[(?, ?), float32],
+                      param_se_scopes=[meta[SEScope][1]], result_se_scope=meta[SEScope][0]) {
               vm.shape_of(%x, dtype="int64")
             }
-        """
+        """,
+            "from_string",
+            None,
+            metatable,
         )
 
     def ref(x):
@@ -609,28 +737,33 @@ def ref(x):
 
 
 def test_alloc_storage():
+    metatable = {"SEScope": [HOST, GPU]}
+
     def input():
         return tvm.parser.parse(
             """
             #[version = "0.0.5"]
             def @main(%size: int64, %alignment: int64) {
-              memory.alloc_storage(%size, %alignment, device_id=0, device_type=2)
+              memory.alloc_storage(%size, %alignment, se_scope=meta[SEScope][1])
             }
         """,
             "from_string",
             core,
+            metatable,
         )
 
     def expected():
         return tvm.parser.parse(
             """
             #[version = "0.0.5"]
-            def @main(%size: int64, %alignment: int64, param_device_types=[1, 1], result_device_type=2) {
-              memory.alloc_storage(%size, %alignment, device_id=0, device_type=2)
+            def @main(%size: int64, %alignment: int64,
+                      param_se_scopes=[meta[SEScope][0], meta[SEScope][0]], result_se_scope=meta[SEScope][1]) {
+              memory.alloc_storage(%size, %alignment, se_scope=meta[SEScope][1])
             }
         """,
             "from_string",
             core,
+            metatable,
         )
 
     # Don't try to execute, too fiddly to setup.
@@ -639,7 +772,7 @@ def @main(%size: int64, %alignment: int64, param_device_types=[1, 1], result_dev
 
 def test_alloc_tensor():
     shape = np.array([3, 2])
-    metatable = {"relay.Constant": [relay.const(shape, dtype="int64")]}
+    metatable = {"SEScope": [HOST, GPU], "relay.Constant": [relay.const(shape, dtype="int64")]}
 
     def input():
         return tvm.parser.parse(
@@ -659,9 +792,9 @@ def expected():
         return tvm.parser.parse(
             """
             #[version = "0.0.5"]
-            def @main(%sto: Storage[], param_device_types=[2], result_device_type=2) {
-              %0 = on_device(0, device_type=1, is_fixed=True);
-              %1 = on_device(meta[relay.Constant][0], device_type=1, is_fixed=True);
+            def @main(%sto: Storage[], param_se_scopes=[meta[SEScope][1]], result_se_scope=meta[SEScope][1]) {
+              %0 = on_device(0, se_scope=meta[SEScope][0], is_fixed=True);
+              %1 = on_device(meta[relay.Constant][0], se_scope=meta[SEScope][0], is_fixed=True);
               memory.alloc_tensor(%sto, %0, %1, const_shape=meta[relay.Constant][0], assert_shape=[])
             }
         """,
@@ -676,7 +809,7 @@ def @main(%sto: Storage[], param_device_types=[2], result_device_type=2) {
 
 def test_reshape_tensor():
     newshape = [2, 4, 2]
-    metatable = {"relay.Constant": [relay.const(newshape, dtype="int64")]}
+    metatable = {"SEScope": [HOST, GPU], "relay.Constant": [relay.const(newshape, dtype="int64")]}
 
     def input():
         return tvm.parser.parse(
@@ -695,8 +828,9 @@ def expected():
         return tvm.parser.parse(
             """
             #[version = "0.0.5"]
-            def @main(%x: Tensor[(2, 8), float32], param_device_types=[2], result_device_type=2) {
-              %0 = on_device(meta[relay.Constant][0], device_type=1, is_fixed=True);
+            def @main(%x: Tensor[(2, 8), float32],
+                      param_se_scopes=[meta[SEScope][1]], result_se_scope=meta[SEScope][1]) {
+              %0 = on_device(meta[relay.Constant][0], se_scope=meta[SEScope][0], is_fixed=True);
               vm.reshape_tensor(%x, %0, newshape=[2, 4, 2])
             }
         """,
@@ -712,26 +846,34 @@ def ref(x):
 
 
 def test_dynamic_input():
+    metatable = {"SEScope": [GPU]}
+
     # There's nothing special about inferring devices for partially unknown types.
     def input():
-        return tvm.parser.fromtext(
+        return tvm.parser.parse(
             """
             #[version = "0.0.5"]
             def @main(%x0: Tensor[(?, ?), float32], %x1: Tensor[(?, ?), float32]) {
               add(%x0, %x1)
             }
-        """
+        """,
+            "from_string",
+            None,
+            metatable,
         )
 
     def expected():
-        return tvm.parser.fromtext(
+        return tvm.parser.parse(
             """
             #[version = "0.0.5"]
             def @main(%x0: Tensor[(?, ?), float32], %x1: Tensor[(?, ?), float32],
-                      param_device_types=[2, 2], result_device_type=2) {
+                      param_se_scopes=[meta[SEScope][0], meta[SEScope][0]], result_se_scope=meta[SEScope][0]) {
               add(%x0, %x1)
             }
-        """
+        """,
+            "from_string",
+            None,
+            metatable,
         )
 
     def ref(x0, x1):
@@ -741,35 +883,44 @@ def ref(x0, x1):
 
 
 def test_redundant_annotation():
+    metatable = {"SEScope": [CPU, GPU]}
+
     def input():
-        return tvm.parser.fromtext(
+        return tvm.parser.parse(
             """
             #[version = "0.0.5"]
             def @main(%x: Tensor[(5, 7), float32], %y: Tensor[(5, 7), float32], %z: Tensor[(5, 7), float32]) {
               %0 = add(%x, %y);
-              %1 = on_device(%0, device_type=1);
+              %1 = on_device(%0, se_scope=meta[SEScope][0]);
               %2 = subtract(%1, %z);
-              %3 = on_device(%0, device_type=1);
+              %3 = on_device(%0, se_scope=meta[SEScope][0]);
               add(%2, %3)
             }
-        """
+        """,
+            "from_string",
+            None,
+            metatable,
         )
 
     def expected():
-        return tvm.parser.fromtext(
+        return tvm.parser.parse(
             """
             #[version = "0.0.5"]
             def @main(%x: Tensor[(5, 7), float32], %y: Tensor[(5, 7), float32], %z: Tensor[(5, 7), float32],
-                      param_device_types=[1, 1, 2], result_device_type=2) {
+                      param_se_scopes=[meta[SEScope][0], meta[SEScope][0], meta[SEScope][1]],
+                      result_se_scope=meta[SEScope][1]) {
               %0 = add(%x, %y);
-              %1 = on_device(%0, device_type=1, is_fixed=True);
-              %2 = device_copy(%1, src_dev_type=1, dst_dev_type=2);
-              %3 = on_device(%0, device_type=1, is_fixed=True);
+              %1 = on_device(%0, se_scope=meta[SEScope][0], is_fixed=True);
+              %2 = device_copy(%1, src_se_scope=meta[SEScope][0], dst_se_scope=meta[SEScope][1]);
+              %3 = on_device(%0, se_scope=meta[SEScope][0], is_fixed=True);
               %4 = subtract(%2, %z);
-              %5 = device_copy(%3, src_dev_type=1, dst_dev_type=2);
+              %5 = device_copy(%3, src_se_scope=meta[SEScope][0], dst_se_scope=meta[SEScope][1]);
               add(%4, %5)
             }
-        """
+        """,
+            "from_string",
+            None,
+            metatable,
         )
 
     def ref(x, y, z):
@@ -780,31 +931,40 @@ def ref(x, y, z):
 
 
 def test_annotate_expr():
+    metatable = {"SEScope": [CPU, GPU]}
+
     def input():
-        return tvm.parser.fromtext(
+        return tvm.parser.parse(
             """
             #[version = "0.0.5"]
             def @main(%x: Tensor[(5, 7), float32], %y: Tensor[(5, 7), float32], %z: Tensor[(5, 7), float32]) {
               %0 = add(%x, %y);
-              %1 = on_device(%0, device_type=2);
+              %1 = on_device(%0, se_scope=meta[SEScope][1]);
               %2 = subtract(%1, %z);
-              on_device(%2, device_type=1)
+              on_device(%2, se_scope=meta[SEScope][0])
             }
-        """
+        """,
+            "from_string",
+            None,
+            metatable,
         )
 
     def expected():
-        return tvm.parser.fromtext(
+        return tvm.parser.parse(
             """
             #[version = "0.0.5"]
             def @main(%x: Tensor[(5, 7), float32], %y: Tensor[(5, 7), float32], %z: Tensor[(5, 7), float32],
-                      param_device_types=[2, 2, 1], result_device_type=1) {
+                      param_se_scopes=[meta[SEScope][1], meta[SEScope][1], meta[SEScope][0]],
+                      result_se_scope=meta[SEScope][0]) {
               %0 = add(%x, %y);
-              %1 = on_device(%0, device_type=2, is_fixed=True);
-              %2 = device_copy(%1, src_dev_type=2, dst_dev_type=1);
+              %1 = on_device(%0, se_scope=meta[SEScope][1], is_fixed=True);
+              %2 = device_copy(%1, src_se_scope=meta[SEScope][1], dst_se_scope=meta[SEScope][0]);
               subtract(%2, %z)
             }
-        """
+        """,
+            "from_string",
+            None,
+            metatable,
         )
 
     def ref(x, y, z):
@@ -814,17 +974,22 @@ def ref(x, y, z):
 
 
 def test_annotate_all():
+    metatable = {"SEScope": [CPU, GPU]}
+
     def input():
         return tvm.parser.parse(
             """
             #[version = "0.0.5"]
             def @main(%x: Tensor[(5, 7), float32], %y: Tensor[(5, 7), float32], %z: Tensor[(5, 7), float32]) {
               %0 = add(%x, %y);
-              %1 = on_device(%0, device_type=1);
+              %1 = on_device(%0, se_scope=meta[SEScope][0]);
               %2 = subtract(%1, %z);
-              on_device(%2, device_type=1)
+              on_device(%2, se_scope=meta[SEScope][0])
             }
-        """
+        """,
+            "from_string",
+            None,
+            metatable,
         )
 
     def expected():
@@ -832,11 +997,15 @@ def expected():
             """
             #[version = "0.0.5"]
             def @main(%x: Tensor[(5, 7), float32], %y: Tensor[(5, 7), float32], %z: Tensor[(5, 7), float32],
-                      param_device_types=[1, 1, 1], result_device_type=1) {
+                      param_se_scopes=[meta[SEScope][0], meta[SEScope][0], meta[SEScope][0]],
+                      result_se_scope=meta[SEScope][0]) {
               %0 = add(%x, %y);
               subtract(%0, %z)
             }
-        """
+        """,
+            "from_string",
+            None,
+            metatable,
         )
 
     def ref(x, y, z):
@@ -858,43 +1027,52 @@ def test_conv_network():
            |
         <result>       <--- CPU
     """
+    metatable = {"SEScope": [CPU, GPU]}
 
     def input():
-        return tvm.parser.fromtext(
+        return tvm.parser.parse(
             """
             #[version = "0.0.5"]
             def @main(%data1: Tensor[(1, 64, 56, 56), float32], %data2: Tensor[(1, 64, 56, 56), float32],
                       %weight: Tensor[(64, 64, 3, 3), float32]) {
               %0 = nn.conv2d(%data1, %weight, padding=[1, 1, 1, 1], channels=64, kernel_size=[3, 3]);
               %1 = nn.conv2d(%data2, %weight, padding=[1, 1, 1, 1], channels=64, kernel_size=[3, 3]);
-              %2 = on_device(%0, device_type=1);
-              %3 = on_device(%1, device_type=1);
+              %2 = on_device(%0, se_scope=meta[SEScope][0]);
+              %3 = on_device(%1, se_scope=meta[SEScope][0]);
               %4 = add(%2, %3);
-              %5 = on_device(%4, device_type=2);
+              %5 = on_device(%4, se_scope=meta[SEScope][1]);
               %6 = nn.conv2d(%5, %weight, padding=[1, 1, 1, 1], channels=64, kernel_size=[3, 3]);
-              on_device(%6, device_type=1)
+              on_device(%6, se_scope=meta[SEScope][0])
             }
-        """
+        """,
+            "from_string",
+            None,
+            metatable,
         )
 
     def expected():
-        return tvm.parser.fromtext(
+        return tvm.parser.parse(
             """
             #[version = "0.0.5"]
             def @main(%data1: Tensor[(1, 64, 56, 56), float32], %data2: Tensor[(1, 64, 56, 56), float32],
-                      %weight: Tensor[(64, 64, 3, 3), float32], param_device_types=[1, 1, 1], result_device_type=1) {
+                      %weight: Tensor[(64, 64, 3, 3), float32],
+                      param_se_scopes=[meta[SEScope][0], meta[SEScope][0], meta[SEScope][0]],
+                      result_se_scope=meta[SEScope][0]) {
               %0 = nn.conv2d(%data1, %weight, padding=[1, 1, 1, 1], channels=64, kernel_size=[3, 3]);
-              %1 = on_device(%0, device_type=1, is_fixed=True);
+              %1 = on_device(%0, se_scope=meta[SEScope][0], is_fixed=True);
               %2 = nn.conv2d(%data2, %weight, padding=[1, 1, 1, 1], channels=64, kernel_size=[3, 3]);
-              %3 = on_device(%2, device_type=1, is_fixed=True);
-              %4 = device_copy(%1, src_dev_type=1, dst_dev_type=2);
-              %5 = device_copy(%3, src_dev_type=1, dst_dev_type=2);
+              %3 = on_device(%2, se_scope=meta[SEScope][0], is_fixed=True);
+              %4 = device_copy(%1, src_se_scope=meta[SEScope][0], dst_se_scope=meta[SEScope][1]);
+              %5 = device_copy(%3, src_se_scope=meta[SEScope][0], dst_se_scope=meta[SEScope][1]);
               %6 = add(%4, %5);
-              %7 = on_device(%6, device_type=2, is_fixed=True);
-              %8 = device_copy(%7, src_dev_type=2, dst_dev_type=1);
+              %7 = on_device(%6, se_scope=meta[SEScope][1], is_fixed=True);
+              %8 = device_copy(%7, src_se_scope=meta[SEScope][1], dst_se_scope=meta[SEScope][0]);
               nn.conv2d(%8, %weight, padding=[1, 1, 1, 1], channels=64, kernel_size=[3, 3])
             }
-        """
+        """,
+            "from_string",
+            None,
+            metatable,
         )
 
     # Don't try to execute, we don't have a reference conv2d
@@ -902,40 +1080,49 @@ def @main(%data1: Tensor[(1, 64, 56, 56), float32], %data2: Tensor[(1, 64, 56, 5
 
 
 def test_tuple_get_item():
+    metatable = {"SEScope": [CPU, GPU]}
+
     # Note that the device copy should be placed after projection rather than before. This is handled by
     # a heuristic in the pass.
     def input():
-        return tvm.parser.fromtext(
+        return tvm.parser.parse(
             """
             #[version = "0.0.5"]
             def @main(%x: Tensor[(3, 3, 4), float32]) {
               let %t = split(%x, indices_or_sections=3);
-              %0 = on_device(%t, device_type=1);
-              %1 = on_device(%t, device_type=1);
+              %0 = on_device(%t, se_scope=meta[SEScope][0]);
+              %1 = on_device(%t, se_scope=meta[SEScope][0]);
               %2 = %0.0;
               %3 = %1.1;
               %4 = subtract(%2, %3);
-              on_device(%4, device_type=2)
+              on_device(%4, se_scope=meta[SEScope][1])
             }
-        """
+        """,
+            "from_string",
+            None,
+            metatable,
         )
 
     def expected():
-        return tvm.parser.fromtext(
+        return tvm.parser.parse(
             """
             #[version = "0.0.5"]
-            def @main(%x: Tensor[(3, 3, 4), float32], param_device_types=[1], result_device_type=2) {
+            def @main(%x: Tensor[(3, 3, 4), float32],
+                      param_se_scopes=[meta[SEScope][0]], result_se_scope=meta[SEScope][1]) {
               %0 = split(%x, indices_or_sections=3);
-              let %t = on_device(%0, device_type=1, is_fixed=True);
+              let %t = on_device(%0, se_scope=meta[SEScope][0], is_fixed=True);
               %1 = %t.0;
-              %2 = on_device(%1, device_type=1, is_fixed=True);
+              %2 = on_device(%1, se_scope=meta[SEScope][0], is_fixed=True);
               %3 = %t.1;
-              %4 = on_device(%3, device_type=1, is_fixed=True);
-              %5 = device_copy(%2, src_dev_type=1, dst_dev_type=2);
-              %6 = device_copy(%4, src_dev_type=1, dst_dev_type=2);
+              %4 = on_device(%3, se_scope=meta[SEScope][0], is_fixed=True);
+              %5 = device_copy(%2, src_se_scope=meta[SEScope][0], dst_se_scope=meta[SEScope][1]);
+              %6 = device_copy(%4, src_se_scope=meta[SEScope][0], dst_se_scope=meta[SEScope][1]);
               subtract(%5, %6)
             }
-        """
+        """,
+            "from_string",
+            None,
+            metatable,
         )
 
     def ref(x):
@@ -959,45 +1146,53 @@ def test_propogation():
                   |
                <result>         <--- CPU
     """
+    metatable = {"SEScope": [CPU, GPU]}
 
     def input():
-        return tvm.parser.fromtext(
+        return tvm.parser.parse(
             """
             #[version = "0.0.5"]
             def @main(%x: Tensor[(5, 7), float32]) {
               %0 = negative(%x);
-              %1 = on_device(%0, device_type=1);
+              %1 = on_device(%0, se_scope=meta[SEScope][0]);
               %2 = negative(%1);
-              %3 = on_device(%0, device_type=1);
+              %3 = on_device(%0, se_scope=meta[SEScope][0]);
               %4 = negative(%3);
-              %5 = on_device(%2, device_type=2);
-              %6 = on_device(%4, device_type=2);
+              %5 = on_device(%2, se_scope=meta[SEScope][1]);
+              %6 = on_device(%4, se_scope=meta[SEScope][1]);
               %7 = add(%5, %6);
-              %8 = on_device(%7, device_type=2);
+              %8 = on_device(%7, se_scope=meta[SEScope][1]);
               %9 = negative(%8);
-              on_device(%9, device_type=1)
+              on_device(%9, se_scope=meta[SEScope][0])
             }
-        """
+        """,
+            "from_string",
+            None,
+            metatable,
         )
 
     def expected():
-        return tvm.parser.fromtext(
+        return tvm.parser.parse(
             """
             #[version = "0.0.5"]
-            def @main(%x: Tensor[(5, 7), float32], param_device_types=[1], result_device_type=1) {
+            def @main(%x: Tensor[(5, 7), float32],
+                      param_se_scopes=[meta[SEScope][0]], result_se_scope=meta[SEScope][0]) {
               %0 = negative(%x);
-              %1 = on_device(%0, device_type=1, is_fixed=True);
-              %2 = device_copy(%1, src_dev_type=1, dst_dev_type=2);
-              %3 = on_device(%0, device_type=1, is_fixed=True);
-              %4 = device_copy(%3, src_dev_type=1, dst_dev_type=2);
+              %1 = on_device(%0, se_scope=meta[SEScope][0], is_fixed=True);
+              %2 = device_copy(%1, src_se_scope=meta[SEScope][0], dst_se_scope=meta[SEScope][1]);
+              %3 = on_device(%0, se_scope=meta[SEScope][0], is_fixed=True);
+              %4 = device_copy(%3, src_se_scope=meta[SEScope][0], dst_se_scope=meta[SEScope][1]);
               %5 = negative(%2);
               %6 = negative(%4);
               %7 = add(%5, %6);
-              %8 = on_device(%7, device_type=2, is_fixed=True);
-              %9 = device_copy(%8, src_dev_type=2, dst_dev_type=1);
+              %8 = on_device(%7, se_scope=meta[SEScope][1], is_fixed=True);
+              %9 = device_copy(%8, src_se_scope=meta[SEScope][1], dst_se_scope=meta[SEScope][0]);
               negative(%9)
             }
-        """
+        """,
+            "from_string",
+            None,
+            metatable,
         )
 
     def ref(x):
@@ -1023,43 +1218,51 @@ def test_fusible_network():
                   |
                <result>     <--- CPU
     """
+    metatable = {"SEScope": [CPU, GPU]}
 
     def input():
-        return tvm.parser.fromtext(
+        return tvm.parser.parse(
             """
             #[version = "0.0.5"]
             def @main(%x: Tensor[(5, 7), float32], %y: Tensor[(5, 7), float32]) {
               %0 = add(%x, %y);
-              %1 = on_device(%0, device_type=2);
+              %1 = on_device(%0, se_scope=meta[SEScope][1]);
               %2 = negative(%1);
-              %3 = on_device(%2, device_type=1);
+              %3 = on_device(%2, se_scope=meta[SEScope][0]);
               %4 = negative(%0);
               %5 = add(%3, %4);
-              %6 = on_device(%5, device_type=2);
+              %6 = on_device(%5, se_scope=meta[SEScope][1]);
               %7 = negative(%6);
-              on_device(%7, device_type=1)
+              on_device(%7, se_scope=meta[SEScope][0])
             }
-        """
+        """,
+            "from_string",
+            None,
+            metatable,
         )
 
     def expected():
-        return tvm.parser.fromtext(
+        return tvm.parser.parse(
             """
             #[version = "0.0.5"]
-            def @main(%x: Tensor[(5, 7), float32], %y: Tensor[(5, 7), float32], param_device_types=[2, 2], result_device_type=1) {
+            def @main(%x: Tensor[(5, 7), float32], %y: Tensor[(5, 7), float32],
+                      param_se_scopes=[meta[SEScope][1], meta[SEScope][1]], result_se_scope=meta[SEScope][0]) {
               %0 = add(%x, %y);
-              %1 = on_device(%0, device_type=2, is_fixed=True);
-              %2 = device_copy(%1, src_dev_type=2, dst_dev_type=1);
+              %1 = on_device(%0, se_scope=meta[SEScope][1], is_fixed=True);
+              %2 = device_copy(%1, src_se_scope=meta[SEScope][1], dst_se_scope=meta[SEScope][0]);
               %3 = negative(%2);
-              %4 = on_device(%3, device_type=1, is_fixed=True);
-              %5 = device_copy(%4, src_dev_type=1, dst_dev_type=2);
+              %4 = on_device(%3, se_scope=meta[SEScope][0], is_fixed=True);
+              %5 = device_copy(%4, src_se_scope=meta[SEScope][0], dst_se_scope=meta[SEScope][1]);
               %6 = negative(%0);
               %7 = add(%5, %6);
-              %8 = on_device(%7, device_type=2, is_fixed=True);
-              %9 = device_copy(%8, src_dev_type=2, dst_dev_type=1);
+              %8 = on_device(%7, se_scope=meta[SEScope][1], is_fixed=True);
+              %9 = device_copy(%8, src_se_scope=meta[SEScope][1], dst_se_scope=meta[SEScope][0]);
               negative(%9)
             }
-        """
+        """,
+            "from_string",
+            None,
+            metatable,
         )
 
     def ref(x, y):
@@ -1083,37 +1286,45 @@ def test_unpropagatable_graph():
            |
         <result>        <--- CPU
     """
+    metatable = {"SEScope": [CPU, GPU]}
 
     def input():
-        return tvm.parser.fromtext(
+        return tvm.parser.parse(
             """
             #[version = "0.0.5"]
             def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
                       %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32]) {
               %0 = add(%a, %b);
               %1 = multiply(%c, %d);
-              %2 = on_device(%0, device_type=1);
-              %3 = on_device(%1, device_type=2);
+              %2 = on_device(%0, se_scope=meta[SEScope][0]);
+              %3 = on_device(%1, se_scope=meta[SEScope][1]);
               %4 = subtract(%2, %3);
-              on_device(%4, device_type=1)
+              on_device(%4, se_scope=meta[SEScope][0])
             }
-        """
+        """,
+            "from_string",
+            None,
+            metatable,
         )
 
     def expected():
-        return tvm.parser.fromtext(
+        return tvm.parser.parse(
             """
             #[version = "0.0.5"]
             def @main(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
                       %c: Tensor[(5, 7), float32], %d: Tensor[(5, 7), float32],
-                      param_device_types=[1, 1, 2, 2], result_device_type=1) {
+                      param_se_scopes=[meta[SEScope][0], meta[SEScope][0], meta[SEScope][1], meta[SEScope][1]],
+                      result_se_scope=meta[SEScope][0]) {
               %0 = multiply(%c, %d);
-              %1 = on_device(%0, device_type=2, is_fixed=True);
+              %1 = on_device(%0, se_scope=meta[SEScope][1], is_fixed=True);
               %2 = add(%a, %b);
-              %3 = device_copy(%1, src_dev_type=2, dst_dev_type=1);
+              %3 = device_copy(%1, src_se_scope=meta[SEScope][1], dst_se_scope=meta[SEScope][0]);
               subtract(%2, %3)
             }
-        """
+        """,
+            "from_string",
+            None,
+            metatable,
         )
 
     def ref(a, b, c, d):
@@ -1123,14 +1334,16 @@ def ref(a, b, c, d):
 
 
 def test_conditional():
+    metatable = {"SEScope": [CPU, GPU]}
+
     # The conditional is over a function type, thus exercising the first-order/higher-order domain handling.
     def input():
-        return tvm.parser.fromtext(
+        return tvm.parser.parse(
             """
             #[version = "0.0.5"]
             def @main(%x: bool, %y: Tensor[(5, 7), float32], %z: Tensor[(5, 7), float32]) {
               let %f = fn (%a) {
-                %0 = on_device(%y, device_type=1, is_fixed=True);
+                %0 = on_device(%y, se_scope=meta[SEScope][0], is_fixed=True);
                 add(%a, %0)
               };
               let %g = fn (%a1) {
@@ -1143,19 +1356,23 @@ def @main(%x: bool, %y: Tensor[(5, 7), float32], %z: Tensor[(5, 7), float32]) {
               };
               %h(%z)
             }
-        """
+        """,
+            "from_string",
+            None,
+            metatable,
         )
 
     def expected():
-        return tvm.parser.fromtext(
+        return tvm.parser.parse(
             """
             #[version = "0.0.5"]
             def @main(%x: bool, %y: Tensor[(5, 7), float32], %z: Tensor[(5, 7), float32],
-                      param_device_types=[1, 1, 1], result_device_type=1) {
-              let %f = fn (%a, param_device_types=[1], result_device_type=1) {
+                      param_se_scopes=[meta[SEScope][0], meta[SEScope][0], meta[SEScope][0]],
+                      result_se_scope=meta[SEScope][0]) {
+              let %f = fn (%a, param_se_scopes=[meta[SEScope][0]], result_se_scope=meta[SEScope][0]) {
                 add(%a, %y)
               };
-              let %g = fn (%a1, param_device_types=[1], result_device_type=1) {
+              let %g = fn (%a1, param_se_scopes=[meta[SEScope][0]], result_se_scope=meta[SEScope][0]) {
                 subtract(%a1, %y)
               };
               let %h = if (%x) {
@@ -1165,7 +1382,10 @@ def @main(%x: bool, %y: Tensor[(5, 7), float32], %z: Tensor[(5, 7), float32],
               };
               %h(%z)
             }
-        """
+        """,
+            "from_string",
+            None,
+            metatable,
         )
 
     def ref(x, y, z):
@@ -1182,36 +1402,46 @@ def g(a):
 
 
 def test_global():
+    metatable = {"SEScope": [CPU, GPU]}
+
     def input():
-        return tvm.parser.fromtext(
+        return tvm.parser.parse(
             """
             #[version = "0.0.5"]
             def @f(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] {
-              %0 = on_device(%b, device_type=1);
+              %0 = on_device(%b, se_scope=meta[SEScope][0]);
               add(%a, %0)
             }
             
             def @main(%x: Tensor[(5, 7), float32], %y: Tensor[(5, 7), float32]) -> Tensor[(5, 7), float32] {
               @f(%y, %x)
             }
-        """
+        """,
+            "from_string",
+            None,
+            metatable,
         )
 
     def expected():
-        return tvm.parser.fromtext(
+        return tvm.parser.parse(
             """
             #[version = "0.0.5"]
             def @f(%a: Tensor[(5, 7), float32], %b: Tensor[(5, 7), float32],
-                   param_device_types=[2, 1], result_device_type=2) -> Tensor[(5, 7), float32] {
-              %0 = device_copy(%b, src_dev_type=1, dst_dev_type=2);
+                   param_se_scopes=[meta[SEScope][1], meta[SEScope][0]],
+                   result_se_scope=meta[SEScope][1]) -> Tensor[(5, 7), float32] {
+              %0 = device_copy(%b, src_se_scope=meta[SEScope][0], dst_se_scope=meta[SEScope][1]);
               add(%a, %0)
             }
             
             def @main(%x: Tensor[(5, 7), float32], %y: Tensor[(5, 7), float32],
-                      param_device_types=[1, 2], result_device_type=2) -> Tensor[(5, 7), float32] {
+                      param_se_scopes=[meta[SEScope][0], meta[SEScope][1]],
+                      result_se_scope=meta[SEScope][1]) -> Tensor[(5, 7), float32] {
               @f(%y, %x)
             }
-        """
+        """,
+            "from_string",
+            None,
+            metatable,
         )
 
     def ref(x, y):
@@ -1224,33 +1454,41 @@ def f(a, b):
 
 
 def test_ref():
+    metatable = {"SEScope": [CPU, GPU]}
+
     def input():
-        return tvm.parser.fromtext(
+        return tvm.parser.parse(
             """
             #[version = "0.0.5"]
             def @main(%x: Tensor[(5, 7), float32], %y: Tensor[(5, 7), float32]) {
               let %r = ref(%x);
-              %0 = on_device(%y, device_type=1);
+              %0 = on_device(%y, se_scope=meta[SEScope][0]);
               ref_write(%r, %0);
               %1 = ref_read(%r);
               add(%x, %1)
             }
-        """
+        """,
+            "from_string",
+            None,
+            metatable,
         )
 
     def expected():
-        return tvm.parser.fromtext(
+        return tvm.parser.parse(
             """
             #[version = "0.0.5"]
             def @main(%x: Tensor[(5, 7), float32], %y: Tensor[(5, 7), float32],
-                      param_device_types=[2, 1], result_device_type=2) {
+                      param_se_scopes=[meta[SEScope][1], meta[SEScope][0]], result_se_scope=meta[SEScope][1]) {
               let %r = ref(%x);
-              %0 = device_copy(%y, src_dev_type=1, dst_dev_type=2);
+              %0 = device_copy(%y, src_se_scope=meta[SEScope][0], dst_se_scope=meta[SEScope][1]);
               ref_write(%r, %0);
               %1 = ref_read(%r);
               add(%x, %1)
             }
-        """
+        """,
+            "from_string",
+            None,
+            metatable,
         )
 
     def ref(x, y):
@@ -1263,8 +1501,10 @@ def ref(x, y):
 
 
 def test_adt():
+    metatable = {"SEScope": [CPU, GPU]}
+
     def input():
-        return tvm.parser.fromtext(
+        return tvm.parser.parse(
             """
             #[version = "0.0.5"]
             type List[A] {
@@ -1272,7 +1512,7 @@ def input():
               Nil,
             }
             def @main(%x : Tensor[(5, 7), float32], %y : Tensor[(5, 7), float32]) {
-              %0 = on_device(%y, device_type=1, is_fixed=True);
+              %0 = on_device(%y, se_scope=meta[SEScope][0], is_fixed=True);
               %1 = Nil;
               %2 = Cons(%0, %1);
               let %l = Cons(%x, %2);
@@ -1280,11 +1520,14 @@ def @main(%x : Tensor[(5, 7), float32], %y : Tensor[(5, 7), float32]) {
                 Cons(%z, _) => %z
               }
             }
-        """
+        """,
+            "from_string",
+            None,
+            metatable,
         )
 
     def expected():
-        return tvm.parser.fromtext(
+        return tvm.parser.parse(
             """
             #[version = "0.0.5"]
             type List[A] {
@@ -1292,7 +1535,7 @@ def expected():
               Nil,
             }
             def @main(%x : Tensor[(5, 7), float32], %y : Tensor[(5, 7), float32],
-                      param_device_types=[1, 1], result_device_type=1) {
+                      param_se_scopes=[meta[SEScope][0], meta[SEScope][0]], result_se_scope=meta[SEScope][0]) {
               %0 = Nil;
               %1 = Cons(%y, %0);
               let %l = Cons(%x, %1);
@@ -1300,7 +1543,10 @@ def @main(%x : Tensor[(5, 7), float32], %y : Tensor[(5, 7), float32],
                 Cons(%z, _) => %z
               }
             }
-        """
+        """,
+            "from_string",
+            None,
+            metatable,
         )
 
     def ref(x, y):
diff --git a/tests/python/relay/test_vm.py b/tests/python/relay/test_vm.py
index 79979747dfd8a..24e3863a64e30 100644
--- a/tests/python/relay/test_vm.py
+++ b/tests/python/relay/test_vm.py
@@ -853,11 +853,11 @@ def check_remote(server):
         # Get a handle to remote Executable.
         rexec = remote.load_module("vm_library.so")
 
-        ctx = remote.cpu()
+        device = remote.cpu()
         # Build a VM out of the executable and context.
-        vm_factory = runtime.vm.VirtualMachine(rexec, ctx)
+        vm_factory = runtime.vm.VirtualMachine(rexec, device)
         np_input = np.random.uniform(size=(10, 1)).astype("float32")
-        input_tensor = tvm.nd.array(np_input, ctx)
+        input_tensor = tvm.nd.array(np_input, device)
         # Invoke its "main" function.
         out = vm_factory.invoke("main", input_tensor)
         # Check the result.
@@ -1003,6 +1003,7 @@ def test_shape_func_nested_function():
 def test_storage_size_and_offset_on_cpu():
     """Tests allocations place sizes and offsets on the CPU host even if the rest
     of the computation is on a different device type."""
+
     # TODO(mbs): Better would be to test ManifestAlloc independently.
 
     # CPU = device type 1
@@ -1035,6 +1036,7 @@ def @main(%a: Tensor[(5, 7), float32],
 def test_reshape_shape_on_cpu():
     """Tests the argument to a reshape places the shape on the CPU host even if the rest
     of the computation is on a different device type."""
+
     # TODO(mbs): Better would be to test ManifestAlloc independently.
 
     # CPU = device type 1
@@ -1060,7 +1062,47 @@ def @main(%x: Tensor[(2, 8), float32],
     assert "on device of type 1" in exe.constants
 
 
+@tvm.testing.requires_cuda
+def test_multi_targets():
+    # Build an IRModule.
+    n = 10
+    x = relay.var("x", shape=(n,))
+    y = relay.var("y", shape=(n,))
+    z = relay.var("z", shape=(n,))
+    f = relay.Function([x, y, z], x + relay.op.annotation.on_device(y + z, tvm.cpu()))
+    mod = IRModule.from_expr(f)
+
+    # Compile to VMExecutable.
+    with tvm.transform.PassContext(
+        opt_level=3, config={"relay.fallback_device_type": tvm.cuda().device_type}
+    ):
+        exec = relay.vm.compile(
+            mod, target={"cpu": tvm.target.Target("llvm"), "cuda": tvm.target.Target("cuda")}
+        )
+    print(exec.virtual_devices)
+    print(exec.constants)
+    print(exec.bytecode)
+
+    # Run
+    vm = runtime.vm.VirtualMachine(exec, [tvm.cuda(), tvm.cpu()])
+    x_data = np.random.rand(
+        n,
+    ).astype("float32")
+    y_data = np.random.rand(
+        n,
+    ).astype("float32")
+    z_data = np.random.rand(
+        n,
+    ).astype("float32")
+    actual_result = vm.invoke("main", x_data, y_data, z_data)
+
+    # Test
+    expected_result = x_data + y_data + z_data
+    tvm.testing.assert_allclose(actual_result.numpy(), expected_result)
+
+
 if __name__ == "__main__":
     import sys
 
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    # sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    test_multi_targets()
diff --git a/tests/python/unittest/test_auto_scheduler_measure.py b/tests/python/unittest/test_auto_scheduler_measure.py
index 9eae3dd336727..04879573bd6a5 100644
--- a/tests/python/unittest/test_auto_scheduler_measure.py
+++ b/tests/python/unittest/test_auto_scheduler_measure.py
@@ -424,16 +424,7 @@ def foo():
 
 
 if __name__ == "__main__":
-    test_record_split_reorder_fuse_annotation()
-    test_record_compute_at_root_inline_cache_read_write()
-    test_record_follow_split_follow_fused_split()
-    test_record_pragma_storage_align_rfactor()
-    test_recover_measure_input()
-    test_workload_dis_factor()
-    test_measure_local_builder_runner()
-    test_dag_measure_local_builder_runner()
-    test_workload_serialization()
-    test_measure_local_builder_rpc_runner()
-    test_measure_target_host()
-    test_measure_special_inputs_map_by_name_local_runner()
-    test_measure_special_inputs_map_by_name_rpc_runner()
+    import sys
+    import pytest
+
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/python/unittest/test_micro_model_library_format.py b/tests/python/unittest/test_micro_model_library_format.py
index 92c1174e728c2..e2ce442e0d885 100644
--- a/tests/python/unittest/test_micro_model_library_format.py
+++ b/tests/python/unittest/test_micro_model_library_format.py
@@ -18,7 +18,6 @@
 import datetime
 import json
 import os
-import sys
 import tarfile
 
 import numpy
@@ -411,4 +410,7 @@ def test_export_byoc_c_module():
 
 
 if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    import sys
+
+    # sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    test_export_operator_model_library_format()
diff --git a/tests/python/unittest/test_runtime_profiling.py b/tests/python/unittest/test_runtime_profiling.py
index b67142b423588..4e777435429b0 100644
--- a/tests/python/unittest/test_runtime_profiling.py
+++ b/tests/python/unittest/test_runtime_profiling.py
@@ -196,4 +196,7 @@ def test_report_serialization():
 
 
 if __name__ == "__main__":
-    test_papi("llvm", tvm.cpu())
+    import sys
+    import pytest
+
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/python/unittest/test_runtime_vm_profiler.py b/tests/python/unittest/test_runtime_vm_profiler.py
index 75b61d281840f..0499a3e6c65a3 100644
--- a/tests/python/unittest/test_runtime_vm_profiler.py
+++ b/tests/python/unittest/test_runtime_vm_profiler.py
@@ -39,4 +39,7 @@ def test_basic(dev, target):
 
 
 if __name__ == "__main__":
-    test_basic(tvm.cpu(), tvm.target.Target("llvm"))
+    import sys
+    import pytest
+
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))