-
Notifications
You must be signed in to change notification settings - Fork 1.8k
[Retiarii] Bugfix: wrong device placement and invalid CUDA ordinal when using CGO engine #4086
Changes from 12 commits
ac7f94c
afbb014
822329d
3fdfdc8
cc5a48e
fa943a4
98d3c57
bcdd06e
bdf0160
6a48572
d0d52ea
0f96890
0f5b458
5b47e79
005fd4e
031bfc4
b196726
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,7 +2,10 @@ | |
# Licensed under the MIT license. | ||
|
||
import logging | ||
from typing import List, Tuple, Any | ||
from typing import Dict, List, Tuple, Any | ||
|
||
from nni.retiarii.operation_def.torch_op_def import ToDevice | ||
from nni.common.device import Device, GPUDevice | ||
|
||
from ..graph import IllegalGraphError, Edge, Graph, Node, Model | ||
|
||
|
@@ -70,7 +73,7 @@ def _format_inputs(node: Node) -> Tuple[List[str], List[Any]]: | |
# when the input comes from a single-output operator | ||
inputs.append('{}'.format(edge.head.name)) | ||
if edge.head.operation.type in ('prim::Constant', 'prim::GetAttr') and \ | ||
'value' in edge.head.operation.parameters: | ||
'value' in edge.head.operation.parameters: | ||
inputs_value.append(edge.head.operation.parameters['value']) | ||
else: | ||
inputs_value.append(None) | ||
|
@@ -98,15 +101,39 @@ def _remove_prefix(names, graph_name): | |
return names[len(graph_name):] if names.startswith(graph_name) else names | ||
|
||
|
||
def generate_cuda_mapping(placement: Dict[Node, Device]) -> Dict[Device, int]: | ||
''' | ||
Since CUDA_VISIBLE_DEVICES will be set to the list of real GPU ID, | ||
we need to remap the GPU ID when generating code to match them correctly. | ||
For example, when CUDA_VISIBLE_DEVICES="0,3", we need to use "cuda:0", "cuda:1" in the generated code. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can't get the point, why There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nni_manager sets When Thus, when generating code that explicitly place operations (e.g., |
||
''' | ||
unique_devices = sorted(list(set([e for e in placement.values() if isinstance(e, GPUDevice)]))) | ||
node_gpu_cnt = {} | ||
cuda_remapped_id = {} | ||
for d in unique_devices: | ||
if d.node_id not in node_gpu_cnt: | ||
node_gpu_cnt[d.node_id] = 0 | ||
node_gpu_cnt[d.node_id] += 1 | ||
cuda_remapped_id[d] = node_gpu_cnt[d.node_id] - 1 | ||
|
||
return cuda_remapped_id | ||
|
||
|
||
def graph_to_pytorch_model(graph_name: str, graph: Graph, placement=None) -> str: | ||
nodes = graph.topo_sort() | ||
|
||
# handle module node and function node differently | ||
# only need to generate code for module here | ||
import_pkgs = set() | ||
node_codes = [] | ||
cuda_remapped_id = None | ||
if placement: | ||
cuda_remapped_id = generate_cuda_mapping(placement) | ||
for node in nodes: | ||
if node.operation: | ||
if placement and isinstance(node.operation, ToDevice): | ||
node.operation.override_device_repr("cuda:%d" % cuda_remapped_id[node.operation.device]) | ||
|
||
if node.operation.type == 'shared': | ||
continue | ||
pkg_name = node.operation.get_import_pkg() | ||
|
@@ -115,7 +142,11 @@ def graph_to_pytorch_model(graph_name: str, graph: Graph, placement=None) -> str | |
node_code = node.operation.to_init_code(_remove_prefix(node.name, graph_name)) | ||
if node_code is not None: | ||
if placement and node in placement and len(node_code) > 0: | ||
node_codes.append(f"{node_code}.to('{placement[node].device_repr()}')") | ||
if isinstance(placement[node], GPUDevice): | ||
device_repr = "cuda:%d" % cuda_remapped_id[placement[node]] | ||
else: | ||
device_repr = placement[node].device_repr() | ||
node_codes.append(f"{node_code}.to('{device_repr}')") | ||
else: | ||
node_codes.append(node_code) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,20 +2,23 @@ | |
# Licensed under the MIT license. | ||
|
||
import copy | ||
from typing import Dict, Tuple, Any, Union | ||
from typing import Dict, Tuple, Any | ||
|
||
from nni.retiarii.utils import uid | ||
from nni.common.device import GPUDevice | ||
from nni.common.device import Device | ||
|
||
from ...graph import Cell, Edge, Graph, Model, Node | ||
from ...operation import Operation, _IOPseudoOperation | ||
|
||
|
||
class CPUDevice: | ||
class CPUDevice(Device): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it is a little strange that why not put There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good comment. I have moved |
||
def __init__(self, node_id): | ||
self.node_id = node_id | ||
self.device = 'cpu' | ||
|
||
def __repr__(self) -> str: | ||
return "{CPU Device, NodeID %s, Status %s}" % (self.node_id, self.status) | ||
|
||
def device_repr(self): | ||
return "cpu" | ||
|
||
|
@@ -25,7 +28,7 @@ def __init__(self, graph, node_id, name, operation, _internal=False): | |
super().__init__(graph, node_id, name, operation, _internal=_internal) | ||
self.related_models = [] | ||
|
||
def assemble(self, multi_model_placement: Dict[Model, GPUDevice]) -> Tuple[Node, GPUDevice]: | ||
def assemble(self, multi_model_placement: Dict[Model, Device]) -> Tuple[Node, Device]: | ||
raise NotImplementedError | ||
|
||
def _fork_to(self, graph: Graph): | ||
|
@@ -92,7 +95,7 @@ def __init__(self, logical_graph: LogicalGraph, | |
self.original_graph = original_graph | ||
self.original_node = original_node | ||
|
||
def assemble(self, multi_model_placement: Dict[Model, GPUDevice]) -> Tuple[Node, GPUDevice]: | ||
def assemble(self, multi_model_placement: Dict[Model, Device]) -> Tuple[Node, Device]: | ||
model_id = self.original_node.graph.model.model_id | ||
new_node = Node(self.original_node.graph, self.original_node.id, | ||
f"M_{model_id}_" + | ||
|
@@ -138,8 +141,8 @@ def _merge_graph(self, from_graph): | |
new_tail = id_to_new_node[edge.tail.id] | ||
Edge((new_head, edge.head_slot), (new_tail, edge.tail_slot), _internal=True)._register() | ||
|
||
def assemble(self, multi_model_placement: Dict[Model, GPUDevice]) \ | ||
-> Tuple[Model, Dict[Node, Union[GPUDevice, CPUDevice]]]: | ||
def assemble(self, multi_model_placement: Dict[Model, Device]) \ | ||
-> Tuple[Model, Dict[Node, Device]]: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. better to add docstring for this function There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. I have added the docstring for |
||
phy_model = Model(_internal=True) | ||
phy_graph = self.lp_model.root_graph._fork_to(phy_model) | ||
phy_graph._rename_graph(phy_graph.name, "_model") | ||
|
@@ -224,7 +227,7 @@ def assemble(self, multi_model_placement: Dict[Model, GPUDevice]) \ | |
# If two nodes are placed on different devices, use ToDevice op to copy the node | ||
existing_edges = phy_graph.edges.copy() | ||
# Avoid a node is copied multiple times on the same device | ||
copied_op: Dict[Tuple(Node, Union[GPUDevice, CPUDevice]), Node] = {} | ||
copied_op: Dict[Tuple(Node, Device), Node] = {} | ||
for edge in existing_edges: | ||
head_placement = node_placements[edge.head] | ||
tail_placement = node_placements[edge.tail] | ||
|
@@ -238,11 +241,12 @@ def assemble(self, multi_model_placement: Dict[Model, GPUDevice]) \ | |
dst_name = edge.head.name + "_to_" + edge.tail.name | ||
to_operation = Operation.new( | ||
'ToDevice', { | ||
"device": tail_placement.device_repr(), "src": ( | ||
"device": tail_placement, "src": ( | ||
edge.head.name, edge.head_slot), "dst": dst_name}) | ||
to_node = Node(phy_graph, uid(), dst_name, to_operation)._register() | ||
Edge((edge.head, edge.head_slot), (to_node, None), _internal=True)._register() | ||
copied_op[(edge.head, tail_placement)] = to_node | ||
node_placements[to_node] = head_placement | ||
edge.head = to_node | ||
edge.head_slot = None | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
is this typical usage of
@dataclass
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure if it is typical. Since the dataclass
GPUDevice
inherits the dataclassDevice
, the order in__init__
by default should benode_id, status, gpu_id
, which looks not nature to human. So I explicitly declare the order here with the expected order.