Skip to content

Commit fca932b

Browse files
mzientcyyever
authored andcommitted
Make output buffers for arugment inputs to GPU operators pinned. (NVIDIA#3728)
* Make output buffers for argument inputs to GPU operators pinned. * Pin GPU operators' CPU inputs and all mixed operators' inputs (except decoders) Signed-off-by: Michał Zientkiewicz <mzient@gmail.com>
1 parent e6a499f commit fca932b

File tree

1 file changed

+20
-4
lines changed

1 file changed

+20
-4
lines changed

dali/pipeline/executor/executor.h

+20-4
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright (c) 2017-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
// Copyright (c) 2017-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
//
33
// Licensed under the Apache License, Version 2.0 (the "License");
44
// you may not use this file except in compliance with the License.
@@ -640,13 +640,29 @@ std::vector<int> Executor<WorkspacePolicy, QueuePolicy>::GetTensorQueueSizes(con
640640
template <typename WorkspacePolicy, typename QueuePolicy>
641641
void Executor<WorkspacePolicy, QueuePolicy>::PrepinData(
642642
std::vector<tensor_data_store_queue_t> &tensor_to_store_queue, const OpGraph &graph) {
643-
// We only pin what we need
643+
// We only pin what we need:
644+
// The inputs of mixed ops are potentially used for H2D copies...
644645
for (int i = 0; i < graph.NumOp(OpType::MIXED); i++) {
645646
auto &node = graph.Node(OpType::MIXED, i);
646-
for (int j = 0; j < node.spec.NumRegularInput(); ++j) {
647+
if (node.spec.name().find("decoders__") == 0)
648+
continue; // don't pin inputs to decoders
649+
for (int j = 0; j < node.spec.NumInput(); ++j) {
647650
auto tid = node.parent_tensors[j];
648651
// Use pinned memory only when it is useful
649-
if (node.spec.name() == "MakeContiguous" && node.spec.NumOutput() == 1) {
652+
auto &parent_tensor_queue =
653+
get_queue<OpType::CPU, StorageDevice::CPU>(tensor_to_store_queue_[tid]);
654+
for (auto &tensor : parent_tensor_queue) {
655+
tensor->set_pinned(node.spec.OutputDevice(0) == "gpu" && !RestrictPinnedMemUsage());
656+
}
657+
}
658+
}
659+
660+
// ...as are CPU inputs of GPU ops (e.g. argument inputs)
661+
for (int i = 0; i < graph.NumOp(OpType::GPU); i++) {
662+
auto &node = graph.Node(OpType::GPU, i);
663+
for (int j = 0; j < node.spec.NumInput(); ++j) {
664+
auto tid = node.parent_tensors[j];
665+
if (graph.Tensor(tid).producer.storage_device == StorageDevice::CPU) {
650666
auto &parent_tensor_queue =
651667
get_queue<OpType::CPU, StorageDevice::CPU>(tensor_to_store_queue_[tid]);
652668
for (auto &tensor : parent_tensor_queue) {

0 commit comments

Comments
 (0)