|
1 |
| -// Copyright (c) 2017-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. |
| 1 | +// Copyright (c) 2017-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. |
2 | 2 | //
|
3 | 3 | // Licensed under the Apache License, Version 2.0 (the "License");
|
4 | 4 | // you may not use this file except in compliance with the License.
|
@@ -640,13 +640,29 @@ std::vector<int> Executor<WorkspacePolicy, QueuePolicy>::GetTensorQueueSizes(con
|
640 | 640 | template <typename WorkspacePolicy, typename QueuePolicy>
|
641 | 641 | void Executor<WorkspacePolicy, QueuePolicy>::PrepinData(
|
642 | 642 | std::vector<tensor_data_store_queue_t> &tensor_to_store_queue, const OpGraph &graph) {
|
643 |
| - // We only pin what we need |
| 643 | + // We only pin what we need: |
| 644 | + // The inputs of mixed ops are potentially used for H2D copies... |
644 | 645 | for (int i = 0; i < graph.NumOp(OpType::MIXED); i++) {
|
645 | 646 | auto &node = graph.Node(OpType::MIXED, i);
|
646 |
| - for (int j = 0; j < node.spec.NumRegularInput(); ++j) { |
| 647 | + if (node.spec.name().find("decoders__") == 0) |
| 648 | + continue; // don't pin inputs to decoders |
| 649 | + for (int j = 0; j < node.spec.NumInput(); ++j) { |
647 | 650 | auto tid = node.parent_tensors[j];
|
648 | 651 | // Use pinned memory only when it is useful
|
649 |
| - if (node.spec.name() == "MakeContiguous" && node.spec.NumOutput() == 1) { |
| 652 | + auto &parent_tensor_queue = |
| 653 | + get_queue<OpType::CPU, StorageDevice::CPU>(tensor_to_store_queue_[tid]); |
| 654 | + for (auto &tensor : parent_tensor_queue) { |
| 655 | + tensor->set_pinned(node.spec.OutputDevice(0) == "gpu" && !RestrictPinnedMemUsage()); |
| 656 | + } |
| 657 | + } |
| 658 | + } |
| 659 | + |
| 660 | + // ...as are CPU inputs of GPU ops (e.g. argument inputs) |
| 661 | + for (int i = 0; i < graph.NumOp(OpType::GPU); i++) { |
| 662 | + auto &node = graph.Node(OpType::GPU, i); |
| 663 | + for (int j = 0; j < node.spec.NumInput(); ++j) { |
| 664 | + auto tid = node.parent_tensors[j]; |
| 665 | + if (graph.Tensor(tid).producer.storage_device == StorageDevice::CPU) { |
650 | 666 | auto &parent_tensor_queue =
|
651 | 667 | get_queue<OpType::CPU, StorageDevice::CPU>(tensor_to_store_queue_[tid]);
|
652 | 668 | for (auto &tensor : parent_tensor_queue) {
|
|
0 commit comments