|
17 | 17 | #include <chrono>
|
18 | 18 | #include <future>
|
19 | 19 |
|
| 20 | +#include "dali/core/tensor_shape.h" |
| 21 | +#include "dali/pipeline/data/backend.h" |
20 | 22 | #include "dali/test/dali_test_decoder.h"
|
21 | 23 | #include "dali/pipeline/executor/executor.h"
|
22 | 24 | #include "dali/pipeline/executor/pipelined_executor.h"
|
@@ -603,4 +605,106 @@ TYPED_TEST(ExecutorSyncTest, TestPrefetchedExecution) {
|
603 | 605 | test::CheckResults(ws, batch_size, 1, tl);
|
604 | 606 | }
|
605 | 607 |
|
| 608 | + |
| 609 | +TYPED_TEST(ExecutorTest, TestPinning) { |
| 610 | + auto exe = this->GetExecutor(this->batch_size_, this->num_threads_, 0, 1); |
| 611 | + exe->Init(); |
| 612 | + |
| 613 | + // Build a basic cpu->gpu graph |
| 614 | + OpGraph graph; |
| 615 | + graph.AddOp(this->PrepareSpec( |
| 616 | + OpSpec("ExternalSource") |
| 617 | + .AddArg("device", "cpu") |
| 618 | + .AddArg("device_id", 0) |
| 619 | + .AddOutput("data_0", "cpu")), "ExternalSource_0"); |
| 620 | + |
| 621 | + // First set of Copy + Copy and Pass Through |
| 622 | + graph.AddOp(this->PrepareSpec( |
| 623 | + OpSpec("Copy") |
| 624 | + .AddArg("device", "cpu") |
| 625 | + .AddInput("data_0", "cpu") |
| 626 | + .AddOutput("copy_0", "cpu")), "Copy_0"); |
| 627 | + |
| 628 | + graph.AddOp(this->PrepareSpec( |
| 629 | + OpSpec("Copy") |
| 630 | + .AddArg("device", "cpu") |
| 631 | + .AddInput("data_0", "cpu") |
| 632 | + .AddOutput("copy_1", "cpu")), "Copy_1"); |
| 633 | + |
| 634 | + graph.AddOp(this->PrepareSpec( |
| 635 | + OpSpec("Reshape") |
| 636 | + .AddArg("device", "cpu") |
| 637 | + .AddArg("layout", "") |
| 638 | + .AddInput("copy_0", "cpu") |
| 639 | + .AddOutput("pass_through_0", "cpu")), "PassThrough_0"); |
| 640 | + |
| 641 | + // Trigger pinning of first set when it moves CPU -> GPU |
| 642 | + graph.AddOp(this->PrepareSpec( |
| 643 | + OpSpec("MakeContiguous") |
| 644 | + .AddArg("device", "mixed") |
| 645 | + .AddInput("pass_through_0", "cpu") |
| 646 | + .AddOutput("out_0", "gpu")), "MakeContiguous_0"); |
| 647 | + |
| 648 | + // but not the Copy_1 to compare against |
| 649 | + graph.AddOp(this->PrepareSpec( |
| 650 | + OpSpec("MakeContiguous") |
| 651 | + .AddArg("device", "mixed") |
| 652 | + .AddInput("copy_1", "cpu") |
| 653 | + .AddOutput("out_1", "cpu")), "MakeContiguous_1"); |
| 654 | + |
| 655 | + |
| 656 | + // Second set of Copy and Pass Through |
| 657 | + graph.AddOp(this->PrepareSpec( |
| 658 | + OpSpec("Copy") |
| 659 | + .AddArg("device", "cpu") |
| 660 | + .AddInput("data_0", "cpu") |
| 661 | + .AddOutput("copy_2", "cpu")), "Copy_2"); |
| 662 | + |
| 663 | + graph.AddOp(this->PrepareSpec( |
| 664 | + OpSpec("Reshape") |
| 665 | + .AddArg("device", "cpu") |
| 666 | + .AddArg("layout", "") |
| 667 | + .AddInput("copy_2", "cpu") |
| 668 | + .AddOutput("pass_through_1", "cpu")), "PassThrough_1"); |
| 669 | + |
| 670 | + // Check pinning argument inputs to operators in GPU stage |
| 671 | + graph.AddOp(this->PrepareSpec( |
| 672 | + OpSpec("random__CoinFlip") |
| 673 | + .AddArg("device", "gpu") |
| 674 | + .AddArgumentInput("probability", "pass_through_1") |
| 675 | + .AddOutput("out_2", "gpu")), "CoinFlip"); |
| 676 | + |
| 677 | + |
| 678 | + graph.SaveToDotFile("cheating.dot", true, true, true); |
| 679 | + vector<string> outputs = {"copy_0_cpu", "copy_1_cpu", "pass_through_0_cpu", "copy_2_cpu", |
| 680 | + "pass_through_1_cpu", "out_0_gpu", "out_1_cpu", "out_2_gpu"}; |
| 681 | + |
| 682 | + exe->Build(&graph, outputs); |
| 683 | + |
| 684 | + // Set the data for the external source |
| 685 | + auto *src_op = dynamic_cast<ExternalSource<CPUBackend> *>(graph.Node(OpType::CPU, 0).op.get()); |
| 686 | + TensorList<CPUBackend> tl; |
| 687 | + tl.Resize(uniform_list_shape(this->batch_size_, TensorShape<>{}), DALI_FLOAT); |
| 688 | + src_op->SetDataSource(tl); |
| 689 | + |
| 690 | + exe->RunCPU(); |
| 691 | + exe->RunMixed(); |
| 692 | + exe->RunGPU(); |
| 693 | + |
| 694 | + DeviceWorkspace ws; |
| 695 | + exe->Outputs(&ws); |
| 696 | + |
| 697 | + // Utilize the fact that the outputs are shared from the executor, so we can check if they are |
| 698 | + // pinned in a way we expect |
| 699 | + // Currently we expect to pin anything that is CPU argument input into GPU operator, and |
| 700 | + // is a CPU -> GPU copy (not via a decoder), so CPU input to Mixed operator that returns GPU data. |
| 701 | + // The whole pass-through group should be pinned as well. |
| 702 | + |
| 703 | + EXPECT_TRUE(ws.Output<CPUBackend>(0).is_pinned()); // copy_0_cpu |
| 704 | + EXPECT_FALSE(ws.Output<CPUBackend>(1).is_pinned()); // copy_1_cpu |
| 705 | + EXPECT_TRUE(ws.Output<CPUBackend>(2).is_pinned()); // pass_through_0_cpu |
| 706 | + EXPECT_TRUE(ws.Output<CPUBackend>(3).is_pinned()); // copy_2_cpu |
| 707 | + EXPECT_TRUE(ws.Output<CPUBackend>(4).is_pinned()); // pass_through_1_cpu |
| 708 | +} |
| 709 | + |
606 | 710 | } // namespace dali
|
0 commit comments