From fb1d481c1c7306977d976bca2fe89f36b0319793 Mon Sep 17 00:00:00 2001
From: drivanov <32910461+drivanov@users.noreply.github.com>
Date: Fri, 17 Aug 2018 14:06:40 -0700
Subject: [PATCH] Test refactor (#91)

* Division by 0 in in CropMirrorNormalizeTest was fixed.
Elimination of duplicated code from WriteImage functions.

Signed-off-by: Andrei <andreii@nvidia.com>

* Fix of problem in bresnet50 benchmark

Signed-off-by: Andrei <andreii@nvidia.com>

* CropMirrorNormalizePermuteTest refactored

Signed-off-by: Andrei <andreii@nvidia.com>

* Lint problem fixed

Signed-off-by: Andrei <andreii@nvidia.com>

* JpegDecodeTest re-implemented

Signed-off-by: Andrei <andreii@nvidia.com>

* Nondefault bound for average deviation implemented for 2 decode tests

Signed-off-by: Andrei <andreii@nvidia.com>

* Executore test refactored.
The JPEG/PNG files are loaded and decoded only when it's is really
need to be done for performing of current test.

Signed-off-by: Andrei <andreii@nvidia.com>

* Lint problem fixed
Output of the images in benchmark eliminate

Signed-off-by: Andrei <andreii@nvidia.com>

* Eps for ExecutorTest and some changes, suggested by Cliff

Signed-off-by: Andrei <andreii@nvidia.com>

* Changes requested by Cliff.
- Destructor for ImageSetDescr
- Conditional compilation of WriteHWCBatch for resnet50 benchmark

Signed-off-by: Andrei <andreii@nvidia.com>

* Color augmentations on CPU

Signed-off-by: Andrei <andreii@nvidia.com>

* GenericMatchingTest class implemented
This class was used for
 - new tests for Displacement Operators.
 - refactoring of the tests for Color Operators

Signed-off-by: Andrei <andreii@nvidia.com>

* Average deviation of pixel colors for unit tests rescaled
(normalized to color range and converted to percentages)

Signed-off-by: Andrei <andreii@nvidia.com>

* Changes requested by Przemek

Signed-off-by: Andrei <andreii@nvidia.com>
---
 dali/benchmark/dali_bench.h                   |  19 +-
 dali/benchmark/resnet50_nvjpeg_bench.cc       |   4 +-
 dali/image/jpeg_test.cc                       | 120 +-----
 dali/image/transform.cc                       |  44 +++
 dali/image/transform.h                        |   9 +-
 dali/pipeline/data/allocator.h                |   4 +-
 dali/pipeline/executor/executor_test.cc       |  97 +----
 dali/pipeline/operators/color/color_test.cc   |  42 ++
 dali/pipeline/operators/color/color_twist.cc  |  40 +-
 .../operators/decoder/host_decoder_test.cc    |  11 +-
 .../operators/decoder/nvjpeg_decoder_test.cc  |   2 +-
 .../displacement/displacement_test.cc         |  55 +++
 .../fused/crop_mirror_normalize_test.cc       | 200 +++-------
 .../operators/fused/resize_crop_mirror.h      |  12 +-
 .../fused/resize_crop_mirror_test.cc          |   4 +-
 dali/pipeline/operators/operator.h            |   2 +-
 dali/pipeline/operators/operator_factory.h    |  19 +-
 .../operators/operator_factory_test.cc        |   4 +-
 dali/pipeline/operators/resize/new_resize.h   |   2 +-
 .../operators/resize/new_resize_test.cc       |   4 +-
 dali/pipeline/operators/resize/resize_test.cc |   8 +-
 dali/pipeline/pipeline_test.cc                | 372 +++++++-----------
 dali/test/dali_test.h                         |  66 ++--
 dali/test/dali_test_decoder.h                 |  61 +--
 dali/test/dali_test_matching.h                |  75 ++++
 dali/test/dali_test_resize.h                  |  66 +---
 dali/test/dali_test_single_op.h               | 317 ++++++++++++---
 dali/util/image.cc                            |  65 +--
 dali/util/image.h                             | 143 +++----
 29 files changed, 927 insertions(+), 940 deletions(-)
 create mode 100644 dali/pipeline/operators/color/color_test.cc
 create mode 100644 dali/pipeline/operators/displacement/displacement_test.cc
 create mode 100644 dali/test/dali_test_matching.h

diff --git a/dali/benchmark/dali_bench.h b/dali/benchmark/dali_bench.h
index fcc6f4e0fbe..30174c2606d 100644
--- a/dali/benchmark/dali_bench.h
+++ b/dali/benchmark/dali_bench.h
@@ -33,14 +33,10 @@ class DALIBenchmark : public benchmark::Fixture {
  public:
   DALIBenchmark() {
     rand_gen_.seed(time(nullptr));
-    LoadJPEGS(image_folder, &jpeg_names_, &jpegs_, &jpeg_sizes_);
+    LoadJPEGS(image_folder, &jpeg_names_, &jpegs_);
   }
 
-  virtual ~DALIBenchmark() {
-    for (auto &ptr : jpegs_) {
-      delete[] ptr;
-    }
-  }
+  virtual ~DALIBenchmark() = default;
 
   int RandInt(int a, int b) {
     return std::uniform_int_distribution<>(a, b)(rand_gen_);
@@ -52,10 +48,11 @@ class DALIBenchmark : public benchmark::Fixture {
   }
 
   inline void MakeJPEGBatch(TensorList<CPUBackend> *tl, int n) {
-    DALI_ENFORCE(jpegs_.size() > 0, "jpegs must be loaded to create batches");
+    const auto nImgs = jpegs_.nImages();
+    DALI_ENFORCE(nImgs > 0, "jpegs must be loaded to create batches");
     vector<Dims> shape(n);
     for (int i = 0; i < n; ++i) {
-      shape[i] = {jpeg_sizes_[i % jpegs_.size()]};
+      shape[i] = {jpegs_.sizes_[i % nImgs]};
     }
 
     tl->template mutable_data<uint8>();
@@ -63,16 +60,14 @@ class DALIBenchmark : public benchmark::Fixture {
 
     for (int i = 0; i < n; ++i) {
       std::memcpy(tl->template mutable_tensor<uint8>(i),
-          jpegs_[i % jpegs_.size()],
-          jpeg_sizes_[i % jpegs_.size()]);
+          jpegs_.data_[i % nImgs], jpegs_.sizes_[i % nImgs]);
     }
   }
 
  protected:
   std::mt19937 rand_gen_;
   vector<string> jpeg_names_;
-  vector<uint8*> jpegs_;
-  vector<int> jpeg_sizes_;
+  ImgSetDescr jpegs_;
 };
 
 }  // namespace dali
diff --git a/dali/benchmark/resnet50_nvjpeg_bench.cc b/dali/benchmark/resnet50_nvjpeg_bench.cc
index 9a4897343b0..42941953541 100644
--- a/dali/benchmark/resnet50_nvjpeg_bench.cc
+++ b/dali/benchmark/resnet50_nvjpeg_bench.cc
@@ -112,7 +112,9 @@ BENCHMARK_DEFINE_F(RealRN50, nvjpegPipe)(benchmark::State& st) { // NOLINT
     }
   }
 
-  WriteHWCBatch<uint8_t>(*ws.Output<GPUBackend>(0), 0, 1, "img");
+#if DALI_DEBUG
+  WriteHWCBatch(*ws.Output<GPUBackend>(0), "img");
+#endif
   int num_batches = st.iterations() + static_cast<int>(pipelined);
   st.counters["FPS"] = benchmark::Counter(batch_size*num_batches,
       benchmark::Counter::kIsRate);
diff --git a/dali/image/jpeg_test.cc b/dali/image/jpeg_test.cc
index 87089cf3359..85cbc5c959a 100644
--- a/dali/image/jpeg_test.cc
+++ b/dali/image/jpeg_test.cc
@@ -12,117 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <gtest/gtest.h>
-#include <opencv2/opencv.hpp>
-
-#include <cmath>
-#include <fstream>
-#include <stdexcept>
-#include <vector>
-#include <string>
-
-#include "dali/common.h"
-#include "dali/test/dali_test.h"
-#include "dali/image/jpeg.h"
+#include "dali/test/dali_test_decoder.h"
 
 namespace dali {
 
-namespace {
-// Our turbo jpeg decoder cannot handle CMYK images
-// or 410 images
-const vector<string> tjpg_test_images = {
-  image_folder + "/420.jpg",
-  image_folder + "/422.jpg",
-  image_folder + "/440.jpg",
-  image_folder + "/444.jpg",
-  image_folder + "/gray.jpg",
-  image_folder + "/411.jpg",
-  image_folder + "/411-non-multiple-4-width.jpg",
-  image_folder + "/420-odd-height.jpg",
-  image_folder + "/420-odd-width.jpg",
-  image_folder + "/420-odd-both.jpg",
-  image_folder + "/422-odd-width.jpg"
-};
-}  // namespace
-
 // Fixture for jpeg decode testing. Templated
 // to make googletest run our tests grayscale & rgb
 template <typename ImgType>
-class JpegDecodeTest : public DALITest {
- public:
-  void SetUp() {
-    if (IsColor(img_type_)) {
-      c_ = 3;
-    } else if (img_type_ == DALI_GRAY) {
-      c_ = 1;
-    }
-    rand_gen_.seed(time(nullptr));
-    LoadJPEGS(tjpg_test_images, &jpegs_, &jpeg_sizes_);
-  }
-
-  void TearDown() {
-    DALITest::TearDown();
-  }
-
-  void VerifyDecode(const uint8 *img, int h, int w, int img_id) {
-    // Compare w/ opencv result
-    cv::Mat ver;
-    cv::Mat jpeg = cv::Mat(1, jpeg_sizes_[img_id], CV_8UC1, jpegs_[img_id]);
-
-    ASSERT_TRUE(CheckIsJPEG(jpegs_[img_id], jpeg_sizes_[img_id]));
-    int flag = IsColor(img_type_) ? CV_LOAD_IMAGE_COLOR : CV_LOAD_IMAGE_GRAYSCALE;
-    cv::imdecode(jpeg, flag, &ver);
-
-    cv::Mat ver_img(h, w, IsColor(img_type_) ? CV_8UC3 : CV_8UC2);
-    if (img_type_ == DALI_RGB) {
-      // Convert from BGR to RGB for verification
-      cv::cvtColor(ver, ver_img, CV_BGR2RGB);
-    } else {
-      ver_img = ver;
-    }
-
-    ASSERT_EQ(h, ver_img.rows);
-    ASSERT_EQ(w, ver_img.cols);
-    vector<int> diff(h*w*c_, 0);
-    for (int i = 0; i < h*w*c_; ++i) {
-      diff[i] = abs(static_cast<int>(ver_img.ptr()[i] - img[i]));
-    }
-
-    // calculate the MSE
-    float mean, std;
-    MeanStdDev(diff, &mean, &std);
-
-#ifndef NDEBUG
-    cout << "num: " << diff.size() << endl;
-    cout << "mean: " << mean << endl;
-    cout << "std: " << std << endl;
-#endif
-
-    // Note: We allow a slight deviation from the ground truth.
-    // This value was picked fairly arbitrarily to let the test
-    // pass for libjpeg turbo
-    ASSERT_LT(mean, 2.f);
-    ASSERT_LT(std, 3.f);
-  }
-
-  void MeanStdDev(const vector<int> &diff, float *mean, float *std) {
-    // Avoid division by zero
-    ASSERT_NE(diff.size(), 0);
-
-    double sum = 0, var_sum = 0;
-    for (auto &val : diff) {
-      sum += val;
-    }
-    *mean = sum / diff.size();
-    for (auto &val : diff) {
-      var_sum += (val - *mean)*(val - *mean);
-    }
-    *std = sqrt(var_sum / diff.size());
-  }
-
- protected:
-  const DALIImageType img_type_ = ImgType::type;
-  int c_;
+class JpegDecodeTest : public GenericDecoderTest<ImgType> {
 };
 
 // Run RGB & grayscale tests
@@ -130,18 +27,7 @@ typedef ::testing::Types<RGB, BGR, Gray> Types;
 TYPED_TEST_CASE(JpegDecodeTest, Types);
 
 TYPED_TEST(JpegDecodeTest, DecodeJPEGHost) {
-  vector<uint8> image;
-  for (size_t img = 0; img < this->jpegs_.size(); ++img) {
-    Tensor<CPUBackend> t;
-    DALI_CALL(DecodeJPEGHost(this->jpegs_[img],
-            this->jpeg_sizes_[img],
-            this->img_type_, &t));
-#ifndef NDEBUG
-    cout << img << " " << tjpg_test_images[img] << " " << this->jpeg_sizes_[img] << endl;
-    cout << "dims: " << t.dim(1) << "x" << t.dim(0) << endl;
-#endif
-    this->VerifyDecode(t.data<uint8_t>(), t.dim(0), t.dim(1), img);
-  }
+  this->RunTestDecode(this->jpegs_, 1.5);
 }
 
 }  // namespace dali
diff --git a/dali/image/transform.cc b/dali/image/transform.cc
index fd584e284f5..99b87069efc 100644
--- a/dali/image/transform.cc
+++ b/dali/image/transform.cc
@@ -152,4 +152,48 @@ DALIError_t FastResizeCropMirrorHost(const uint8 *img, int H, int W, int C,
   return DALISuccess;
 }
 
+void CheckParam(const Tensor<CPUBackend> &input, const std::string &opName) {
+  DALI_ENFORCE(input.ndim() == 3);
+  DALI_ENFORCE(IsType<uint8>(input.type()),
+               opName + " expects input data in uint8.");
+  DALI_ENFORCE(input.dim(2) == 1 || input.dim(2) == 3,
+               opName + " supports hwc rgb & grayscale inputs.");
+}
+
+typedef cv::Vec<uchar, 1> Vec1b;
+
+DALIError_t MakeColorTransformation(const uint8 *img, int H, int W, int C,
+                                    const float *matr, uint8 *out_img) {
+  const int channel_flag = C == 3 ? CV_8UC3 : CV_8UC1;
+
+  const cv::Mat cv_imgIn = CreateMatFromPtr(H, W, channel_flag, img);
+  cv::Mat cv_imgOut = CreateMatFromPtr(H, W, channel_flag, out_img);
+
+  if (C == 1) {
+    for (int y = 0; y < H; ++y) {
+      for (int x = 0; x < W; ++x) {
+        cv_imgOut.at<Vec1b>(y, x)[0] =
+            cv::saturate_cast<uint8>((matr[0] * cv_imgIn.at<Vec1b>(y, x)[0]) + matr[1]);
+      }
+    }
+  } else {
+    for (int y = 0; y < H; ++y) {
+      for (int x = 0; x < W; ++x) {
+        // Using direct calculation because they are 25% faster
+        // than two loops which could be used here
+        const auto &inpPix = cv_imgIn.at<cv::Vec3b>(y, x);
+        auto &outPix = cv_imgOut.at<cv::Vec3b>(y, x);
+        outPix[0] = cv::saturate_cast<uint8>
+          (inpPix[0] * matr[0] + inpPix[1] * matr[1] + inpPix[2] * matr[2] + matr[3]);
+        outPix[1] = cv::saturate_cast<uint8>
+          (inpPix[0] * matr[4] + inpPix[1] * matr[5] + inpPix[2] * matr[6] + matr[7]);
+        outPix[2] = cv::saturate_cast<uint8>
+          (inpPix[0] * matr[8] + inpPix[1] * matr[9] + inpPix[2] * matr[10] + matr[11]);
+      }
+    }
+  }
+
+  return DALISuccess;
+}
+
 }  // namespace dali
diff --git a/dali/image/transform.h b/dali/image/transform.h
index bb700090718..2a6bc2eb00c 100644
--- a/dali/image/transform.h
+++ b/dali/image/transform.h
@@ -15,8 +15,10 @@
 #ifndef DALI_IMAGE_TRANSFORM_H_
 #define DALI_IMAGE_TRANSFORM_H_
 
+#include <string>
 #include "dali/common.h"
 #include "dali/error_handling.h"
+#include "dali/pipeline/data/tensor.h"
 
 namespace dali {
 
@@ -31,7 +33,7 @@ namespace dali {
  * this temporary workspace pointer to avoid extra memory allocation. The size
  * of the memory pointed to by 'workspace' should be rsz_h*rsz_w*C bytes
  *
- * Note: We leave the calculate of the resize dimesions & the decision of whether
+ * Note: We leave the calculate of the resize dimensions & the decision of whether
  * to mirror the image or not external to the function. With the GPU version of
  * this function, these params will need to have been calculated before-hand
  * and, in the case of a batched call, copied to the device. Separating these
@@ -65,6 +67,11 @@ DALIError_t FastResizeCropMirrorHost(const uint8 *img, int H, int W, int C,
     int mirror, uint8 *out_img, DALIInterpType type = DALI_INTERP_LINEAR,
     uint8 *workspace = nullptr);
 
+void CheckParam(const Tensor<CPUBackend> &input,  const std::string &pOperator);
+
+DALIError_t MakeColorTransformation(const uint8 *img, int H, int W, int C,
+                                    const float *matrix, uint8 *out_img);
+
 }  // namespace dali
 
 #endif  // DALI_IMAGE_TRANSFORM_H_
diff --git a/dali/pipeline/data/allocator.h b/dali/pipeline/data/allocator.h
index 64030be8104..d62aae923a6 100644
--- a/dali/pipeline/data/allocator.h
+++ b/dali/pipeline/data/allocator.h
@@ -75,7 +75,7 @@ DALI_DECLARE_OPTYPE_REGISTRY(GPUAllocator, GPUAllocator);
 
 #define DALI_REGISTER_GPU_ALLOCATOR(OpName, OpType) \
   DALI_DEFINE_OPTYPE_REGISTERER(OpName, OpType,     \
-      dali::GPUAllocator, dali::GPUAllocator)
+      dali::GPUAllocator, dali::GPUAllocator, "GPU_Allocator")
 
 
 /**
@@ -99,7 +99,7 @@ DALI_DECLARE_OPTYPE_REGISTRY(CPUAllocator, CPUAllocator);
 
 #define DALI_REGISTER_CPU_ALLOCATOR(OpName, OpType) \
   DALI_DEFINE_OPTYPE_REGISTERER(OpName, OpType,     \
-      dali::CPUAllocator, dali::CPUAllocator)
+      dali::CPUAllocator, dali::CPUAllocator, "CPU_Allocator")
 
 /**
  * @brief Pinned memory CPU allocator
diff --git a/dali/pipeline/executor/executor_test.cc b/dali/pipeline/executor/executor_test.cc
index 990497e1986..4470a7e9141 100644
--- a/dali/pipeline/executor/executor_test.cc
+++ b/dali/pipeline/executor/executor_test.cc
@@ -12,118 +12,58 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "dali/pipeline/executor/executor.h"
-
-#include <opencv2/opencv.hpp>
-
-#include "dali/pipeline/operators/util/external_source.h"
-#include "dali/test/dali_test.h"
+#include "dali/test/dali_test_decoder.h"
 
 namespace dali {
 
-namespace {
-// Our turbo jpeg decoder cannot handle CMYK images
-// or 410 images
-const vector<string> tjpg_test_images = {
-  image_folder + "/420.jpg",
-  image_folder + "/422.jpg",
-  image_folder + "/440.jpg",
-  image_folder + "/444.jpg",
-  image_folder + "/gray.jpg",
-  image_folder + "/411.jpg",
-  image_folder + "/411-non-multiple-4-width.jpg",
-  image_folder + "/420-odd-height.jpg",
-  image_folder + "/420-odd-width.jpg",
-  image_folder + "/420-odd-both.jpg",
-  image_folder + "/422-odd-width.jpg"
-};
-}  // namespace
+class ExecutorTest : public GenericDecoderTest<RGB> {
+ protected:
+  uint32_t GetImageLoadingFlags() const override {
+    return t_loadJPEGs + t_decodeJPEGs;
+  }
 
-class ExecutorTest : public DALITest {
- public:
   void SetUp() override {
-    rand_gen_.seed(time(nullptr));
-    LoadJPEGS(tjpg_test_images, &jpegs_, &jpeg_sizes_);
-    batch_size_ = jpegs_.size();
-    DecodeJPEGS(DALI_RGB);
+    DALISingleOpTest::SetUp();
+    set_batch_size(jpegs_.nImages());
   }
 
   inline void set_batch_size(int size) { batch_size_ = size; }
 
-  inline OpSpec PrepareSpec(OpSpec spec) {
+  inline OpSpec PrepareSpec(OpSpec spec) const {
     spec.AddArg("batch_size", batch_size_)
       .AddArg("num_threads", num_threads_);
     return spec;
   }
 
-  inline void PruneGraph(Executor *exe) {
+  inline void PruneGraph(Executor *exe) const {
     exe->PruneUnusedGraphNodes();
   }
 
-  vector<HostWorkspace> CPUData(Executor *exe, int idx) {
+  vector<HostWorkspace> CPUData(Executor *exe, int idx) const {
     return exe->wss_[idx].cpu_op_data;
   }
 
-  vector<MixedWorkspace> MixedData(Executor *exe, int idx) {
+  vector<MixedWorkspace> MixedData(Executor *exe, int idx) const {
     return exe->wss_[idx].mixed_op_data;
   }
 
-  vector<DeviceWorkspace> GPUData(Executor *exe, int idx) {
+  vector<DeviceWorkspace> GPUData(Executor *exe, int idx) const {
     return exe->wss_[idx].gpu_op_data;
   }
 
-  void VerifyDecode(const uint8 *img, int h, int w, int img_id) {
+  void VerifyDecode(const uint8 *img, int h, int w, int img_id) const {
     // Load the image to host
     uint8 *host_img = new uint8[h*w*c_];
     CUDA_CALL(cudaMemcpy(host_img, img, h*w*c_, cudaMemcpyDefault));
 
-    // Compare w/ opencv result
-    cv::Mat ver;
-    cv::Mat jpeg = cv::Mat(1, jpeg_sizes_[img_id], CV_8UC1, jpegs_[img_id]);
-
-    ASSERT_TRUE(CheckIsJPEG(jpegs_[img_id], jpeg_sizes_[img_id]));
-    int flag = IsColor(img_type_) ? CV_LOAD_IMAGE_COLOR : CV_LOAD_IMAGE_GRAYSCALE;
-    cv::imdecode(jpeg, flag, &ver);
-
-    cv::Mat ver_img(h, w, IsColor(img_type_) ? CV_8UC3 : CV_8UC2);
-    if (img_type_ == DALI_RGB) {
-      // Convert from BGR to RGB for verification
-      cv::cvtColor(ver, ver_img, CV_BGR2RGB);
-    } else {
-      ver_img = ver;
-    }
-
-    // DEBUG
-    // WriteHWCImage(ver_img.ptr(), h, w, c_, std::to_string(img_id) + "-ver");
-
-    ASSERT_EQ(h, ver_img.rows);
-    ASSERT_EQ(w, ver_img.cols);
-    vector<int> diff(h*w*c_, 0);
-    for (int i = 0; i < h*w*c_; ++i) {
-      diff[i] = abs(static_cast<int>(ver_img.ptr()[i] - host_img[i]));
-    }
-
-    // calculate the MSE
-    double mean, std;
-    this->MeanStdDev(diff, &mean, &std);
-
-#ifndef NDEBUG
-    cout << "num: " << diff.size() << endl;
-    cout << "mean: " << mean << endl;
-    cout << "std: " << std << endl;
+#if DALI_DEBUG
+    WriteHWCImage(host_img, h, w, c_, std::to_string(img_id) + "-img");
 #endif
-
-    // Note: We allow a slight deviation from the ground truth.
-    // This value was picked fairly arbitrarily to let the test
-    // pass for libjpeg turbo
-    ASSERT_LT(mean, 2.f);
-    ASSERT_LT(std, 3.f);
+    GenericDecoderTest::VerifyDecode(host_img, h, w, jpegs_, img_id);
+    delete [] host_img;
   }
 
- protected:
   int batch_size_, num_threads_ = 1;
-  int c_ = 3;
-  DALIImageType img_type_ = DALI_RGB;
 };
 
 TEST_F(ExecutorTest, TestPruneBasicGraph) {
@@ -454,6 +394,7 @@ TEST_F(ExecutorTest, TestRunBasicGraph) {
 TEST_F(ExecutorTest, TestPrefetchedExecution) {
   int batch_size = this->batch_size_ / 2;
   this->set_batch_size(batch_size);
+  this->SetEps(1.6);
 
   Executor exe(this->batch_size_, this->num_threads_, 0, 1);
 
diff --git a/dali/pipeline/operators/color/color_test.cc b/dali/pipeline/operators/color/color_test.cc
new file mode 100644
index 00000000000..df59d542a95
--- /dev/null
+++ b/dali/pipeline/operators/color/color_test.cc
@@ -0,0 +1,42 @@
+// Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "dali/test/dali_test_matching.h"
+
+namespace dali {
+
+template <typename ImgType>
+class ColorTest : public GenericMatchingTest<ImgType> {
+};
+
+typedef ::testing::Types<RGB> Types;
+TYPED_TEST_CASE(ColorTest, Types);
+
+TYPED_TEST(ColorTest, Brightness) {
+  this->RunTest({"Brightness", "brightness", "3.", 1e-4});
+}
+
+TYPED_TEST(ColorTest, Contrast) {
+  this->RunTest({"Contrast", "contrast", "1.3", 0.18});
+}
+
+TYPED_TEST(ColorTest, Saturation) {
+  this->RunTest({"Saturation", "saturation", "3.", 0.26});
+}
+
+TYPED_TEST(ColorTest, Hue) {
+  this->RunTest({"Hue", "hue", "31.456", 0.27});
+}
+
+}  // namespace dali
diff --git a/dali/pipeline/operators/color/color_twist.cc b/dali/pipeline/operators/color/color_twist.cc
index 8b274de38c5..fbeed64f190 100644
--- a/dali/pipeline/operators/color/color_twist.cc
+++ b/dali/pipeline/operators/color/color_twist.cc
@@ -13,8 +13,7 @@
 // limitations under the License.
 
 #include "dali/pipeline/operators/color/color_twist.h"
-#include <vector>
-#include <string>
+#include "dali/image/transform.h"
 
 namespace dali {
 
@@ -72,4 +71,41 @@ Values >= 0 are supported. For example:
 )code", 1.f, true)
     .AddParent("ColorTransformBase");
 
+template <>
+void ColorTwistBase<CPUBackend>::RunImpl(SampleWorkspace *ws, const int idx) {
+  const auto &input = ws->Input<CPUBackend>(idx);
+  auto output = ws->Output<CPUBackend>(idx);
+  const auto &input_shape = input.shape();
+
+  CheckParam(input, "Color augmentation");
+
+  const auto H = input_shape[0];
+  const auto W = input_shape[1];
+  const auto C = input_shape[2];
+
+  output->ResizeLike(input);
+
+  auto pImgInp = input.template data<uint8>();
+  auto pImgOut = output->template mutable_data<uint8>();
+
+  if (!augments_.empty()) {
+    float matrix[nDim][nDim];
+    float *m = reinterpret_cast<float *>(matrix);
+    IdentityMatrix(m);
+    for (size_t j = 0; j < augments_.size(); ++j) {
+      augments_[j]->Prepare(0, spec_, ws);
+      (*augments_[j])(m);
+    }
+
+    MakeColorTransformation(pImgInp, H, W, C, m, pImgOut);
+  } else {
+    memcpy(pImgOut, pImgInp, H * W * C);
+  }
+}
+
+DALI_REGISTER_OPERATOR(Brightness, BrightnessAdjust<CPUBackend>, CPU);
+DALI_REGISTER_OPERATOR(Contrast, ContrastAdjust<CPUBackend>, CPU);
+DALI_REGISTER_OPERATOR(Hue, HueAdjust<CPUBackend>, CPU);
+DALI_REGISTER_OPERATOR(Saturation, SaturationAdjust<CPUBackend>, CPU);
+
 }  // namespace dali
diff --git a/dali/pipeline/operators/decoder/host_decoder_test.cc b/dali/pipeline/operators/decoder/host_decoder_test.cc
index 09c07c770c6..07fa7051e42 100644
--- a/dali/pipeline/operators/decoder/host_decoder_test.cc
+++ b/dali/pipeline/operators/decoder/host_decoder_test.cc
@@ -19,6 +19,10 @@ namespace dali {
 template <typename ImgType>
 class HostDecodeTest : public GenericDecoderTest<ImgType> {
  protected:
+  uint32_t GetImageLoadingFlags() const override {
+    return t_loadJPEGs + t_loadPNGs;
+  }
+
   const OpSpec DecodingOp() const override {
     return OpSpec("HostDecoder")
       .AddArg("device", "cpu")
@@ -26,8 +30,9 @@ class HostDecodeTest : public GenericDecoderTest<ImgType> {
       .AddInput("encoded", "cpu")
       .AddOutput("decoded", "cpu");
   }
-  uint8 TestCheckType() const  override {
-    return t_checkColorComp + t_checkElements + t_checkAll + t_checkNoAssert;
+
+  uint8 GetTestCheckType() const  override {
+    return t_checkColorComp + t_checkElements;  // + t_checkAll + t_checkNoAssert;
   }
 };
 
@@ -35,7 +40,7 @@ typedef ::testing::Types<RGB, BGR, Gray> Types;
 TYPED_TEST_CASE(HostDecodeTest, Types);
 
 TYPED_TEST(HostDecodeTest, TestJPEGDecode) {
-  this->RunTestDecode(t_jpegImgType, 0.00000005);
+  this->RunTestDecode(t_jpegImgType, 0.65);
 }
 
 TYPED_TEST(HostDecodeTest, TestPNGDecode) {
diff --git a/dali/pipeline/operators/decoder/nvjpeg_decoder_test.cc b/dali/pipeline/operators/decoder/nvjpeg_decoder_test.cc
index cfd379e6715..3a59869ed8f 100644
--- a/dali/pipeline/operators/decoder/nvjpeg_decoder_test.cc
+++ b/dali/pipeline/operators/decoder/nvjpeg_decoder_test.cc
@@ -32,7 +32,7 @@ class nvjpegDecodeTest : public GenericDecoderTest<ImgType> {
   void TestDecode(bool batched, int num_threads) {
     batched_ = batched;
     this->SetNumThreads(num_threads);
-    this->RunTestDecode(t_jpegImgType, 2.0);
+    this->RunTestDecode(t_jpegImgType, 0.7);
   }
 
  private:
diff --git a/dali/pipeline/operators/displacement/displacement_test.cc b/dali/pipeline/operators/displacement/displacement_test.cc
new file mode 100644
index 00000000000..a43e8c1d173
--- /dev/null
+++ b/dali/pipeline/operators/displacement/displacement_test.cc
@@ -0,0 +1,55 @@
+// Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "dali/test/dali_test_matching.h"
+
+namespace dali {
+
+template <typename ImgType>
+class DisplacementTest : public GenericMatchingTest<ImgType> {
+};
+
+typedef ::testing::Types<RGB/*, BGR, Gray*/> Types;
+TYPED_TEST_CASE(DisplacementTest, Types);
+
+TYPED_TEST(DisplacementTest, Sphere) {
+  this->RunTest("Sphere");
+}
+
+TYPED_TEST(DisplacementTest, Water) {
+  const OpArg params[] = {{"ampl_x", "2.", t_floatParam},
+                          {"ampl_y", "3.", t_floatParam},
+                          {"phase_x", "0.2", t_floatParam}};
+  this->RunTest("Water", params, sizeof(params)/sizeof(params[0]));
+}
+
+/*
+ * As of 08/03/2018 this test is disabled because Jitter is not activated for CPU
+ *
+TYPED_TEST(DisplacementTest, Jitter) {
+  this->RunTest("Jitter");
+}
+*/
+
+TYPED_TEST(DisplacementTest, WarpAffine) {
+  vector<float> matrix{1.0, 0.8, 0.0, 0.0, 1.2, 0.0};
+  const OpArg params = {"matrix", "1.0, 0.8, 0.0, 0.0, 1.2, 0.0", t_floatVector};
+  this->RunTest("WarpAffine", &params, 1);
+}
+
+TYPED_TEST(DisplacementTest, Rotate) {
+  this->RunTest({"Rotate", "angle", "10", 0.001});
+}
+
+}  // namespace dali
diff --git a/dali/pipeline/operators/fused/crop_mirror_normalize_test.cc b/dali/pipeline/operators/fused/crop_mirror_normalize_test.cc
index 55492b1232e..860aef53c67 100644
--- a/dali/pipeline/operators/fused/crop_mirror_normalize_test.cc
+++ b/dali/pipeline/operators/fused/crop_mirror_normalize_test.cc
@@ -12,179 +12,67 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <gtest/gtest.h>
-#include <opencv2/opencv.hpp>
-
-#include <utility>
-#include <string>
-
-#include "dali/pipeline/operators/fused/resize_crop_mirror.h"
-#include "dali/common.h"
-#include "dali/error_handling.h"
-#include "dali/image/jpeg.h"
-#include "dali/pipeline/pipeline.h"
-#include "dali/test/dali_test.h"
+#include "dali/test/dali_test_resize.h"
 
 namespace dali {
 
-namespace {
-// 440 & 410 not supported by npp
-const vector<string> hybdec_images = {
-  image_folder + "/411.jpg",
-  image_folder + "/420.jpg",
-  image_folder + "/422.jpg",
-  image_folder + "/444.jpg",
-  image_folder + "/gray.jpg",
-  image_folder + "/411-non-multiple-4-width.jpg",
-  image_folder + "/420-odd-height.jpg",
-  image_folder + "/420-odd-width.jpg",
-  image_folder + "/420-odd-both.jpg",
-  image_folder + "/422-odd-width.jpg"
-};
-}  // namespace
-
 template <typename ImgType>
-class CropMirrorNormalizePermuteTest : public DALITest {
- public:
-  void SetUp() {
-    if (IsColor(img_type_)) {
-      c_ = 3;
-    } else if (img_type_ == DALI_GRAY) {
-      c_ = 1;
-    } else {
-      DALI_FAIL("Unsupported image type.");
-    }
-
-    rand_gen_.seed(time(nullptr));
-    LoadJPEGS(hybdec_images, &jpegs_, &jpeg_sizes_);
-  }
-
-  void TearDown() {
-    DALITest::TearDown();
+class CropMirrorNormalizePermuteTest : public GenericResizeTest<ImgType> {
+ protected:
+  virtual vector<TensorList<CPUBackend>*>
+  Reference(const vector<TensorList<CPUBackend>*> &inputs, DeviceWorkspace *ws) {
+    return this->CopyToHost(*ws->Output<GPUBackend>(1));
   }
 
-  void VerifyImage(const float *img, const float *img2, int n,
-      float mean_bound = 2.0, float std_bound = 3.0) {
-    std::vector<float> host_img(n), host_img2(n);
+  void RunTest() {
+    const int batch_size = this->jpegs_.nImages();
+    this->SetBatchSize(batch_size);
+    this->SetNumThreads(1);
 
-    CUDA_CALL(cudaMemcpy(host_img.data(), img, n*sizeof(float), cudaMemcpyDefault));
-    CUDA_CALL(cudaMemcpy(host_img2.data(), img2, n*sizeof(float), cudaMemcpyDefault));
+    TensorList<CPUBackend> data;
+    this->MakeJPEGBatch(&data, batch_size);
+    this->SetExternalInputs({{"jpegs", &data}});
 
-    vector<int> abs_diff(n, 0);
-    for (int i = 0; i < n; ++i) {
-      abs_diff[i] = abs(host_img[i] - host_img2[i]);
-    }
-    double mean, std;
-    MeanStdDev(abs_diff, &mean, &std);
+    shared_ptr<dali::Pipeline> pipe = this->GetPipeline();
+    // Decode the images
+    pipe->AddOperator(
+      OpSpec("HostDecoder")
+        .AddArg("output_type", this->img_type_)
+        .AddInput("jpegs", "cpu")
+        .AddOutput("images", "cpu"));
 
-#ifndef NDEBUG
-    cout << "num: " << abs_diff.size() << endl;
-    cout << "mean: " << mean << endl;
-    cout << "std: " << std << endl;
+    pipe->AddOperator(
+      OpSpec("HostDecoder")
+        .AddArg("output_type", this->img_type_)
+        .AddInput("jpegs", "cpu")
+        .AddOutput("images2", "cpu"));
+
+    // CropMirrorNormalizePermute + crop multiple sets of images
+    DeviceWorkspace ws;
+    this->RunOperator(OpSpec("CropMirrorNormalize")
+                        .AddArg("device", "gpu")
+                        .AddInput("images", "gpu")
+                        .AddOutput("cropped1", "gpu")
+                        .AddInput("images2", "gpu")
+                        .AddOutput("cropped2", "gpu")
+                        .AddArg("crop", vector<int>{64, 64})
+                        .AddArg("mean", vector<float>(this->c_, 0.))
+                        .AddArg("std", vector<float>(this->c_, 1.))
+                        .AddArg("image_type", this->img_type_)
+                        .AddArg("num_input_sets", 2), 1e-4, &ws);
+
+#if DALI_DEBUG
+    WriteCHWBatch<float>(*ws.Output<GPUBackend>(0), 0., 1, "img0");
+    WriteCHWBatch<float>(*ws.Output<GPUBackend>(1), 0., 1, "img1");
 #endif
-
-    // Note: We allow a slight deviation from the ground truth.
-    // This value was picked fairly arbitrarily to let the test
-    // pass for libjpeg turbo
-    ASSERT_LT(mean, mean_bound);
-    ASSERT_LT(std, std_bound);
-  }
-
-  template <typename T>
-  void MeanStdDev(const vector<T> &diff, double *mean, double *std) {
-    // Avoid division by zero
-    ASSERT_NE(diff.size(), 0);
-
-    double sum = 0, var_sum = 0;
-    for (auto &val : diff) {
-      sum += val;
-    }
-    *mean = sum / diff.size();
-    for (auto &val : diff) {
-      var_sum += (val - *mean)*(val - *mean);
-    }
-    *std = sqrt(var_sum / diff.size());
   }
-
- protected:
-  const DALIImageType img_type_ = ImgType::type;
-  int c_;
 };
 
 typedef ::testing::Types<RGB, BGR, Gray> Types;
 TYPED_TEST_CASE(CropMirrorNormalizePermuteTest, Types);
 
 TYPED_TEST(CropMirrorNormalizePermuteTest, MultipleData) {
-  int batch_size = this->jpegs_.size();
-  int num_thread = 1;
-
-  // Create the pipeline
-  Pipeline pipe(
-      batch_size,
-      num_thread,
-      0);
-
-  TensorList<CPUBackend> data;
-  this->MakeJPEGBatch(&data, batch_size);
-  pipe.AddExternalInput("jpegs");
-  pipe.SetExternalInput("jpegs", data);
-
-  // Decode the images
-  pipe.AddOperator(
-      OpSpec("HostDecoder")
-      .AddArg("output_type", this->img_type_)
-      .AddInput("jpegs", "cpu")
-      .AddOutput("images", "cpu"));
-
-  pipe.AddOperator(
-      OpSpec("HostDecoder")
-      .AddArg("output_type", this->img_type_)
-      .AddInput("jpegs", "cpu")
-      .AddOutput("images2", "cpu"));
-
-
-  std::vector<float> mean_vec(this->c_);
-  for (int i = 0; i < this->c_; ++i) {
-    mean_vec[i] = 0.;
-  }
-
-  // CropMirrorNormalizePermute + crop multiple sets of images
-  pipe.AddOperator(
-      OpSpec("CropMirrorNormalize")
-      .AddArg("device", "gpu")
-      .AddInput("images", "gpu")
-      .AddOutput("cropped1", "gpu")
-      .AddInput("images2", "gpu")
-      .AddOutput("cropped2", "gpu")
-      .AddArg("crop", vector<int>{64, 64})
-      .AddArg("mean", mean_vec)
-      .AddArg("std", mean_vec)
-      .AddArg("image_type", this->img_type_)
-      .AddArg("num_input_sets", 2));
-
-    // Build and run the pipeline
-    vector<std::pair<string, string>> outputs = {{"cropped1", "gpu"}, {"cropped2", "gpu"}};
-
-  pipe.Build(outputs);
-
-  // Decode the images
-  pipe.RunCPU();
-  pipe.RunGPU();
-
-  DeviceWorkspace results;
-  pipe.Outputs(&results);
-
-  // Verify the results
-  auto output0 = results.Output<GPUBackend>(0);
-  auto output1 = results.Output<GPUBackend>(1);
-
-  // WriteHWCBatch(*output, "image");
-  for (int i = 0; i < batch_size; ++i) {
-    this->VerifyImage(
-        output0->template tensor<float>(i),
-        output1->template tensor<float>(i),
-        output0->tensor_shape(i)[0]*output0->tensor_shape(i)[1]*output0->tensor_shape(i)[2]);
-  }
+  this->RunTest();
 }
 
 }  // namespace dali
diff --git a/dali/pipeline/operators/fused/resize_crop_mirror.h b/dali/pipeline/operators/fused/resize_crop_mirror.h
index 5e78e230882..160e95e686e 100644
--- a/dali/pipeline/operators/fused/resize_crop_mirror.h
+++ b/dali/pipeline/operators/fused/resize_crop_mirror.h
@@ -172,11 +172,7 @@ class ResizeCropMirror : public Operator<CPUBackend>, protected ResizeCropMirror
   inline void RunImpl(SampleWorkspace *ws, const int idx) override {
     auto &input = ws->Input<CPUBackend>(idx);
     auto output = ws->Output<CPUBackend>(idx);
-    DALI_ENFORCE(input.ndim() == 3);
-    DALI_ENFORCE(IsType<uint8>(input.type()),
-        "Expects input data in uint8.");
-    DALI_ENFORCE(input.dim(2) == 1 || input.dim(2) == 3,
-        "ResizeCropMirror supports hwc rgb & grayscale inputs.");
+    CheckParam(input, "ResizeCropMirror");
 
     const TransformMeta &meta = per_thread_meta_[ws->thread_idx()];
 
@@ -216,11 +212,7 @@ class FastResizeCropMirror : public ResizeCropMirror<CPUBackend> {
   inline void RunImpl(SampleWorkspace *ws, const int idx) override {
     auto &input = ws->Input<CPUBackend>(idx);
     auto output = ws->Output<CPUBackend>(idx);
-    DALI_ENFORCE(input.ndim() == 3);
-    DALI_ENFORCE(IsType<uint8>(input.type()),
-        "Expects input data in uint8.");
-    DALI_ENFORCE(input.dim(2) == 1 || input.dim(2) == 3,
-        "FastResizeCropMirror supports hwc rgb & grayscale inputs.");
+    CheckParam(input, "FastResizeCropMirror");
 
     const TransformMeta &meta = per_thread_meta_[ws->thread_idx()];
 
diff --git a/dali/pipeline/operators/fused/resize_crop_mirror_test.cc b/dali/pipeline/operators/fused/resize_crop_mirror_test.cc
index 0d71afb279a..3f9b74384ed 100644
--- a/dali/pipeline/operators/fused/resize_crop_mirror_test.cc
+++ b/dali/pipeline/operators/fused/resize_crop_mirror_test.cc
@@ -52,14 +52,14 @@ TYPED_TEST(ResizeCropMirrorTest, TestFixedResizeAndCropWarp) {
 TYPED_TEST(ResizeCropMirrorTest, TestFixedFastResizeAndCrop) {
   this->TstBody(this->DefaultSchema(true)
                 .AddArg("resize_shorter", 480.f)
-                .AddArg("crop", vector<int>{224, 224}), 5.5);
+                .AddArg("crop", vector<int>{224, 224}), 2.1);
 }
 
 TYPED_TEST(ResizeCropMirrorTest, TestFixedFastResizeAndCropWarp) {
   this->TstBody(this->DefaultSchema(true)
                 .AddArg("resize_x", 480.f)
                 .AddArg("resize_y", 480.f)
-                .AddArg("crop", vector<int>{224, 224}), 5.5);
+                .AddArg("crop", vector<int>{224, 224}), 2.1);
 }
 
 }  // namespace dali
diff --git a/dali/pipeline/operators/operator.h b/dali/pipeline/operators/operator.h
index 835084f51b1..84f6e56c4cc 100644
--- a/dali/pipeline/operators/operator.h
+++ b/dali/pipeline/operators/operator.h
@@ -218,7 +218,7 @@ DALI_DECLARE_OPTYPE_REGISTRY(SupportOperator, OperatorBase);
   static int ANONYMIZE_VARIABLE(OpName) =                       \
     DALI_OPERATOR_SCHEMA_REQUIRED_FOR_##OpName();               \
   DALI_DEFINE_OPTYPE_REGISTERER(OpName, OpType,                 \
-      device##Operator, dali::OperatorBase)
+      device##Operator, dali::OperatorBase, #device)
 
 class ResizeParamDescr;
 
diff --git a/dali/pipeline/operators/operator_factory.h b/dali/pipeline/operators/operator_factory.h
index ed328ec9660..9679eacc804 100644
--- a/dali/pipeline/operators/operator_factory.h
+++ b/dali/pipeline/operators/operator_factory.h
@@ -37,10 +37,11 @@ class OperatorRegistry {
 
   OperatorRegistry() {}
 
-  void Register(const std::string &name, Creator creator) {
-    std::lock_guard<std::mutex> lock(mutex_);
+  void Register(const std::string &name, Creator creator, const std::string &devName = "") {
+      std::lock_guard<std::mutex> lock(mutex_);
     DALI_ENFORCE(registry_.count(name) == 0,
-        "Operator \"" + name + "\" already registered.");
+        "Operator \"" + name + "\" already registered" +
+        (devName != ""? (" for " + devName) : "") + ".");
     registry_[name] = creator;
   }
 
@@ -71,8 +72,8 @@ class Registerer {
  public:
   Registerer(const std::string &name,
       OperatorRegistry<OpType> *registry,
-      typename OperatorRegistry<OpType>::Creator creator) {
-    registry->Register(name, creator);
+      typename OperatorRegistry<OpType>::Creator creator, const std::string &devName = "") {
+    registry->Register(name, creator, devName);
   }
 
   // Standard creator function used by all operators
@@ -85,9 +86,9 @@ class Registerer {
 
 // Creators a registry object for a specific op type
 #define DALI_DECLARE_OPTYPE_REGISTRY(RegistryName, OpType)            \
-  class DLL_PUBLIC RegistryName##Registry {                                      \
+  class DLL_PUBLIC RegistryName##Registry {                           \
    public:                                                            \
-    DLL_PUBLIC static dali::OperatorRegistry<OpType>& Registry();                \
+    DLL_PUBLIC static dali::OperatorRegistry<OpType>& Registry();     \
   };
 
 #define DALI_DEFINE_OPTYPE_REGISTRY(RegistryName, OpType)               \
@@ -99,11 +100,11 @@ class Registerer {
 // Helper to define a registerer for a specific op type. Each op type
 // defines its own, more aptly named, registration macros on top of this
 #define DALI_DEFINE_OPTYPE_REGISTERER(OpName, DerivedType,              \
-    RegistryName, OpType)                                               \
+    RegistryName, OpType, dev)                                          \
   namespace {                                                           \
     static dali::Registerer<OpType> ANONYMIZE_VARIABLE(anon##OpName)(   \
         #OpName, &RegistryName##Registry::Registry(),                   \
-        dali::Registerer<OpType>::OperatorCreator<DerivedType>);        \
+        dali::Registerer<OpType>::OperatorCreator<DerivedType>, dev);   \
   }
 
 }  // namespace dali
diff --git a/dali/pipeline/operators/operator_factory_test.cc b/dali/pipeline/operators/operator_factory_test.cc
index eb3990870a0..1d6a0f52d34 100644
--- a/dali/pipeline/operators/operator_factory_test.cc
+++ b/dali/pipeline/operators/operator_factory_test.cc
@@ -73,11 +73,11 @@ DALI_DEFINE_OPTYPE_REGISTRY(GPUDummy, DummyBase<GPUBackend>);
 // Some registration macros
 #define DALI_REGISTER_CPU_DUMMY(OpName, OpType)        \
   DALI_DEFINE_OPTYPE_REGISTERER(OpName, OpType,        \
-      CPUDummy, DummyBase<CPUBackend>)
+      CPUDummy, DummyBase<CPUBackend>, "CPU")
 
 #define DALI_REGISTER_GPU_DUMMY(OpName, OpType)         \
   DALI_DEFINE_OPTYPE_REGISTERER(OpName, OpType,         \
-      GPUDummy, DummyBase<GPUBackend>)
+      GPUDummy, DummyBase<GPUBackend>, "GPU")
 
 
 // Register the classes
diff --git a/dali/pipeline/operators/resize/new_resize.h b/dali/pipeline/operators/resize/new_resize.h
index 67d759dbe0f..3e9aae96aa4 100644
--- a/dali/pipeline/operators/resize/new_resize.h
+++ b/dali/pipeline/operators/resize/new_resize.h
@@ -143,7 +143,7 @@ class NewResize : public Resize<Backend> {
   void SetupSharedSampleParams(Workspace<Backend> *ws) override {
     Resize<Backend> ::SetupSharedSampleParams(ws);
   }
-  uint ResizeInfoNeeded() const override { return t_crop + t_mirrorHor; }
+  uint ResizeInfoNeeded() const override    { return t_crop + t_mirrorHor; }
 
  private:
   MappingInfo **CopyResizeTableToGPU(size_t resizeMemory[], cudaStream_t s,
diff --git a/dali/pipeline/operators/resize/new_resize_test.cc b/dali/pipeline/operators/resize/new_resize_test.cc
index dd617ce40ff..cc04eb1cd23 100644
--- a/dali/pipeline/operators/resize/new_resize_test.cc
+++ b/dali/pipeline/operators/resize/new_resize_test.cc
@@ -13,14 +13,14 @@ TYPED_TEST_CASE(NewResizeTest, Types);
 TYPED_TEST(NewResizeTest, TestFixedResizeAndCrop) {
   this->TstBody(this->DefaultSchema("ResizeCropMirror")
                   .AddArg("resize_shorter", 480.f)
-                  .AddArg("crop", vector<int>{224, 224}), 0.85);
+                  .AddArg("crop", vector<int>{224, 224}), 0.33);
 }
 
 TYPED_TEST(NewResizeTest, TestFixedResizeAndCropWarp) {
   this->TstBody(this->DefaultSchema("ResizeCropMirror")
                   .AddArg("resize_x", 480.f)
                   .AddArg("resize_y", 480.f)
-                  .AddArg("crop", vector<int>{224, 224}), 0.85);
+                  .AddArg("crop", vector<int>{224, 224}), 0.33);
 }
 
 }  // namespace dali
diff --git a/dali/pipeline/operators/resize/resize_test.cc b/dali/pipeline/operators/resize/resize_test.cc
index cdd6bd1949b..5d8094e3122 100644
--- a/dali/pipeline/operators/resize/resize_test.cc
+++ b/dali/pipeline/operators/resize/resize_test.cc
@@ -27,24 +27,24 @@ TYPED_TEST_CASE(ResizeTest, Types);
 
 TYPED_TEST(ResizeTest, TestResizeShorter) {
   this->TstBody(this->DefaultSchema("Resize")
-                      .AddArg("resize_shorter", 480.f), 14.0);
+                      .AddArg("resize_shorter", 480.f), 5.5);
 }
 
 TYPED_TEST(ResizeTest, TestResizeShorter_A) {
   this->TstBody(this->DefaultSchema("Resize")
-                      .AddArg("resize_shorter", 224.f), 14.0);
+                      .AddArg("resize_shorter", 224.f), 5.5);
 }
 
 TYPED_TEST(ResizeTest, TestResizeXY) {
   this->TstBody(this->DefaultSchema("Resize")
                       .AddArg("resize_x", 224.f)
-                      .AddArg("resize_y", 224.f), 14.0);
+                      .AddArg("resize_y", 224.f), 5.5);
 }
 
 TYPED_TEST(ResizeTest, TestResizeXY_A) {
   this->TstBody(this->DefaultSchema("Resize")
                       .AddArg("resize_x", 240.f)
-                      .AddArg("resize_y", 480.f), 14.0);
+                      .AddArg("resize_y", 480.f), 5.5);
 }
 
 }  // namespace dali
diff --git a/dali/pipeline/pipeline_test.cc b/dali/pipeline/pipeline_test.cc
index 1b9c320ef79..e3a77c3906f 100644
--- a/dali/pipeline/pipeline_test.cc
+++ b/dali/pipeline/pipeline_test.cc
@@ -36,18 +36,18 @@ class PipelineTest : public DALITest {
     DALITest::DecodeJPEGS(DALI_RGB);
   }
 
-  template <typename T>
-  inline void CompareData(const T* data, const T* ground_truth, int n) {
+  template<typename T>
+  inline void CompareData(const T *data, const T *ground_truth, int n) {
     CUDA_CALL(cudaDeviceSynchronize());
     vector<T> tmp_cpu(n);
-    CUDA_CALL(cudaMemcpy(tmp_cpu.data(), data, sizeof(T)*n, cudaMemcpyDefault));
+    CUDA_CALL(cudaMemcpy(tmp_cpu.data(), data, sizeof(T) * n, cudaMemcpyDefault));
 
     vector<double> abs_diff(n, 0);
     for (int i = 0; i < n; ++i) {
       abs_diff[i] = abs(static_cast<double>(tmp_cpu[i]) - static_cast<double>(ground_truth[i]));
     }
     double mean, std;
-    DALITest::MeanStdDev(abs_diff, &mean, &std);
+    DALITest::MeanStdDevColorNorm(abs_diff, &mean, &std);
 
 #ifndef NDEBUG
     cout << "num: " << abs_diff.size() << endl;
@@ -59,280 +59,190 @@ class PipelineTest : public DALITest {
     ASSERT_LT(std, 0.000001);
   }
 
-  inline OpGraph& GetGraph(Pipeline *pipe) {
-    return pipe->graph_;
-  }
-};
-
-template <int number_of_threads>
-struct ThreadCount {
-  static const int nt = number_of_threads;
-};
-
-class PipelineTestOnce : public PipelineTest<ThreadCount<1>> {
-};
-
-typedef ::testing::Types<ThreadCount<1>,
-                         ThreadCount<2>,
-                         ThreadCount<3>,
-                         ThreadCount<4>> NumThreads;
-TYPED_TEST_CASE(PipelineTest, NumThreads);
-
-TEST_F(PipelineTestOnce, TestInputNotKnown) {
-  Pipeline pipe(1, 1, 0);
+  void RunTestEnforce(const string &dev1, const string &dev2) {
+    Pipeline pipe(1, 1, 0);
 
-  ASSERT_THROW(
+    // Inputs must be know to the pipeline, i.e. ops
+    // must be added in a topological ordering.
+    ASSERT_THROW(
       pipe.AddOperator(
-          OpSpec("Copy")
-          .AddArg("device", "cpu")
-          .AddInput("data", "cpu")
-          .AddOutput("copy_out", "cpu")),
+        OpSpec("Copy")
+          .AddArg("device", dev1)
+          .AddInput("data", dev1)
+          .AddOutput("copy_out", dev1)),
       std::runtime_error);
-}
 
-TEST_F(PipelineTestOnce, TestEnforceCPUOpConstraints) {
-  Pipeline pipe(1, 1, 0);
-
-  // Inputs must be know to the pipeline, i.e. ops
-  // must be added in a topological ordering.
-  ASSERT_THROW(
-      pipe.AddOperator(
-          OpSpec("Copy")
-          .AddArg("device", "cpu")
-          .AddInput("data", "cpu")
-          .AddOutput("copy_out", "cpu")),
-      std::runtime_error);
-
-  pipe.AddOperator(
+    pipe.AddOperator(
       OpSpec("ExternalSource")
-      .AddArg("device", "gpu")
-      .AddOutput("data", "gpu"));
-
-  // Inputs to CPU ops must be on CPU
-  ASSERT_THROW(
+        .AddArg("device", "gpu")
+        .AddOutput("data", "gpu"));
+
+    // For dev1 = "cpu": Inputs to CPU ops must be on CPU,
+    //                   we do not auto-copy them from gpu to cpu.
+    // For dev1 = "gpu": CPU inputs to GPU ops must be on CPU,
+    //                   we will not copy them back to the host.
+    ASSERT_THROW(
       pipe.AddOperator(
-          OpSpec("Copy")
-          .AddArg("device", "cpu")
-          .AddInput("data", "gpu")
-          .AddOutput("copy_out", "cpu")),
+        OpSpec("Copy")
+          .AddArg("device", dev1)
+          .AddInput("data", dev2)
+          .AddOutput("copy_out", dev1)),
       std::runtime_error);
 
-  // Inputs to CPU ops must already exist on CPU,
-  // we do not auto-copy them from gpu to cpu.
-  ASSERT_THROW(
-      pipe.AddOperator(
+    if (dev1 == "cpu") {
+      // Inputs to CPU ops must already exist on CPU,
+      // we do not auto-copy them from gpu to cpu.
+      ASSERT_THROW(
+        pipe.AddOperator(
           OpSpec("Copy")
-          .AddArg("device", "cpu")
-          .AddInput("data", "cpu")
-          .AddOutput("copy_out", "cpu")),
-      std::runtime_error);
+            .AddArg("device", dev1)
+            .AddInput("data", dev1)
+            .AddOutput("copy_out", dev1)),
+        std::runtime_error);
+    }
 
-  pipe.AddOperator(
+    pipe.AddOperator(
       OpSpec("ExternalSource")
-      .AddArg("device", "cpu")
-      .AddOutput("data_2", "cpu"));
+        .AddArg("device", dev1)
+        .AddOutput("data_2", dev1));
 
-  pipe.AddOperator(
+    pipe.AddOperator(
       OpSpec("ExternalSource")
-      .AddArg("device", "cpu")
-      .AddOutput("data_3", "cpu"));
+        .AddArg("device", dev1)
+        .AddOutput("data_3", dev1));
 
-  // Outputs must have unique names.
-  ASSERT_THROW(
+    // Outputs must have unique names.
+    ASSERT_THROW(
       pipe.AddOperator(
-          OpSpec("Copy")
-          .AddArg("device", "cpu")
-          .AddInput("data_2", "cpu")
-          .AddOutput("data_3", "cpu")),
+        OpSpec("Copy")
+          .AddArg("device", dev1)
+          .AddInput("data_2", dev1)
+          .AddOutput("data_3", dev1)),
       std::runtime_error);
 
-  // All data must have unique names regardless
-  // of the device they exist on.
-  ASSERT_THROW(
+    if (dev1 == "gpu") {
       pipe.AddOperator(
-          OpSpec("Copy")
+        OpSpec("ExternalSource")
           .AddArg("device", "cpu")
-          .AddInput("data_2", "cpu")
-          .AddOutput("data", "cpu")),
-      std::runtime_error);
-
-  // CPU ops can only produce CPU outputs
-  ASSERT_THROW(
+          .AddOutput("data_4", "cpu"));
+    }
+    // All data must have unique names regardless
+    // of the device they exist on.
+    ASSERT_THROW(
       pipe.AddOperator(
-          OpSpec("Copy")
-          .AddArg("device", "cpu")
-          .AddInput("data_2", "cpu")
-          .AddOutput("data_4", "gpu")),
+        OpSpec("Copy")
+          .AddArg("device", dev1)
+          .AddInput("data_2", dev1)
+          .AddOutput("data", dev1)),
       std::runtime_error);
-}
 
-TEST_F(PipelineTestOnce, TestEnforceGPUOpConstraints) {
-  Pipeline pipe(1, 1, 0);
 
-  // Inputs must be know to the pipeline, i.e. ops
-  // must be added in a topological ordering.
-  ASSERT_THROW(
+    // CPU ops can only produce CPU outputs
+    ASSERT_THROW(
       pipe.AddOperator(
-          OpSpec("Copy")
-          .AddArg("device", "gpu")
-          .AddInput("data", "gpu")
-          .AddOutput("copy_out", "gpu")),
+        OpSpec("Copy")
+          .AddArg("device", dev1)
+          .AddInput("data_2", dev1)
+          .AddOutput("data_4", dev2)),
       std::runtime_error);
+  }
 
-  pipe.AddOperator(
-      OpSpec("ExternalSource")
-      .AddArg("device", "gpu")
-      .AddOutput("data", "gpu"));
+  void RunTestTrigger(const string &dev) {
+    Pipeline pipe(1, 1, 0);
 
-  // CPU inputs to GPU ops must be on CPU, we will
-  // not copy them back to the host.
-  ASSERT_THROW(
-      pipe.AddOperator(
-          OpSpec("Copy")
-          .AddArg("device", "gpu")
-          .AddInput("data", "cpu")
-          .AddOutput("copy_out", "gpu")),
-      std::runtime_error);
+    pipe.AddExternalInput("data");
 
-  pipe.AddOperator(
-      OpSpec("ExternalSource")
-      .AddArg("device", "gpu")
-      .AddOutput("data_2", "gpu"));
+    pipe.AddOperator(
+      OpSpec("Copy")
+        .AddArg("device", "gpu")
+        .AddInput("data", dev)
+        .AddOutput("data_copy", "gpu"));
+
+    vector<std::pair<string, string>> outputs = {{"data_copy", "gpu"}};
+    pipe.Build(outputs);
+
+    OpGraph &graph = this->GetGraph(&pipe);
+
+      // Validate the graph
+    ASSERT_EQ(graph.NumCPUOp(), 1);
+    ASSERT_EQ(graph.NumMixedOp(), 1);
+    ASSERT_EQ(graph.NumGPUOp(), 1);
+
+    ASSERT_EQ(graph.mixed_op(0).name(), "MakeContiguous");
+
+    // Validate the source op
+    auto &node = graph.node(0);
+    ASSERT_EQ(node.id, 0);
+    ASSERT_EQ(node.children.size(), 1);
+    ASSERT_EQ(node.parents.size(), 0);
+    ASSERT_EQ(node.children.count(1), 1);
+
+    // Validate the MakeContiguous op
+    auto &node2 = graph.node(1);
+    ASSERT_EQ(node2.id, 1);
+    ASSERT_EQ(node2.children.size(), 1);
+    ASSERT_EQ(node2.parents.size(), 1);
+    ASSERT_EQ(node2.parents.count(0), 1);
+    ASSERT_EQ(node2.children.count(2), 1);
+
+    // Validate the copy op
+    auto &node3 = graph.node(2);
+    ASSERT_EQ(node3.id, 2);
+    ASSERT_EQ(node3.children.size(), 0);
+    ASSERT_EQ(node3.parents.size(), 1);
+    ASSERT_EQ(node3.parents.count(1), 1);
+  }
 
-  pipe.AddOperator(
-      OpSpec("ExternalSource")
-      .AddArg("device", "gpu")
-      .AddOutput("data_3", "gpu"));
+  inline OpGraph& GetGraph(Pipeline *pipe) {
+    return pipe->graph_;
+  }
+};
 
-  // Outputs must have unique names.
-  ASSERT_THROW(
-      pipe.AddOperator(
-          OpSpec("Copy")
-          .AddArg("device", "gpu")
-          .AddInput("data_2", "gpu")
-          .AddOutput("data_3", "gpu")),
-      std::runtime_error);
+template <int number_of_threads>
+struct ThreadCount {
+  static const int nt = number_of_threads;
+};
 
-  pipe.AddOperator(
-      OpSpec("ExternalSource")
-      .AddArg("device", "cpu")
-      .AddOutput("data_4", "cpu"));
+class PipelineTestOnce : public PipelineTest<ThreadCount<1>> {
+};
 
-  // All data must have unique names regardless
-  // of the device they exist on.
-  ASSERT_THROW(
-      pipe.AddOperator(
-          OpSpec("Copy")
-          .AddArg("device", "gpu")
-          .AddInput("data_2", "gpu")
-          .AddOutput("data_4", "gpu")),
-      std::runtime_error);
+typedef ::testing::Types<ThreadCount<1>,
+                         ThreadCount<2>,
+                         ThreadCount<3>,
+                         ThreadCount<4>> NumThreads;
+TYPED_TEST_CASE(PipelineTest, NumThreads);
+
+TEST_F(PipelineTestOnce, TestInputNotKnown) {
+  Pipeline pipe(1, 1, 0);
 
-  // GPU ops can only produce GPU outputs
   ASSERT_THROW(
       pipe.AddOperator(
           OpSpec("Copy")
-          .AddArg("device", "gpu")
-          .AddInput("data_2", "gpu")
-          .AddOutput("data_4", "cpu")),
+          .AddArg("device", "cpu")
+          .AddInput("data", "cpu")
+          .AddOutput("copy_out", "cpu")),
       std::runtime_error);
 }
 
-TEST_F(PipelineTestOnce, TestTriggerToContiguous) {
-  Pipeline pipe(1, 1, 0);
-
-  pipe.AddExternalInput("data");
-
-  pipe.AddOperator(
-      OpSpec("Copy")
-      .AddArg("device", "gpu")
-      .AddInput("data", "cpu")
-      .AddOutput("data_copy", "gpu"));
-
-  vector<std::pair<string, string>> outputs = {{"data_copy", "gpu"}};
-  pipe.Build(outputs);
-
-  OpGraph &graph = this->GetGraph(&pipe);
-
-  // Validate the graph
-  ASSERT_EQ(graph.NumCPUOp(), 1);
-  ASSERT_EQ(graph.NumMixedOp(), 1);
-  ASSERT_EQ(graph.NumGPUOp(), 1);
+TEST_F(PipelineTestOnce, TestEnforceCPUOpConstraints) {
+  RunTestEnforce("cpu", "gpu");
+}
 
-  ASSERT_EQ(graph.mixed_op(0).name(), "MakeContiguous");
+TEST_F(PipelineTestOnce, TestEnforceGPUOpConstraints) {
+  RunTestEnforce("gpu", "cpu");
+}
 
-  // Validate the source op
-  auto& node = graph.node(0);
-  ASSERT_EQ(node.id, 0);
-  ASSERT_EQ(node.children.size(), 1);
-  ASSERT_EQ(node.parents.size(), 0);
-  ASSERT_EQ(node.children.count(1), 1);
-
-  // Validate the MakeContiguous op
-  auto& node2 = graph.node(1);
-  ASSERT_EQ(node2.id, 1);
-  ASSERT_EQ(node2.children.size(), 1);
-  ASSERT_EQ(node2.parents.size(), 1);
-  ASSERT_EQ(node2.parents.count(0), 1);
-  ASSERT_EQ(node2.children.count(2), 1);
-
-  // Validate the copy op
-  auto& node3 = graph.node(2);
-  ASSERT_EQ(node3.id, 2);
-  ASSERT_EQ(node3.children.size(), 0);
-  ASSERT_EQ(node3.parents.size(), 1);
-  ASSERT_EQ(node3.parents.count(1), 1);
+TEST_F(PipelineTestOnce, TestTriggerToContiguous) {
+  RunTestTrigger("cpu");
 }
 
 TEST_F(PipelineTestOnce, TestTriggerCopyToDevice) {
-  Pipeline pipe(1, 1, 0);
-
-  pipe.AddExternalInput("data");
-
-  pipe.AddOperator(
-      OpSpec("Copy")
-      .AddArg("device", "gpu")
-      .AddInput("data", "gpu")
-      .AddOutput("data_copy", "gpu"));
-
-  vector<std::pair<string, string>> outputs = {{"data_copy", "gpu"}};
-  pipe.Build(outputs);
-
-  OpGraph &graph = this->GetGraph(&pipe);
-
-  // Validate the graph
-  ASSERT_EQ(graph.NumCPUOp(), 1);
-  ASSERT_EQ(graph.NumMixedOp(), 1);
-  ASSERT_EQ(graph.NumGPUOp(), 1);
-
-  ASSERT_EQ(graph.mixed_op(0).name(), "MakeContiguous");
-
-  // Validate the source op
-  auto& node = graph.node(0);
-  ASSERT_EQ(node.id, 0);
-  ASSERT_EQ(node.children.size(), 1);
-  ASSERT_EQ(node.parents.size(), 0);
-  ASSERT_EQ(node.children.count(1), 1);
-
-  // Validate the MakeContiguous op
-  auto& node2 = graph.node(1);
-  ASSERT_EQ(node2.id, 1);
-  ASSERT_EQ(node2.children.size(), 1);
-  ASSERT_EQ(node2.parents.size(), 1);
-  ASSERT_EQ(node2.parents.count(0), 1);
-  ASSERT_EQ(node2.children.count(2), 1);
-
-  // Validate the copy op
-  auto& node3 = graph.node(2);
-  ASSERT_EQ(node3.id, 2);
-  ASSERT_EQ(node3.children.size(), 0);
-  ASSERT_EQ(node3.parents.size(), 1);
-  ASSERT_EQ(node3.parents.count(1), 1);
+  RunTestTrigger("gpu");
 }
 
 TYPED_TEST(PipelineTest, TestExternalSource) {
   int num_thread = TypeParam::nt;
-  int batch_size = this->jpegs_.size();
+  int batch_size = this->jpegs_.nImages();
 
   Pipeline pipe(batch_size, num_thread, 0);
 
@@ -354,7 +264,7 @@ TYPED_TEST(PipelineTest, TestExternalSource) {
 
 TYPED_TEST(PipelineTest, TestSerialization) {
   int num_thread = TypeParam::nt;
-  int batch_size = this->jpegs_.size();
+  int batch_size = this->jpegs_.nImages();
 
   Pipeline pipe(batch_size, num_thread, 0);
 
diff --git a/dali/test/dali_test.h b/dali/test/dali_test.h
index 85ffc3e92eb..0760c9a92b5 100644
--- a/dali/test/dali_test.h
+++ b/dali/test/dali_test.h
@@ -53,11 +53,10 @@ class DALITest : public ::testing::Test {
  public:
   virtual inline void SetUp() {
     rand_gen_.seed(time(nullptr));
-    LoadJPEGS(image_folder, &jpeg_names_, &jpegs_, &jpeg_sizes_);
+    LoadJPEGS(image_folder, &jpeg_names_, &jpegs_);
   }
 
   virtual inline void TearDown() {
-    for (auto &ptr : jpegs_) delete[] ptr;
     for (auto &ptr : images_) delete[] ptr;
   }
 
@@ -71,7 +70,7 @@ class DALITest : public ::testing::Test {
   }
 
   void DecodeImage(const unsigned char *data, int data_size, int c, int img_type,
-                          Tensor<CPUBackend> *out, unsigned char *out_dataPntr = NULL) {
+                          Tensor<CPUBackend> *out, unsigned char *out_dataPntr = NULL) const {
     cv::Mat input(1, data_size, CV_8UC1, const_cast<unsigned char*>(data));
 
     cv::Mat tmp = cv::imdecode(input, c == 1 ? CV_LOAD_IMAGE_GRAYSCALE : CV_LOAD_IMAGE_COLOR);
@@ -93,12 +92,14 @@ class DALITest : public ::testing::Test {
     std::memcpy(out_dataPntr, out_img.ptr(), out_img.rows * out_img.cols * c);
   }
 
-  inline void DecodeImages(DALIImageType type, const vector<uint8*>& encoded,
-                           const vector<int>& encoded_sizes,
+  inline void DecodeImages(DALIImageType type, const ImgSetDescr &imgs,
                            vector<uint8*> *images, vector<DimPair> *image_dims) {
-    images->resize(encoded.size());
-    image_dims->resize(encoded.size());
-    for (size_t i = 0; i < encoded.size(); ++i) {
+    const auto & encoded = imgs.data_;
+    const auto & encoded_sizes = imgs.sizes_;
+    const auto nImgs = imgs.nImages();
+    images->resize(nImgs);
+    image_dims->resize(nImgs);
+    for (size_t i = 0; i < nImgs; ++i) {
       cv::Mat img;
       cv::Mat encode = cv::Mat(1, encoded_sizes[i], CV_8UC1, encoded[i]);
 
@@ -127,7 +128,7 @@ class DALITest : public ::testing::Test {
   }
 
   inline void DecodeJPEGS(DALIImageType type) {
-    DecodeImages(type, jpegs_, jpeg_sizes_, &images_, &image_dims_);
+    DecodeImages(type, jpegs_, &images_, &image_dims_);
   }
 
   inline void MakeDecodedBatch(int n, TensorList<CPUBackend> *tl,
@@ -159,13 +160,14 @@ class DALITest : public ::testing::Test {
   }
 
   // Make a batch (in TensorList) of arbitrary raw data
-  inline void MakeEncodedBatch(TensorList<CPUBackend> *tl, int n,
-                        const vector<uint8*> &data,
-                        const vector<int> &data_sizes) {
-    DALI_ENFORCE(data.size() > 0, "data must be populated to create batches");
+  inline void MakeEncodedBatch(TensorList<CPUBackend> *tl, int n, const ImgSetDescr &imgs) {
+    const auto &data = imgs.data_;
+    const auto &data_sizes = imgs.sizes_;
+    const auto nImgs = imgs.nImages();
+    DALI_ENFORCE(nImgs > 0, "data must be populated to create batches");
     vector<Dims> shape(n);
     for (int i = 0; i < n; ++i) {
-      shape[i] = {data_sizes[i % data.size()]};
+      shape[i] = {data_sizes[i % nImgs]};
     }
 
     tl->template mutable_data<uint8>();
@@ -173,41 +175,42 @@ class DALITest : public ::testing::Test {
 
     for (int i = 0; i < n; ++i) {
       std::memcpy(tl->template mutable_tensor<uint8>(i),
-          data[i % data.size()],
-          data_sizes[i % data.size()]);
+          data[i % nImgs],
+          data_sizes[i % nImgs]);
     }
   }
 
   // Make a batch (of vector<Tensor>) of arbitrary raw data
-  inline void MakeEncodedBatch(vector<Tensor<CPUBackend>> *t, int n,
-                            const vector<uint8*> &data,
-                            const vector<int> &data_sizes) {
-    DALI_ENFORCE(data.size() > 0, "data must be populated to create batches");
+  inline void MakeEncodedBatch(vector<Tensor<CPUBackend>> *t, int n, const ImgSetDescr &imgs) {
+    const auto &data = imgs.data_;
+    const auto &data_sizes = imgs.sizes_;
+    const auto nImgs = data.size();
+    DALI_ENFORCE(nImgs > 0, "data must be populated to create batches");
 
     t->resize(n);
     for (int i = 0; i < n; ++i) {
       auto& ti = t->at(i);
       ti = Tensor<CPUBackend>{};
-      ti.Resize({data_sizes[i % data.size()]});
+      ti.Resize({data_sizes[i % nImgs]});
       ti.template mutable_data<uint8>();
 
       std::memcpy(ti.raw_mutable_data(),
-                  data[i % data.size()],
-                  data_sizes[i % data.size()]);
+                  data[i % nImgs],
+                  data_sizes[i % nImgs]);
     }
   }
 
 
   inline void MakeJPEGBatch(TensorList<CPUBackend> *tl, int n) {
-    MakeEncodedBatch(tl, n, jpegs_, jpeg_sizes_);
+    MakeEncodedBatch(tl, n, jpegs_);
   }
 
   inline void MakeJPEGBatch(vector<Tensor<CPUBackend>> *t, int n) {
-    MakeEncodedBatch(t, n, jpegs_, jpeg_sizes_);
+    MakeEncodedBatch(t, n, jpegs_);
   }
 
   template <typename T>
-  void MeanStdDev(const vector<T> &diff, double *mean, double *std) {
+  void MeanStdDev(const vector<T> &diff, double *mean, double *std) const {
     const size_t N = diff.size();
     // Avoid division by zero
     ASSERT_NE(N, 0);
@@ -223,6 +226,12 @@ class DALITest : public ::testing::Test {
     *std = sqrt(var_sum / N);
   }
 
+  template <typename T>
+  void MeanStdDevColorNorm(const vector<T> &diff, double *mean, double *std) const {
+    MeanStdDev(diff, mean, std);
+    *mean /= (255. / 100.);    // normalizing to the color range and use percents
+  }
+
   // From OCV example :
   // docs.opencv.org/2.4/doc/tutorials/gpu/gpu-basics-similarity/gpu-basics-similarity.html
   cv::Scalar MSSIM(uint8 *a, uint8 *b, int h, int w, int c) {
@@ -281,10 +290,11 @@ class DALITest : public ::testing::Test {
   }
 
  protected:
+  int GetNumColorComp() const          { return c_; }
+
   std::mt19937 rand_gen_;
   vector<string> jpeg_names_;
-  vector<uint8*> jpegs_;
-  vector<int> jpeg_sizes_;
+  ImgSetDescr jpegs_;
 
   // Decoded images
   vector<uint8*> images_;
diff --git a/dali/test/dali_test_decoder.h b/dali/test/dali_test_decoder.h
index eeef6286709..318f7b4e26b 100644
--- a/dali/test/dali_test_decoder.h
+++ b/dali/test/dali_test_decoder.h
@@ -11,10 +11,8 @@
 namespace dali {
 
 template <typename ImgType>
-class GenericDecoderTest : public DALISingleOpTest {
+class GenericDecoderTest : public DALISingleOpTest<ImgType> {
  public:
-  USING_DALI_SINGLE_OP_TEST();
-
   vector<TensorList<CPUBackend>*>
   Reference(const vector<TensorList<CPUBackend>*> &inputs,
             DeviceWorkspace *ws) {
@@ -25,12 +23,12 @@ class GenericDecoderTest : public DALISingleOpTest {
 
     const TensorList<CPUBackend>& encoded_data = *inputs[0];
 
-    c_ = (IsColor(img_type_) ? 3 : 1);
+    const int c = this->GetNumColorComp();
     for (int i = 0; i < encoded_data.ntensor(); ++i) {
       auto *data = encoded_data.tensor<unsigned char>(i);
       auto data_size = Product(encoded_data.tensor_shape(i));
 
-      DecodeImage(data, data_size, c_, img_type_, &out[i]);
+      this->DecodeImage(data, data_size, c, this->ImageType(), &out[i]);
     }
 
     vector<TensorList<CPUBackend>*> outputs(1);
@@ -40,23 +38,16 @@ class GenericDecoderTest : public DALISingleOpTest {
   }
 
  protected:
-  virtual const OpSpec DecodingOp() const = 0;
-  virtual uint8 TestCheckType() const       { return t_checkDefault; }
+  virtual const OpSpec DecodingOp() const   { return OpSpec(); }
 
   void RunTestDecode(t_imgType imageType, float eps = 5e-2) {
-#ifdef PIXEL_STAT_FILE
-    FILE *file = fopen(PIXEL_STAT_FILE".txt", "a");
-    fprintf(file, "Type of the files: %s   eps = %6.4f\n", jpegData? "JPEG" : "PNG", eps);
-    fprintf(file, " Color#:       mean:        std:          eq.         pos.         neg.\n");
-    fclose(file);
-#endif
     TensorList<CPUBackend> encoded_data;
     switch (imageType) {
       case t_jpegImgType:
-        EncodedJPEGData(&encoded_data, batch_size_);
+        this->EncodedJPEGData(&encoded_data);
         break;
       case t_pngImgType:
-        EncodedPNGData(&encoded_data, batch_size_);
+        this->EncodedPNGData(&encoded_data);
         break;
       default: {
         char buff[32];
@@ -65,19 +56,41 @@ class GenericDecoderTest : public DALISingleOpTest {
       }
     }
 
-    SetExternalInputs({std::make_pair("encoded", &encoded_data)});
+    this->SetExternalInputs({std::make_pair("encoded", &encoded_data)});
+    this->RunOperator(DecodingOp(), eps);
+  }
 
-    AddSingleOp(DecodingOp());
+  void RunTestDecode(const ImgSetDescr &imgs, float eps = 5e-2) {
+    this->SetEps(eps);
+    for (size_t imgIdx = 0; imgIdx < imgs.nImages(); ++imgIdx) {
+      Tensor<CPUBackend> t;
+      DALI_CALL(DecodeJPEGHost(imgs.data_[imgIdx],
+                               imgs.sizes_[imgIdx],
+                               this->img_type_, &t));
+
+#if DALI_DEBUG
+      WriteHWCImage(t.data<uint8_t>(), t.dim(0), t.dim(1), t.dim(2),
+                    std::to_string(imgIdx) + "-img");
+#ifndef NDEBUG
+      cout << imgIdx << ": " << imgs.sizes_[imgIdx]
+           << "  dims: " << t.dim(1) << "x" << t.dim(0) << endl;
+#endif
+#endif
+      this->VerifyDecode(t.data<uint8_t>(), t.dim(0), t.dim(1), imgs, imgIdx);
+    }
+  }
 
-    DeviceWorkspace ws;
-    RunOperator(&ws);
+  void VerifyDecode(const uint8 *img, int h, int w, const ImgSetDescr &imgs, int img_id) const {
+    // Compare w/ opencv result
+    const auto imgData = imgs.data_[img_id];
+    const auto imgSize = imgs.sizes_[img_id];
+    ASSERT_TRUE(CheckIsJPEG(imgData, imgSize));
 
-    SetEps(eps);
-    SetTestCheckType(TestCheckType());
-    CheckAnswers(&ws, {0});
+    Tensor<CPUBackend> out;
+    const int c = this->GetNumColorComp();
+    this->DecodeImage(imgData, imgSize, c, this->ImageType(), &out);
+    this->CheckBuffers(h*w*c, out.mutable_data<uint8>(), img, false);
   }
-
-  const DALIImageType img_type_ = ImgType::type;
 };
 
 }  // namespace dali
diff --git a/dali/test/dali_test_matching.h b/dali/test/dali_test_matching.h
new file mode 100644
index 00000000000..2f01f494ddf
--- /dev/null
+++ b/dali/test/dali_test_matching.h
@@ -0,0 +1,75 @@
+// Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+#ifndef DALI_TEST_DALI_TEST_MATCHING_H_
+#define DALI_TEST_DALI_TEST_MATCHING_H_
+
+#include "dali/test/dali_test_single_op.h"
+#include <utility>
+#include <vector>
+#include <string>
+#include <memory>
+
+namespace dali {
+
+typedef struct {
+  const char *opName;
+  const char *paramName;
+  const char *paramVal;
+  double epsVal;
+} singleParamOpDescr;
+
+template <typename ImgType>
+class GenericMatchingTest : public DALISingleOpTest<ImgType> {
+ protected:
+  void RunTest(const opDescr &descr) {
+    const int batch_size = this->jpegs_.nImages();
+    this->SetBatchSize(batch_size);
+    this->SetNumThreads(1);
+
+    TensorList<CPUBackend> data;
+    this->MakeJPEGBatch(&data, batch_size);
+    this->SetExternalInputs({{"jpegs", &data}});
+
+    shared_ptr<dali::Pipeline> pipe = this->GetPipeline();
+    // Decode the images
+    pipe->AddOperator(
+      OpSpec("HostDecoder")
+        .AddArg("output_type", this->img_type_)
+        .AddInput("jpegs", "cpu")
+        .AddOutput("input", "cpu"));
+
+    // Launching the same transformation on CPU (outputIdx 0) and GPU (outputIdx 1)
+    this->AddOperatorWithOutput(descr);
+    this->RunOperator(descr);
+  }
+
+  virtual vector<TensorList<CPUBackend>*>
+  Reference(const vector<TensorList<CPUBackend>*> &inputs, DeviceWorkspace *ws) {
+    return this->CopyToHost(*ws->Output<GPUBackend>(1));
+  }
+
+  uint8 GetTestCheckType() const  override {
+    return t_checkColorComp + t_checkElements;  // + t_checkAll + t_checkNoAssert;
+  }
+
+  void RunTest(const singleParamOpDescr &paramOp) {
+    OpArg arg = {paramOp.paramName, paramOp.paramVal, t_floatParam};
+    vector<OpArg> args;
+    args.push_back(arg);
+    opDescr aaa(paramOp.opName, paramOp.epsVal, &args);
+    RunTest(aaa);
+  }
+
+  void RunTest(const char *opName, const OpArg params[] = NULL,
+                int nParam = 0, double eps = 0.001) {
+    if (params && nParam > 0) {
+      vector<OpArg> args(params, params + nParam);
+      RunTest(opDescr(opName, eps, &args));
+    } else {
+      RunTest(opDescr(opName, eps, NULL));
+    }
+  }
+};
+
+}  // namespace dali
+
+#endif  // DALI_TEST_DALI_TEST_MATCHING_H_
diff --git a/dali/test/dali_test_resize.h b/dali/test/dali_test_resize.h
index 90cb6cc8dc6..2c36628238f 100644
--- a/dali/test/dali_test_resize.h
+++ b/dali/test/dali_test_resize.h
@@ -16,39 +16,12 @@ typedef enum {
 } t_resizeOptions;
 
 template <typename ImgType>
-class GenericResizeTest : public DALISingleOpTest {
+class GenericResizeTest : public DALISingleOpTest<ImgType> {
  public:
-  USING_DALI_SINGLE_OP_TEST();
-
-  void TstBody(const string &pName, const string &pDevice = "gpu", double eps = 2e-1) {
-    OpSpec operation = DefaultSchema(pName, pDevice);
-    TstBody(GetOperationSpec(operation), eps);
-  }
-
-  void TstBody(const OpSpec &operation, double eps = 2e-1) {
-#ifdef PIXEL_STAT_FILE
-    FILE *file = fopen(PIXEL_STAT_FILE".txt", "a");
-    fprintf(file, "Eps = %6.4f\n", eps);
-    fprintf(file, " Color#:       mean:        std:          eq.         pos.         neg.\n");
-    fclose(file);
-#endif
-    TensorList<CPUBackend> data;
-    this->DecodedData(&data, this->batch_size_, this->img_type_);
-    this->SetExternalInputs({std::make_pair("input", &data)});
-
-    this->AddSingleOp(operation);
-
-    DeviceWorkspace ws;
-    this->RunOperator(&ws);
-
-    this->SetEps(eps);
-    this->CheckAnswers(&ws, {0});
-  }
-
   vector<TensorList<CPUBackend>*>
   Reference(const vector<TensorList<CPUBackend>*> &inputs, DeviceWorkspace *ws) {
-    c_ = (IsColor(img_type_) ? 3 : 1);
-    auto cv_type = (c_ == 3) ? CV_8UC3 : CV_8UC1;
+    const int c = this->GetNumColorComp();
+    auto cv_type = (c == 3) ? CV_8UC3 : CV_8UC1;
 
     // single input - encoded images
     // single output - decoded images
@@ -59,25 +32,25 @@ class GenericResizeTest : public DALISingleOpTest {
 
     int resize_a = 0, resize_b = 0;
     bool warp_resize = true;
-
+    const OpSpec &spec = this->GetOperationSpec();
     const bool useExternSizes = (resizeOptions & t_externSizes) &&
-                                spec_.GetArgument<bool>("save_attrs");
+                                spec.GetArgument<bool>("save_attrs");
     if (!useExternSizes) {
       if (resizeOptions & t_externSizes)
         assert(false);  // Can't handle these right now
 
-      resize_a = spec_.GetArgument<float>("resize_x");
+      resize_a = spec.GetArgument<float>("resize_x");
       warp_resize = resize_a != 0;
       if (warp_resize)
-        resize_b = spec_.GetArgument<float>("resize_y");
+        resize_b = spec.GetArgument<float>("resize_y");
       else
-        resize_a = spec_.GetArgument<float>("resize_shorter");
+        resize_a = spec.GetArgument<float>("resize_shorter");
     }
 
     int crop_h = 0, crop_w = 0;
     if (resizeOptions & t_cropping) {
       // Perform a crop
-      const vector<int> crop = spec_.GetRepeatedArgument<int>("crop");
+      const vector<int> crop = spec.GetRepeatedArgument<int>("crop");
       crop_h = crop.at(0), crop_w = crop.at(1);
     }
 
@@ -122,11 +95,11 @@ class GenericResizeTest : public DALISingleOpTest {
         const int crop_x = (rsz_w - crop_w) / 2;
 
         crop_img.create(crop_h, crop_w, cv_type);
-        const int crop_offset = (crop_y * rsz_w + crop_x) * c_;
+        const int crop_offset = (crop_y * rsz_w + crop_x) * c;
         uint8 *crop_ptr = rsz_img.ptr() + crop_offset;
 
-        CUDA_CALL(cudaMemcpy2D(crop_img.ptr(), crop_w * c_,
-                               crop_ptr, rsz_w * c_, crop_w * c_, crop_h,
+        CUDA_CALL(cudaMemcpy2D(crop_img.ptr(), crop_w * c,
+                               crop_ptr, rsz_w * c, crop_w * c, crop_h,
                                cudaMemcpyHostToHost));
       }
 
@@ -137,10 +110,10 @@ class GenericResizeTest : public DALISingleOpTest {
         finalImg = &mirror_img;
       }
 
-      out[i].Resize({finalImg->rows, finalImg->cols, c_});
+      out[i].Resize({finalImg->rows, finalImg->cols, c});
       auto *out_data = out[i].mutable_data<unsigned char>();
 
-      std::memcpy(out_data, finalImg->ptr(), finalImg->rows * finalImg->cols * c_);
+      std::memcpy(out_data, finalImg->ptr(), finalImg->rows * finalImg->cols * c);
     }
 
     vector<TensorList<CPUBackend>*> outputs(1);
@@ -151,17 +124,6 @@ class GenericResizeTest : public DALISingleOpTest {
 
  protected:
   virtual uint32_t getResizeOptions() const         { return t_cropping /*+ t_mirroring*/; }
-  virtual OpSpec DefaultSchema(const string &pName, const string &pDevice = "gpu") const {
-    return OpSpec(pName)
-      .AddArg("device", pDevice)
-      .AddArg("output_type", this->img_type_)
-      .AddInput("input", pDevice)
-      .AddOutput("output", pDevice);
-  }
-
-  virtual const OpSpec &GetOperationSpec(const OpSpec &op) const { return op; }
-
-  const DALIImageType img_type_ = ImgType::type;
 };
 
 }  // namespace dali
diff --git a/dali/test/dali_test_single_op.h b/dali/test/dali_test_single_op.h
index a00152143c2..a243589a2a2 100644
--- a/dali/test/dali_test_single_op.h
+++ b/dali/test/dali_test_single_op.h
@@ -31,6 +31,7 @@ namespace dali {
 #define MAKE_IMG_OUTPUT    0      // Make the output of compared (obtained and referenced) images
 #if MAKE_IMG_OUTPUT
   #define PIXEL_STAT_FILE "pixelStatFile"  // Output of statistics for compared sets of images
+                                           // Use "" to make the output in stdout
 #endif
 
 namespace images {
@@ -80,6 +81,35 @@ typedef enum {
   t_pngImgType,
 } t_imgType;
 
+typedef enum {
+  t_loadJPEGs   = 1,
+  t_decodeJPEGs = 2,
+  t_loadPNGs    = 4,
+  t_decodePNGs  = 8
+} t_loadingFlags;
+
+typedef enum {
+  t_intParam,
+  t_floatParam,
+  t_stringParam,
+  t_floatVector
+} t_paramType;
+
+typedef struct  {
+  const char *m_Name;
+  const char *m_val;
+  t_paramType type;
+} OpArg;
+
+class opDescr {
+ public:
+  explicit opDescr(const char *name, double eps = 0.0, const vector<OpArg> *argPntr = NULL) :
+                   opName(name), epsVal(eps), args(argPntr) {}
+  const char *opName;
+  double epsVal;
+  const vector<OpArg> *args;
+};
+
 // Define a virtual base class for single operator tests,
 // where we want to add a single operator to a pipeline,
 // run the pipe using known data, and compare the result to
@@ -90,23 +120,31 @@ typedef enum {
 //  void SetInputs() - define all external inputs to the graph
 //  void GetOutputs() - retrieve all testable outputs from the graph
 //  bool Compare() - Compare against a (supplied) reference implementation
+template <typename ImgType>
 class DALISingleOpTest : public DALITest {
  public:
   inline void SetUp() override {
     DALITest::SetUp();
+    c_ = (IsColor(img_type_) ? 3 : 1);
     jpegs_.clear();
-    jpeg_sizes_.clear();
 
-    // encoded in jpegs_
-    LoadJPEGS(images::jpeg_test_images, &jpegs_, &jpeg_sizes_);
-    LoadImages(images::png_test_images, &png_, &png_sizes_);
+    const auto flags = GetImageLoadingFlags();
+
+    if (flags & t_loadJPEGs) {
+      LoadJPEGS(images::jpeg_test_images, &jpegs_);
+      if (flags & t_decodeJPEGs)
+        DecodeImages(DALI_RGB, jpegs_, &jpeg_decoded_, &jpeg_dims_);
+    }
 
-    // decoded in images_
-    DecodeImages(DALI_RGB, jpegs_, jpeg_sizes_, &jpeg_decoded_, &jpeg_dims_);
-    DecodeImages(DALI_RGB, png_, png_sizes_, &png_decoded_, &png_dims_);
+    if (flags & t_loadPNGs) {
+      LoadImages(images::png_test_images, &png_);
+
+      if (flags & t_decodePNGs)
+        DecodeImages(DALI_RGB, png_, &png_decoded_, &png_dims_);
+    }
 
     // Set the pipeline batch size
-    batch_size_ = 32;
+    SetBatchSize(32);
   }
   inline void TearDown() override {
     DALITest::TearDown();
@@ -132,19 +170,30 @@ class DALISingleOpTest : public DALITest {
     return testCheckType_ & type;
   }
 
-  void AddSingleOp(const OpSpec& spec) {
-    spec_ = spec;
-    InitPipeline();
-    // generate the output mapping for this op
-    for (int i = 0; i < spec.NumOutput(); ++i) {
-      auto output_name = spec.OutputName(i);
-      auto output_device = spec.OutputDevice(i);
+  virtual uint8 GetTestCheckType() const {
+    return t_checkDefault;
+  }
 
-      outputs_.push_back(std::make_pair(output_name, output_device));
-    }
+  void AddOperatorWithOutput(const OpSpec& spec) {
+    // generate the output mapping for this op
+    for (int i = 0; i < spec.NumOutput(); ++i)
+      outputs_.push_back(std::make_pair(spec.OutputName(i), spec.OutputDevice(i)));
 
     pipeline_->AddOperator(spec);
+  }
+
+  void AddOperatorWithOutput(const opDescr &descr, const string &pDevice = "cpu",
+                             const string &pInput = "input", const string &pOutput = "outputCPU") {
+    OpSpec spec(descr.opName);
+    AddOperatorWithOutput(AddArguments(&spec, descr.args)
+                            .AddInput(pInput, pDevice)
+                            .AddOutput(pOutput, pDevice));
+  }
 
+  void AddSingleOp(const OpSpec& spec) {
+    spec_ = spec;
+    InitPipeline();
+    AddOperatorWithOutput(spec);
     pipeline_->Build(outputs_);
   }
 
@@ -159,9 +208,9 @@ class DALISingleOpTest : public DALITest {
   }
 
   void RunOperator(DeviceWorkspace *ws) {
+    SetTestCheckType(GetTestCheckType());
     pipeline_->RunCPU();
     pipeline_->RunGPU();
-
     pipeline_->Outputs(ws);
   }
 
@@ -208,6 +257,8 @@ class DALISingleOpTest : public DALITest {
         // check calculated vs. reference answers
         CheckTensorLists(calc_output, ref_output);
       }
+
+      delete res[i];
     }
   }
 
@@ -215,12 +266,12 @@ class DALISingleOpTest : public DALITest {
    * Provide some encoded data
    * TODO(slayton): Add different encodings
    */
-  void EncodedJPEGData(TensorList<CPUBackend>* t, int n) {
-    DALITest::MakeEncodedBatch(t, n, jpegs_, jpeg_sizes_);
+  void EncodedJPEGData(TensorList<CPUBackend>* t) {
+    DALITest::MakeEncodedBatch(t, batch_size_, jpegs_);
   }
 
-  void EncodedPNGData(TensorList<CPUBackend>* t, int n) {
-    DALITest::MakeEncodedBatch(t, n, png_, png_sizes_);
+  void EncodedPNGData(TensorList<CPUBackend>* t) {
+    DALITest::MakeEncodedBatch(t, batch_size_, png_);
   }
 
   /**
@@ -231,39 +282,163 @@ class DALISingleOpTest : public DALITest {
     DALITest::MakeImageBatch(n, t, type);
   }
 
- private:
-  // use a Get mean, std-dev of difference separately for each color component
+ protected:
+  inline shared_ptr<Pipeline> GetPipeline() const {
+    return pipeline_;
+  }
+
+  virtual uint32_t GetImageLoadingFlags() const   {
+    return t_loadJPEGs;   // Only loading of JPEG files
+  }
+
+  const OpSpec &GetOperationSpec() const          {
+    return spec_;
+  }
+
+  DALIImageType ImageType() const                 {
+    return img_type_;
+  }
+
+  void TstBody(const string &pName, const string &pDevice = "gpu", double eps = 2e-1) {
+    OpSpec operation = DefaultSchema(pName, pDevice);
+    TstBody(operation, eps);
+  }
+
+  void TstBody(const OpSpec &operation, double eps = 2e-1, bool flag = true) {
+    TensorList<CPUBackend> data;
+    DecodedData(&data, this->batch_size_, this->img_type_);
+    if (flag)
+      SetExternalInputs({std::make_pair("input", &data)});
+
+    RunOperator(operation, eps);
+  }
+
+  virtual OpSpec DefaultSchema(const string &pName, const string &pDevice = "gpu") const {
+    return OpSpec(pName)
+      .AddArg("device", pDevice)
+      .AddArg("output_type", this->img_type_)
+      .AddInput("input", pDevice)
+      .AddOutput("output", pDevice);
+  }
+
+  OpSpec AddArguments(OpSpec *spec, const vector<OpArg> *args) const {
+    if (!args || args->empty())
+      return *spec;
+
+    for (auto param : *args) {
+      auto val = param.m_val;
+      auto name = param.m_Name;
+      switch (param.type) {
+        case t_intParam:
+          spec->AddArg(name, atoi(val));
+          break;
+        case t_floatParam:
+          spec->AddArg(name, strtof(val, NULL));
+          break;
+        case t_stringParam:
+          spec->AddArg(name, val);
+          break;
+        case t_floatVector: {
+          const auto len = strlen(val);
+          vector<float> vect;
+          char *pEnd, *pTmp = new char[len+1];
+          memcpy(pEnd = pTmp, val, len);
+          pEnd[len] = '\0';
+          while (pEnd[0]) {
+            if (pEnd[0] == ',')
+              pEnd++;
+
+            vect.push_back(strtof(pEnd, &pEnd));
+          }
+
+          delete [] pTmp;
+          spec->AddArg(name, vect);
+        }
+      }
+    }
+
+    return *spec;
+  }
+
+  void RunOperator(const opDescr &descr) {
+    OpSpec spec(DefaultSchema(descr.opName));
+    RunOperator(AddArguments(&spec, descr.args), descr.epsVal);
+  }
+
+  void RunOperator(const OpSpec& spec, double eps, DeviceWorkspace *pWS = nullptr) {
+    AddSingleOp(spec);
+
+    DeviceWorkspace ws;
+    if (!pWS)
+      pWS = &ws;
+
+    RunOperator(pWS);
+
+    SetEps(eps);
+    CheckAnswers(pWS, {0});
+  }
+
+  template <typename T>
+  vector<TensorList<CPUBackend> *>CopyToHost(const TensorList<T> &calcOutput) {
+    // copy to host
+    vector<TensorList<CPUBackend> *> outputs(1);
+    outputs[0] = new TensorList<CPUBackend>();
+    outputs[0]->Copy(calcOutput, 0);
+    return outputs;
+  }
 
   template <typename T>
-  int CheckBuffers(int N, const T *a, const T *b, bool checkAll, double *pDiff = NULL) {
+  int CheckBuffers(int N, const T *a, const T *b, bool checkAll, double *pMean = NULL) const {
+    // use a Get mean, std-dev of difference separately for each color component
     const int jMax = TestCheckType(t_checkColorComp)?  c_ : 1;
     const int len = N / jMax;
-    double mean = 0, std;
+    double mean = 0, std = 0;
     vector<double> diff(len);
+    int retVal = -1;
 #ifndef PIXEL_STAT_FILE
     for (int j = 0; j < jMax; ++j) {
       for (int i = j; i < N; i += jMax)
         diff[i / jMax] = abs(static_cast<double>(a[i]) - static_cast<double>(b[i]));
 
-      MeanStdDev<double>(diff, &mean, &std);
+      MeanStdDevColorNorm<double>(diff, &mean, &std);
       if (checkAll) {
-        const auto diff = fabs(mean) - eps_;
+        const auto diff = mean - eps_;
         if (diff <= 0)
           continue;
 
-        if (pDiff)
-          *pDiff = diff;
+        if (pMean)
+          *pMean = mean;
 
         return j;
       }
 
-      ASSERT_LE(fabs(mean), eps_), -1;
+      ASSERT_LE(mean, eps_), -1;
     }
 #else
-    static int fff;
-    FILE *file = fopen(PIXEL_STAT_FILE".txt", "a");
-    if (!fff++)
-      fprintf(file, "Buffer Length: %7d (for each color component)\n", len);
+    static int fff = 0;
+    FILE *file = NULL;
+    if (!fff) {
+      // Header of the pixel statistic table
+      const char *pHeader =
+        "\nImgID: ClrID:     Mean:        Std:      SameValue:     Bigger:         Less:";
+
+      if (strlen(PIXEL_STAT_FILE)) {
+        file = fopen(PIXEL_STAT_FILE".txt", fff ? "a" : "w");
+        fprintf(file, "%s", pHeader);
+      } else {
+        cout << pHeader;
+      }
+    }
+
+    char buffer[256];
+    snprintf(buffer, sizeof(buffer), "%s%3d:", (fff % 32? "" : "\n"), fff);
+    fff++;
+
+    // Image number
+    if (file)
+      fprintf(file, "%s", buffer);
+    else
+      cout << buffer;
 
     for (int j = 0; j < c_; ++j) {
       int pos = 0, neg = 0;
@@ -275,23 +450,43 @@ class DALISingleOpTest : public DALITest {
           neg++;
       }
 
-      MeanStdDev<double>(diff, &mean, &std);
-      fprintf(file, "      %1d    %8.2f     %8.2f       %7d      %7d      %7d\n",
-              j, mean, std, len - pos - neg, pos, neg);
+      MeanStdDevColorNorm<double>(diff, &mean, &std);
+      snprintf(buffer, sizeof(buffer),
+               "%s     %1d    %8.2f     %8.2f       %7d      %7d      %7d\n",
+               j? "    " : "", j, mean, std, len - pos - neg, pos, neg);
+
+      if (file)
+        fprintf(file, "%s", buffer);
+      else
+        cout << buffer;
+
+      if (mean > eps_) {
+        if (retVal < 0) {
+          retVal = j;       // First violation of the boundary found
+          if (pMean)        // Keep the color index as a return value
+            *pMean = mean;
+        } else {
+          if (pMean && *pMean < mean) {
+            *pMean = mean;  // More strong violation of the boundary found
+            retVal = j;     // Change the color index as a return value
+          }
+        }
+      }
     }
 
-    fclose(file);
+    if (file)
+      fclose(file);
 #endif
 
-    return -1;
+    return retVal;
   }
 
-  void ReportTestFailure(double diff, int colorIdx, int idx = -1,
-                         const vector<Index> *pShape = NULL) {
+  void ReportTestFailure(double mean, int colorIdx, int idx = -1,
+                         const vector<Index> *pShape = NULL) const {
     if (TestCheckType(t_checkNoAssert))
-      cout << "Test warning:";
+      cout << "\nTest warning:";
     else
-      cout << "Test failed:";
+      cout << "\nTest failed:";
 
     if (TestCheckType(t_checkColorComp))
       cout << " color # " << colorIdx;
@@ -302,11 +497,11 @@ class DALISingleOpTest : public DALITest {
     if (pShape)
       cout << " (h, w) = (" << (*pShape)[0] << ", " << (*pShape)[1] << ")";
 
-    cout << " fabs(mean) = " << diff + eps_ << " and it was expected to be <= " << eps_ << endl;
+    cout << " mean = " << mean << " and it was expected to be <= " << eps_ << endl;
   }
 
   void CheckTensorLists(const TensorList<CPUBackend> *t1,
-                        const TensorList<CPUBackend> *t2) {
+                        const TensorList<CPUBackend> *t2) const {
     ASSERT_TRUE(t1);
     ASSERT_TRUE(t2);
     ASSERT_EQ(t1->ntensor(), t2->ntensor());
@@ -319,7 +514,7 @@ class DALISingleOpTest : public DALITest {
 
     int failNumb = 0, colorIdx = 0;
     const bool checkAll = TestCheckType(t_checkAll);
-    double diff;
+    double mean;
     if (TestCheckType(t_checkElements)) {
       // The the results are checked for each element separately
       for (int i = 0; i < t1->ntensor(); ++i) {
@@ -336,16 +531,16 @@ class DALISingleOpTest : public DALITest {
         if (floatType) {
           colorIdx = CheckBuffers<float>(lenBuffer,
                           (*t1).template tensor<float>(i),
-                          (*t2).template tensor<float>(i), true, &diff);
+                          (*t2).template tensor<float>(i), checkAll, &mean);
         } else {
           colorIdx = CheckBuffers<unsigned char>(lenBuffer,
                           (*t1).template tensor<uint8>(i),
-                          (*t2).template tensor<uint8>(i), true, &diff);
+                          (*t2).template tensor<uint8>(i), checkAll, &mean);
         }
 
         if (colorIdx >= 0) {
           // Test failed for colorIdx
-          ReportTestFailure(diff, colorIdx, i, &shape1);
+          ReportTestFailure(mean, colorIdx, i, &shape1);
           failNumb++;
           if (!checkAll)
             break;
@@ -354,15 +549,15 @@ class DALISingleOpTest : public DALITest {
     } else {
       if (floatType) {
         colorIdx = CheckBuffers<float>(t1->size(),
-                            t1->data<float>(),
-                            t2->data<float>(), true, &diff);
+                          t1->data<float>(),
+                          t2->data<float>(), checkAll, &mean);
       } else {
         colorIdx = CheckBuffers<unsigned char>(t1->size(),
-                                    t1->data<unsigned char>(),
-                                    t2->data<unsigned char>(), checkAll, &diff);
+                          t1->data<unsigned char>(),
+                          t2->data<unsigned char>(), checkAll, &mean);
       }
-      if (colorIdx >= 0 && checkAll)
-        ReportTestFailure(diff, colorIdx);
+      if (colorIdx >= 0)
+        ReportTestFailure(mean, colorIdx);
     }
 
     if (!TestCheckType(t_checkNoAssert)) {
@@ -380,8 +575,7 @@ class DALISingleOpTest : public DALITest {
   vector<std::pair<string, string>> outputs_;
   shared_ptr<Pipeline> pipeline_;
 
-  vector<uint8*> png_;
-  vector<int> png_sizes_;
+  ImgSetDescr png_;
 
   vector<uint8*> jpeg_decoded_, png_decoded_;
   vector<DimPair> jpeg_dims_, png_dims_;
@@ -392,17 +586,12 @@ class DALISingleOpTest : public DALITest {
   int num_threads_ = 2;
   double eps_ = 1e-4;
   uint32_t testCheckType_ = t_checkDefault;
+  const DALIImageType img_type_ = ImgType::type;
 
   // keep a copy of the creation OpSpec for reference
   OpSpec spec_;
 };
 
-#define USING_DALI_SINGLE_OP_TEST() \
-  using DALISingleOpTest::AddSingleOp; \
-  using DALISingleOpTest::SetExternalInputs; \
-  using DALISingleOpTest::EncodedJPEGData; \
-  using DALISingleOpTest::DecodedData;
-
 }  // namespace dali
 
 #endif  // DALI_TEST_DALI_TEST_SINGLE_OP_H_
diff --git a/dali/util/image.cc b/dali/util/image.cc
index 8c1d1fdaaac..0779fa854c0 100644
--- a/dali/util/image.cc
+++ b/dali/util/image.cc
@@ -16,8 +16,8 @@
 
 namespace dali {
 
-void LoadImages(const string image_folder, vector<string> *image_names,
-    vector<uint8*> *images, vector<int> *image_sizes) {
+void LoadImages(const string &image_folder, vector<string> *image_names,
+                ImgSetDescr *imgs) {
   const string image_list = image_folder + "/image_list.txt";
   std::ifstream file(image_list);
   DALI_ENFORCE(file.is_open());
@@ -28,22 +28,10 @@ void LoadImages(const string image_folder, vector<string> *image_names,
     image_names->push_back(image_folder + "/" + img);
   }
 
-  for (auto img_name : *image_names) {
-    std::ifstream img_file(img_name);
-    DALI_ENFORCE(img_file.is_open());
-
-    img_file.seekg(0, std::ios::end);
-    int img_size = static_cast<int>(img_file.tellg());
-    img_file.seekg(0, std::ios::beg);
-
-    images->push_back(new uint8[img_size]);
-    image_sizes->push_back(img_size);
-    img_file.read(reinterpret_cast<char*>((*images)[images->size()-1]), img_size);
-  }
+  LoadImages(*image_names, imgs);
 }
 
-void LoadImages(const vector<string> &image_names,
-    vector<uint8*> *images, vector<int> *image_sizes) {
+void LoadImages(const vector<string> &image_names, ImgSetDescr *imgs) {
   for (auto img_name : image_names) {
     std::ifstream img_file(img_name);
     DALI_ENFORCE(img_file.is_open());
@@ -52,23 +40,22 @@ void LoadImages(const vector<string> &image_names,
     int img_size = static_cast<int>(img_file.tellg());
     img_file.seekg(0, std::ios::beg);
 
-    images->push_back(new uint8[img_size]);
-    image_sizes->push_back(img_size);
-    img_file.read(reinterpret_cast<char*>((*images)[images->size()-1]), img_size);
+    auto data = new uint8[img_size];
+    imgs->data_.push_back(data);
+    imgs->sizes_.push_back(img_size);
+    img_file.read(reinterpret_cast<char*>(data), img_size);
   }
 }
 
-void LoadJPEGS(const string image_folder, vector<string> *jpeg_names,
-    vector<uint8*> *jpegs, vector<int> *jpeg_sizes) {
-  LoadImages(image_folder, jpeg_names, jpegs, jpeg_sizes);
+void LoadJPEGS(const string &image_folder, vector<string> *jpeg_names, ImgSetDescr *jpegs) {
+  LoadImages(image_folder, jpeg_names, jpegs);
 }
 
-void LoadJPEGS(const vector<string> &jpeg_names,
-    vector<uint8*> *jpegs, vector<int> *jpeg_sizes) {
-  LoadImages(jpeg_names, jpegs, jpeg_sizes);
+void LoadJPEGS(const vector<string> &jpeg_names, ImgSetDescr *jpegs) {
+  LoadImages(jpeg_names, jpegs);
 }
 
-void LoadFromFile(string file_name, uint8 **image, int *h, int *w, int *c) {
+void LoadFromFile(const string &file_name, uint8 **image, int *h, int *w, int *c) {
   std::ifstream file(file_name + ".txt");
   DALI_ENFORCE(file.is_open());
 
@@ -86,30 +73,8 @@ void LoadFromFile(string file_name, uint8 **image, int *h, int *w, int *c) {
   }
 }
 
-void WriteHWCImage(const uint8 *img, int h, int w, int c, string file_name) {
-  DALI_ENFORCE(img != nullptr);
-  DALI_ENFORCE(h >= 0);
-  DALI_ENFORCE(w >= 0);
-  DALI_ENFORCE(c >= 0);
-  CUDA_CALL(cudaDeviceSynchronize());
-  vector<uint8> tmp(h*w*c, 0);
-  MemCopy(tmp.data(), img, h*w*c);
-  std::ofstream file(file_name + ".ppm");
-  DALI_ENFORCE(file.is_open());
-
-  file << "P3" << endl;
-  file << w << " " << h << endl;
-  file << "255" << endl;
-
-  for (int i = 0; i < h; ++i) {
-    for (int j = 0; j < w; ++j) {
-      for (int k = 0; k < 3; ++k) {
-        int c_id = k % c;
-        file << int(tmp[i*w*c + j*c + c_id]) << " ";
-      }
-    }
-    file << endl;
-  }
+void WriteHWCImage(const uint8 *img, int h, int w, int c, const string &file_name) {
+  WriteImageScaleBias(img, h, w, c, 0.f, 1.0f, file_name, outHWCImage);
 }
 
 }  // namespace dali
diff --git a/dali/util/image.h b/dali/util/image.h
index 7f26ae36c76..c847ef09442 100644
--- a/dali/util/image.h
+++ b/dali/util/image.h
@@ -34,105 +34,75 @@
 
 namespace dali {
 
+class ImgSetDescr {
+ public:
+  ~ImgSetDescr()                  { clear(); }
+  inline void clear() {
+    for (auto &ptr : data_) delete[] ptr;
+    data_.clear(); sizes_.clear();
+  }
+
+  inline size_t nImages() const   { return data_.size(); }
+
+  vector<uint8 *> data_;
+  vector<int> sizes_;
+};
+
 /**
  * Load all images from a list of image names. Assumes names contain
  * full path
  */
-DLL_PUBLIC void LoadImages(const vector<string> &image_names,
-    vector<uint8*> *images, vector<int> *image_sizes);
+DLL_PUBLIC void LoadImages(const vector<string> &image_names, ImgSetDescr *imgs);
 
 /**
  * Loads images from a specified image folder. Assumes the folder contains
  * a file 'image_list.txt' that lists all the different images in the
  * folder
  */
-DLL_PUBLIC void LoadImages(string image_folder, vector<string> *jpeg_names,
-    vector<uint8*> *jpegs, vector<int> *jpeg_sizes);
+DLL_PUBLIC void LoadImages(const string &image_folder, vector<string> *jpeg_names,
+                           ImgSetDescr *imgs);
 
 /**
  * Loads jpegs from a specified image folder. Assumes the folder contains
  * a file 'image_list.txt' that lists all the different images in the
  * folder
  */
-DLL_PUBLIC void LoadJPEGS(string image_folder, vector<string> *jpeg_names,
-    vector<uint8*> *jpegs, vector<int> *jpeg_sizes);
+DLL_PUBLIC void LoadJPEGS(const string &image_folder, vector<string> *jpeg_names,
+                          ImgSetDescr *imgs);
 
 /**
  * Loads all jpegs from the list of image names. Assumes names contains
  * full path
  */
-DLL_PUBLIC void LoadJPEGS(const vector<string> &jpeg_names,
-    vector<uint8*> *jpegs, vector<int> *jpeg_sizes);
+DLL_PUBLIC void LoadJPEGS(const vector<string> &jpeg_names, ImgSetDescr *imgs);
 
 /**
  * @brief Writes the input image as a ppm file
  */
-DLL_PUBLIC void WriteHWCImage(const uint8 *img, int h, int w, int c, string file_name);
+DLL_PUBLIC void WriteHWCImage(const uint8 *img, int h, int w, int c, const string &file_name);
 
-/**
- * @brief Writes all images in a batch
- */
-template <typename Backend>
-DLL_PUBLIC void WriteHWCBatch(const TensorList<Backend> &tl, string suffix) {
-  DALI_ENFORCE(IsType<uint8>(tl.type()));
-  for (int i = 0; i < tl.ntensor(); ++i) {
-    DALI_ENFORCE(tl.tensor_shape(i).size() == 3);
-    int h = tl.tensor_shape(i)[0];
-    int w = tl.tensor_shape(i)[1];
-    int c = tl.tensor_shape(i)[2];
-    WriteHWCImage(tl.template tensor<uint8>(i),
-        h, w, c, std::to_string(i) + "-" + suffix);
-  }
+template <typename T>
+int outHWCImage(const vector<T> &tmp, int h, int w, int c,
+                int i, int j, int k, float bias, float scale) {
+  return static_cast<int>(tmp[i*w*c + j*c + k]*scale + bias);
 }
 
-/**
- * @brief Writes an image after applying a scale and bias to get
- * pixel values in the range 0-255
- */
 template <typename T>
-void WriteHWCImageScaleBias(const T *img, int h, int w,
-    int c, float bias, float scale, string file_name) {
-  DALI_ENFORCE(img != nullptr);
-  DALI_ENFORCE(h >= 0);
-  DALI_ENFORCE(w >= 0);
-  DALI_ENFORCE(c >= 0);
-  CUDA_CALL(cudaDeviceSynchronize());
-  Tensor<GPUBackend> tmp_gpu, double_gpu;
-  tmp_gpu.Resize({h, w, c});
-  tmp_gpu.template mutable_data<T>();  // make sure the buffer is allocated
-  double_gpu.Resize({h, w, c});
-
-  // Copy the data and convert to double
-  MemCopy(tmp_gpu.template mutable_data<T>(), img, tmp_gpu.nbytes());
-  Convert(tmp_gpu.template data<T>(), tmp_gpu.size(), double_gpu.template mutable_data<double>());
-
-  vector<double> tmp(h*w*c, 0);
-  MemCopy(tmp.data(), double_gpu.template data<double>(), double_gpu.nbytes());
-  CUDA_CALL(cudaDeviceSynchronize());
-  std::ofstream file(file_name + ".ppm");
-  DALI_ENFORCE(file.is_open());
-
-  file << "P3" << endl;
-  file << w << " " << h << endl;
-  file << "255" << endl;
-
-  for (int i = 0; i < h; ++i) {
-    for (int j = 0; j < w; ++j) {
-      for (int k = 0; k < c; ++k) {
-        file << int(tmp[i*w*c + j*c + k]*scale + bias) << " ";
-      }
-    }
-    file << endl;
-  }
+int outCHWImage(const vector<T> &tmp, int h, int w, int c,
+                int i, int j, int k, float bias, float scale) {
+  return static_cast<int>(tmp[k*h*w + i*w + j]*scale + bias);
 }
 
+typedef int (*outFunc)(const vector<double> &tmp, int h, int w, int c,
+                       int i, int j, int k, float bias, float scale);
+
 /**
  * @brief Writes an image after applying a scale and bias to get
  * pixel values in the range 0-255
  */
 template <typename T>
-void WriteCHWImageScaleBias(const T *img, int h, int w,
-    int c, float bias, float scale, string file_name) {
+void WriteImageScaleBias(const T *img, int h, int w,
+    int c, float bias, float scale, const string &file_name, outFunc pFunc) {
   DALI_ENFORCE(img != nullptr);
   DALI_ENFORCE(h >= 0);
   DALI_ENFORCE(w >= 0);
@@ -147,19 +117,20 @@ void WriteCHWImageScaleBias(const T *img, int h, int w,
   MemCopy(tmp_gpu.template mutable_data<T>(), img, tmp_gpu.nbytes());
   Convert(tmp_gpu.template data<T>(), tmp_gpu.size(), double_gpu.template mutable_data<double>());
 
-  vector<double> tmp(h*w*c, 0);
+  vector<double> tmp(h * w * c, 0);
   MemCopy(tmp.data(), double_gpu.template data<double>(), double_gpu.nbytes());
+  CUDA_CALL(cudaDeviceSynchronize());
   std::ofstream file(file_name + ".ppm");
   DALI_ENFORCE(file.is_open());
 
-  file << "P3" << endl;
+  file << (c == 3? "P3" : "P2") << endl;    // For color/grayscale images, respectively
   file << w << " " << h << endl;
   file << "255" << endl;
 
   for (int i = 0; i < h; ++i) {
     for (int j = 0; j < w; ++j) {
       for (int k = 0; k < c; ++k) {
-        file << int(tmp[k*h*w + i*w + j]*scale + bias) << " ";
+        file << (*pFunc)(tmp, h, w, c, i, j, k, bias, scale) << " ";
       }
     }
     file << endl;
@@ -170,36 +141,34 @@ void WriteCHWImageScaleBias(const T *img, int h, int w,
  * @brief Writes all images in a batch with a scale and bias
  */
 template <typename T, typename Backend>
-void WriteHWCBatch(const TensorList<Backend> &tl, float bias, float scale, string suffix) {
+void WriteBatch(const TensorList<Backend> &tl, float bias, float scale, const string &suffix,
+                const std::array<int, 3> &permute, outFunc pFunc) {
   DALI_ENFORCE(IsType<T>(tl.type()));
   for (int i = 0; i < tl.ntensor(); ++i) {
     DALI_ENFORCE(tl.tensor_shape(i).size() == 3);
-    int h = tl.tensor_shape(i)[0];
-    int w = tl.tensor_shape(i)[1];
-    int c = tl.tensor_shape(i)[2];
-    WriteHWCImageScaleBias(
+    int h = tl.tensor_shape(i)[permute[0]];
+    int w = tl.tensor_shape(i)[permute[1]];
+    int c = tl.tensor_shape(i)[permute[2]];
+    WriteImageScaleBias(
         tl.template tensor<T>(i),
         h, w, c, bias, scale,
-        std::to_string(i) + "-" + suffix);
+        std::to_string(i) + "-" + suffix, pFunc);
   }
 }
 
-/**
- * @brief Writes all images in a batch with a scale and bias
- */
 template <typename T, typename Backend>
-void WriteCHWBatch(const TensorList<Backend> &tl, float bias, float scale, string suffix) {
-  DALI_ENFORCE(IsType<T>(tl.type()));
-  for (int i = 0; i < tl.ntensor(); ++i) {
-    DALI_ENFORCE(tl.tensor_shape(i).size() == 3);
-    int c = tl.tensor_shape(i)[0];
-    int h = tl.tensor_shape(i)[1];
-    int w = tl.tensor_shape(i)[2];
-    WriteCHWImageScaleBias(
-        tl.template tensor<T>(i),
-        h, w, c, bias, scale,
-        std::to_string(i) + "-" + suffix);
-  }
+void WriteHWCBatch(const TensorList<Backend> &tl, float bias, float scale, const string &suffix) {
+  WriteBatch(tl, bias, scale, suffix, std::array<int, 3>{0, 1, 2}, outHWCImage);
+}
+
+template <typename T, typename Backend>
+void WriteCHWBatch(const TensorList<Backend> &tl, float bias, float scale, const string &suffix) {
+  WriteBatch<T, Backend>(tl, bias, scale, suffix, std::array<int, 3>{1, 2, 0}, outCHWImage);
+}
+
+template <typename Backend>
+void WriteHWCBatch(const TensorList<Backend> &tl, const string &suffix) {
+  WriteBatch<uint8, Backend>(tl, 0.f, 1.0, suffix, std::array<int, 3>{0, 1, 2}, outHWCImage);
 }
 
 }  // namespace dali