diff --git a/cmake/Dependencies.common.cmake b/cmake/Dependencies.common.cmake
index dd8244c2768..3927a4440e7 100644
--- a/cmake/Dependencies.common.cmake
+++ b/cmake/Dependencies.common.cmake
@@ -273,9 +273,18 @@ endif()
 # nvimagecodec
 ##################################################################
 set(DALI_INSTALL_REQUIRES_NVIMGCODEC "")
+set(DALI_INSTALL_REQUIRES_NVJPEG2K "")
+set(DALI_INSTALL_REQUIRES_NVTIFF "")
 if(BUILD_NVIMAGECODEC)
-  set(NVIMGCODEC_MIN_VERSION "0.3.0")
-  set(NVIMGCODEC_MAX_VERSION "0.4.0")
+  set(NVJPEG2K_MIN_VERSION "0.8.0")
+  set(NVJPEG2K_MAX_VERSION "0.9.0")
+
+  set(NVTIFF_MIN_VERSION "0.4.0")
+  set(NVTIFF_MAX_VERSION "0.5.0")
+
+  set(NVIMGCODEC_MIN_VERSION "0.4.1")
+  set(NVIMGCODEC_MAX_VERSION "0.5.0")
+  message(STATUS "nvImageCodec - requires version >=${NVIMGCODEC_MIN_VERSION}, <${NVIMGCODEC_MAX_VERSION}")
   if (WITH_DYNAMIC_NVIMGCODEC)
     message(STATUS "nvImageCodec - dynamic load")
 
@@ -288,8 +297,8 @@ if(BUILD_NVIMAGECODEC)
     include(FetchContent)
     FetchContent_Declare(
       nvimgcodec_headers
-      URL      https://developer.download.nvidia.com/compute/nvimgcodec/redist/nvimgcodec/linux-x86_64/nvimgcodec-linux-x86_64-0.3.0.5-archive.tar.xz
-      URL_HASH SHA512=259bff93305c301fb4325c6e2f71da93f3f6e0b38c7c8739913ca70b5a9c74cc898a608c5ac6e830dba1739878e53607ded03deaf2f23af3a9cc473463f100eb
+      URL      https://developer.download.nvidia.com/compute/nvimgcodec/redist/nvimgcodec/linux-x86_64/nvimgcodec-linux-x86_64-0.4.1.21-archive.tar.xz
+      URL_HASH SHA512=3f20f6944a360597586bfe3550a0605257bcd944748477a869691ec1a42716e3722f8ddbd0b525995ebab89a33cd91ed82d5b151194008f1a8424971448a4824
     )
     FetchContent_Populate(nvimgcodec_headers)
     set(nvimgcodec_SEARCH_PATH "${nvimgcodec_headers_SOURCE_DIR}/${CUDA_VERSION_MAJOR}/include")
@@ -304,7 +313,27 @@ if(BUILD_NVIMAGECODEC)
     message(STATUS "NVIMGCODEC_DEFAULT_INSTALL_PATH=${NVIMGCODEC_DEFAULT_INSTALL_PATH}")
     add_definitions(-DNVIMGCODEC_DEFAULT_INSTALL_PATH=\"${NVIMGCODEC_DEFAULT_INSTALL_PATH}\")
 
-    set(DALI_INSTALL_REQUIRES_NVIMGCODEC "\'nvidia-nvimgcodec-cu${CUDA_VERSION_MAJOR} >= ${NVIMGCODEC_MIN_VERSION}, < ${NVIMGCODEC_MAX_VERSION}',")
+    # Find the position of the substring
+    string(FIND "aarch64-linux-gnu" "${CMAKE_PREFIX_PATH}" SUBSTRING_POSITION)
+    if(NOT SUBSTRING_POSITION EQUAL -1)
+      # Substring found
+      set(NVIMGCODEC_PACKAGE_NAME "nvidia-nvimgcodec-cu${CUDA_VERSION_MAJOR}")
+      set(NVJPEG2K_PACKAGE_NAME "nvidia-nvjpeg2k-cu${CUDA_VERSION_MAJOR}")
+      set(NVTIFF_PACKAGE_NAME "nvidia-nvtiff-cu${CUDA_VERSION_MAJOR}")
+    else()
+      # Substring not found
+      set(NVIMGCODEC_PACKAGE_NAME "nvidia-nvimgcodec-tegra-cu${CUDA_VERSION_MAJOR}")
+      set(NVJPEG2K_PACKAGE_NAME "nvidia-nvjpeg2k-tegra-cu${CUDA_VERSION_MAJOR}")
+      set(NVTIFF_PACKAGE_NAME "nvidia-nvtiff-tegra-cu${CUDA_VERSION_MAJOR}")
+    endif()
+
+    # TODO(janton): Replace with nvimgcodec[nvtiff+nvjpeg2k+...] when available
+    set(DALI_INSTALL_REQUIRES_NVJPEG2K "\'${NVJPEG2K_PACKAGE_NAME} >= ${NVJPEG2K_MIN_VERSION}, < ${NVJPEG2K_MAX_VERSION}',")
+    message(STATUS "Adding nvjpeg2k requirement as: ${DALI_INSTALL_REQUIRES_NVJPEG2K}")
+    set(DALI_INSTALL_REQUIRES_NVTIFF "\'${NVTIFF_PACKAGE_NAME} >= ${NVTIFF_MIN_VERSION}, < ${NVTIFF_MAX_VERSION}',")
+    message(STATUS "Adding nvtiff requirement as: ${DALI_INSTALL_REQUIRES_NVTIFF}")
+    set(DALI_INSTALL_REQUIRES_NVIMGCODEC "\'${NVIMGCODEC_PACKAGE_NAME} >= ${NVIMGCODEC_MIN_VERSION}, < ${NVIMGCODEC_MAX_VERSION}',")
+    message(STATUS "Adding nvimagecodec requirement as: ${DALI_INSTALL_REQUIRES_NVIMGCODEC}")
   else()
     message(STATUS "nvImageCodec - static link")
 
@@ -321,7 +350,7 @@ if(BUILD_NVIMAGECODEC)
     ExternalProject_Add(
       nvImageCodec
       GIT_REPOSITORY    https://github.com/NVIDIA/nvImageCodec.git
-      GIT_TAG           v0.3.0
+      GIT_TAG           v0.4.0
       GIT_SUBMODULES    "external/pybind11"
                         "external/NVTX"
                         "external/googletest"
@@ -336,6 +365,8 @@ if(BUILD_NVIMAGECODEC)
                         "-DWITH_DYNAMIC_NVJPEG2K=OFF"
                         "-DBUILD_NVJPEG_EXT=${BUILD_NVJPEG}"
                         "-DWITH_DYNAMIC_NVJPEG=${WITH_DYNAMIC_NVJPEG}"
+                        "-DBUILD_NVTIFF_EXT=OFF"
+                        "-DWITH_DYNAMIC_NVTIFF=OFF"
                         "-DBUILD_NVBMP_EXT=OFF"
                         "-DBUILD_NVPNM_EXT=OFF"
                         "-DBUILD_LIBJPEG_TURBO_EXT=${BUILD_LIBJPEG_TURBO}"
diff --git a/conda/third_party/dali_nvimagecodec/recipe/meta.yaml b/conda/third_party/dali_nvimagecodec/recipe/meta.yaml
index 4c67501779b..83a402ac214 100644
--- a/conda/third_party/dali_nvimagecodec/recipe/meta.yaml
+++ b/conda/third_party/dali_nvimagecodec/recipe/meta.yaml
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 
-{% set build_version = "0.3.0" %}
+{% set build_version = "0.4.0" %}
 
 package:
   name: nvidia-nvimagecodec-cuda{{ environ.get('CUDA_VERSION', '') | replace(".","") }}
@@ -21,7 +21,7 @@ package:
 
 source:
   git_url: https://github.com/NVIDIA/nvImageCodec.git
-  git_rev: v0.3.0
+  git_rev: v0.4.0
 
 build:
   number: 0
diff --git a/dali/operators/imgcodec/image_decoder.h b/dali/operators/imgcodec/image_decoder.h
index aed89353e41..129b198e658 100644
--- a/dali/operators/imgcodec/image_decoder.h
+++ b/dali/operators/imgcodec/image_decoder.h
@@ -11,6 +11,7 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // limitations under the License.
 
+#include <atomic>
 #include <map>
 #include <memory>
 #include <string>
@@ -32,14 +33,13 @@
 #include "dali/pipeline/operator/operator.h"
 
 #if not(WITH_DYNAMIC_NVIMGCODEC_ENABLED)
-nvimgcodecStatus_t get_libjpeg_turbo_extension_desc(nvimgcodecExtensionDesc_t* ext_desc);
-nvimgcodecStatus_t get_libtiff_extension_desc(nvimgcodecExtensionDesc_t* ext_desc);
-nvimgcodecStatus_t get_opencv_extension_desc(nvimgcodecExtensionDesc_t* ext_desc);
-nvimgcodecStatus_t get_nvjpeg_extension_desc(nvimgcodecExtensionDesc_t* ext_desc);
-nvimgcodecStatus_t get_nvjpeg2k_extension_desc(nvimgcodecExtensionDesc_t* ext_desc);
+nvimgcodecStatus_t get_libjpeg_turbo_extension_desc(nvimgcodecExtensionDesc_t *ext_desc);
+nvimgcodecStatus_t get_libtiff_extension_desc(nvimgcodecExtensionDesc_t *ext_desc);
+nvimgcodecStatus_t get_opencv_extension_desc(nvimgcodecExtensionDesc_t *ext_desc);
+nvimgcodecStatus_t get_nvjpeg_extension_desc(nvimgcodecExtensionDesc_t *ext_desc);
+nvimgcodecStatus_t get_nvjpeg2k_extension_desc(nvimgcodecExtensionDesc_t *ext_desc);
 #endif
 
-
 #ifndef DALI_OPERATORS_IMGCODEC_IMAGE_DECODER_H_
 #define DALI_OPERATORS_IMGCODEC_IMAGE_DECODER_H_
 
@@ -119,7 +119,7 @@ inline int static_dali_pinned_free(void *ctx, void *ptr, size_t size, cudaStream
   return cudaSuccess;
 }
 
-inline void get_nvimgcodec_version(int* major, int *minor, int* patch) {
+inline void get_nvimgcodec_version(int *major, int *minor, int *patch) {
   static int s_major = -1, s_minor = -1, s_patch = -1;
   auto version_check_f = [&] {
     nvimgcodecProperties_t properties{NVIMGCODEC_STRUCTURE_TYPE_PROPERTIES,
@@ -148,7 +148,7 @@ class ImageDecoder : public StatelessOperator<Backend> {
   ~ImageDecoder() override {
 #if not(WITH_DYNAMIC_NVIMGCODEC_ENABLED)
     decoder_.reset();  // first stop the decoder
-    for (auto& extension : extensions_) {
+    for (auto &extension : extensions_) {
       nvimgcodecExtensionDestroy(extension);
     }
 #endif
@@ -176,6 +176,7 @@ class ImageDecoder : public StatelessOperator<Backend> {
     DALIImageType orig_img_type;
     DALIImageType req_img_type;
     float dyn_range_multiplier = 1.0f;
+    bool load_from_cache = false;
 
     mm::uptr<uint8_t> host_buf;
     mm::async_uptr<uint8_t> device_buf;
@@ -229,11 +230,9 @@ class ImageDecoder : public StatelessOperator<Backend> {
     if (std::is_same<MixedBackend, Backend>::value) {
       thread_pool_ = std::make_unique<ThreadPool>(num_threads_, device_id_,
                                                   spec.GetArgument<bool>("affine"), "MixedDecoder");
-
       if (spec_.HasArgument("cache_size"))
         cache_ = std::make_unique<CachedDecoderImpl>(spec_);
     }
-
     EnforceMinimumNvimgcodecVersion();
 
     nvimgcodecDeviceAllocator_t *dev_alloc_ptr = nullptr;
@@ -267,7 +266,7 @@ class ImageDecoder : public StatelessOperator<Backend> {
         nullptr};
 
     const char *log_lvl_env = std::getenv("DALI_NVIMGCODEC_LOG_LEVEL");
-    int log_lvl = log_lvl_env ? clamp(atoi(log_lvl_env), 1, 5): 2;
+    int log_lvl = log_lvl_env ? clamp(atoi(log_lvl_env), 1, 5) : 2;
 
     instance_create_info.load_extension_modules = static_cast<int>(WITH_DYNAMIC_NVIMGCODEC_ENABLED);
     instance_create_info.load_builtin_modules = static_cast<int>(true);
@@ -355,8 +354,7 @@ class ImageDecoder : public StatelessOperator<Backend> {
     opts_.add_module_option("nvjpeg_cuda_decoder", "preallocate_buffers", true);
 
     // Batch size
-    opts_.add_module_option("nvjpeg_hw_decoder", "preallocate_batch_size",
-                            std::max(1, max_batch_size_));
+    opts_.add_module_option("nvjpeg_hw_decoder", "preallocate_batch_size", max_batch_size_);
     // Nvjpeg2k parallel tiles
     opts_.add_module_option("nvjpeg2k_cuda_decoder", "num_parallel_tiles", 16);
 
@@ -367,32 +365,35 @@ class ImageDecoder : public StatelessOperator<Backend> {
     backends_.clear();
     backends_.reserve(4);
     if (nvimgcodec_device_id != NVIMGCODEC_DEVICE_CPU_ONLY) {
-      backends_.push_back(
-          nvimgcodecBackend_t{NVIMGCODEC_STRUCTURE_TYPE_BACKEND,
-                              sizeof(nvimgcodecBackend_t),
-                              nullptr,
-                              NVIMGCODEC_BACKEND_KIND_HW_GPU_ONLY,
-                              {NVIMGCODEC_STRUCTURE_TYPE_BACKEND_PARAMS,
-                               sizeof(nvimgcodecBackendParams_t), nullptr, hw_load}});
-      backends_.push_back(nvimgcodecBackend_t{NVIMGCODEC_STRUCTURE_TYPE_BACKEND,
-                                              sizeof(nvimgcodecBackend_t),
-                                              nullptr,
-                                              NVIMGCODEC_BACKEND_KIND_GPU_ONLY,
-                                              {NVIMGCODEC_STRUCTURE_TYPE_BACKEND_PARAMS,
-                                               sizeof(nvimgcodecBackendParams_t), nullptr, 1.0f}});
-      backends_.push_back(nvimgcodecBackend_t{NVIMGCODEC_STRUCTURE_TYPE_BACKEND,
-                                              sizeof(nvimgcodecBackend_t),
-                                              nullptr,
-                                              NVIMGCODEC_BACKEND_KIND_HYBRID_CPU_GPU,
-                                              {NVIMGCODEC_STRUCTURE_TYPE_BACKEND_PARAMS,
-                                               sizeof(nvimgcodecBackendParams_t), nullptr, 1.0f}});
+      backends_.push_back(nvimgcodecBackend_t{
+          NVIMGCODEC_STRUCTURE_TYPE_BACKEND,
+          sizeof(nvimgcodecBackend_t),
+          nullptr,
+          NVIMGCODEC_BACKEND_KIND_HW_GPU_ONLY,
+          {NVIMGCODEC_STRUCTURE_TYPE_BACKEND_PARAMS, sizeof(nvimgcodecBackendParams_t), nullptr,
+           hw_load, NVIMGCODEC_LOAD_HINT_POLICY_FIXED}});
+      backends_.push_back(nvimgcodecBackend_t{
+          NVIMGCODEC_STRUCTURE_TYPE_BACKEND,
+          sizeof(nvimgcodecBackend_t),
+          nullptr,
+          NVIMGCODEC_BACKEND_KIND_GPU_ONLY,
+          {NVIMGCODEC_STRUCTURE_TYPE_BACKEND_PARAMS, sizeof(nvimgcodecBackendParams_t), nullptr,
+           1.0f, NVIMGCODEC_LOAD_HINT_POLICY_FIXED}});
+      backends_.push_back(nvimgcodecBackend_t{
+          NVIMGCODEC_STRUCTURE_TYPE_BACKEND,
+          sizeof(nvimgcodecBackend_t),
+          nullptr,
+          NVIMGCODEC_BACKEND_KIND_HYBRID_CPU_GPU,
+          {NVIMGCODEC_STRUCTURE_TYPE_BACKEND_PARAMS, sizeof(nvimgcodecBackendParams_t), nullptr,
+           1.0f, NVIMGCODEC_LOAD_HINT_POLICY_FIXED}});
     }
-    backends_.push_back(nvimgcodecBackend_t{NVIMGCODEC_STRUCTURE_TYPE_BACKEND,
-                                            sizeof(nvimgcodecBackend_t),
-                                            nullptr,
-                                            NVIMGCODEC_BACKEND_KIND_CPU_ONLY,
-                                            {NVIMGCODEC_STRUCTURE_TYPE_BACKEND_PARAMS,
-                                             sizeof(nvimgcodecBackendParams_t), nullptr, 1.0f}});
+    backends_.push_back(nvimgcodecBackend_t{
+        NVIMGCODEC_STRUCTURE_TYPE_BACKEND,
+        sizeof(nvimgcodecBackend_t),
+        nullptr,
+        NVIMGCODEC_BACKEND_KIND_CPU_ONLY,
+        {NVIMGCODEC_STRUCTURE_TYPE_BACKEND_PARAMS, sizeof(nvimgcodecBackendParams_t), nullptr, 1.0f,
+         NVIMGCODEC_LOAD_HINT_POLICY_FIXED}});
 
     exec_params_.backends = backends_.data();
     exec_params_.num_backends = backends_.size();
@@ -401,13 +402,27 @@ class ImageDecoder : public StatelessOperator<Backend> {
     exec_params_.executor = &executor_;
     exec_params_.max_num_cpu_threads = num_threads_;
     exec_params_.pre_init = 1;
+    exec_params_.skip_pre_sync = 1;  // we are not doing stream allocations before decoding.
     decoder_ = NvImageCodecDecoder::Create(instance_, &exec_params_, opts_.to_string());
   }
 
-  nvimgcodecStatus_t launch(int device_id, int sample_idx, void *task_context,
-                            void (*task)(int thread_id, int sample_idx, void *task_context)) {
+  nvimgcodecStatus_t schedule(int device_id, int sample_idx, void *task_context,
+                              void (*task)(int thread_id, int sample_idx, void *task_context)) {
+    assert(tp_);
+    tp_->AddWork([=](int tid) { task(tid, sample_idx, task_context); }, -(task_count_++), false);
+    return NVIMGCODEC_STATUS_SUCCESS;
+  }
+
+  nvimgcodecStatus_t run(int device_id) {
+    assert(tp_);
+    tp_->RunAll(false);
+    task_count_ = 0;
+    return NVIMGCODEC_STATUS_SUCCESS;
+  }
+
+  nvimgcodecStatus_t wait(int device_id) {
     assert(tp_);
-    tp_->AddWork([=](int tid) { task(tid, sample_idx, task_context); }, 0, true);
+    tp_->WaitForWork();
     return NVIMGCODEC_STATUS_SUCCESS;
   }
 
@@ -417,12 +432,22 @@ class ImageDecoder : public StatelessOperator<Backend> {
     return num_threads_;
   }
 
-  static nvimgcodecStatus_t static_launch(void *instance, int device_id, int sample_idx,
-                                          void *task_context,
-                                          void (*task)(int thread_id, int sample_idx,
-                                                       void *task_context)) {
+  static nvimgcodecStatus_t static_schedule(void *instance, int device_id, int sample_idx,
+                                            void *task_context,
+                                            void (*task)(int thread_id, int sample_idx,
+                                                         void *task_context)) {
+    auto *handle = static_cast<ImageDecoder<Backend> *>(instance);
+    return handle->schedule(device_id, sample_idx, task_context, task);
+  }
+
+  static nvimgcodecStatus_t static_run(void *instance, int device_id) {
+    auto *handle = static_cast<ImageDecoder<Backend> *>(instance);
+    return handle->run(device_id);
+  }
+
+  static nvimgcodecStatus_t static_wait(void *instance, int device_id) {
     auto *handle = static_cast<ImageDecoder<Backend> *>(instance);
-    return handle->launch(device_id, sample_idx, task_context, task);
+    return handle->wait(device_id);
   }
 
   static int static_get_num_threads(void *instance) {
@@ -466,6 +491,19 @@ class ImageDecoder : public StatelessOperator<Backend> {
     // Make sure we set the default that DALI expects
     if (decoder_params_.count("jpeg_fancy_upsampling") == 0)
       decoder_params_["jpeg_fancy_upsampling"] = false;
+
+    // Overriding hw_decoder_load default if nvImageCodec 0.4.x version
+    if (!spec.HasArgument("hw_decoder_load") && decoder_params_.count("hw_decoder_load") > 0) {
+      int major = -1, minor = -1, patch = -1;
+      get_nvimgcodec_version(&major, &minor, &patch);
+      if (MAKE_SEMANTIC_VERSION(major, minor, patch) >= MAKE_SEMANTIC_VERSION(0, 4, 0) &&
+          MAKE_SEMANTIC_VERSION(major, minor, patch) < MAKE_SEMANTIC_VERSION(0, 5, 0)) {
+        DALI_WARN(
+            "nvImageCodec 0.4.x version detected. Setting hw_decoder_load to 1.0f for "
+            "deterministic results");
+        decoder_params_["hw_decoder_load"] = 1.0f;
+      }
+    }
   }
 
   void ParseSample(ParsedSample &parsed_sample, span<const uint8_t> encoded) {
@@ -500,90 +538,8 @@ class ImageDecoder : public StatelessOperator<Backend> {
     return std::is_same<MixedBackend, Backend>::value ? thread_pool_.get() : &ws.GetThreadPool();
   }
 
-
-  bool SetupImpl(std::vector<OutputDesc> &output_descs, const Workspace &ws) override {
-    DomainTimeRange tr("Setup", DomainTimeRange::kOrange);
-    tp_ = GetThreadPool(ws);
-    assert(tp_ != nullptr);
-    auto auto_cleanup = AtScopeExit([&] {
-      tp_ = nullptr;
-    });
-
-    output_descs.resize(1);
-    auto &input = ws.template Input<CPUBackend>(0);
-    int nsamples = input.num_samples();
-
-    SetupRoiGenerator(spec_, ws);
-    TensorListShape<> shapes;
-    shapes.resize(nsamples, 3);
-    while (static_cast<int>(state_.size()) < nsamples)
-      state_.push_back(std::make_unique<SampleState>());
-    rois_.resize(nsamples);
-
-    const bool use_cache = cache_ && cache_->IsCacheEnabled() && dtype_ == DALI_UINT8;
-    auto get_task = [&](int block_idx, int nblocks) {
-      return [&, block_idx, nblocks](int tid) {
-        int i_start = nsamples * block_idx / nblocks;
-        int i_end = nsamples * (block_idx + 1) / nblocks;
-        for (int i = i_start; i < i_end; i++) {
-          auto *st = state_[i].get();
-          assert(st != nullptr);
-          const auto &input_sample = input[i];
-
-          auto src_info = input.GetMeta(i).GetSourceInfo();
-          if (use_cache && cache_->IsInCache(src_info)) {
-            auto cached_shape = cache_->CacheImageShape(src_info);
-            auto roi = GetRoi(spec_, ws, i, cached_shape);
-            if (!roi.use_roi()) {
-              shapes.set_tensor_shape(i, cached_shape);
-              continue;
-            }
-          }
-          ParseSample(st->parsed_sample,
-                      span<const uint8_t>{static_cast<const uint8_t *>(input_sample.raw_data()),
-                                          volume(input_sample.shape())});
-          st->out_shape = st->parsed_sample.dali_img_info.shape;
-          st->out_shape[2] = NumberOfChannels(format_, st->out_shape[2]);
-          if (use_orientation_ &&
-              (st->parsed_sample.nvimgcodec_img_info.orientation.rotated % 180 != 0)) {
-            std::swap(st->out_shape[0], st->out_shape[1]);
-          }
-          ROI &roi = rois_[i] = GetRoi(spec_, ws, i, st->out_shape);
-          if (roi.use_roi()) {
-            auto roi_sh = roi.shape();
-            if (roi.end.size() >= 2) {
-              DALI_ENFORCE(0 <= roi.end[0] && roi.end[0] <= st->out_shape[0] &&
-                               0 <= roi.end[1] && roi.end[1] <= st->out_shape[1],
-                           "ROI end must fit within the image bounds");
-            }
-            if (roi.begin.size() >= 2) {
-              DALI_ENFORCE(0 <= roi.begin[0] && roi.begin[0] <= st->out_shape[0] &&
-                               0 <= roi.begin[1] && roi.begin[1] <= st->out_shape[1],
-                           "ROI begin must fit within the image bounds");
-            }
-            st->out_shape[0] = roi_sh[0];
-            st->out_shape[1] = roi_sh[1];
-          }
-          shapes.set_tensor_shape(i, st->out_shape);
-        }
-      };
-    };
-
-    int nblocks = tp_->NumThreads() + 1;
-    if (nsamples > nblocks * 4) {
-      int block_idx = 0;
-      for (; block_idx < tp_->NumThreads(); block_idx++) {
-        tp_->AddWork(get_task(block_idx, nblocks), -block_idx);
-      }
-      tp_->RunAll(false);                // start work but not wait
-      get_task(block_idx, nblocks)(-1);  // run last block
-      tp_->WaitForWork();                // wait for the other threads
-    } else {                             // not worth parallelizing
-      get_task(0, 1)(-1);                // run all in current thread
-    }
-
-    output_descs[0] = {std::move(shapes), dtype_};
-    return true;
+  bool SetupImpl(std::vector<OutputDesc> &output_desc, const Workspace &ws) override {
+    return false;
   }
 
   /**
@@ -602,14 +558,10 @@ class ImageDecoder : public StatelessOperator<Backend> {
    * to the decoding function
    */
   bool need_host_sync_alloc() {
-    int major, minor, patch;
-    get_nvimgcodec_version(&major, &minor, &patch);
     return !version_at_least(0, 3, 0);
   }
 
-  template <typename OutBackend>
-  void PrepareOutput(SampleState &st, SampleView<OutBackend> out, const ROI &roi,
-                     const Workspace &ws) {
+  void PrepareOutput(SampleState &st, void *out_ptr, const ROI &roi, const Workspace &ws) {
     // Make a copy of the parsed img info. We might modify it
     // (for example, request planar vs. interleaved, etc)
     st.image_info = st.parsed_sample.nvimgcodec_img_info;
@@ -659,6 +611,8 @@ class ImageDecoder : public StatelessOperator<Backend> {
     int precision = st.image_info.plane_info[0].precision;
     if (precision == 0)
       precision = PositiveBits(st.parsed_sample.orig_dtype);
+    if (precision == 1 && st.parsed_sample.orig_dtype == DALI_UINT8)
+      precision = 8;  // nvimgcodec produces at minimum uint8 dynamic range (0..255)
     bool need_dynamic_range_scaling =
         NeedDynamicRangeScaling(precision, st.parsed_sample.orig_dtype);
     st.dyn_range_multiplier = need_dynamic_range_scaling ?
@@ -690,7 +644,7 @@ class ImageDecoder : public StatelessOperator<Backend> {
         st.decode_out_cpu = {st.image_info.buffer, decode_shape, st.parsed_sample.orig_dtype};
       }
     } else {
-      st.image_info.buffer = static_cast<uint8_t *>(out.raw_mutable_data());
+      st.image_info.buffer = out_ptr;
     }
 
     st.image_info.num_planes = 1;
@@ -703,12 +657,22 @@ class ImageDecoder : public StatelessOperator<Backend> {
     st.image = NvImageCodecImage::Create(instance_, &st.image_info);
   }
 
+  bool HasContiguousOutputs() const override {
+    return false;
+  }
+
   void RunImplImpl(Workspace &ws) {
     const auto &input = ws.Input<CPUBackend>(0);
+    int nsamples = input.num_samples();
     auto &output = ws.template Output<typename OutBackend<Backend>::type>(0);
+    // it complains if we try to set the sample dim after it is already allocated
+    // even if the sample dim didn't change
+    if (output.sample_dim() != 3)
+      output.set_sample_dim(3);
+    output.set_type(dtype_);
+    output.SetContiguity(BatchContiguity::Noncontiguous);
     output.SetLayout("HWC");
-    int nsamples = input.num_samples();
-    assert(output.num_samples() == nsamples);
+    output.SetSize(nsamples);
 
     tp_ = GetThreadPool(ws);
     assert(tp_ != nullptr);
@@ -716,14 +680,10 @@ class ImageDecoder : public StatelessOperator<Backend> {
       tp_ = nullptr;
     });
 
-    bool has_any_roi = false;
-    for (auto &roi : rois_)
-      has_any_roi |= roi.use_roi();
-
-    nvimgcodecDecodeParams_t decode_params = {NVIMGCODEC_STRUCTURE_TYPE_DECODE_PARAMS,
-                                              sizeof(nvimgcodecDecodeParams_t), nullptr};
-    decode_params.apply_exif_orientation = static_cast<int>(use_orientation_);
-    decode_params.enable_roi = static_cast<int>(has_any_roi);
+    SetupRoiGenerator(spec_, ws);
+    while (static_cast<int>(state_.size()) < nsamples)
+      state_.push_back(std::make_unique<SampleState>());
+    rois_.resize(nsamples);
 
     assert(static_cast<int>(state_.size()) >= nsamples);
     batch_encoded_streams_.clear();
@@ -732,125 +692,174 @@ class ImageDecoder : public StatelessOperator<Backend> {
     batch_images_.reserve(nsamples);
     decode_sample_idxs_.clear();
     decode_sample_idxs_.reserve(nsamples);
+    decode_status_.clear();
 
-    // TODO(janton): consider extending cache to different dtype as well
     const bool use_cache = cache_ && cache_->IsCacheEnabled() && dtype_ == DALI_UINT8;
-    if (use_cache) {
-      int samples_to_load = 0;
-      DomainTimeRange tr(make_string("CacheLoad"), DomainTimeRange::kOrange);
-      for (int orig_idx = 0; orig_idx < nsamples; orig_idx++) {
-        auto src_info = input.GetMeta(orig_idx).GetSourceInfo();
-        // To simplify things, we do not allow caching ROIs
-        bool has_roi = rois_[orig_idx].use_roi();
-        if (cache_->IsInCache(src_info) && !has_roi) {
-          cache_->DeferCacheLoad(src_info, output.template mutable_tensor<uint8_t>(orig_idx));
-          samples_to_load++;
-        } else {
-          decode_sample_idxs_.push_back(orig_idx);
+    auto setup_block = [&](int block_idx, int nblocks, int tid) {
+      int i_start = nsamples * block_idx / nblocks;
+      int i_end = nsamples * (block_idx + 1) / nblocks;
+      DomainTimeRange tr("Setup #" + std::to_string(block_idx) + "/" + std::to_string(nblocks),
+                          DomainTimeRange::kOrange);
+      for (int i = i_start; i < i_end; i++) {
+        auto *st = state_[i].get();
+        assert(st != nullptr);
+        const auto &input_sample = input[i];
+
+        auto src_info = input.GetMeta(i).GetSourceInfo();
+        if (use_cache && cache_->IsInCache(src_info)) {
+          auto cached_shape = cache_->CacheImageShape(src_info);
+          auto roi = GetRoi(spec_, ws, i, cached_shape);
+          if (!roi.use_roi()) {
+            st->load_from_cache = true;
+            output.ResizeSample(i, st->out_shape);
+            st->image_info.buffer = output.raw_mutable_tensor(i);
+            continue;
+          }
+        }
+        st->load_from_cache = false;
+        ParseSample(st->parsed_sample,
+                    span<const uint8_t>{static_cast<const uint8_t *>(input_sample.raw_data()),
+                                        volume(input_sample.shape())});
+        st->out_shape = st->parsed_sample.dali_img_info.shape;
+        st->out_shape[2] = NumberOfChannels(format_, st->out_shape[2]);
+        if (use_orientation_ &&
+            (st->parsed_sample.nvimgcodec_img_info.orientation.rotated % 180 != 0)) {
+          std::swap(st->out_shape[0], st->out_shape[1]);
         }
-      }
-      if (samples_to_load > 0)
-        cache_->LoadDeferred(ws.stream());
-    } else {
-      decode_sample_idxs_.resize(nsamples);
-      std::iota(decode_sample_idxs_.begin(), decode_sample_idxs_.end(), 0);
-    }
 
-    int decode_nsamples = decode_sample_idxs_.size();
-    {
-      DomainTimeRange tr(make_string("Prepare descs"), DomainTimeRange::kOrange);
-      auto get_task = [&](int block_idx, int nblocks) {
-        return [&, block_idx, nblocks](int tid) {
-          int i_start = decode_nsamples * block_idx / nblocks;
-          int i_end = decode_nsamples * (block_idx + 1) / nblocks;
-          for (int i = i_start; i < i_end; i++) {
-            int orig_idx = decode_sample_idxs_[i];
-            PrepareOutput(*state_[orig_idx], output[orig_idx], rois_[orig_idx], ws);
+        ROI &roi = rois_[i] = GetRoi(spec_, ws, i, st->out_shape);
+        if (roi.use_roi()) {
+          auto roi_sh = roi.shape();
+          if (roi.end.size() >= 2) {
+            DALI_ENFORCE(0 <= roi.end[0] && roi.end[0] <= st->out_shape[0] && 0 <= roi.end[1] &&
+                              roi.end[1] <= st->out_shape[1],
+                          "ROI end must fit within the image bounds");
           }
-        };
-      };
+          if (roi.begin.size() >= 2) {
+            DALI_ENFORCE(0 <= roi.begin[0] && roi.begin[0] <= st->out_shape[0] &&
+                              0 <= roi.begin[1] && roi.begin[1] <= st->out_shape[1],
+                          "ROI begin must fit within the image bounds");
+          }
+          st->out_shape[0] = roi_sh[0];
+          st->out_shape[1] = roi_sh[1];
+        }
+        output.ResizeSample(i, st->out_shape);
+        PrepareOutput(*state_[i], output.raw_mutable_tensor(i), rois_[i], ws);
+        assert(!ws.has_stream() || ws.stream() == st->image_info.cuda_stream);
+      }
+    };
 
-      int nblocks = tp_->NumThreads() + 1;
-      if (decode_nsamples > nblocks * 4) {
-        int block_idx = 0;
-        for (; block_idx < tp_->NumThreads(); block_idx++) {
-          tp_->AddWork(get_task(block_idx, nblocks), -block_idx);
+    int nsamples_per_block = 16;
+    int nblocks = std::max(1, nsamples / nsamples_per_block);
+    int ntasks = std::min<int>(nblocks, std::min<int>(8, tp_->NumThreads() + 1));
+
+    if (ntasks < 2) {
+      setup_block(0, 1, -1);  // run all in current thread
+    } else {
+      int block_idx = 0;
+      atomic_idx_.store(0);
+      auto setup_task = [&, nblocks](int tid) {
+        DomainTimeRange tr("Setup", DomainTimeRange::kOrange);
+        int block_idx;
+        while ((block_idx = atomic_idx_.fetch_add(1)) < nblocks) {
+          setup_block(block_idx, nblocks, tid);
         }
-        tp_->RunAll(false);                 // start work but not wait
-        get_task(block_idx, nblocks)(-1);   // run last block
-        tp_->WaitForWork();                 // wait for the other threads
-      } else {                              // not worth parallelizing
-        get_task(0, 1)(-1);                 // run all in current thread
+      };
+
+      for (int task_idx = 0; task_idx < ntasks - 1; task_idx++) {
+        tp_->AddWork(setup_task, -task_idx);
       }
+      assert(ntasks >= 2);
+      tp_->RunAll(false);  // start work but not wait
+      setup_task(-1);      // last task in current thread
+      tp_->WaitForWork();  // wait for the other threads
+    }
 
-      for (int orig_idx : decode_sample_idxs_) {
-        auto &st = *state_[orig_idx];
+    bool any_need_processing = false;
+    bool has_any_roi = false;
+    for (int orig_idx = 0; orig_idx < nsamples; orig_idx++) {
+      auto &st = *state_[orig_idx];
+      any_need_processing |= state_[orig_idx]->need_processing;
+      if (use_cache && st.load_from_cache) {
+        auto *data_ptr = output.raw_mutable_tensor(orig_idx);
+        auto src_info = input.GetMeta(orig_idx).GetSourceInfo();
+        cache_->DeferCacheLoad(src_info, static_cast<uint8_t *>(data_ptr));
+      } else {
+        has_any_roi |= rois_[orig_idx].use_roi();
         batch_encoded_streams_.push_back(st.parsed_sample.encoded_stream);
         batch_images_.push_back(st.image);
+        decode_sample_idxs_.push_back(orig_idx);
       }
     }
+    size_t nsamples_decode = batch_images_.size();
+    size_t nsamples_cache = nsamples - nsamples_decode;
 
-    // This is a workaround for nvImageCodec <= 0.2
-    auto any_need_processing = [&]() {
-      for (int orig_idx : decode_sample_idxs_) {
-        auto& st = state_[orig_idx];
-        assert(ws.stream() == st->image_info.cuda_stream);  // assuming this is true
-        if (st->need_processing)
-          return true;
-      }
-      return false;
-    };
-    if (ws.has_stream() && need_host_sync_alloc() && any_need_processing()) {
+    if (ws.has_stream() && need_host_sync_alloc() && any_need_processing) {
       DomainTimeRange tr("alloc sync", DomainTimeRange::kOrange);
       CUDA_CALL(cudaStreamSynchronize(ws.stream()));
     }
 
-    {
-      DomainTimeRange tr("Decode", DomainTimeRange::kOrange);
+    if (use_cache && nsamples_cache > 0) {
+      DomainTimeRange tr("LoadDeferred", DomainTimeRange::kOrange);
+      cache_->LoadDeferred(ws.stream());
+    }
+
+    if (nsamples_decode > 0) {
       nvimgcodecFuture_t future;
-      decode_status_.resize(decode_nsamples);
-      size_t status_size = 0;
-      CHECK_NVIMGCODEC(nvimgcodecDecoderDecode(decoder_, batch_encoded_streams_.data(),
-                                               batch_images_.data(), decode_nsamples,
-                                               &decode_params, &future));
-      CHECK_NVIMGCODEC(
-          nvimgcodecFutureGetProcessingStatus(future, decode_status_.data(), &status_size));
-      if (static_cast<int>(status_size) != decode_nsamples)
-        throw std::logic_error("Failed to retrieve processing status");
-      CHECK_NVIMGCODEC(nvimgcodecFutureDestroy(future));
-
-      for (int i = 0; i < decode_nsamples; i++) {
-        if (decode_status_[i] != NVIMGCODEC_PROCESSING_STATUS_SUCCESS) {
-          int orig_idx = decode_sample_idxs_[i];
+      decode_status_.resize(nsamples_decode);
+      size_t decode_status_size = 0;
+      nvimgcodecDecodeParams_t decode_params = {NVIMGCODEC_STRUCTURE_TYPE_DECODE_PARAMS,
+                                                sizeof(nvimgcodecDecodeParams_t), nullptr};
+      decode_params.apply_exif_orientation = static_cast<int>(use_orientation_);
+      decode_params.enable_roi = static_cast<int>(has_any_roi);
+
+      {
+        DomainTimeRange tr("nvimgcodecDecoderDecode", DomainTimeRange::kOrange);
+        CHECK_NVIMGCODEC(nvimgcodecDecoderDecode(decoder_, batch_encoded_streams_.data(),
+                                                 batch_images_.data(), nsamples_decode,
+                                                 &decode_params, &future));
+        CHECK_NVIMGCODEC(nvimgcodecFutureWaitForAll(future));
+        CHECK_NVIMGCODEC(nvimgcodecFutureGetProcessingStatus(future, decode_status_.data(),
+                                                             &decode_status_size));
+        CHECK_NVIMGCODEC(nvimgcodecFutureDestroy(future));
+      }
+      if (decode_status_size != nsamples_decode)
+        throw std::runtime_error("Failed to run decoder");
+      for (size_t idx = 0; idx < nsamples_decode; idx++) {
+        size_t orig_idx = decode_sample_idxs_[idx];
+        auto st_ptr = state_[orig_idx].get();
+        if (decode_status_[idx] != NVIMGCODEC_PROCESSING_STATUS_SUCCESS) {
           throw std::runtime_error(make_string("Failed to decode sample #", orig_idx, " : ",
                                                input.GetMeta(orig_idx).GetSourceInfo()));
         }
       }
-    }
-
-    for (int orig_idx : decode_sample_idxs_) {
-      auto st_ptr = state_[orig_idx].get();
-      if (st_ptr->need_processing) {
-        tp_->AddWork(
-            [&, out = output[orig_idx], st_ptr, orig_idx](int tid) {
-              DomainTimeRange tr(make_string("Convert #", orig_idx), DomainTimeRange::kOrange);
-              auto &st = *st_ptr;
-              if constexpr (std::is_same<MixedBackend, Backend>::value) {
-                ConvertGPU(out, st.req_layout, st.req_img_type, st.decode_out_gpu, st.req_layout,
-                           st.orig_img_type, ws.stream(), ROI{}, nvimgcodecOrientation_t{},
-                           st.dyn_range_multiplier);
-                st.device_buf.reset();
-              } else {
-                assert(st.dyn_range_multiplier == 1.0f);  // TODO(janton): enable
-                ConvertCPU(out, st.req_layout, st.req_img_type, st.decode_out_cpu, st.req_layout,
-                           st.orig_img_type, ROI{}, nvimgcodecOrientation_t{});
-                st.host_buf.reset();
-              }
-            },
-            -orig_idx);
+      if (any_need_processing) {
+        for (size_t idx = 0; idx < nsamples_decode; idx++) {
+          size_t orig_idx = decode_sample_idxs_[idx];
+          auto st_ptr = state_[orig_idx].get();
+          if (st_ptr->need_processing) {
+            tp_->AddWork(
+                [&, out = output[orig_idx], st_ptr, orig_idx](int tid) {
+                  DomainTimeRange tr(make_string("Convert #", orig_idx), DomainTimeRange::kOrange);
+                  auto &st = *st_ptr;
+                  if constexpr (std::is_same<MixedBackend, Backend>::value) {
+                    ConvertGPU(out, st.req_layout, st.req_img_type, st.decode_out_gpu,
+                               st.req_layout, st.orig_img_type, ws.stream(), ROI{},
+                               nvimgcodecOrientation_t{}, st.dyn_range_multiplier);
+                    st.device_buf.reset();
+                  } else {
+                    assert(st.dyn_range_multiplier == 1.0f);  // TODO(janton): enable
+                    ConvertCPU(out, st.req_layout, st.req_img_type, st.decode_out_cpu,
+                               st.req_layout, st.orig_img_type, ROI{}, nvimgcodecOrientation_t{});
+                    st.host_buf.reset();
+                  }
+                },
+                -idx);
+          }
+        }
+        tp_->RunAll(true);
       }
     }
-    tp_->RunAll();
 
     if (use_cache) {
       DomainTimeRange tr(make_string("CacheStore"), DomainTimeRange::kOrange);
@@ -882,8 +891,11 @@ class ImageDecoder : public StatelessOperator<Backend> {
                                      sizeof(nvimgcodecExecutorDesc_t),
                                      nullptr,
                                      this,
-                                     &static_launch,
+                                     &static_schedule,
+                                     &static_run,
+                                     &static_wait,
                                      &static_get_num_threads};
+
   nvimgcodecDeviceAllocator_t dev_alloc_ = {};
   nvimgcodecPinnedAllocator_t pinned_alloc_ = {};
   std::vector<nvimgcodecBackend_t> backends_;
@@ -899,7 +911,8 @@ class ImageDecoder : public StatelessOperator<Backend> {
   bool use_orientation_ = true;
   int max_batch_size_ = 1;
   int num_threads_ = -1;
-  ThreadPool* tp_ = nullptr;
+  ThreadPool *tp_ = nullptr;
+  int64_t task_count_ = 0;
   std::vector<std::unique_ptr<SampleState>> state_;
   std::vector<nvimgcodecCodeStream_t> batch_encoded_streams_;
   std::vector<nvimgcodecImage_t> batch_images_;
@@ -909,6 +922,8 @@ class ImageDecoder : public StatelessOperator<Backend> {
   // This vector is used to get the original index of the decoded samples
   std::vector<size_t> decode_sample_idxs_;
 
+  std::atomic<int> atomic_idx_;
+
   // Manually loaded extensions
   std::vector<nvimgcodecExtensionDesc_t> extensions_descs_;
   std::vector<nvimgcodecExtension_t> extensions_;
diff --git a/dali/python/setup.py.in b/dali/python/setup.py.in
index e49dbff2d04..179e4d04430 100644
--- a/dali/python/setup.py.in
+++ b/dali/python/setup.py.in
@@ -89,6 +89,8 @@ For more details please check the
           'six >= 1.16, <= 1.16',
           'dm-tree <= 0.1.8',
           'packaging <= 24.2',
+          @DALI_INSTALL_REQUIRES_NVJPEG2K@
+          @DALI_INSTALL_REQUIRES_NVTIFF@
           @DALI_INSTALL_REQUIRES_NVIMGCODEC@
           ],
      )
diff --git a/dali/test/python/decoder/test_imgcodec.py b/dali/test/python/decoder/test_imgcodec.py
index 3d85a8e4445..9f086a47a2e 100644
--- a/dali/test/python/decoder/test_imgcodec.py
+++ b/dali/test/python/decoder/test_imgcodec.py
@@ -177,7 +177,10 @@ def run_decode_fused(test_fun, path, img_type, batch, device, threads, validatio
                 dump_as_core_artifacts(
                     img_1.source_info(), arr_1, arr_2, iter=it, sample_idx=sample_idx
                 )
-            assert is_ok, f"{validation_fun.__name__}\nimage: {img_1.source_info()}"
+            assert is_ok, (
+                f"{validation_fun.__name__}\n"
+                + f"image: {img_1.source_info()} iter: {it} sample_idx: {sample_idx}"
+            )
 
 
 def test_image_decoder_fused():
diff --git a/docker/Dockerfile.build.aarch64-linux b/docker/Dockerfile.build.aarch64-linux
index 62cf6dc3907..b02026ea26f 100644
--- a/docker/Dockerfile.build.aarch64-linux
+++ b/docker/Dockerfile.build.aarch64-linux
@@ -147,7 +147,7 @@ CMD WERROR=ON           \
     WITH_DYNAMIC_NVJPEG=ON \
     WITH_DYNAMIC_CUFFT=ON \
     WITH_DYNAMIC_NPP=ON \
-    WITH_DYNAMIC_NVIMGCODEC=OFF \
+    WITH_DYNAMIC_NVIMGCODEC=ON \
     NVIDIA_BUILD_ID=${NVIDIA_BUILD_ID:-0} \
     WHL_PLATFORM_NAME=manylinux2014_aarch64            \
     BUNDLE_PATH_PREFIX="/usr/aarch64-linux-gnu"        \
diff --git a/include/dali/core/dev_string.h b/include/dali/core/dev_string.h
index ad85a89f000..29b1ae432eb 100644
--- a/include/dali/core/dev_string.h
+++ b/include/dali/core/dev_string.h
@@ -173,7 +173,7 @@ inline __device__ DeviceString dev_to_string(const void *ptr) {
   while (x) {
     int digit = x&0xf;
     x >>= 4;;
-    buf[--cursor] = "01234567889ABCDEF"[digit];
+    buf[--cursor] = "0123456789ABCDEF"[digit];
   }
   buf[--cursor] = 'x';
   buf[--cursor] = '0';
diff --git a/internal_tools/hw_decoder_bench.py b/internal_tools/hw_decoder_bench.py
index 7d074e0dcda..8e1189b0c77 100644
--- a/internal_tools/hw_decoder_bench.py
+++ b/internal_tools/hw_decoder_bench.py
@@ -79,6 +79,12 @@
     type=int,
 )
 
+parser.add_argument(
+    "--experimental_decoder",
+    action="store_true",
+    help="If True, uses the experimental decoder instead of the default",
+)
+
 args = parser.parse_args()
 
 DALI_INPUT_NAME = "DALI_INPUT_0"
@@ -86,12 +92,15 @@
 
 
 @pipeline_def(
-    batch_size=args.batch_size, num_threads=args.num_threads, device_id=args.device_id, seed=0
+    batch_size=args.batch_size,
+    num_threads=args.num_threads,
+    device_id=args.device_id,
+    seed=0,
 )
-def DecoderPipeline():
+def DecoderPipeline(decoders_module=fn.decoders):
     device = "mixed" if args.device == "gpu" else "cpu"
     jpegs, _ = fn.readers.file(file_root=args.images_dir)
-    images = fn.decoders.image(
+    images = decoders_module.image(
         jpegs,
         device=device,
         output_type=types.RGB,
@@ -103,12 +112,15 @@ def DecoderPipeline():
 
 
 @pipeline_def(
-    batch_size=args.batch_size, num_threads=args.num_threads, device_id=args.device_id, seed=0
+    batch_size=args.batch_size,
+    num_threads=args.num_threads,
+    device_id=args.device_id,
+    seed=0,
 )
-def RN50Pipeline(minibatch_size):
+def RN50Pipeline(minibatch_size, decoders_module=fn.decoders):
     device = "mixed" if args.device == "gpu" else "cpu"
     jpegs, _ = fn.readers.file(file_root=args.images_dir)
-    images = fn.decoders.image_random_crop(
+    images = decoders_module.image_random_crop(
         jpegs,
         device=device,
         output_type=types.RGB,
@@ -138,10 +150,10 @@ def RN50Pipeline(minibatch_size):
     device_id=args.device_id,
     seed=0,
     enable_conditionals=True,
+    decoders_module=fn.decoders,
 )
 def EfficientnetTrainingPipeline(
-    minibatch_size,
-    automatic_augmentation="autoaugment",
+    minibatch_size, automatic_augmentation="autoaugment", decoders_module=fn.decoders
 ):
     dali_device = args.device
     output_layout = types.NCHW
@@ -159,7 +171,7 @@ def EfficientnetTrainingPipeline(
         decoder_device = "cpu"
         resize_device = "cpu"
 
-    images = fn.decoders.image_random_crop(
+    images = decoders_module.image_random_crop(
         jpegs,
         device=decoder_device,
         output_type=types.RGB,
@@ -213,9 +225,9 @@ def EfficientnetTrainingPipeline(
     device_id=args.device_id,
     prefetch_queue_depth=1,
 )
-def EfficientnetInferencePipeline():
+def EfficientnetInferencePipeline(decoders_module=fn.decoders):
     images = fn.external_source(device="cpu", name=DALI_INPUT_NAME)
-    images = fn.decoders.image(
+    images = decoders_module.image(
         images,
         device="mixed" if args.device == "gpu" else "cpu",
         output_type=types.RGB,
@@ -276,7 +288,9 @@ def non_image_preprocessing(raw_text):
 @pipeline_def(
     batch_size=args.batch_size, num_threads=args.num_threads, device_id=args.device_id, seed=0
 )
-def vit_pipeline(is_training=False, image_shape=(384, 384, 3), num_classes=1000):
+def vit_pipeline(
+    is_training=False, image_shape=(384, 384, 3), num_classes=1000, decoders_module=fn.decoders
+):
     files_paths = [os.path.join(args.images_dir, f) for f in os.listdir(args.images_dir)]
 
     img, clss = fn.readers.webdataset(
@@ -298,7 +312,7 @@ def vit_pipeline(is_training=False, image_shape=(384, 384, 3), num_classes=1000)
     labels = fn.one_hot(labels, num_classes=num_classes)
 
     device = "mixed" if use_gpu else "cpu"
-    img = fn.decoders.image(
+    img = decoders_module.image(
         img,
         device=device,
         output_type=types.RGB,
@@ -336,19 +350,32 @@ def vit_pipeline(is_training=False, image_shape=(384, 384, 3), num_classes=1000)
     return img, labels
 
 
+decoders_module = fn.experimental.decoders if args.experimental_decoder else fn.decoders
+print(f"Using decoders_module={decoders_module}")
+
 pipes = []
 if args.pipeline == "decoder":
     for i in range(args.gpu_num):
-        pipes.append(DecoderPipeline(device_id=i + args.device_id))
+        pipes.append(DecoderPipeline(device_id=i + args.device_id, decoders_module=decoders_module))
 elif args.pipeline == "rn50":
     for i in range(args.gpu_num):
-        pipes.append(RN50Pipeline(device_id=i + args.device_id, minibatch_size=args.minibatch_size))
+        pipes.append(
+            RN50Pipeline(
+                device_id=i + args.device_id,
+                minibatch_size=args.minibatch_size,
+                decoders_module=decoders_module,
+            )
+        )
 elif args.pipeline == "efficientnet_inference":
     for i in range(args.gpu_num):
-        pipes.append(EfficientnetInferencePipeline(device_id=i + args.device_id))
+        pipes.append(
+            EfficientnetInferencePipeline(
+                device_id=i + args.device_id, decoders_module=decoders_module
+            )
+        )
 elif args.pipeline == "vit":
     for i in range(args.gpu_num):
-        pipes.append(vit_pipeline(device_id=i + args.device_id))
+        pipes.append(vit_pipeline(device_id=i + args.device_id, decoders_module=decoders_module))
 elif args.pipeline == "efficientnet_training":
     for i in range(args.gpu_num):
         pipes.append(
@@ -356,6 +383,7 @@ def vit_pipeline(is_training=False, image_shape=(384, 384, 3), num_classes=1000)
                 device_id=i + args.device_id,
                 minibatch_size=args.minibatch_size,
                 automatic_augmentation=args.aug_strategy,
+                decoders_module=decoders_module,
             )
         )
 else:
diff --git a/internal_tools/stub_generator/nvimgcodec.json b/internal_tools/stub_generator/nvimgcodec.json
index 4a6847ea4d8..ebeaf4aac67 100644
--- a/internal_tools/stub_generator/nvimgcodec.json
+++ b/internal_tools/stub_generator/nvimgcodec.json
@@ -7,9 +7,11 @@
    "not_found_error":"NVIMGCODEC_STATUS_IMPLEMENTATION_UNSUPPORTED",
 "functions": {
       "nvimgcodecInstanceCreate": {},
+      "nvimgcodecDecoderCanDecode": {},
       "nvimgcodecCodeStreamCreateFromHostMem": {},
       "nvimgcodecCodeStreamGetImageInfo": {},
       "nvimgcodecDecoderCreate": {},
+      "nvimgcodecFutureWaitForAll": {},
       "nvimgcodecFutureGetProcessingStatus": {},
       "nvimgcodecInstanceDestroy": {},
       "nvimgcodecImageDestroy": {},
diff --git a/qa/TL0_python-self-test-base-cuda/test.sh b/qa/TL0_python-self-test-base-cuda/test.sh
index 969a5e754e8..2a0060aa4e3 100644
--- a/qa/TL0_python-self-test-base-cuda/test.sh
+++ b/qa/TL0_python-self-test-base-cuda/test.sh
@@ -14,6 +14,7 @@ version_ge "$DALI_CUDA_MAJOR_VERSION" "11" && \
   pip uninstall -y `pip list | grep nvidia-cufft | cut -d " " -f1` \
                    `pip list | grep nvidia-nvjpeg | cut -d " " -f1` \
                    `pip list | grep nvidia-nvjpeg2k | cut -d " " -f1` \
+                   `pip list | grep nvidia-nvtiff | cut -d " " -f1` \
                    `pip list | grep nvidia-npp | cut -d " " -f1` \
   || true
 
@@ -43,4 +44,5 @@ version_ge "$DALI_CUDA_MAJOR_VERSION" "11" && \
               nvidia-npp-cu${DALI_CUDA_MAJOR_VERSION}    \
               nvidia-nvjpeg-cu${DALI_CUDA_MAJOR_VERSION} \
               nvidia-nvjpeg2k-cu${DALI_CUDA_MAJOR_VERSION} \
+              nvidia-nvtiff-cu${DALI_CUDA_MAJOR_VERSION} \
   || true
diff --git a/qa/TL1_decoder_perf/test.sh b/qa/TL1_decoder_perf/test.sh
index 449ad6c2ce4..0924978bf66 100644
--- a/qa/TL1_decoder_perf/test.sh
+++ b/qa/TL1_decoder_perf/test.sh
@@ -3,9 +3,11 @@
 pip_packages='numpy'
 target_dir=./internal_tools
 
-LOG="dali.log"
+LOG1="dali_legacy.log"
+LOG2="dali_nvimgcodec.log"
 function CLEAN_AND_EXIT {
-    rm -rf ${LOG}
+    rm -rf ${LOG1}
+    rm -rf ${LOG2}
     exit $1
 }
 
@@ -15,12 +17,15 @@ test_body() {
     # Hopper
     MIN_PERF=19000;
     # use taskset to avoid inefficient data migration between cores we don't want to use
-    taskset --cpu-list 0-127 python hw_decoder_bench.py --width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 100 -t 100000 -i ${DALI_EXTRA_PATH}/db/single/jpeg -p rn50 -j 70 --hw_load 0.12 | tee ${LOG}
+    taskset --cpu-list 0-127 python hw_decoder_bench.py --width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 100 -t 100000 -i ${DALI_EXTRA_PATH}/db/single/jpeg -p rn50 -j 70 --hw_load 0.12 | tee ${LOG1}
+    taskset --cpu-list 0-127 python hw_decoder_bench.py --width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 100 -t 100000 -i ${DALI_EXTRA_PATH}/db/single/jpeg -p rn50 -j 70 --hw_load 0.12 --experimental_decoder | tee ${LOG2}
+
   else
     # GraceHopper
     MIN_PERF=29000;
     # use taskset to avoid inefficient data migration between cores we don't want to use
-    taskset --cpu-list 0-71 python hw_decoder_bench.py --width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 100 -t 100000 -i ${DALI_EXTRA_PATH}/db/single/jpeg -p rn50 -j 72 --hw_load 0.11 | tee ${LOG}
+    taskset --cpu-list 0-71 python hw_decoder_bench.py --width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 100 -t 100000 -i ${DALI_EXTRA_PATH}/db/single/jpeg -p rn50 -j 72 --hw_load 0.11 | tee ${LOG1}
+    taskset --cpu-list 0-71 python hw_decoder_bench.py --width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 100 -t 100000 -i ${DALI_EXTRA_PATH}/db/single/jpeg -p rn50 -j 72 --hw_load 0.11 --experimental_decoder | tee ${LOG2}
   fi
 
   # Regex Explanation:
@@ -28,14 +33,39 @@ test_body() {
   # \K: Resets the start of the match, so anything before \K is not included in the output.
   # [0-9]+(\.[0-9]+)?: Matches the number, with an optional decimal part.
   # (?= frames/sec): ensures " frames/sec" follows the number, but doesn't include it.
-  PERF=$(grep -oP 'Total Throughput: \K[0-9]+(\.[0-9]+)?(?= frames/sec)' ${LOG})
+  PERF1=$(grep -oP 'Total Throughput: \K[0-9]+(\.[0-9]+)?(?= frames/sec)' ${LOG1})
+  PERF2=$(grep -oP 'Total Throughput: \K[0-9]+(\.[0-9]+)?(?= frames/sec)' ${LOG2})
 
-  PERF_RESULT=$(echo "$PERF $MIN_PERF" | awk '{if ($1>=$2) {print "OK"} else { print "FAIL" }}')
-  if [[ "$PERF_RESULT" == "OK" ]]; then
-    CLEAN_AND_EXIT 0
-  fi
+  PERF_RESULT1=$(echo "$PERF1 $MIN_PERF" | awk '{if ($1>=$2) {print "OK"} else { print "FAIL" }}')
+  PERF_RESULT2=$(echo "$PERF2 $MIN_PERF" | awk '{if ($1>=$2) {print "OK"} else { print "FAIL" }}')
+  # Ensure that PERF2 is no less than 5% smaller than PERF1
+  PERF_RESULT3=$(echo "$PERF2 $PERF1" | awk '{if ($1 >= $2 * 0.95) {print "OK"} else { print "FAIL" }}')
+
+  echo "PERF_RESULT1=${PERF_RESULT1}"
+  echo "PERF_RESULT2=${PERF_RESULT2}"
+  echo "PERF_RESULT3=${PERF_RESULT3}"
 
-  CLEAN_AND_EXIT 4
+  # If nvImageCodec>=0.5.0 enforce the performance requirements. Otherwise, we check only the legacy decoder
+  NVIMGCODEC_VERSION=$(pip show nvidia-nvimgcodec-cu12 | grep ^Version: | awk '{print $2}')
+  NVIMGCODEC_VERSION_WITHOUT_EXTRA=$(echo "$NVIMGCODEC_VERSION" | awk -F '.' '{print $1 "." $2 "." $3}')
+  if [[ "$NVIMGCODEC_VERSION_WITHOUT_EXTRA" =~ ^([0-9]+)\.([0-9]+)\.([0-9]+)$ ]]; then
+    IFS='.' read -r MAJOR MINOR PATCH <<< "$NVIMGCODEC_VERSION_WITHOUT_EXTRA"
+    if [[ $MAJOR -gt 0 || ($MAJOR -eq 0 && $MINOR -ge 5) ]]; then
+      if [[ "$PERF_RESULT1" == "OK" && "$PERF_RESULT2" == "OK" && "$PERF_RESULT3" == "OK" ]]; then
+        CLEAN_AND_EXIT 0
+      else
+        CLEAN_AND_EXIT 4
+      fi
+    else
+      if [[ "$PERF_RESULT1" == "OK" ]]; then
+        CLEAN_AND_EXIT 0
+      else
+        CLEAN_AND_EXIT 4
+      fi
+    fi
+  else
+    CLEAN_AND_EXIT 3
+  fi
 }
 pushd ../..
 source ./qa/test_template.sh
diff --git a/qa/test_template_impl.sh b/qa/test_template_impl.sh
index 55f298290f8..41eb9b5efa6 100755
--- a/qa/test_template_impl.sh
+++ b/qa/test_template_impl.sh
@@ -159,6 +159,7 @@ do
             install_pip_pkg "pip install --upgrade nvidia-npp-cu${DALI_CUDA_MAJOR_VERSION}${NPP_VERSION}    \
                                                    nvidia-nvjpeg-cu${DALI_CUDA_MAJOR_VERSION} \
                                                    nvidia-nvjpeg2k-cu${DALI_CUDA_MAJOR_VERSION} \
+                                                   nvidia-nvtiff-cu${DALI_CUDA_MAJOR_VERSION} \
                                                    nvidia-cufft-cu${DALI_CUDA_MAJOR_VERSION}  \
                                                    -f /pip-packages"
           fi