diff --git a/cmake/Dependencies.common.cmake b/cmake/Dependencies.common.cmake index dd8244c2768..3927a4440e7 100644 --- a/cmake/Dependencies.common.cmake +++ b/cmake/Dependencies.common.cmake @@ -273,9 +273,18 @@ endif() # nvimagecodec ################################################################## set(DALI_INSTALL_REQUIRES_NVIMGCODEC "") +set(DALI_INSTALL_REQUIRES_NVJPEG2K "") +set(DALI_INSTALL_REQUIRES_NVTIFF "") if(BUILD_NVIMAGECODEC) - set(NVIMGCODEC_MIN_VERSION "0.3.0") - set(NVIMGCODEC_MAX_VERSION "0.4.0") + set(NVJPEG2K_MIN_VERSION "0.8.0") + set(NVJPEG2K_MAX_VERSION "0.9.0") + + set(NVTIFF_MIN_VERSION "0.4.0") + set(NVTIFF_MAX_VERSION "0.5.0") + + set(NVIMGCODEC_MIN_VERSION "0.4.1") + set(NVIMGCODEC_MAX_VERSION "0.5.0") + message(STATUS "nvImageCodec - requires version >=${NVIMGCODEC_MIN_VERSION}, <${NVIMGCODEC_MAX_VERSION}") if (WITH_DYNAMIC_NVIMGCODEC) message(STATUS "nvImageCodec - dynamic load") @@ -288,8 +297,8 @@ if(BUILD_NVIMAGECODEC) include(FetchContent) FetchContent_Declare( nvimgcodec_headers - URL https://developer.download.nvidia.com/compute/nvimgcodec/redist/nvimgcodec/linux-x86_64/nvimgcodec-linux-x86_64-0.3.0.5-archive.tar.xz - URL_HASH SHA512=259bff93305c301fb4325c6e2f71da93f3f6e0b38c7c8739913ca70b5a9c74cc898a608c5ac6e830dba1739878e53607ded03deaf2f23af3a9cc473463f100eb + URL https://developer.download.nvidia.com/compute/nvimgcodec/redist/nvimgcodec/linux-x86_64/nvimgcodec-linux-x86_64-0.4.1.21-archive.tar.xz + URL_HASH SHA512=3f20f6944a360597586bfe3550a0605257bcd944748477a869691ec1a42716e3722f8ddbd0b525995ebab89a33cd91ed82d5b151194008f1a8424971448a4824 ) FetchContent_Populate(nvimgcodec_headers) set(nvimgcodec_SEARCH_PATH "${nvimgcodec_headers_SOURCE_DIR}/${CUDA_VERSION_MAJOR}/include") @@ -304,7 +313,27 @@ if(BUILD_NVIMAGECODEC) message(STATUS "NVIMGCODEC_DEFAULT_INSTALL_PATH=${NVIMGCODEC_DEFAULT_INSTALL_PATH}") add_definitions(-DNVIMGCODEC_DEFAULT_INSTALL_PATH=\"${NVIMGCODEC_DEFAULT_INSTALL_PATH}\") - set(DALI_INSTALL_REQUIRES_NVIMGCODEC "\'nvidia-nvimgcodec-cu${CUDA_VERSION_MAJOR} >= ${NVIMGCODEC_MIN_VERSION}, < ${NVIMGCODEC_MAX_VERSION}',") + # Find the position of the substring + string(FIND "aarch64-linux-gnu" "${CMAKE_PREFIX_PATH}" SUBSTRING_POSITION) + if(NOT SUBSTRING_POSITION EQUAL -1) + # Substring found + set(NVIMGCODEC_PACKAGE_NAME "nvidia-nvimgcodec-cu${CUDA_VERSION_MAJOR}") + set(NVJPEG2K_PACKAGE_NAME "nvidia-nvjpeg2k-cu${CUDA_VERSION_MAJOR}") + set(NVTIFF_PACKAGE_NAME "nvidia-nvtiff-cu${CUDA_VERSION_MAJOR}") + else() + # Substring not found + set(NVIMGCODEC_PACKAGE_NAME "nvidia-nvimgcodec-tegra-cu${CUDA_VERSION_MAJOR}") + set(NVJPEG2K_PACKAGE_NAME "nvidia-nvjpeg2k-tegra-cu${CUDA_VERSION_MAJOR}") + set(NVTIFF_PACKAGE_NAME "nvidia-nvtiff-tegra-cu${CUDA_VERSION_MAJOR}") + endif() + + # TODO(janton): Replace with nvimgcodec[nvtiff+nvjpeg2k+...] when available + set(DALI_INSTALL_REQUIRES_NVJPEG2K "\'${NVJPEG2K_PACKAGE_NAME} >= ${NVJPEG2K_MIN_VERSION}, < ${NVJPEG2K_MAX_VERSION}',") + message(STATUS "Adding nvjpeg2k requirement as: ${DALI_INSTALL_REQUIRES_NVJPEG2K}") + set(DALI_INSTALL_REQUIRES_NVTIFF "\'${NVTIFF_PACKAGE_NAME} >= ${NVTIFF_MIN_VERSION}, < ${NVTIFF_MAX_VERSION}',") + message(STATUS "Adding nvtiff requirement as: ${DALI_INSTALL_REQUIRES_NVTIFF}") + set(DALI_INSTALL_REQUIRES_NVIMGCODEC "\'${NVIMGCODEC_PACKAGE_NAME} >= ${NVIMGCODEC_MIN_VERSION}, < ${NVIMGCODEC_MAX_VERSION}',") + message(STATUS "Adding nvimagecodec requirement as: ${DALI_INSTALL_REQUIRES_NVIMGCODEC}") else() message(STATUS "nvImageCodec - static link") @@ -321,7 +350,7 @@ if(BUILD_NVIMAGECODEC) ExternalProject_Add( nvImageCodec GIT_REPOSITORY https://github.com/NVIDIA/nvImageCodec.git - GIT_TAG v0.3.0 + GIT_TAG v0.4.0 GIT_SUBMODULES "external/pybind11" "external/NVTX" "external/googletest" @@ -336,6 +365,8 @@ if(BUILD_NVIMAGECODEC) "-DWITH_DYNAMIC_NVJPEG2K=OFF" "-DBUILD_NVJPEG_EXT=${BUILD_NVJPEG}" "-DWITH_DYNAMIC_NVJPEG=${WITH_DYNAMIC_NVJPEG}" + "-DBUILD_NVTIFF_EXT=OFF" + "-DWITH_DYNAMIC_NVTIFF=OFF" "-DBUILD_NVBMP_EXT=OFF" "-DBUILD_NVPNM_EXT=OFF" "-DBUILD_LIBJPEG_TURBO_EXT=${BUILD_LIBJPEG_TURBO}" diff --git a/conda/third_party/dali_nvimagecodec/recipe/meta.yaml b/conda/third_party/dali_nvimagecodec/recipe/meta.yaml index 4c67501779b..83a402ac214 100644 --- a/conda/third_party/dali_nvimagecodec/recipe/meta.yaml +++ b/conda/third_party/dali_nvimagecodec/recipe/meta.yaml @@ -13,7 +13,7 @@ # limitations under the License. -{% set build_version = "0.3.0" %} +{% set build_version = "0.4.0" %} package: name: nvidia-nvimagecodec-cuda{{ environ.get('CUDA_VERSION', '') | replace(".","") }} @@ -21,7 +21,7 @@ package: source: git_url: https://github.com/NVIDIA/nvImageCodec.git - git_rev: v0.3.0 + git_rev: v0.4.0 build: number: 0 diff --git a/dali/operators/imgcodec/image_decoder.h b/dali/operators/imgcodec/image_decoder.h index aed89353e41..129b198e658 100644 --- a/dali/operators/imgcodec/image_decoder.h +++ b/dali/operators/imgcodec/image_decoder.h @@ -11,6 +11,7 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // limitations under the License. +#include #include #include #include @@ -32,14 +33,13 @@ #include "dali/pipeline/operator/operator.h" #if not(WITH_DYNAMIC_NVIMGCODEC_ENABLED) -nvimgcodecStatus_t get_libjpeg_turbo_extension_desc(nvimgcodecExtensionDesc_t* ext_desc); -nvimgcodecStatus_t get_libtiff_extension_desc(nvimgcodecExtensionDesc_t* ext_desc); -nvimgcodecStatus_t get_opencv_extension_desc(nvimgcodecExtensionDesc_t* ext_desc); -nvimgcodecStatus_t get_nvjpeg_extension_desc(nvimgcodecExtensionDesc_t* ext_desc); -nvimgcodecStatus_t get_nvjpeg2k_extension_desc(nvimgcodecExtensionDesc_t* ext_desc); +nvimgcodecStatus_t get_libjpeg_turbo_extension_desc(nvimgcodecExtensionDesc_t *ext_desc); +nvimgcodecStatus_t get_libtiff_extension_desc(nvimgcodecExtensionDesc_t *ext_desc); +nvimgcodecStatus_t get_opencv_extension_desc(nvimgcodecExtensionDesc_t *ext_desc); +nvimgcodecStatus_t get_nvjpeg_extension_desc(nvimgcodecExtensionDesc_t *ext_desc); +nvimgcodecStatus_t get_nvjpeg2k_extension_desc(nvimgcodecExtensionDesc_t *ext_desc); #endif - #ifndef DALI_OPERATORS_IMGCODEC_IMAGE_DECODER_H_ #define DALI_OPERATORS_IMGCODEC_IMAGE_DECODER_H_ @@ -119,7 +119,7 @@ inline int static_dali_pinned_free(void *ctx, void *ptr, size_t size, cudaStream return cudaSuccess; } -inline void get_nvimgcodec_version(int* major, int *minor, int* patch) { +inline void get_nvimgcodec_version(int *major, int *minor, int *patch) { static int s_major = -1, s_minor = -1, s_patch = -1; auto version_check_f = [&] { nvimgcodecProperties_t properties{NVIMGCODEC_STRUCTURE_TYPE_PROPERTIES, @@ -148,7 +148,7 @@ class ImageDecoder : public StatelessOperator { ~ImageDecoder() override { #if not(WITH_DYNAMIC_NVIMGCODEC_ENABLED) decoder_.reset(); // first stop the decoder - for (auto& extension : extensions_) { + for (auto &extension : extensions_) { nvimgcodecExtensionDestroy(extension); } #endif @@ -176,6 +176,7 @@ class ImageDecoder : public StatelessOperator { DALIImageType orig_img_type; DALIImageType req_img_type; float dyn_range_multiplier = 1.0f; + bool load_from_cache = false; mm::uptr host_buf; mm::async_uptr device_buf; @@ -229,11 +230,9 @@ class ImageDecoder : public StatelessOperator { if (std::is_same::value) { thread_pool_ = std::make_unique(num_threads_, device_id_, spec.GetArgument("affine"), "MixedDecoder"); - if (spec_.HasArgument("cache_size")) cache_ = std::make_unique(spec_); } - EnforceMinimumNvimgcodecVersion(); nvimgcodecDeviceAllocator_t *dev_alloc_ptr = nullptr; @@ -267,7 +266,7 @@ class ImageDecoder : public StatelessOperator { nullptr}; const char *log_lvl_env = std::getenv("DALI_NVIMGCODEC_LOG_LEVEL"); - int log_lvl = log_lvl_env ? clamp(atoi(log_lvl_env), 1, 5): 2; + int log_lvl = log_lvl_env ? clamp(atoi(log_lvl_env), 1, 5) : 2; instance_create_info.load_extension_modules = static_cast(WITH_DYNAMIC_NVIMGCODEC_ENABLED); instance_create_info.load_builtin_modules = static_cast(true); @@ -355,8 +354,7 @@ class ImageDecoder : public StatelessOperator { opts_.add_module_option("nvjpeg_cuda_decoder", "preallocate_buffers", true); // Batch size - opts_.add_module_option("nvjpeg_hw_decoder", "preallocate_batch_size", - std::max(1, max_batch_size_)); + opts_.add_module_option("nvjpeg_hw_decoder", "preallocate_batch_size", max_batch_size_); // Nvjpeg2k parallel tiles opts_.add_module_option("nvjpeg2k_cuda_decoder", "num_parallel_tiles", 16); @@ -367,32 +365,35 @@ class ImageDecoder : public StatelessOperator { backends_.clear(); backends_.reserve(4); if (nvimgcodec_device_id != NVIMGCODEC_DEVICE_CPU_ONLY) { - backends_.push_back( - nvimgcodecBackend_t{NVIMGCODEC_STRUCTURE_TYPE_BACKEND, - sizeof(nvimgcodecBackend_t), - nullptr, - NVIMGCODEC_BACKEND_KIND_HW_GPU_ONLY, - {NVIMGCODEC_STRUCTURE_TYPE_BACKEND_PARAMS, - sizeof(nvimgcodecBackendParams_t), nullptr, hw_load}}); - backends_.push_back(nvimgcodecBackend_t{NVIMGCODEC_STRUCTURE_TYPE_BACKEND, - sizeof(nvimgcodecBackend_t), - nullptr, - NVIMGCODEC_BACKEND_KIND_GPU_ONLY, - {NVIMGCODEC_STRUCTURE_TYPE_BACKEND_PARAMS, - sizeof(nvimgcodecBackendParams_t), nullptr, 1.0f}}); - backends_.push_back(nvimgcodecBackend_t{NVIMGCODEC_STRUCTURE_TYPE_BACKEND, - sizeof(nvimgcodecBackend_t), - nullptr, - NVIMGCODEC_BACKEND_KIND_HYBRID_CPU_GPU, - {NVIMGCODEC_STRUCTURE_TYPE_BACKEND_PARAMS, - sizeof(nvimgcodecBackendParams_t), nullptr, 1.0f}}); + backends_.push_back(nvimgcodecBackend_t{ + NVIMGCODEC_STRUCTURE_TYPE_BACKEND, + sizeof(nvimgcodecBackend_t), + nullptr, + NVIMGCODEC_BACKEND_KIND_HW_GPU_ONLY, + {NVIMGCODEC_STRUCTURE_TYPE_BACKEND_PARAMS, sizeof(nvimgcodecBackendParams_t), nullptr, + hw_load, NVIMGCODEC_LOAD_HINT_POLICY_FIXED}}); + backends_.push_back(nvimgcodecBackend_t{ + NVIMGCODEC_STRUCTURE_TYPE_BACKEND, + sizeof(nvimgcodecBackend_t), + nullptr, + NVIMGCODEC_BACKEND_KIND_GPU_ONLY, + {NVIMGCODEC_STRUCTURE_TYPE_BACKEND_PARAMS, sizeof(nvimgcodecBackendParams_t), nullptr, + 1.0f, NVIMGCODEC_LOAD_HINT_POLICY_FIXED}}); + backends_.push_back(nvimgcodecBackend_t{ + NVIMGCODEC_STRUCTURE_TYPE_BACKEND, + sizeof(nvimgcodecBackend_t), + nullptr, + NVIMGCODEC_BACKEND_KIND_HYBRID_CPU_GPU, + {NVIMGCODEC_STRUCTURE_TYPE_BACKEND_PARAMS, sizeof(nvimgcodecBackendParams_t), nullptr, + 1.0f, NVIMGCODEC_LOAD_HINT_POLICY_FIXED}}); } - backends_.push_back(nvimgcodecBackend_t{NVIMGCODEC_STRUCTURE_TYPE_BACKEND, - sizeof(nvimgcodecBackend_t), - nullptr, - NVIMGCODEC_BACKEND_KIND_CPU_ONLY, - {NVIMGCODEC_STRUCTURE_TYPE_BACKEND_PARAMS, - sizeof(nvimgcodecBackendParams_t), nullptr, 1.0f}}); + backends_.push_back(nvimgcodecBackend_t{ + NVIMGCODEC_STRUCTURE_TYPE_BACKEND, + sizeof(nvimgcodecBackend_t), + nullptr, + NVIMGCODEC_BACKEND_KIND_CPU_ONLY, + {NVIMGCODEC_STRUCTURE_TYPE_BACKEND_PARAMS, sizeof(nvimgcodecBackendParams_t), nullptr, 1.0f, + NVIMGCODEC_LOAD_HINT_POLICY_FIXED}}); exec_params_.backends = backends_.data(); exec_params_.num_backends = backends_.size(); @@ -401,13 +402,27 @@ class ImageDecoder : public StatelessOperator { exec_params_.executor = &executor_; exec_params_.max_num_cpu_threads = num_threads_; exec_params_.pre_init = 1; + exec_params_.skip_pre_sync = 1; // we are not doing stream allocations before decoding. decoder_ = NvImageCodecDecoder::Create(instance_, &exec_params_, opts_.to_string()); } - nvimgcodecStatus_t launch(int device_id, int sample_idx, void *task_context, - void (*task)(int thread_id, int sample_idx, void *task_context)) { + nvimgcodecStatus_t schedule(int device_id, int sample_idx, void *task_context, + void (*task)(int thread_id, int sample_idx, void *task_context)) { + assert(tp_); + tp_->AddWork([=](int tid) { task(tid, sample_idx, task_context); }, -(task_count_++), false); + return NVIMGCODEC_STATUS_SUCCESS; + } + + nvimgcodecStatus_t run(int device_id) { + assert(tp_); + tp_->RunAll(false); + task_count_ = 0; + return NVIMGCODEC_STATUS_SUCCESS; + } + + nvimgcodecStatus_t wait(int device_id) { assert(tp_); - tp_->AddWork([=](int tid) { task(tid, sample_idx, task_context); }, 0, true); + tp_->WaitForWork(); return NVIMGCODEC_STATUS_SUCCESS; } @@ -417,12 +432,22 @@ class ImageDecoder : public StatelessOperator { return num_threads_; } - static nvimgcodecStatus_t static_launch(void *instance, int device_id, int sample_idx, - void *task_context, - void (*task)(int thread_id, int sample_idx, - void *task_context)) { + static nvimgcodecStatus_t static_schedule(void *instance, int device_id, int sample_idx, + void *task_context, + void (*task)(int thread_id, int sample_idx, + void *task_context)) { + auto *handle = static_cast *>(instance); + return handle->schedule(device_id, sample_idx, task_context, task); + } + + static nvimgcodecStatus_t static_run(void *instance, int device_id) { + auto *handle = static_cast *>(instance); + return handle->run(device_id); + } + + static nvimgcodecStatus_t static_wait(void *instance, int device_id) { auto *handle = static_cast *>(instance); - return handle->launch(device_id, sample_idx, task_context, task); + return handle->wait(device_id); } static int static_get_num_threads(void *instance) { @@ -466,6 +491,19 @@ class ImageDecoder : public StatelessOperator { // Make sure we set the default that DALI expects if (decoder_params_.count("jpeg_fancy_upsampling") == 0) decoder_params_["jpeg_fancy_upsampling"] = false; + + // Overriding hw_decoder_load default if nvImageCodec 0.4.x version + if (!spec.HasArgument("hw_decoder_load") && decoder_params_.count("hw_decoder_load") > 0) { + int major = -1, minor = -1, patch = -1; + get_nvimgcodec_version(&major, &minor, &patch); + if (MAKE_SEMANTIC_VERSION(major, minor, patch) >= MAKE_SEMANTIC_VERSION(0, 4, 0) && + MAKE_SEMANTIC_VERSION(major, minor, patch) < MAKE_SEMANTIC_VERSION(0, 5, 0)) { + DALI_WARN( + "nvImageCodec 0.4.x version detected. Setting hw_decoder_load to 1.0f for " + "deterministic results"); + decoder_params_["hw_decoder_load"] = 1.0f; + } + } } void ParseSample(ParsedSample &parsed_sample, span encoded) { @@ -500,90 +538,8 @@ class ImageDecoder : public StatelessOperator { return std::is_same::value ? thread_pool_.get() : &ws.GetThreadPool(); } - - bool SetupImpl(std::vector &output_descs, const Workspace &ws) override { - DomainTimeRange tr("Setup", DomainTimeRange::kOrange); - tp_ = GetThreadPool(ws); - assert(tp_ != nullptr); - auto auto_cleanup = AtScopeExit([&] { - tp_ = nullptr; - }); - - output_descs.resize(1); - auto &input = ws.template Input(0); - int nsamples = input.num_samples(); - - SetupRoiGenerator(spec_, ws); - TensorListShape<> shapes; - shapes.resize(nsamples, 3); - while (static_cast(state_.size()) < nsamples) - state_.push_back(std::make_unique()); - rois_.resize(nsamples); - - const bool use_cache = cache_ && cache_->IsCacheEnabled() && dtype_ == DALI_UINT8; - auto get_task = [&](int block_idx, int nblocks) { - return [&, block_idx, nblocks](int tid) { - int i_start = nsamples * block_idx / nblocks; - int i_end = nsamples * (block_idx + 1) / nblocks; - for (int i = i_start; i < i_end; i++) { - auto *st = state_[i].get(); - assert(st != nullptr); - const auto &input_sample = input[i]; - - auto src_info = input.GetMeta(i).GetSourceInfo(); - if (use_cache && cache_->IsInCache(src_info)) { - auto cached_shape = cache_->CacheImageShape(src_info); - auto roi = GetRoi(spec_, ws, i, cached_shape); - if (!roi.use_roi()) { - shapes.set_tensor_shape(i, cached_shape); - continue; - } - } - ParseSample(st->parsed_sample, - span{static_cast(input_sample.raw_data()), - volume(input_sample.shape())}); - st->out_shape = st->parsed_sample.dali_img_info.shape; - st->out_shape[2] = NumberOfChannels(format_, st->out_shape[2]); - if (use_orientation_ && - (st->parsed_sample.nvimgcodec_img_info.orientation.rotated % 180 != 0)) { - std::swap(st->out_shape[0], st->out_shape[1]); - } - ROI &roi = rois_[i] = GetRoi(spec_, ws, i, st->out_shape); - if (roi.use_roi()) { - auto roi_sh = roi.shape(); - if (roi.end.size() >= 2) { - DALI_ENFORCE(0 <= roi.end[0] && roi.end[0] <= st->out_shape[0] && - 0 <= roi.end[1] && roi.end[1] <= st->out_shape[1], - "ROI end must fit within the image bounds"); - } - if (roi.begin.size() >= 2) { - DALI_ENFORCE(0 <= roi.begin[0] && roi.begin[0] <= st->out_shape[0] && - 0 <= roi.begin[1] && roi.begin[1] <= st->out_shape[1], - "ROI begin must fit within the image bounds"); - } - st->out_shape[0] = roi_sh[0]; - st->out_shape[1] = roi_sh[1]; - } - shapes.set_tensor_shape(i, st->out_shape); - } - }; - }; - - int nblocks = tp_->NumThreads() + 1; - if (nsamples > nblocks * 4) { - int block_idx = 0; - for (; block_idx < tp_->NumThreads(); block_idx++) { - tp_->AddWork(get_task(block_idx, nblocks), -block_idx); - } - tp_->RunAll(false); // start work but not wait - get_task(block_idx, nblocks)(-1); // run last block - tp_->WaitForWork(); // wait for the other threads - } else { // not worth parallelizing - get_task(0, 1)(-1); // run all in current thread - } - - output_descs[0] = {std::move(shapes), dtype_}; - return true; + bool SetupImpl(std::vector &output_desc, const Workspace &ws) override { + return false; } /** @@ -602,14 +558,10 @@ class ImageDecoder : public StatelessOperator { * to the decoding function */ bool need_host_sync_alloc() { - int major, minor, patch; - get_nvimgcodec_version(&major, &minor, &patch); return !version_at_least(0, 3, 0); } - template - void PrepareOutput(SampleState &st, SampleView out, const ROI &roi, - const Workspace &ws) { + void PrepareOutput(SampleState &st, void *out_ptr, const ROI &roi, const Workspace &ws) { // Make a copy of the parsed img info. We might modify it // (for example, request planar vs. interleaved, etc) st.image_info = st.parsed_sample.nvimgcodec_img_info; @@ -659,6 +611,8 @@ class ImageDecoder : public StatelessOperator { int precision = st.image_info.plane_info[0].precision; if (precision == 0) precision = PositiveBits(st.parsed_sample.orig_dtype); + if (precision == 1 && st.parsed_sample.orig_dtype == DALI_UINT8) + precision = 8; // nvimgcodec produces at minimum uint8 dynamic range (0..255) bool need_dynamic_range_scaling = NeedDynamicRangeScaling(precision, st.parsed_sample.orig_dtype); st.dyn_range_multiplier = need_dynamic_range_scaling ? @@ -690,7 +644,7 @@ class ImageDecoder : public StatelessOperator { st.decode_out_cpu = {st.image_info.buffer, decode_shape, st.parsed_sample.orig_dtype}; } } else { - st.image_info.buffer = static_cast(out.raw_mutable_data()); + st.image_info.buffer = out_ptr; } st.image_info.num_planes = 1; @@ -703,12 +657,22 @@ class ImageDecoder : public StatelessOperator { st.image = NvImageCodecImage::Create(instance_, &st.image_info); } + bool HasContiguousOutputs() const override { + return false; + } + void RunImplImpl(Workspace &ws) { const auto &input = ws.Input(0); + int nsamples = input.num_samples(); auto &output = ws.template Output::type>(0); + // it complains if we try to set the sample dim after it is already allocated + // even if the sample dim didn't change + if (output.sample_dim() != 3) + output.set_sample_dim(3); + output.set_type(dtype_); + output.SetContiguity(BatchContiguity::Noncontiguous); output.SetLayout("HWC"); - int nsamples = input.num_samples(); - assert(output.num_samples() == nsamples); + output.SetSize(nsamples); tp_ = GetThreadPool(ws); assert(tp_ != nullptr); @@ -716,14 +680,10 @@ class ImageDecoder : public StatelessOperator { tp_ = nullptr; }); - bool has_any_roi = false; - for (auto &roi : rois_) - has_any_roi |= roi.use_roi(); - - nvimgcodecDecodeParams_t decode_params = {NVIMGCODEC_STRUCTURE_TYPE_DECODE_PARAMS, - sizeof(nvimgcodecDecodeParams_t), nullptr}; - decode_params.apply_exif_orientation = static_cast(use_orientation_); - decode_params.enable_roi = static_cast(has_any_roi); + SetupRoiGenerator(spec_, ws); + while (static_cast(state_.size()) < nsamples) + state_.push_back(std::make_unique()); + rois_.resize(nsamples); assert(static_cast(state_.size()) >= nsamples); batch_encoded_streams_.clear(); @@ -732,125 +692,174 @@ class ImageDecoder : public StatelessOperator { batch_images_.reserve(nsamples); decode_sample_idxs_.clear(); decode_sample_idxs_.reserve(nsamples); + decode_status_.clear(); - // TODO(janton): consider extending cache to different dtype as well const bool use_cache = cache_ && cache_->IsCacheEnabled() && dtype_ == DALI_UINT8; - if (use_cache) { - int samples_to_load = 0; - DomainTimeRange tr(make_string("CacheLoad"), DomainTimeRange::kOrange); - for (int orig_idx = 0; orig_idx < nsamples; orig_idx++) { - auto src_info = input.GetMeta(orig_idx).GetSourceInfo(); - // To simplify things, we do not allow caching ROIs - bool has_roi = rois_[orig_idx].use_roi(); - if (cache_->IsInCache(src_info) && !has_roi) { - cache_->DeferCacheLoad(src_info, output.template mutable_tensor(orig_idx)); - samples_to_load++; - } else { - decode_sample_idxs_.push_back(orig_idx); + auto setup_block = [&](int block_idx, int nblocks, int tid) { + int i_start = nsamples * block_idx / nblocks; + int i_end = nsamples * (block_idx + 1) / nblocks; + DomainTimeRange tr("Setup #" + std::to_string(block_idx) + "/" + std::to_string(nblocks), + DomainTimeRange::kOrange); + for (int i = i_start; i < i_end; i++) { + auto *st = state_[i].get(); + assert(st != nullptr); + const auto &input_sample = input[i]; + + auto src_info = input.GetMeta(i).GetSourceInfo(); + if (use_cache && cache_->IsInCache(src_info)) { + auto cached_shape = cache_->CacheImageShape(src_info); + auto roi = GetRoi(spec_, ws, i, cached_shape); + if (!roi.use_roi()) { + st->load_from_cache = true; + output.ResizeSample(i, st->out_shape); + st->image_info.buffer = output.raw_mutable_tensor(i); + continue; + } + } + st->load_from_cache = false; + ParseSample(st->parsed_sample, + span{static_cast(input_sample.raw_data()), + volume(input_sample.shape())}); + st->out_shape = st->parsed_sample.dali_img_info.shape; + st->out_shape[2] = NumberOfChannels(format_, st->out_shape[2]); + if (use_orientation_ && + (st->parsed_sample.nvimgcodec_img_info.orientation.rotated % 180 != 0)) { + std::swap(st->out_shape[0], st->out_shape[1]); } - } - if (samples_to_load > 0) - cache_->LoadDeferred(ws.stream()); - } else { - decode_sample_idxs_.resize(nsamples); - std::iota(decode_sample_idxs_.begin(), decode_sample_idxs_.end(), 0); - } - int decode_nsamples = decode_sample_idxs_.size(); - { - DomainTimeRange tr(make_string("Prepare descs"), DomainTimeRange::kOrange); - auto get_task = [&](int block_idx, int nblocks) { - return [&, block_idx, nblocks](int tid) { - int i_start = decode_nsamples * block_idx / nblocks; - int i_end = decode_nsamples * (block_idx + 1) / nblocks; - for (int i = i_start; i < i_end; i++) { - int orig_idx = decode_sample_idxs_[i]; - PrepareOutput(*state_[orig_idx], output[orig_idx], rois_[orig_idx], ws); + ROI &roi = rois_[i] = GetRoi(spec_, ws, i, st->out_shape); + if (roi.use_roi()) { + auto roi_sh = roi.shape(); + if (roi.end.size() >= 2) { + DALI_ENFORCE(0 <= roi.end[0] && roi.end[0] <= st->out_shape[0] && 0 <= roi.end[1] && + roi.end[1] <= st->out_shape[1], + "ROI end must fit within the image bounds"); } - }; - }; + if (roi.begin.size() >= 2) { + DALI_ENFORCE(0 <= roi.begin[0] && roi.begin[0] <= st->out_shape[0] && + 0 <= roi.begin[1] && roi.begin[1] <= st->out_shape[1], + "ROI begin must fit within the image bounds"); + } + st->out_shape[0] = roi_sh[0]; + st->out_shape[1] = roi_sh[1]; + } + output.ResizeSample(i, st->out_shape); + PrepareOutput(*state_[i], output.raw_mutable_tensor(i), rois_[i], ws); + assert(!ws.has_stream() || ws.stream() == st->image_info.cuda_stream); + } + }; - int nblocks = tp_->NumThreads() + 1; - if (decode_nsamples > nblocks * 4) { - int block_idx = 0; - for (; block_idx < tp_->NumThreads(); block_idx++) { - tp_->AddWork(get_task(block_idx, nblocks), -block_idx); + int nsamples_per_block = 16; + int nblocks = std::max(1, nsamples / nsamples_per_block); + int ntasks = std::min(nblocks, std::min(8, tp_->NumThreads() + 1)); + + if (ntasks < 2) { + setup_block(0, 1, -1); // run all in current thread + } else { + int block_idx = 0; + atomic_idx_.store(0); + auto setup_task = [&, nblocks](int tid) { + DomainTimeRange tr("Setup", DomainTimeRange::kOrange); + int block_idx; + while ((block_idx = atomic_idx_.fetch_add(1)) < nblocks) { + setup_block(block_idx, nblocks, tid); } - tp_->RunAll(false); // start work but not wait - get_task(block_idx, nblocks)(-1); // run last block - tp_->WaitForWork(); // wait for the other threads - } else { // not worth parallelizing - get_task(0, 1)(-1); // run all in current thread + }; + + for (int task_idx = 0; task_idx < ntasks - 1; task_idx++) { + tp_->AddWork(setup_task, -task_idx); } + assert(ntasks >= 2); + tp_->RunAll(false); // start work but not wait + setup_task(-1); // last task in current thread + tp_->WaitForWork(); // wait for the other threads + } - for (int orig_idx : decode_sample_idxs_) { - auto &st = *state_[orig_idx]; + bool any_need_processing = false; + bool has_any_roi = false; + for (int orig_idx = 0; orig_idx < nsamples; orig_idx++) { + auto &st = *state_[orig_idx]; + any_need_processing |= state_[orig_idx]->need_processing; + if (use_cache && st.load_from_cache) { + auto *data_ptr = output.raw_mutable_tensor(orig_idx); + auto src_info = input.GetMeta(orig_idx).GetSourceInfo(); + cache_->DeferCacheLoad(src_info, static_cast(data_ptr)); + } else { + has_any_roi |= rois_[orig_idx].use_roi(); batch_encoded_streams_.push_back(st.parsed_sample.encoded_stream); batch_images_.push_back(st.image); + decode_sample_idxs_.push_back(orig_idx); } } + size_t nsamples_decode = batch_images_.size(); + size_t nsamples_cache = nsamples - nsamples_decode; - // This is a workaround for nvImageCodec <= 0.2 - auto any_need_processing = [&]() { - for (int orig_idx : decode_sample_idxs_) { - auto& st = state_[orig_idx]; - assert(ws.stream() == st->image_info.cuda_stream); // assuming this is true - if (st->need_processing) - return true; - } - return false; - }; - if (ws.has_stream() && need_host_sync_alloc() && any_need_processing()) { + if (ws.has_stream() && need_host_sync_alloc() && any_need_processing) { DomainTimeRange tr("alloc sync", DomainTimeRange::kOrange); CUDA_CALL(cudaStreamSynchronize(ws.stream())); } - { - DomainTimeRange tr("Decode", DomainTimeRange::kOrange); + if (use_cache && nsamples_cache > 0) { + DomainTimeRange tr("LoadDeferred", DomainTimeRange::kOrange); + cache_->LoadDeferred(ws.stream()); + } + + if (nsamples_decode > 0) { nvimgcodecFuture_t future; - decode_status_.resize(decode_nsamples); - size_t status_size = 0; - CHECK_NVIMGCODEC(nvimgcodecDecoderDecode(decoder_, batch_encoded_streams_.data(), - batch_images_.data(), decode_nsamples, - &decode_params, &future)); - CHECK_NVIMGCODEC( - nvimgcodecFutureGetProcessingStatus(future, decode_status_.data(), &status_size)); - if (static_cast(status_size) != decode_nsamples) - throw std::logic_error("Failed to retrieve processing status"); - CHECK_NVIMGCODEC(nvimgcodecFutureDestroy(future)); - - for (int i = 0; i < decode_nsamples; i++) { - if (decode_status_[i] != NVIMGCODEC_PROCESSING_STATUS_SUCCESS) { - int orig_idx = decode_sample_idxs_[i]; + decode_status_.resize(nsamples_decode); + size_t decode_status_size = 0; + nvimgcodecDecodeParams_t decode_params = {NVIMGCODEC_STRUCTURE_TYPE_DECODE_PARAMS, + sizeof(nvimgcodecDecodeParams_t), nullptr}; + decode_params.apply_exif_orientation = static_cast(use_orientation_); + decode_params.enable_roi = static_cast(has_any_roi); + + { + DomainTimeRange tr("nvimgcodecDecoderDecode", DomainTimeRange::kOrange); + CHECK_NVIMGCODEC(nvimgcodecDecoderDecode(decoder_, batch_encoded_streams_.data(), + batch_images_.data(), nsamples_decode, + &decode_params, &future)); + CHECK_NVIMGCODEC(nvimgcodecFutureWaitForAll(future)); + CHECK_NVIMGCODEC(nvimgcodecFutureGetProcessingStatus(future, decode_status_.data(), + &decode_status_size)); + CHECK_NVIMGCODEC(nvimgcodecFutureDestroy(future)); + } + if (decode_status_size != nsamples_decode) + throw std::runtime_error("Failed to run decoder"); + for (size_t idx = 0; idx < nsamples_decode; idx++) { + size_t orig_idx = decode_sample_idxs_[idx]; + auto st_ptr = state_[orig_idx].get(); + if (decode_status_[idx] != NVIMGCODEC_PROCESSING_STATUS_SUCCESS) { throw std::runtime_error(make_string("Failed to decode sample #", orig_idx, " : ", input.GetMeta(orig_idx).GetSourceInfo())); } } - } - - for (int orig_idx : decode_sample_idxs_) { - auto st_ptr = state_[orig_idx].get(); - if (st_ptr->need_processing) { - tp_->AddWork( - [&, out = output[orig_idx], st_ptr, orig_idx](int tid) { - DomainTimeRange tr(make_string("Convert #", orig_idx), DomainTimeRange::kOrange); - auto &st = *st_ptr; - if constexpr (std::is_same::value) { - ConvertGPU(out, st.req_layout, st.req_img_type, st.decode_out_gpu, st.req_layout, - st.orig_img_type, ws.stream(), ROI{}, nvimgcodecOrientation_t{}, - st.dyn_range_multiplier); - st.device_buf.reset(); - } else { - assert(st.dyn_range_multiplier == 1.0f); // TODO(janton): enable - ConvertCPU(out, st.req_layout, st.req_img_type, st.decode_out_cpu, st.req_layout, - st.orig_img_type, ROI{}, nvimgcodecOrientation_t{}); - st.host_buf.reset(); - } - }, - -orig_idx); + if (any_need_processing) { + for (size_t idx = 0; idx < nsamples_decode; idx++) { + size_t orig_idx = decode_sample_idxs_[idx]; + auto st_ptr = state_[orig_idx].get(); + if (st_ptr->need_processing) { + tp_->AddWork( + [&, out = output[orig_idx], st_ptr, orig_idx](int tid) { + DomainTimeRange tr(make_string("Convert #", orig_idx), DomainTimeRange::kOrange); + auto &st = *st_ptr; + if constexpr (std::is_same::value) { + ConvertGPU(out, st.req_layout, st.req_img_type, st.decode_out_gpu, + st.req_layout, st.orig_img_type, ws.stream(), ROI{}, + nvimgcodecOrientation_t{}, st.dyn_range_multiplier); + st.device_buf.reset(); + } else { + assert(st.dyn_range_multiplier == 1.0f); // TODO(janton): enable + ConvertCPU(out, st.req_layout, st.req_img_type, st.decode_out_cpu, + st.req_layout, st.orig_img_type, ROI{}, nvimgcodecOrientation_t{}); + st.host_buf.reset(); + } + }, + -idx); + } + } + tp_->RunAll(true); } } - tp_->RunAll(); if (use_cache) { DomainTimeRange tr(make_string("CacheStore"), DomainTimeRange::kOrange); @@ -882,8 +891,11 @@ class ImageDecoder : public StatelessOperator { sizeof(nvimgcodecExecutorDesc_t), nullptr, this, - &static_launch, + &static_schedule, + &static_run, + &static_wait, &static_get_num_threads}; + nvimgcodecDeviceAllocator_t dev_alloc_ = {}; nvimgcodecPinnedAllocator_t pinned_alloc_ = {}; std::vector backends_; @@ -899,7 +911,8 @@ class ImageDecoder : public StatelessOperator { bool use_orientation_ = true; int max_batch_size_ = 1; int num_threads_ = -1; - ThreadPool* tp_ = nullptr; + ThreadPool *tp_ = nullptr; + int64_t task_count_ = 0; std::vector> state_; std::vector batch_encoded_streams_; std::vector batch_images_; @@ -909,6 +922,8 @@ class ImageDecoder : public StatelessOperator { // This vector is used to get the original index of the decoded samples std::vector decode_sample_idxs_; + std::atomic atomic_idx_; + // Manually loaded extensions std::vector extensions_descs_; std::vector extensions_; diff --git a/dali/python/setup.py.in b/dali/python/setup.py.in index e49dbff2d04..179e4d04430 100644 --- a/dali/python/setup.py.in +++ b/dali/python/setup.py.in @@ -89,6 +89,8 @@ For more details please check the 'six >= 1.16, <= 1.16', 'dm-tree <= 0.1.8', 'packaging <= 24.2', + @DALI_INSTALL_REQUIRES_NVJPEG2K@ + @DALI_INSTALL_REQUIRES_NVTIFF@ @DALI_INSTALL_REQUIRES_NVIMGCODEC@ ], ) diff --git a/dali/test/python/decoder/test_imgcodec.py b/dali/test/python/decoder/test_imgcodec.py index 3d85a8e4445..9f086a47a2e 100644 --- a/dali/test/python/decoder/test_imgcodec.py +++ b/dali/test/python/decoder/test_imgcodec.py @@ -177,7 +177,10 @@ def run_decode_fused(test_fun, path, img_type, batch, device, threads, validatio dump_as_core_artifacts( img_1.source_info(), arr_1, arr_2, iter=it, sample_idx=sample_idx ) - assert is_ok, f"{validation_fun.__name__}\nimage: {img_1.source_info()}" + assert is_ok, ( + f"{validation_fun.__name__}\n" + + f"image: {img_1.source_info()} iter: {it} sample_idx: {sample_idx}" + ) def test_image_decoder_fused(): diff --git a/docker/Dockerfile.build.aarch64-linux b/docker/Dockerfile.build.aarch64-linux index 62cf6dc3907..b02026ea26f 100644 --- a/docker/Dockerfile.build.aarch64-linux +++ b/docker/Dockerfile.build.aarch64-linux @@ -147,7 +147,7 @@ CMD WERROR=ON \ WITH_DYNAMIC_NVJPEG=ON \ WITH_DYNAMIC_CUFFT=ON \ WITH_DYNAMIC_NPP=ON \ - WITH_DYNAMIC_NVIMGCODEC=OFF \ + WITH_DYNAMIC_NVIMGCODEC=ON \ NVIDIA_BUILD_ID=${NVIDIA_BUILD_ID:-0} \ WHL_PLATFORM_NAME=manylinux2014_aarch64 \ BUNDLE_PATH_PREFIX="/usr/aarch64-linux-gnu" \ diff --git a/include/dali/core/dev_string.h b/include/dali/core/dev_string.h index ad85a89f000..29b1ae432eb 100644 --- a/include/dali/core/dev_string.h +++ b/include/dali/core/dev_string.h @@ -173,7 +173,7 @@ inline __device__ DeviceString dev_to_string(const void *ptr) { while (x) { int digit = x&0xf; x >>= 4;; - buf[--cursor] = "01234567889ABCDEF"[digit]; + buf[--cursor] = "0123456789ABCDEF"[digit]; } buf[--cursor] = 'x'; buf[--cursor] = '0'; diff --git a/internal_tools/hw_decoder_bench.py b/internal_tools/hw_decoder_bench.py index 7d074e0dcda..8e1189b0c77 100644 --- a/internal_tools/hw_decoder_bench.py +++ b/internal_tools/hw_decoder_bench.py @@ -79,6 +79,12 @@ type=int, ) +parser.add_argument( + "--experimental_decoder", + action="store_true", + help="If True, uses the experimental decoder instead of the default", +) + args = parser.parse_args() DALI_INPUT_NAME = "DALI_INPUT_0" @@ -86,12 +92,15 @@ @pipeline_def( - batch_size=args.batch_size, num_threads=args.num_threads, device_id=args.device_id, seed=0 + batch_size=args.batch_size, + num_threads=args.num_threads, + device_id=args.device_id, + seed=0, ) -def DecoderPipeline(): +def DecoderPipeline(decoders_module=fn.decoders): device = "mixed" if args.device == "gpu" else "cpu" jpegs, _ = fn.readers.file(file_root=args.images_dir) - images = fn.decoders.image( + images = decoders_module.image( jpegs, device=device, output_type=types.RGB, @@ -103,12 +112,15 @@ def DecoderPipeline(): @pipeline_def( - batch_size=args.batch_size, num_threads=args.num_threads, device_id=args.device_id, seed=0 + batch_size=args.batch_size, + num_threads=args.num_threads, + device_id=args.device_id, + seed=0, ) -def RN50Pipeline(minibatch_size): +def RN50Pipeline(minibatch_size, decoders_module=fn.decoders): device = "mixed" if args.device == "gpu" else "cpu" jpegs, _ = fn.readers.file(file_root=args.images_dir) - images = fn.decoders.image_random_crop( + images = decoders_module.image_random_crop( jpegs, device=device, output_type=types.RGB, @@ -138,10 +150,10 @@ def RN50Pipeline(minibatch_size): device_id=args.device_id, seed=0, enable_conditionals=True, + decoders_module=fn.decoders, ) def EfficientnetTrainingPipeline( - minibatch_size, - automatic_augmentation="autoaugment", + minibatch_size, automatic_augmentation="autoaugment", decoders_module=fn.decoders ): dali_device = args.device output_layout = types.NCHW @@ -159,7 +171,7 @@ def EfficientnetTrainingPipeline( decoder_device = "cpu" resize_device = "cpu" - images = fn.decoders.image_random_crop( + images = decoders_module.image_random_crop( jpegs, device=decoder_device, output_type=types.RGB, @@ -213,9 +225,9 @@ def EfficientnetTrainingPipeline( device_id=args.device_id, prefetch_queue_depth=1, ) -def EfficientnetInferencePipeline(): +def EfficientnetInferencePipeline(decoders_module=fn.decoders): images = fn.external_source(device="cpu", name=DALI_INPUT_NAME) - images = fn.decoders.image( + images = decoders_module.image( images, device="mixed" if args.device == "gpu" else "cpu", output_type=types.RGB, @@ -276,7 +288,9 @@ def non_image_preprocessing(raw_text): @pipeline_def( batch_size=args.batch_size, num_threads=args.num_threads, device_id=args.device_id, seed=0 ) -def vit_pipeline(is_training=False, image_shape=(384, 384, 3), num_classes=1000): +def vit_pipeline( + is_training=False, image_shape=(384, 384, 3), num_classes=1000, decoders_module=fn.decoders +): files_paths = [os.path.join(args.images_dir, f) for f in os.listdir(args.images_dir)] img, clss = fn.readers.webdataset( @@ -298,7 +312,7 @@ def vit_pipeline(is_training=False, image_shape=(384, 384, 3), num_classes=1000) labels = fn.one_hot(labels, num_classes=num_classes) device = "mixed" if use_gpu else "cpu" - img = fn.decoders.image( + img = decoders_module.image( img, device=device, output_type=types.RGB, @@ -336,19 +350,32 @@ def vit_pipeline(is_training=False, image_shape=(384, 384, 3), num_classes=1000) return img, labels +decoders_module = fn.experimental.decoders if args.experimental_decoder else fn.decoders +print(f"Using decoders_module={decoders_module}") + pipes = [] if args.pipeline == "decoder": for i in range(args.gpu_num): - pipes.append(DecoderPipeline(device_id=i + args.device_id)) + pipes.append(DecoderPipeline(device_id=i + args.device_id, decoders_module=decoders_module)) elif args.pipeline == "rn50": for i in range(args.gpu_num): - pipes.append(RN50Pipeline(device_id=i + args.device_id, minibatch_size=args.minibatch_size)) + pipes.append( + RN50Pipeline( + device_id=i + args.device_id, + minibatch_size=args.minibatch_size, + decoders_module=decoders_module, + ) + ) elif args.pipeline == "efficientnet_inference": for i in range(args.gpu_num): - pipes.append(EfficientnetInferencePipeline(device_id=i + args.device_id)) + pipes.append( + EfficientnetInferencePipeline( + device_id=i + args.device_id, decoders_module=decoders_module + ) + ) elif args.pipeline == "vit": for i in range(args.gpu_num): - pipes.append(vit_pipeline(device_id=i + args.device_id)) + pipes.append(vit_pipeline(device_id=i + args.device_id, decoders_module=decoders_module)) elif args.pipeline == "efficientnet_training": for i in range(args.gpu_num): pipes.append( @@ -356,6 +383,7 @@ def vit_pipeline(is_training=False, image_shape=(384, 384, 3), num_classes=1000) device_id=i + args.device_id, minibatch_size=args.minibatch_size, automatic_augmentation=args.aug_strategy, + decoders_module=decoders_module, ) ) else: diff --git a/internal_tools/stub_generator/nvimgcodec.json b/internal_tools/stub_generator/nvimgcodec.json index 4a6847ea4d8..ebeaf4aac67 100644 --- a/internal_tools/stub_generator/nvimgcodec.json +++ b/internal_tools/stub_generator/nvimgcodec.json @@ -7,9 +7,11 @@ "not_found_error":"NVIMGCODEC_STATUS_IMPLEMENTATION_UNSUPPORTED", "functions": { "nvimgcodecInstanceCreate": {}, + "nvimgcodecDecoderCanDecode": {}, "nvimgcodecCodeStreamCreateFromHostMem": {}, "nvimgcodecCodeStreamGetImageInfo": {}, "nvimgcodecDecoderCreate": {}, + "nvimgcodecFutureWaitForAll": {}, "nvimgcodecFutureGetProcessingStatus": {}, "nvimgcodecInstanceDestroy": {}, "nvimgcodecImageDestroy": {}, diff --git a/qa/TL0_python-self-test-base-cuda/test.sh b/qa/TL0_python-self-test-base-cuda/test.sh index 969a5e754e8..2a0060aa4e3 100644 --- a/qa/TL0_python-self-test-base-cuda/test.sh +++ b/qa/TL0_python-self-test-base-cuda/test.sh @@ -14,6 +14,7 @@ version_ge "$DALI_CUDA_MAJOR_VERSION" "11" && \ pip uninstall -y `pip list | grep nvidia-cufft | cut -d " " -f1` \ `pip list | grep nvidia-nvjpeg | cut -d " " -f1` \ `pip list | grep nvidia-nvjpeg2k | cut -d " " -f1` \ + `pip list | grep nvidia-nvtiff | cut -d " " -f1` \ `pip list | grep nvidia-npp | cut -d " " -f1` \ || true @@ -43,4 +44,5 @@ version_ge "$DALI_CUDA_MAJOR_VERSION" "11" && \ nvidia-npp-cu${DALI_CUDA_MAJOR_VERSION} \ nvidia-nvjpeg-cu${DALI_CUDA_MAJOR_VERSION} \ nvidia-nvjpeg2k-cu${DALI_CUDA_MAJOR_VERSION} \ + nvidia-nvtiff-cu${DALI_CUDA_MAJOR_VERSION} \ || true diff --git a/qa/TL1_decoder_perf/test.sh b/qa/TL1_decoder_perf/test.sh index 449ad6c2ce4..0924978bf66 100644 --- a/qa/TL1_decoder_perf/test.sh +++ b/qa/TL1_decoder_perf/test.sh @@ -3,9 +3,11 @@ pip_packages='numpy' target_dir=./internal_tools -LOG="dali.log" +LOG1="dali_legacy.log" +LOG2="dali_nvimgcodec.log" function CLEAN_AND_EXIT { - rm -rf ${LOG} + rm -rf ${LOG1} + rm -rf ${LOG2} exit $1 } @@ -15,12 +17,15 @@ test_body() { # Hopper MIN_PERF=19000; # use taskset to avoid inefficient data migration between cores we don't want to use - taskset --cpu-list 0-127 python hw_decoder_bench.py --width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 100 -t 100000 -i ${DALI_EXTRA_PATH}/db/single/jpeg -p rn50 -j 70 --hw_load 0.12 | tee ${LOG} + taskset --cpu-list 0-127 python hw_decoder_bench.py --width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 100 -t 100000 -i ${DALI_EXTRA_PATH}/db/single/jpeg -p rn50 -j 70 --hw_load 0.12 | tee ${LOG1} + taskset --cpu-list 0-127 python hw_decoder_bench.py --width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 100 -t 100000 -i ${DALI_EXTRA_PATH}/db/single/jpeg -p rn50 -j 70 --hw_load 0.12 --experimental_decoder | tee ${LOG2} + else # GraceHopper MIN_PERF=29000; # use taskset to avoid inefficient data migration between cores we don't want to use - taskset --cpu-list 0-71 python hw_decoder_bench.py --width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 100 -t 100000 -i ${DALI_EXTRA_PATH}/db/single/jpeg -p rn50 -j 72 --hw_load 0.11 | tee ${LOG} + taskset --cpu-list 0-71 python hw_decoder_bench.py --width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 100 -t 100000 -i ${DALI_EXTRA_PATH}/db/single/jpeg -p rn50 -j 72 --hw_load 0.11 | tee ${LOG1} + taskset --cpu-list 0-71 python hw_decoder_bench.py --width_hint 6000 --height_hint 6000 -b 408 -d 0 -g gpu -w 100 -t 100000 -i ${DALI_EXTRA_PATH}/db/single/jpeg -p rn50 -j 72 --hw_load 0.11 --experimental_decoder | tee ${LOG2} fi # Regex Explanation: @@ -28,14 +33,39 @@ test_body() { # \K: Resets the start of the match, so anything before \K is not included in the output. # [0-9]+(\.[0-9]+)?: Matches the number, with an optional decimal part. # (?= frames/sec): ensures " frames/sec" follows the number, but doesn't include it. - PERF=$(grep -oP 'Total Throughput: \K[0-9]+(\.[0-9]+)?(?= frames/sec)' ${LOG}) + PERF1=$(grep -oP 'Total Throughput: \K[0-9]+(\.[0-9]+)?(?= frames/sec)' ${LOG1}) + PERF2=$(grep -oP 'Total Throughput: \K[0-9]+(\.[0-9]+)?(?= frames/sec)' ${LOG2}) - PERF_RESULT=$(echo "$PERF $MIN_PERF" | awk '{if ($1>=$2) {print "OK"} else { print "FAIL" }}') - if [[ "$PERF_RESULT" == "OK" ]]; then - CLEAN_AND_EXIT 0 - fi + PERF_RESULT1=$(echo "$PERF1 $MIN_PERF" | awk '{if ($1>=$2) {print "OK"} else { print "FAIL" }}') + PERF_RESULT2=$(echo "$PERF2 $MIN_PERF" | awk '{if ($1>=$2) {print "OK"} else { print "FAIL" }}') + # Ensure that PERF2 is no less than 5% smaller than PERF1 + PERF_RESULT3=$(echo "$PERF2 $PERF1" | awk '{if ($1 >= $2 * 0.95) {print "OK"} else { print "FAIL" }}') + + echo "PERF_RESULT1=${PERF_RESULT1}" + echo "PERF_RESULT2=${PERF_RESULT2}" + echo "PERF_RESULT3=${PERF_RESULT3}" - CLEAN_AND_EXIT 4 + # If nvImageCodec>=0.5.0 enforce the performance requirements. Otherwise, we check only the legacy decoder + NVIMGCODEC_VERSION=$(pip show nvidia-nvimgcodec-cu12 | grep ^Version: | awk '{print $2}') + NVIMGCODEC_VERSION_WITHOUT_EXTRA=$(echo "$NVIMGCODEC_VERSION" | awk -F '.' '{print $1 "." $2 "." $3}') + if [[ "$NVIMGCODEC_VERSION_WITHOUT_EXTRA" =~ ^([0-9]+)\.([0-9]+)\.([0-9]+)$ ]]; then + IFS='.' read -r MAJOR MINOR PATCH <<< "$NVIMGCODEC_VERSION_WITHOUT_EXTRA" + if [[ $MAJOR -gt 0 || ($MAJOR -eq 0 && $MINOR -ge 5) ]]; then + if [[ "$PERF_RESULT1" == "OK" && "$PERF_RESULT2" == "OK" && "$PERF_RESULT3" == "OK" ]]; then + CLEAN_AND_EXIT 0 + else + CLEAN_AND_EXIT 4 + fi + else + if [[ "$PERF_RESULT1" == "OK" ]]; then + CLEAN_AND_EXIT 0 + else + CLEAN_AND_EXIT 4 + fi + fi + else + CLEAN_AND_EXIT 3 + fi } pushd ../.. source ./qa/test_template.sh diff --git a/qa/test_template_impl.sh b/qa/test_template_impl.sh index 55f298290f8..41eb9b5efa6 100755 --- a/qa/test_template_impl.sh +++ b/qa/test_template_impl.sh @@ -159,6 +159,7 @@ do install_pip_pkg "pip install --upgrade nvidia-npp-cu${DALI_CUDA_MAJOR_VERSION}${NPP_VERSION} \ nvidia-nvjpeg-cu${DALI_CUDA_MAJOR_VERSION} \ nvidia-nvjpeg2k-cu${DALI_CUDA_MAJOR_VERSION} \ + nvidia-nvtiff-cu${DALI_CUDA_MAJOR_VERSION} \ nvidia-cufft-cu${DALI_CUDA_MAJOR_VERSION} \ -f /pip-packages" fi