From 0d5cf666a649d4468d617b618322f7f76ca892b2 Mon Sep 17 00:00:00 2001 From: shobana-mcw Date: Tue, 1 Jun 2021 23:49:17 -0700 Subject: [PATCH] Add prefetching support in RALI pipeline. --- rocAL/rocAL/include/cifar10_data_loader.h | 10 +++--- rocAL/rocAL/include/circular_buffer.h | 8 ++--- rocAL/rocAL/include/context.h | 4 +-- rocAL/rocAL/include/image_loader.h | 9 +++--- rocAL/rocAL/include/image_loader_sharded.h | 12 ++++--- rocAL/rocAL/include/loader_module.h | 3 +- rocAL/rocAL/include/master_graph.h | 9 ++++-- rocAL/rocAL/include/node_fused_jpeg_crop.h | 10 +++--- .../node_fused_jpeg_crop_single_shard.h | 8 ++--- rocAL/rocAL/include/node_image_loader.h | 4 +-- .../include/node_image_loader_single_shard.h | 4 +-- rocAL/rocAL/include/rali_api.h | 2 +- rocAL/rocAL/source/cifar10_data_loader.cpp | 32 ++++++++++++------- rocAL/rocAL/source/circular_buffer.cpp | 21 +++++------- rocAL/rocAL/source/image_loader.cpp | 13 ++++++-- rocAL/rocAL/source/image_loader_sharded.cpp | 10 +++++- rocAL/rocAL/source/master_graph.cpp | 5 +-- .../node_fused_jpeg_crop_single_shard.cpp | 4 +-- rocAL/rocAL/source/rali_api.cpp | 3 +- rocAL/rocAL_pybind/amd/rali/pipeline.py | 4 +-- rocAL/rocAL_pybind/rali_pybind.cpp | 1 + 21 files changed, 105 insertions(+), 71 deletions(-) diff --git a/rocAL/rocAL/include/cifar10_data_loader.h b/rocAL/rocAL/include/cifar10_data_loader.h index 92de7d6eb2..bd69ccf959 100644 --- a/rocAL/rocAL/include/cifar10_data_loader.h +++ b/rocAL/rocAL/include/cifar10_data_loader.h @@ -32,9 +32,9 @@ class CIFAR10DataLoader : public LoaderModule public: #if ENABLE_HIP explicit CIFAR10DataLoader(DeviceResourcesHip dev_resources); -#else +#else explicit CIFAR10DataLoader(DeviceResources dev_resources); -#endif +#endif ~CIFAR10DataLoader() override; LoaderModuleStatus load_next() override; void initialize(ReaderConfig reader_cfg, DecoderConfig decoder_cfg, RaliMemType mem_type, unsigned batch_size, bool keep_orig_size=true) override; @@ -46,6 +46,7 @@ class CIFAR10DataLoader : public LoaderModule std::vector get_id() override; decoded_image_info get_decode_image_info() override; Timing timing() override; + void set_prefetch_queue_depth(size_t prefetch_queue_depth) override; private: void increment_loader_idx(); bool is_out_of_data(); @@ -59,7 +60,7 @@ class CIFAR10DataLoader : public LoaderModule #else const DeviceResources _dev_resources; #endif - decoded_image_info _raw_img_info; // image info to store the names. In this case the ID of image is stored in _roi_width field + decoded_image_info _raw_img_info; // image info to store the names. In this case the ID of image is stored in _roi_width field decoded_image_info _output_decoded_img_info; bool _initialized = false; RaliMemType _mem_type; @@ -72,7 +73,8 @@ class CIFAR10DataLoader : public LoaderModule std::vector _actual_read_size; std::vector _output_names; CircularBuffer _circ_buff; - const static size_t CIRC_BUFFER_DEPTH = 3; // Used for circular buffer's internal buffer + // const static size_t CIRC_BUFFER_DEPTH = 3; // Used for circular buffer's internal buffer + size_t _prefetch_queue_depth; TimingDBG _file_load_time, _swap_handle_time; size_t _loader_idx; size_t _shard_count = 1; diff --git a/rocAL/rocAL/include/circular_buffer.h b/rocAL/rocAL/include/circular_buffer.h index 6c380e426a..f963512dc8 100644 --- a/rocAL/rocAL/include/circular_buffer.h +++ b/rocAL/rocAL/include/circular_buffer.h @@ -43,12 +43,12 @@ class CircularBuffer { public: #if ENABLE_HIP - CircularBuffer(DeviceResourcesHip hipres, size_t buffer_depth ); + CircularBuffer(DeviceResourcesHip hipres); #else - CircularBuffer(DeviceResources ocl, size_t buffer_depth ); + CircularBuffer(DeviceResources ocl); #endif ~CircularBuffer(); - void init(RaliMemType output_mem_type, size_t output_mem_size); + void init(RaliMemType output_mem_type, size_t output_mem_size, size_t buff_depth); void sync();// Syncs device buffers with host void unblock_reader();// Unblocks the thread currently waiting on a call to get_read_buffer void unblock_writer();// Unblocks the thread currently waiting on get_write_buffer @@ -69,7 +69,7 @@ class CircularBuffer void increment_write_ptr(); bool full(); bool empty(); - const size_t BUFF_DEPTH; + size_t BUFF_DEPTH; decoded_image_info _last_image_info; std::queue _circ_image_info;//!< Stores the loaded images names, decoded_width and decoded_height(data is stored in the _circ_buff) std::mutex _names_buff_lock; diff --git a/rocAL/rocAL/include/context.h b/rocAL/rocAL/include/context.h index 4346566fdc..80b0c42ba9 100644 --- a/rocAL/rocAL/include/context.h +++ b/rocAL/rocAL/include/context.h @@ -29,12 +29,12 @@ THE SOFTWARE. struct Context { - explicit Context(size_t batch_size, RaliAffinity affinity, int gpu_id , size_t cpu_thread_count, RaliTensorDataType output_tensor_type ): + explicit Context(size_t batch_size, RaliAffinity affinity, int gpu_id , size_t cpu_thread_count, size_t prefetch_queue_depth, RaliTensorDataType output_tensor_type ): affinity(affinity), _user_batch_size(batch_size) { LOG("Processing on " + STR(((affinity == RaliAffinity::CPU)?" CPU": " GPU"))) - master_graph = std::make_shared(batch_size, affinity, gpu_id, cpu_thread_count, output_tensor_type); + master_graph = std::make_shared(batch_size, affinity, gpu_id, cpu_thread_count, prefetch_queue_depth, output_tensor_type); _internal_batch_size = master_graph->internal_batch_size(); } ~Context() diff --git a/rocAL/rocAL/include/image_loader.h b/rocAL/rocAL/include/image_loader.h index bbc1e51160..37aa9a5bf8 100644 --- a/rocAL/rocAL/include/image_loader.h +++ b/rocAL/rocAL/include/image_loader.h @@ -20,7 +20,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#pragma once +#pragma once #include #include @@ -38,7 +38,7 @@ class ImageLoader : public LoaderModule { explicit ImageLoader(DeviceResourcesHip dev_resources); #else explicit ImageLoader(DeviceResources dev_resources); -#endif +#endif ~ImageLoader() override; LoaderModuleStatus load_next() override; void initialize(ReaderConfig reader_cfg, DecoderConfig decoder_cfg, RaliMemType mem_type, unsigned batch_size, bool keep_orig_size=false) override; @@ -52,6 +52,7 @@ class ImageLoader : public LoaderModule { LoaderModuleStatus set_cpu_sched_policy(struct sched_param sched_policy); std::vector get_id() override; decoded_image_info get_decode_image_info() override; + void set_prefetch_queue_depth(size_t prefetch_queue_depth) override; private: bool is_out_of_data(); void de_init(); @@ -75,10 +76,10 @@ class ImageLoader : public LoaderModule { bool _is_initialized; bool _stopped = false; bool _loop;// _randombboxcrop_meta_data_reader = nullptr; + std::shared_ptr _randombboxcrop_meta_data_reader = nullptr; }; diff --git a/rocAL/rocAL/include/image_loader_sharded.h b/rocAL/rocAL/include/image_loader_sharded.h index eecc017fd9..d2841b81f9 100644 --- a/rocAL/rocAL/include/image_loader_sharded.h +++ b/rocAL/rocAL/include/image_loader_sharded.h @@ -32,9 +32,9 @@ class ImageLoaderSharded : public LoaderModule public: #if ENABLE_HIP explicit ImageLoaderSharded(DeviceResourcesHip dev_resources); -#else +#else explicit ImageLoaderSharded(DeviceResources dev_resources); -#endif +#endif ~ImageLoaderSharded() override; LoaderModuleStatus load_next() override; void initialize(ReaderConfig reader_cfg, DecoderConfig decoder_cfg, RaliMemType mem_type, unsigned batch_size, bool keep_orig_size=false) override; @@ -46,18 +46,20 @@ class ImageLoaderSharded : public LoaderModule std::vector get_id() override; decoded_image_info get_decode_image_info() override; Timing timing() override; + void set_prefetch_queue_depth(size_t prefetch_queue_depth) override; private: void increment_loader_idx(); -#if ENABLE_HIP +#if ENABLE_HIP const DeviceResourcesHip _dev_resources; -#else +#else const DeviceResources _dev_resources; -#endif +#endif bool _initialized = false; std::vector> _loaders; size_t _loader_idx; size_t _shard_count = 1; void fast_forward_through_empty_loaders(); + size_t _prefetch_queue_depth; Image *_output_image; std::shared_ptr _randombboxcrop_meta_data_reader = nullptr; diff --git a/rocAL/rocAL/include/loader_module.h b/rocAL/rocAL/include/loader_module.h index 1536053357..c39cea66d7 100644 --- a/rocAL/rocAL/include/loader_module.h +++ b/rocAL/rocAL/include/loader_module.h @@ -42,7 +42,7 @@ enum class LoaderModuleStatus }; /*! \class LoaderModule The interface defining the API and requirements of loader modules*/ -class LoaderModule +class LoaderModule { public: virtual void initialize(ReaderConfig reader_config, DecoderConfig decoder_config, RaliMemType mem_type, unsigned batch_size, bool keep_orig_size) = 0; @@ -55,6 +55,7 @@ class LoaderModule virtual std::vector get_id() = 0; // returns the id of the last batch of images/frames loaded virtual void start_loading() = 0; // starts internal loading thread virtual decoded_image_info get_decode_image_info() = 0; + virtual void set_prefetch_queue_depth(size_t prefetch_queue_depth) = 0; // introduce meta data reader virtual void set_random_bbox_data_reader(std::shared_ptr randombboxcrop_meta_data_reader) = 0; }; diff --git a/rocAL/rocAL/include/master_graph.h b/rocAL/rocAL/include/master_graph.h index cf2689db9a..e1c74108d9 100644 --- a/rocAL/rocAL/include/master_graph.h +++ b/rocAL/rocAL/include/master_graph.h @@ -45,7 +45,7 @@ class MasterGraph { public: enum class Status { OK = 0, NOT_RUNNING = 1, NO_MORE_DATA = 2, NOT_IMPLEMENTED }; - MasterGraph(size_t batch_size, RaliAffinity affinity, int gpu_id, size_t cpu_threads, RaliTensorDataType output_tensor_data_type); + MasterGraph(size_t batch_size, RaliAffinity affinity, int gpu_id, size_t cpu_threads, size_t prefetch_queue_depth, RaliTensorDataType output_tensor_data_type); ~MasterGraph(); Status reset(); size_t remaining_images_count(); @@ -135,7 +135,6 @@ class MasterGraph std::shared_ptr _randombboxcrop_meta_data_reader = nullptr; bool _first_run = true; bool _processing;//!< Indicates if internal processing thread should keep processing or not - const static unsigned OUTPUT_RING_BUFFER_DEPTH = 3; const static unsigned SAMPLE_SIZE = sizeof(unsigned char); int _remaining_images_count;//!< Keeps the count of remaining images yet to be processed for the user, bool _loop;//!< Indicates if user wants to indefinitely loops through images or not @@ -145,6 +144,7 @@ class MasterGraph bool _output_routine_finished_processing = false; const RaliTensorDataType _out_data_type; bool _is_random_bbox_crop = false; + size_t _prefetch_queue_depth; }; template @@ -189,6 +189,7 @@ template<> inline std::shared_ptr MasterGraph::add_node(const s THROW("A loader already exists, cannot have more than one loader") auto node = std::make_shared(outputs[0], _device.resources()); _loader_module = node->get_loader_module(); + _loader_module->set_prefetch_queue_depth(_prefetch_queue_depth); _root_nodes.push_back(node); for(auto& output: outputs) _image_map.insert(make_pair(output, node)); @@ -201,6 +202,7 @@ template<> inline std::shared_ptr MasterGraph::add_n THROW("A loader already exists, cannot have more than one loader") auto node = std::make_shared(outputs[0], _device.resources()); _loader_module = node->get_loader_module(); + _loader_module->set_prefetch_queue_depth(_prefetch_queue_depth); _root_nodes.push_back(node); for(auto& output: outputs) _image_map.insert(make_pair(output, node)); @@ -213,6 +215,7 @@ template<> inline std::shared_ptr MasterGraph::add_node(const THROW("A loader already exists, cannot have more than one loader") auto node = std::make_shared(outputs[0], _device.resources()); _loader_module = node->get_loader_module(); + _loader_module->set_prefetch_queue_depth(_prefetch_queue_depth); _loader_module->set_random_bbox_data_reader(_randombboxcrop_meta_data_reader); _root_nodes.push_back(node); for(auto& output: outputs) @@ -227,6 +230,7 @@ template<> inline std::shared_ptr MasterGraph::add THROW("A loader already exists, cannot have more than one loader") auto node = std::make_shared(outputs[0], _device.resources()); _loader_module = node->get_loader_module(); + _loader_module->set_prefetch_queue_depth(_prefetch_queue_depth); _loader_module->set_random_bbox_data_reader(_randombboxcrop_meta_data_reader); _root_nodes.push_back(node); for(auto& output: outputs) @@ -244,6 +248,7 @@ template<> inline std::shared_ptr MasterGraph::add_node(const THROW("A loader already exists, cannot have more than one loader") auto node = std::make_shared(outputs[0], _device.resources()); _loader_module = node->get_loader_module(); + _loader_module->set_prefetch_queue_depth(_prefetch_queue_depth); _root_nodes.push_back(node); for(auto& output: outputs) _image_map.insert(make_pair(output, node)); diff --git a/rocAL/rocAL/include/node_fused_jpeg_crop.h b/rocAL/rocAL/include/node_fused_jpeg_crop.h index c04ca15abe..3e0017d89d 100644 --- a/rocAL/rocAL/include/node_fused_jpeg_crop.h +++ b/rocAL/rocAL/include/node_fused_jpeg_crop.h @@ -32,11 +32,11 @@ class FusedJpegCropNode: public Node /// \param device_resources shard count from user /// internal_shard_count number of loader/decoders are created and each shard is loaded and decoded using separate and independent resources increasing the parallelism and performance. -#if ENABLE_HIP +#if ENABLE_HIP FusedJpegCropNode(Image *output, DeviceResourcesHip device_resources_hip); -#else +#else FusedJpegCropNode(Image *output, DeviceResources device_resources); -#endif +#endif ~FusedJpegCropNode() override; FusedJpegCropNode() = delete; /// @@ -59,8 +59,8 @@ class FusedJpegCropNode: public Node Parameter* _y_drift; Parameter* _area_factor; Parameter* _aspect_ratio; - constexpr static float X_DRIFT_RANGE [2] = {0, 1}; + constexpr static float X_DRIFT_RANGE [2] = {0, 1}; constexpr static float Y_DRIFT_RANGE [2] = {0, 1}; - constexpr static float AREA_FACTOR_RANGE[2] = {0.08, 0.99}; + constexpr static float AREA_FACTOR_RANGE[2] = {0.08, 0.99}; constexpr static float ASPECT_RATIO_RANGE[2] = {0.75, 1.33}; }; diff --git a/rocAL/rocAL/include/node_fused_jpeg_crop_single_shard.h b/rocAL/rocAL/include/node_fused_jpeg_crop_single_shard.h index 240aee4f78..29e1d4b871 100644 --- a/rocAL/rocAL/include/node_fused_jpeg_crop_single_shard.h +++ b/rocAL/rocAL/include/node_fused_jpeg_crop_single_shard.h @@ -9,9 +9,9 @@ class FusedJpegCropSingleShardNode: public Node public: #if ENABLE_HIP FusedJpegCropSingleShardNode(Image *output, DeviceResourcesHip device_resources); -#else +#else FusedJpegCropSingleShardNode(Image *output, DeviceResources device_resources); -#endif +#endif ~FusedJpegCropSingleShardNode() override; /// \param user_shard_count shard count from user @@ -35,8 +35,8 @@ class FusedJpegCropSingleShardNode: public Node Parameter* _y_drift; Parameter* _area_factor; Parameter* _aspect_ratio; - constexpr static float X_DRIFT_RANGE [2] = {0, 1}; + constexpr static float X_DRIFT_RANGE [2] = {0, 1}; constexpr static float Y_DRIFT_RANGE [2] = {0, 1}; - constexpr static float AREA_FACTOR_RANGE[2] = {0.08, 0.99}; + constexpr static float AREA_FACTOR_RANGE[2] = {0.08, 0.99}; constexpr static float ASPECT_RATIO_RANGE[2] = {0.75, 1.33}; }; diff --git a/rocAL/rocAL/include/node_image_loader.h b/rocAL/rocAL/include/node_image_loader.h index 0de8c26a58..19eb61a7ff 100644 --- a/rocAL/rocAL/include/node_image_loader.h +++ b/rocAL/rocAL/include/node_image_loader.h @@ -33,9 +33,9 @@ class ImageLoaderNode: public Node /// internal_shard_count number of loader/decoders are created and each shard is loaded and decoded using separate and independent resources increasing the parallelism and performance. #if ENABLE_HIP ImageLoaderNode(Image *output, DeviceResourcesHip device_resources); -#else +#else ImageLoaderNode(Image *output, DeviceResources device_resources); -#endif +#endif ~ImageLoaderNode() override; ImageLoaderNode() = delete; /// diff --git a/rocAL/rocAL/include/node_image_loader_single_shard.h b/rocAL/rocAL/include/node_image_loader_single_shard.h index 44603dd5f2..370fa1e061 100644 --- a/rocAL/rocAL/include/node_image_loader_single_shard.h +++ b/rocAL/rocAL/include/node_image_loader_single_shard.h @@ -30,9 +30,9 @@ class ImageLoaderSingleShardNode: public Node public: #if ENABLE_HIP ImageLoaderSingleShardNode(Image *output, DeviceResourcesHip device_resources); -#else +#else ImageLoaderSingleShardNode(Image *output, DeviceResources device_resources); -#endif +#endif ~ImageLoaderSingleShardNode() override; /// \param user_shard_count shard count from user diff --git a/rocAL/rocAL/include/rali_api.h b/rocAL/rocAL/include/rali_api.h index 81b14bb029..c6cb156214 100644 --- a/rocAL/rocAL/include/rali_api.h +++ b/rocAL/rocAL/include/rali_api.h @@ -38,7 +38,7 @@ THE SOFTWARE. /// \param gpu_id /// \param cpu_thread_count /// \return -extern "C" RaliContext RALI_API_CALL raliCreate(size_t batch_size, RaliProcessMode affinity, int gpu_id = 0, size_t cpu_thread_count = 1, RaliTensorOutputType output_tensor_data_type = RaliTensorOutputType::RALI_FP32); +extern "C" RaliContext RALI_API_CALL raliCreate(size_t batch_size, RaliProcessMode affinity, int gpu_id = 0, size_t cpu_thread_count = 1, size_t prefetch_queue_depth = 3, RaliTensorOutputType output_tensor_data_type = RaliTensorOutputType::RALI_FP32); //extern "C" RaliContext RALI_API_CALL raliCreate(size_t batch_size, RaliProcessMode affinity, int gpu_id = 0, size_t cpu_thread_count = 1); /// diff --git a/rocAL/rocAL/source/cifar10_data_loader.cpp b/rocAL/rocAL/source/cifar10_data_loader.cpp index 5ad95a43b1..4c8279f316 100644 --- a/rocAL/rocAL/source/cifar10_data_loader.cpp +++ b/rocAL/rocAL/source/cifar10_data_loader.cpp @@ -20,7 +20,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#include +#include #include #include "cifar10_data_loader.h" #include "vx_ext_amd.h" @@ -30,7 +30,7 @@ CIFAR10DataLoader::CIFAR10DataLoader(DeviceResourcesHip dev_resources): #else CIFAR10DataLoader::CIFAR10DataLoader(DeviceResources dev_resources): #endif -_circ_buff(dev_resources, CIRC_BUFFER_DEPTH), +_circ_buff(dev_resources), _file_load_time("file load time", DBG_TIMING), _swap_handle_time("Swap_handle_time", DBG_TIMING) { @@ -47,13 +47,21 @@ CIFAR10DataLoader::~CIFAR10DataLoader() de_init(); } +void CIFAR10DataLoader::set_prefetch_queue_depth(size_t prefetch_queue_depth) +{ + if(prefetch_queue_depth <= 0) + THROW("Prefetch quque depth value cannot be zero or negative"); + _prefetch_queue_depth = prefetch_queue_depth; +} + + size_t CIFAR10DataLoader::remaining_count() { return _remaining_image_count; } -void +void CIFAR10DataLoader::reset() { // stop the writer thread and empty the internal circular buffer @@ -74,7 +82,7 @@ CIFAR10DataLoader::reset() start_loading(); } -void +void CIFAR10DataLoader::de_init() { stop_internal_thread(); @@ -139,7 +147,7 @@ CIFAR10DataLoader::initialize(ReaderConfig reader_cfg, DecoderConfig decoder_cfg _raw_img_info._roi_height.resize(batch_size); _raw_img_info._original_height.resize(_batch_size); _raw_img_info._original_width.resize(_batch_size); - _circ_buff.init(_mem_type, _output_mem_size); + _circ_buff.init(_mem_type, _output_mem_size, _prefetch_queue_depth); _is_initialized = true; LOG("Loader module initialized"); } @@ -161,7 +169,7 @@ CIFAR10DataLoader::start_loading() } -LoaderModuleStatus +LoaderModuleStatus CIFAR10DataLoader::load_routine() { LOG("Started the internal loader thread"); @@ -218,8 +226,8 @@ CIFAR10DataLoader::load_routine() last_load_status = load_status; } - // Here it sets the out-of-data flag and signal the circular buffer's internal - // read semaphore using release() call + // Here it sets the out-of-data flag and signal the circular buffer's internal + // read semaphore using release() call // , and calls the release() allows the reader thread to wake up and handle // the out-of-data case properly // It also slows down the reader thread since there is no more data to read, @@ -231,12 +239,12 @@ CIFAR10DataLoader::load_routine() return LoaderModuleStatus::OK; } -bool +bool CIFAR10DataLoader::is_out_of_data() { return (remaining_count() < _batch_size) ; } -LoaderModuleStatus +LoaderModuleStatus CIFAR10DataLoader::update_output_image() { LoaderModuleStatus status = LoaderModuleStatus::OK; @@ -254,8 +262,8 @@ CIFAR10DataLoader::update_output_image() if(_output_image->swap_handle(data_buffer)!= 0) return LoaderModuleStatus ::DEVICE_BUFFER_SWAP_FAILED; _swap_handle_time.end(); - } - else + } + else { auto data_buffer = _circ_buff.get_read_buffer_host(); _swap_handle_time.start(); diff --git a/rocAL/rocAL/source/circular_buffer.cpp b/rocAL/rocAL/source/circular_buffer.cpp index 2d784af45a..b307bdd172 100644 --- a/rocAL/rocAL/source/circular_buffer.cpp +++ b/rocAL/rocAL/source/circular_buffer.cpp @@ -23,35 +23,25 @@ THE SOFTWARE. #include "circular_buffer.h" #include "log.h" #if !ENABLE_HIP -CircularBuffer::CircularBuffer(DeviceResources ocl, size_t buffer_depth): - BUFF_DEPTH(buffer_depth), +CircularBuffer::CircularBuffer(DeviceResources ocl): _cl_cmdq(ocl.cmd_queue), _cl_context(ocl.context), _device_id(ocl.device_id), - _dev_buffer(BUFF_DEPTH, nullptr), - _host_buffer_ptrs(BUFF_DEPTH, nullptr), _write_ptr(0), _read_ptr(0), _level(0) { - for(size_t bufIdx = 0; bufIdx < BUFF_DEPTH; bufIdx++) - _dev_buffer[bufIdx] = nullptr; } #else -CircularBuffer::CircularBuffer(DeviceResourcesHip hipres, size_t buffer_depth): - BUFF_DEPTH(buffer_depth), +CircularBuffer::CircularBuffer(DeviceResourcesHip hipres): _hip_stream(hipres.hip_stream), _hip_device_id(hipres.device_id), _dev_prop(&hipres.dev_prop), - _dev_buffer(BUFF_DEPTH, nullptr), - _host_buffer_ptrs(BUFF_DEPTH, nullptr), _write_ptr(0), _read_ptr(0), _level(0) { - for(size_t bufIdx = 0; bufIdx < BUFF_DEPTH; bufIdx++) - _dev_buffer[bufIdx] = nullptr; } #endif @@ -173,8 +163,13 @@ void CircularBuffer::pop() increment_read_ptr(); _circ_image_info.pop(); } -void CircularBuffer::init(RaliMemType output_mem_type, size_t output_mem_size) +void CircularBuffer::init(RaliMemType output_mem_type, size_t output_mem_size, size_t buffer_depth) { + BUFF_DEPTH = buffer_depth; + _dev_buffer.reserve(BUFF_DEPTH); + _host_buffer_ptrs.reserve(BUFF_DEPTH); + for(size_t bufIdx = 0; bufIdx < BUFF_DEPTH; bufIdx++) + _dev_buffer[bufIdx] = nullptr; if(_initialized) return; _output_mem_type = output_mem_type; diff --git a/rocAL/rocAL/source/image_loader.cpp b/rocAL/rocAL/source/image_loader.cpp index 2580b1e856..6b179a76d3 100644 --- a/rocAL/rocAL/source/image_loader.cpp +++ b/rocAL/rocAL/source/image_loader.cpp @@ -31,9 +31,10 @@ ImageLoader::ImageLoader(DeviceResourcesHip dev_resources): #else ImageLoader::ImageLoader(DeviceResources dev_resources): #endif -_circ_buff(dev_resources, CIRC_BUFFER_DEPTH), +_circ_buff(dev_resources), _swap_handle_time("Swap_handle_time", DBG_TIMING) { + // CIRC_BUFFER_DEPTH = prefetch_queue_depth; _output_image = nullptr; _mem_type = RaliMemType::HOST; _internal_thread_running = false; @@ -48,6 +49,14 @@ ImageLoader::~ImageLoader() de_init(); } +void ImageLoader::set_prefetch_queue_depth(size_t prefetch_queue_depth) +{ + if(prefetch_queue_depth <= 0) + THROW("Prefetch quque depth value cannot be zero or negative"); + CIRC_BUFFER_DEPTH = prefetch_queue_depth; +} + + size_t ImageLoader::remaining_count() { @@ -138,7 +147,7 @@ void ImageLoader::initialize(ReaderConfig reader_cfg, DecoderConfig decoder_cfg, _decoded_img_info._roi_width.resize(_batch_size); _decoded_img_info._original_height.resize(_batch_size); _decoded_img_info._original_width.resize(_batch_size); - _circ_buff.init(_mem_type, _output_mem_size); + _circ_buff.init(_mem_type, _output_mem_size,CIRC_BUFFER_DEPTH ); _is_initialized = true; _image_loader->set_random_bbox_data_reader(_randombboxcrop_meta_data_reader); LOG("Loader module initialized"); diff --git a/rocAL/rocAL/source/image_loader_sharded.cpp b/rocAL/rocAL/source/image_loader_sharded.cpp index df3a3dabf7..f016be1951 100644 --- a/rocAL/rocAL/source/image_loader_sharded.cpp +++ b/rocAL/rocAL/source/image_loader_sharded.cpp @@ -32,6 +32,13 @@ ImageLoaderSharded::ImageLoaderSharded(DeviceResources dev_resources): _loader_idx = 0; } +void ImageLoaderSharded::set_prefetch_queue_depth(size_t prefetch_queue_depth) +{ + if(prefetch_queue_depth <= 0) + THROW("Prefetch quque depth value cannot be zero or negative"); + _prefetch_queue_depth = prefetch_queue_depth; +} + std::vector ImageLoaderSharded::get_id() { if(!_initialized) @@ -83,7 +90,8 @@ ImageLoaderSharded::initialize(ReaderConfig reader_cfg, DecoderConfig decoder_cf // Create loader modules for(size_t i = 0; i < _shard_count; i++) { - auto loader = std::make_shared(_dev_resources); + std::shared_ptr loader = std::make_shared(_dev_resources); + loader->set_prefetch_queue_depth(_prefetch_queue_depth); _loaders.push_back(loader); } // Initialize loader modules diff --git a/rocAL/rocAL/source/master_graph.cpp b/rocAL/rocAL/source/master_graph.cpp index aa1e5b7ecf..2de6f204c9 100644 --- a/rocAL/rocAL/source/master_graph.cpp +++ b/rocAL/rocAL/source/master_graph.cpp @@ -88,8 +88,8 @@ MasterGraph::~MasterGraph() release(); } -MasterGraph::MasterGraph(size_t batch_size, RaliAffinity affinity, int gpu_id, size_t cpu_threads, RaliTensorDataType output_tensor_data_type): - _ring_buffer(OUTPUT_RING_BUFFER_DEPTH), +MasterGraph::MasterGraph(size_t batch_size, RaliAffinity affinity, int gpu_id, size_t cpu_threads, size_t prefetch_queue_depth, RaliTensorDataType output_tensor_data_type): + _ring_buffer(prefetch_queue_depth), _output_tensor(nullptr), _graph(nullptr), _affinity(affinity), @@ -107,6 +107,7 @@ MasterGraph::MasterGraph(size_t batch_size, RaliAffinity affinity, int gpu_id, s _processing(false), _internal_batch_size(compute_optimum_internal_batch_size(batch_size, affinity)), _user_to_internal_batch_ratio (_user_batch_size/_internal_batch_size), + _prefetch_queue_depth(prefetch_queue_depth), _out_data_type(output_tensor_data_type) { try { diff --git a/rocAL/rocAL/source/node_fused_jpeg_crop_single_shard.cpp b/rocAL/rocAL/source/node_fused_jpeg_crop_single_shard.cpp index 6fdded70a7..e3a9624d82 100644 --- a/rocAL/rocAL/source/node_fused_jpeg_crop_single_shard.cpp +++ b/rocAL/rocAL/source/node_fused_jpeg_crop_single_shard.cpp @@ -4,9 +4,9 @@ #if ENABLE_HIP FusedJpegCropSingleShardNode::FusedJpegCropSingleShardNode(Image *output, DeviceResourcesHip device_resources): -#else +#else FusedJpegCropSingleShardNode::FusedJpegCropSingleShardNode(Image *output, DeviceResources device_resources): -#endif +#endif Node({}, {output}) { _loader_module = std::make_shared(device_resources); diff --git a/rocAL/rocAL/source/rali_api.cpp b/rocAL/rocAL/source/rali_api.cpp index f95fc9e19c..626fd4f8b9 100644 --- a/rocAL/rocAL/source/rali_api.cpp +++ b/rocAL/rocAL/source/rali_api.cpp @@ -41,6 +41,7 @@ raliCreate( RaliProcessMode affinity, int gpu_id, size_t cpu_thread_count, + size_t prefetch_queue_depth, RaliTensorOutputType output_tensor_data_type) { RaliContext context = nullptr; @@ -70,7 +71,7 @@ raliCreate( THROW("Unkown Rali data type") } }; - context = new Context(batch_size, translate_process_mode(affinity), gpu_id, cpu_thread_count, translate_output_data_type(output_tensor_data_type)); + context = new Context(batch_size, translate_process_mode(affinity), gpu_id, cpu_thread_count, prefetch_queue_depth, translate_output_data_type(output_tensor_data_type)); // Reset seed in case it's being randomized during context creation } catch(const std::exception& e) diff --git a/rocAL/rocAL_pybind/amd/rali/pipeline.py b/rocAL/rocAL_pybind/amd/rali/pipeline.py index 1ca000b972..b17200205b 100644 --- a/rocAL/rocAL_pybind/amd/rali/pipeline.py +++ b/rocAL/rocAL_pybind/amd/rali/pipeline.py @@ -81,10 +81,10 @@ def __init__(self, batch_size=-1, num_threads=-1, device_id=-1, seed=-1, if(rali_cpu): # print("comes to cpu") self._handle = b.raliCreate( - batch_size, types.CPU, device_id, num_threads,types.FLOAT) + batch_size, types.CPU, device_id, num_threads,prefetch_queue_depth,types.FLOAT) else: self._handle = b.raliCreate( - batch_size, types.GPU, device_id, num_threads,types.FLOAT) + batch_size, types.GPU, device_id, num_threads,prefetch_queue_depth,types.FLOAT) if(b.getStatus(self._handle) == types.OK): print("Pipeline has been created succesfully") else: diff --git a/rocAL/rocAL_pybind/rali_pybind.cpp b/rocAL/rocAL_pybind/rali_pybind.cpp index 53b247db64..e0cae9daf0 100644 --- a/rocAL/rocAL_pybind/rali_pybind.cpp +++ b/rocAL/rocAL_pybind/rali_pybind.cpp @@ -175,6 +175,7 @@ namespace rali{ py::arg("affinity"), py::arg("gpu_id") = 0, py::arg("cpu_thread_count") = 1, + py::arg("prefetch_queue_depth") = 3, py::arg("output_data_type") = 0); m.def("raliVerify",&raliVerify); m.def("raliRun",&raliRun);