diff --git a/rocAL/rocAL/include/pipeline/context.h b/rocAL/rocAL/include/pipeline/context.h index a4849a53ac..a008f7bdbb 100644 --- a/rocAL/rocAL/include/pipeline/context.h +++ b/rocAL/rocAL/include/pipeline/context.h @@ -35,7 +35,6 @@ struct Context { LOG("Processing on " + STR(((affinity == RocalAffinity::CPU)?" CPU": " GPU"))) master_graph = std::make_shared(batch_size, affinity, gpu_id, prefetch_queue_depth, output_tensor_type); - _internal_batch_size = master_graph->internal_batch_size(); } ~Context() { @@ -52,10 +51,8 @@ struct Context return master_graph->timing(); } size_t user_batch_size() { return _user_batch_size; } - size_t internal_batch_size() { return _internal_batch_size; } private: void clear_errors() { error = "";} std::string error; size_t _user_batch_size; - size_t _internal_batch_size; -}; \ No newline at end of file +}; diff --git a/rocAL/rocAL/include/pipeline/master_graph.h b/rocAL/rocAL/include/pipeline/master_graph.h index 2180b983e6..3288fbc866 100644 --- a/rocAL/rocAL/include/pipeline/master_graph.h +++ b/rocAL/rocAL/include/pipeline/master_graph.h @@ -52,27 +52,16 @@ class MasterGraph ~MasterGraph(); Status reset(); size_t remaining_count(); - MasterGraph::Status copy_out_tensor(void *out_ptr, RocalTensorFormat format, float multiplier0, float multiplier1, float multiplier2, - float offset0, float offset1, float offset2, bool reverse_channels, RocalTensorDataType output_data_type); - Status copy_output(unsigned char* out_ptr, size_t out_size_in_bytes); - Status copy_out_tensor_planar(void *out_ptr, RocalTensorFormat format, float multiplier0, float multiplier1, float multiplier2, - float offset0, float offset1, float offset2, bool reverse_channels, RocalTensorDataType output_data_type); - size_t output_width(); - size_t output_height(); - size_t output_byte_size(); - size_t output_depth(); + rocalTensorList *get_output_tensors(); void sequence_start_frame_number(std::vector &sequence_start_framenum); // Returns the starting frame number of the sequences void sequence_frame_timestamps(std::vector> &sequence_frame_timestamp); // Returns the timestamps of the frames in the sequences - size_t augmentation_branch_count(); - size_t output_sample_size(); - RocalColorFormat output_color_format(); Status build(); Status run(); Timing timing(); RocalMemType mem_type(); void release(); template - std::shared_ptr add_node(const std::vector &inputs, const std::vector &outputs); + std::shared_ptr add_node(const std::vector &inputs, const std::vector &outputs); template std::shared_ptr meta_add_node(std::shared_ptr node); rocalTensor *create_tensor(const rocalTensorInfo &info, bool is_output); rocalTensor *create_loader_output_tensor(const rocalTensorInfo &info); @@ -88,24 +77,15 @@ class MasterGraph void create_randombboxcrop_reader(RandomBBoxCrop_MetaDataReaderType reader_type, RandomBBoxCrop_MetaDataType label_type, bool all_boxes_overlap, bool no_crop, FloatParam* aspect_ratio, bool has_shape, int crop_width, int crop_height, int num_attempts, FloatParam* scaling, int total_num_attempts, int64_t seed=0); const std::pair& meta_data(); void set_loop(bool val) { _loop = val; } - void set_output_images(const std::vector &output_images, unsigned int num_of_outputs) - { - _output_images.resize(num_of_outputs); - _output_images = output_images; - } - void set_output(Image* output_image); + void set_output(rocalTensor* output_tensor); bool empty() { return (remaining_count() < (_is_sequence_reader_output ? _sequence_batch_size : _user_batch_size)); } - size_t internal_batch_size() { return _internal_batch_size; } size_t sequence_batch_size() { return _sequence_batch_size; } std::shared_ptr meta_data_graph() { return _meta_data_graph; } std::shared_ptr meta_data_reader() { return _meta_data_reader; } bool is_random_bbox_crop() {return _is_random_bbox_crop; } - void set_video_loader_flag() { _is_video_loader = true; } - bool is_video_loader() {return _is_video_loader; } bool is_sequence_reader_output() {return _is_sequence_reader_output; } void set_sequence_reader_output() { _is_sequence_reader_output = true; } void set_sequence_batch_size(size_t sequence_length) { _sequence_batch_size = _user_batch_size * sequence_length; } - void set_sequence_batch_ratio() { _sequence_batch_ratio = _sequence_batch_size / _internal_batch_size; } Status get_bbox_encoded_buffers(float **boxes_buf_ptr, int **labels_buf_ptr, size_t num_encoded_boxes); size_t bounding_box_batch_count(int* buf, pMetaDataBatch meta_data_batch); #if ENABLE_OPENCL @@ -113,33 +93,26 @@ class MasterGraph #endif private: Status update_node_parameters(); - Status allocate_output_tensor(); - Status deallocate_output_tensor(); void create_single_graph(); void start_processing(); void stop_processing(); void output_routine(); - void output_routine_video(); void decrease_image_count(); - bool processing_on_device_ocl() { return _output_image_info.mem_type() == RocalMemType::OCL; }; - bool processing_on_device_hip() { return _output_image_info.mem_type() == RocalMemType::HIP; }; - /// notify_user_thread() is called when the internal processing thread is done with processing all available images + /// notify_user_thread() is called when the internal processing thread is done with processing all available tensors void notify_user_thread(); - /// no_more_processed_data() is logically linked to the notify_user_thread() and is used to tell the user they've already consumed all the processed images + /// no_more_processed_data() is logically linked to the notify_user_thread() and is used to tell the user they've already consumed all the processed tensors bool no_more_processed_data(); - RingBuffer _ring_buffer;//!< The queue that keeps the images that have benn processed by the internal thread (_output_thread) asynchronous to the user's thread + RingBuffer _ring_buffer;//!< The queue that keeps the tensors that have benn processed by the internal thread (_output_thread) asynchronous to the user's thread MetaDataBatch* _augmented_meta_data = nullptr;//!< The output of the meta_data_graph, CropCordBatch* _random_bbox_crop_cords_data = nullptr; std::thread _output_thread; - ImageInfo _output_image_info;//!< Keeps the information about ROCAL's output image , it includes all images of a batch stacked on top of each other rocalTensorList _internal_tensor_list; //!< Keeps a list of ovx tensors that are used to store the augmented outputs (there is an augmentation output batch per element in the list) rocalTensorList _output_tensor_list; //!< Keeps a list of ovx tensors(augmented outputs) that are to be passed to the user (there is an augmentation output batch per element in the list) - std::list _internal_images;//!< Keeps all the ovx images (virtual/non-virtual) either intermediate images, or input images that feed the graph + std::list _internal_tensors; //!< Keeps all the ovx tensors (virtual/non-virtual) either intermediate tensors, or input tensors that feed the graph std::list> _nodes;//!< List of all the nodes std::list> _root_nodes;//!< List of all root nodes (image/video loaders) std::list> _meta_data_nodes;//!< List of nodes where meta data has to be updated after augmentation - std::map> _image_map;//!< key: image, value : Parent node - void * _output_tensor;//!< In the GPU processing case , is used to convert the U8 samples to float32 before they are being transfered back to host + std::map> _tensor_map;//!< key: tensor, value : Parent node #if ENABLE_HIP DeviceManagerHip _device;//!< Keeps the device related constructs needed for running on GPU #elif ENABLE_OPENCL @@ -148,10 +121,7 @@ class MasterGraph std::shared_ptr _graph = nullptr; RocalAffinity _affinity; const int _gpu_id;//!< Defines the device id used for processing - pLoaderModule _loader_module; //!< Keeps the loader module used to feed the input the images of the graph -#ifdef ROCAL_VIDEO - pVideoLoaderModule _video_loader_module; //!< Keeps the video loader module used to feed the input sequences of the graph -#endif + pLoaderModule _loader_module; //!< Keeps the loader module used to feed the input the tensors of the graph TimingDBG _convert_time, _process_time, _bencode_time; const size_t _user_batch_size;//!< Batch size provided by the user vx_context _context; @@ -162,20 +132,15 @@ class MasterGraph bool _first_run = true; bool _processing;//!< Indicates if internal processing thread should keep processing or not const static unsigned SAMPLE_SIZE = sizeof(unsigned char); - int _remaining_count;//!< Keeps the count of remaining images yet to be processed for the user, - bool _loop;//!< Indicates if user wants to indefinitely loops through images or not - static size_t compute_optimum_internal_batch_size(size_t user_batch_size, RocalAffinity affinity); - const size_t _internal_batch_size;//!< In the host processing case , internal batch size can be different than _user_batch_size. This batch size used internally throughout. - const size_t _user_to_internal_batch_ratio; + int _remaining_count;//!< Keeps the count of remaining tensors yet to be processed for the user, + bool _loop;//!< Indicates if user wants to indefinitely loops through tensors or not size_t _prefetch_queue_depth; bool _output_routine_finished_processing = false; const RocalTensorDataType _out_data_type; bool _is_random_bbox_crop = false; - bool _is_video_loader = false; //!< Set to true if Video Loader is invoked. std::vector> _sequence_start_framenum_vec; //!< Stores the starting frame number of the sequences. std::vector>>_sequence_frame_timestamps_vec; //!< Stores the timestamps of the frames in a sequences. size_t _sequence_batch_size = 0; //!< Indicates the _user_batch_size when sequence reader outputs are required - size_t _sequence_batch_ratio; //!< Indicates the _user_to_internal_batch_ratio when sequence reader outputs are required bool _is_sequence_reader_output = false; //!< Set to true if Sequence Reader is invoked. // box encoder variables bool _is_box_encoder = false; //bool variable to set the box encoder @@ -192,23 +157,23 @@ class MasterGraph }; template -std::shared_ptr MasterGraph::add_node(const std::vector &inputs, const std::vector &outputs) +std::shared_ptr MasterGraph::add_node(const std::vector &inputs, const std::vector &outputs) { auto node = std::make_shared(inputs, outputs); _nodes.push_back(node); for(auto& input: inputs) { - if (_image_map.find(input) == _image_map.end()) - THROW("Input image is invalid, cannot be found among output of previously created nodes") + if (_tensor_map.find(input) == _tensor_map.end()) + THROW("Input tensor is invalid, cannot be found among output of previously created nodes") - auto parent_node = _image_map.find(input)->second; + auto parent_node = _tensor_map.find(input)->second; parent_node->add_next(node); node->add_previous(parent_node); } for(auto& output: outputs) - _image_map.insert(std::make_pair(output, node)); + _tensor_map.insert(std::make_pair(output, node)); return node; } @@ -219,7 +184,7 @@ std::shared_ptr MasterGraph::meta_add_node(std::shared_ptr node) auto meta_node = std::make_shared(); _meta_data_graph->_meta_nodes.push_back(meta_node); meta_node->_node = node; - meta_node->_batch_size = _is_sequence_reader_output ? _sequence_batch_size : _user_batch_size; + meta_node->_batch_size = _user_batch_size; return meta_node; } @@ -240,11 +205,12 @@ template<> inline std::shared_ptr MasterGraph::add_node(const s _loader_module->set_prefetch_queue_depth(_prefetch_queue_depth); _root_nodes.push_back(node); for(auto& output: outputs) - _image_map.insert(std::make_pair(output, node)); + _tensor_map.insert(std::make_pair(output, node)); return node; } -template<> inline std::shared_ptr MasterGraph::add_node(const std::vector& inputs, const std::vector& outputs) + +template<> inline std::shared_ptr MasterGraph::add_node(const std::vector& inputs, const std::vector& outputs) { if(_loader_module) THROW("A loader already exists, cannot have more than one loader") @@ -257,11 +223,12 @@ template<> inline std::shared_ptr MasterGraph::add_n _loader_module->set_prefetch_queue_depth(_prefetch_queue_depth); _root_nodes.push_back(node); for(auto& output: outputs) - _image_map.insert(std::make_pair(output, node)); + _tensor_map.insert(std::make_pair(output, node)); return node; } -template<> inline std::shared_ptr MasterGraph::add_node(const std::vector& inputs, const std::vector& outputs) + +template<> inline std::shared_ptr MasterGraph::add_node(const std::vector& inputs, const std::vector& outputs) { if(_loader_module) THROW("A loader already exists, cannot have more than one loader") @@ -275,12 +242,12 @@ template<> inline std::shared_ptr MasterGraph::add_node(const _loader_module->set_random_bbox_data_reader(_randombboxcrop_meta_data_reader); _root_nodes.push_back(node); for(auto& output: outputs) - _image_map.insert(std::make_pair(output, node)); + _tensor_map.insert(std::make_pair(output, node)); return node; } -template<> inline std::shared_ptr MasterGraph::add_node(const std::vector& inputs, const std::vector& outputs) +template<> inline std::shared_ptr MasterGraph::add_node(const std::vector& inputs, const std::vector& outputs) { if(_loader_module) THROW("A loader already exists, cannot have more than one loader") @@ -294,7 +261,7 @@ template<> inline std::shared_ptr MasterGraph::add _loader_module->set_random_bbox_data_reader(_randombboxcrop_meta_data_reader); _root_nodes.push_back(node); for(auto& output: outputs) - _image_map.insert(std::make_pair(output, node)); + _tensor_map.insert(std::make_pair(output, node)); return node; } @@ -302,7 +269,7 @@ template<> inline std::shared_ptr MasterGraph::add /* * Explicit specialization for Cifar10LoaderNode */ -template<> inline std::shared_ptr MasterGraph::add_node(const std::vector& inputs, const std::vector& outputs) +template<> inline std::shared_ptr MasterGraph::add_node(const std::vector& inputs, const std::vector& outputs) { if(_loader_module) THROW("A loader already exists, cannot have more than one loader") @@ -315,47 +282,46 @@ template<> inline std::shared_ptr MasterGraph::add_node(const _loader_module->set_prefetch_queue_depth(_prefetch_queue_depth); _root_nodes.push_back(node); for(auto& output: outputs) - _image_map.insert(std::make_pair(output, node)); + _tensor_map.insert(std::make_pair(output, node)); return node; } -#ifdef ROCAL_VIDEO /* * Explicit specialization for VideoLoaderNode */ -template<> inline std::shared_ptr MasterGraph::add_node(const std::vector& inputs, const std::vector& outputs) +template<> inline std::shared_ptr MasterGraph::add_node(const std::vector& inputs, const std::vector& outputs) { - if(_video_loader_module) - THROW("A video loader already exists, cannot have more than one loader") + if(_loader_module) + THROW("A loader already exists, cannot have more than one loader") #if ENABLE_HIP || ENABLE_OPENCL auto node = std::make_shared(outputs[0], (void *)_device.resources()); #else auto node = std::make_shared(outputs[0], nullptr); -#endif - _video_loader_module = node->get_loader_module(); - _video_loader_module->set_prefetch_queue_depth(_prefetch_queue_depth); +#endif + _loader_module = node->get_loader_module(); + _loader_module->set_prefetch_queue_depth(_prefetch_queue_depth); _root_nodes.push_back(node); for(auto& output: outputs) - _image_map.insert(std::make_pair(output, node)); + _tensor_map.insert(std::make_pair(output, node)); return node; } -template<> inline std::shared_ptr MasterGraph::add_node(const std::vector& inputs, const std::vector& outputs) + +template<> inline std::shared_ptr MasterGraph::add_node(const std::vector& inputs, const std::vector& outputs) { - if(_video_loader_module) - THROW("A video loader already exists, cannot have more than one loader") + if(_loader_module) + THROW("A loader already exists, cannot have more than one loader") #if ENABLE_HIP || ENABLE_OPENCL auto node = std::make_shared(outputs[0], (void *)_device.resources()); #else auto node = std::make_shared(outputs[0], nullptr); #endif - _video_loader_module = node->get_loader_module(); - _video_loader_module->set_prefetch_queue_depth(_prefetch_queue_depth); + _loader_module = node->get_loader_module(); + _loader_module->set_prefetch_queue_depth(_prefetch_queue_depth); _root_nodes.push_back(node); for(auto& output: outputs) - _image_map.insert(std::make_pair(output, node)); + _tensor_map.insert(std::make_pair(output, node)); return node; } -#endif diff --git a/rocAL/rocAL/include/pipeline/ring_buffer.h b/rocAL/rocAL/include/pipeline/ring_buffer.h index 77618aac6d..d0148a5074 100644 --- a/rocAL/rocAL/include/pipeline/ring_buffer.h +++ b/rocAL/rocAL/include/pipeline/ring_buffer.h @@ -45,10 +45,10 @@ class RingBuffer ///\param dev ///\param sub_buffer_size ///\param sub_buffer_count - void init(RocalMemType mem_type, void *dev, unsigned sub_buffer_size, unsigned sub_buffer_count); + void init(RocalMemType mem_type, void *dev, std::vector &sub_buffer_size); void initBoxEncoderMetaData(RocalMemType mem_type, size_t encoded_bbox_size, size_t encoded_labels_size); void release_gpu_res(); - std::vector get_read_buffers() ; + std::vector get_read_buffers(); void* get_host_master_read_buffer(); std::vector get_write_buffers(); std::pair get_box_encode_write_buffers(); @@ -72,13 +72,12 @@ class RingBuffer void increment_write_ptr(); bool full(); const unsigned BUFF_DEPTH; - unsigned _sub_buffer_size; + std::vector _sub_buffer_size; unsigned _sub_buffer_count; std::mutex _lock; std::condition_variable _wait_for_load; std::condition_variable _wait_for_unload; std::vector> _dev_sub_buffer; - std::vector _host_master_buffers; std::vector> _host_sub_buffers; std::vector _dev_bbox_buffer; std::vector _dev_labels_buffer; diff --git a/rocAL/rocAL/source/api/rocal_api_data_loaders.cpp b/rocAL/rocAL/source/api/rocal_api_data_loaders.cpp index 7f663b068e..e65daae795 100644 --- a/rocAL/rocAL/source/api/rocal_api_data_loaders.cpp +++ b/rocAL/rocAL/source/api/rocal_api_data_loaders.cpp @@ -301,7 +301,6 @@ rocalSequenceReader( // Set sequence batch size and batch ratio in master graph as it varies according to sequence length context->master_graph->set_sequence_reader_output(); context->master_graph->set_sequence_batch_size(sequence_length); - context->master_graph->set_sequence_batch_ratio(); bool decoder_keep_original = true; // This has been introduced to support variable width and height video frames in future. @@ -384,7 +383,6 @@ rocalSequenceReaderSingleShard( // Set sequence batch size and batch ratio in master graph as it varies according to sequence length context->master_graph->set_sequence_reader_output(); context->master_graph->set_sequence_batch_size(sequence_length); - context->master_graph->set_sequence_batch_ratio(); bool decoder_keep_original = true; // This has been introduced to support variable width and height video frames in future. diff --git a/rocAL/rocAL/source/pipeline/master_graph.cpp b/rocAL/rocAL/source/pipeline/master_graph.cpp index 8938fdc8cd..9579aed9f7 100644 --- a/rocAL/rocAL/source/pipeline/master_graph.cpp +++ b/rocAL/rocAL/source/pipeline/master_graph.cpp @@ -113,7 +113,6 @@ MasterGraph::~MasterGraph() MasterGraph::MasterGraph(size_t batch_size, RocalAffinity affinity, int gpu_id, size_t prefetch_queue_depth, RocalTensorDataType output_tensor_data_type): _ring_buffer(prefetch_queue_depth), - _output_tensor(nullptr), _graph(nullptr), _affinity(affinity), _gpu_id(gpu_id), @@ -130,8 +129,6 @@ MasterGraph::MasterGraph(size_t batch_size, RocalAffinity affinity, int gpu_id, #endif _first_run(true), _processing(false), - _internal_batch_size(compute_optimum_internal_batch_size(batch_size, affinity)), - _user_to_internal_batch_ratio (_user_batch_size/_internal_batch_size), _prefetch_queue_depth(prefetch_queue_depth), _out_data_type(output_tensor_data_type), #if ENABLE_HIP @@ -262,12 +259,12 @@ MasterGraph::create_single_graph() _graph = std::make_shared(_context, _affinity, 0, _gpu_id); for(auto& node: _nodes) { - // Any image not yet created can be created as virtual image - for(auto& image: node->output()) - if(image->info().type() == ImageInfo::Type::UNKNOWN) + // Any tensor not yet created can be created as virtual tensor + for(auto& tensor: node->output()) + if(tensor->info().type() == rocalTensorInfo::Type::UNKNOWN) { - image->create_virtual(_context, _graph->get()); - _internal_images.push_back(image); + tensor->create_virtual(_context, _graph->get()); + _internal_tensors.push_back(tensor); } node->create(_graph); } @@ -277,20 +274,13 @@ MasterGraph::create_single_graph() MasterGraph::Status MasterGraph::build() { - if(_output_images.empty()) - THROW("No output images are there, cannot create the pipeline") + if(_internal_tensor_list.empty()) + THROW("No output tensors are there, cannot create the pipeline") - // Verify all output images have the same dimension, otherwise creating a unified tensor from them is not supported - _output_image_info = _output_images.front()->info(); - for(auto&& output_image : _output_images) - if(!(output_image->info() == _output_image_info)) - THROW("Dimension of the output images do not match") - - allocate_output_tensor(); #if ENABLE_HIP || ENABLE_OPENCL - _ring_buffer.init(_mem_type, (void *)_device.resources(), output_byte_size(), _output_images.size()); + _ring_buffer.init(_mem_type, (void *)_device.resources(), _internal_tensor_list.data_size()); #else - _ring_buffer.init(_mem_type, nullptr, output_byte_size(), _output_images.size()); + _ring_buffer.init(_mem_type, nullptr, _internal_tensor_list.data_size()); #endif if (_is_box_encoder) _ring_buffer.initBoxEncoderMetaData(_mem_type, _user_batch_size*_num_anchors*4*sizeof(float), _user_batch_size*_num_anchors*sizeof(int)); create_single_graph(); @@ -320,30 +310,28 @@ MasterGraph::create_tensor(const rocalTensorInfo &info, bool is_output) { if (output->create_from_handle(_context) != 0) THROW("Cannot create the tensor from handle") _internal_tensor_list.push_back(output); - - auto *output_ext = new rocalTensor(info); - if (output_ext->create_from_handle(_context) != 0) - THROW("Cannot create the tensor from handle") - _output_tensor_list.push_back(output_ext); + _output_tensor_list.push_back(new rocalTensor(info)); // Creating a replica of the output tensor to be returned to the user } return output; } void -MasterGraph::set_output(Image* output_image) +MasterGraph::set_output(rocalTensor* output_tensor) { - if(output_image->is_handle_set() == false) + if(output_tensor->is_handle_set() == false) { - if (output_image->create_from_handle(_context) != 0) - THROW("Cannot create the image from handle") - _output_images.push_back(output_image); + if (output_tensor->create_from_handle(_context) != 0) + THROW("Cannot create the tensor from handle") + + _internal_tensor_list.push_back(output_tensor); + _output_tensor_list.push_back(new rocalTensor(output_tensor->info())); // Creating a replica of the output tensor to be returned to the user } else { // Decoder case only - auto actual_output = create_image(output_image->info(), true); - add_node({output_image}, {actual_output}); + auto actual_output = create_tensor(output_tensor->info(), true); + add_node({output_tensor}, {actual_output}); } } @@ -353,23 +341,16 @@ void MasterGraph::release() stop_processing(); _nodes.clear(); _root_nodes.clear(); - _image_map.clear(); + _tensor_map.clear(); _ring_buffer.release_gpu_res(); //shut_down loader:: required for releasing any allocated resourses -#ifdef ROCAL_VIDEO - if(_is_video_loader) - _video_loader_module->shut_down(); - else -#endif - _loader_module->shut_down(); + _loader_module->shut_down(); // release all openvx resources. vx_status status; - for(auto& image: _internal_images) - delete image;// It will call the vxReleaseImage internally in the destructor - for(auto& image: _output_images) - delete image;// It will call the vxReleaseImage internally in the destructor - deallocate_output_tensor(); - + for(auto& tensor: _internal_tensors) + delete tensor; // It will call the vxReleaseTensor internally in the destructor + _internal_tensor_list.release(); // It will call the vxReleaseTensor internally in the destructor for each tensor in the list + _output_tensor_list.release(); // It will call the vxReleaseTensor internally in the destructor for each tensor in the list if(_graph != nullptr) _graph->release(); @@ -394,30 +375,6 @@ MasterGraph::update_node_parameters() return Status::OK; } -size_t -MasterGraph::augmentation_branch_count() -{ - return _output_images.size(); -} - -RocalColorFormat -MasterGraph::output_color_format() -{ - return _output_image_info.color_format(); -} - -size_t -MasterGraph::output_width() -{ - return _output_image_info.width(); -} - -size_t -MasterGraph::output_height() -{ - return _output_image_info.height_batch() * (_is_sequence_reader_output ? _sequence_batch_ratio : _user_to_internal_batch_ratio); -} - void MasterGraph::sequence_start_frame_number(std::vector &sequence_start_framenum) { @@ -432,60 +389,6 @@ MasterGraph::sequence_frame_timestamps(std::vector> &sequence _sequence_frame_timestamps_vec.pop_back(); } -MasterGraph::Status -MasterGraph::allocate_output_tensor() -{ -#if ENABLE_OPENCL - if(processing_on_device_ocl()) - { - // creating a float buffer that can accommodates all output images - size_t output_buffer_size = output_byte_size() * _output_images.size(); - cl_int ret = CL_SUCCESS; - _output_tensor = nullptr; - size_t size = output_buffer_size*sizeof(cl_float); - cl_mem clImgFloat = clCreateBuffer(_device.resources()->context, - CL_MEM_READ_WRITE, - size, - nullptr, &ret); - - if (!clImgFloat || ret != CL_SUCCESS) - THROW("clCreateBuffer of size " + TOSTR(size) + " failed " + TOSTR(ret)) - - _output_tensor = clImgFloat; - } -#elif ENABLE_HIP - if (processing_on_device_hip()) - { - // creating a float buffer that can accommodates all output images - size_t output_buffer_size = output_byte_size() * _output_images.size(); - size_t size = (_out_data_type==RocalTensorDataType::FP32)? output_buffer_size*sizeof(float): output_buffer_size*sizeof(half); - hipError_t status = hipMalloc( &_output_tensor, size); - if (status != hipSuccess || !_output_tensor ) - THROW("ROCAL::hipMalloc of size " + TOSTR(size) + " failed " + TOSTR(status)) - } -#endif - return Status::OK; -} - -MasterGraph::Status -MasterGraph::deallocate_output_tensor() -{ -#if ENABLE_OPENCL - if(processing_on_device_ocl() && _output_tensor != nullptr) - clReleaseMemObject((cl_mem)_output_tensor ); -#elif ENABLE_HIP - if(processing_on_device_hip() && _output_tensor != nullptr) { - hipError_t err = hipFree(_output_tensor ); - if (err != hipSuccess) { - THROW("MasterGraph::deallocate_output_tensor hipFree failed " + TOSTR(err)) - } - _output_tensor = nullptr; - } -#endif - - return Status::OK; -} - MasterGraph::Status MasterGraph::reset() { @@ -495,24 +398,14 @@ MasterGraph::reset() if(_output_thread.joinable()) _output_thread.join(); _ring_buffer.reset(); + _sequence_start_framenum_vec.clear(); + _sequence_frame_timestamps_vec.clear(); // clearing meta ring buffer -#ifdef ROCAL_VIDEO - if(_is_video_loader) - { - _video_loader_module->reset(); - _sequence_start_framenum_vec.clear(); - _sequence_frame_timestamps_vec.clear(); - } - else -#endif - { - // if random_bbox meta reader is used: read again to get different crops - if (_randombboxcrop_meta_data_reader != nullptr) - _randombboxcrop_meta_data_reader->release(); - // resetting loader module to start from the beginning of the media and clear it's internal state/buffers - _loader_module->reset(); - } - + // if random_bbox meta reader is used: read again to get different crops + if (_randombboxcrop_meta_data_reader != nullptr) + _randombboxcrop_meta_data_reader->release(); + // resetting loader module to start from the beginning of the media and clear it's internal state/buffers + _loader_module->reset(); // restart processing of the images _first_run = true; _output_routine_finished_processing = false; @@ -535,428 +428,29 @@ MasterGraph::mem_type() Timing MasterGraph::timing() { - Timing t; -#ifdef ROCAL_VIDEO - if(_is_video_loader) - { - t = _video_loader_module->timing(); - t.video_process_time += _process_time.get_timing(); - } - else -#endif - { - t = _loader_module->timing(); - t.image_process_time += _process_time.get_timing(); - } + Timing t = _loader_module->timing(); + t.image_process_time += _process_time.get_timing(); t.copy_to_output += _convert_time.get_timing(); t.bb_process_time += _bencode_time.get_timing(); return t; } - -#define CHECK_CL_CALL_RET(x) { cl_int ret; ret = x; if( ret != CL_SUCCESS) THROW("ocl call failed "+STR(#x)+" error "+TOSTR(ret)) } - -MasterGraph::Status -MasterGraph::copy_out_tensor(void *out_ptr, RocalTensorFormat format, float multiplier0, float multiplier1, - float multiplier2, float offset0, float offset1, float offset2, bool reverse_channels, RocalTensorDataType output_data_type) +rocalTensorList * +MasterGraph::get_output_tensors() { - if(no_more_processed_data()) - return MasterGraph::Status::NO_MORE_DATA; - - if (output_color_format() == RocalColorFormat::RGB_PLANAR) - return MasterGraph::copy_out_tensor_planar(out_ptr,format,multiplier0, multiplier1, multiplier2, offset0, offset1, offset2, reverse_channels, output_data_type); - - _convert_time.start(); - // Copies to the output context given by the user - unsigned int n = _user_batch_size; - const size_t c = output_depth(); - const size_t h = _output_image_info.height_single(); - const size_t w = output_width(); - const size_t single_output_image_size = output_byte_size(); - -#if ENABLE_OPENCL - if(_output_image_info.mem_type() == RocalMemType::OCL) - { - if(output_data_type == RocalTensorDataType::FP16) - THROW("FP16 tensor output for GPU affinity is not implemented") - // OCL device memory - cl_int status; - - size_t global_work_size = output_sample_size(); - size_t local_work_size = 256; - - // TODO: Use the runKernel function instead - - auto kernel_name = (format == RocalTensorFormat::NHWC)? "copyInt8ToNHWC" : "copyInt8ToNCHW"; - cl_kernel kernel = _device["utility"][kernel_name]; - auto queue = _device.resources()->cmd_queue; - unsigned dest_buf_offset = 0; - auto output_buffers =_ring_buffer.get_read_buffers(); - for( auto&& out_image: output_buffers) - { - int argIdx = 0; - unsigned reverse_chnl = reverse_channels ? 1 : 0; - auto img_buffer = out_image; - CHECK_CL_CALL_RET(clSetKernelArg( kernel, argIdx++, sizeof(cl_mem), (void*)& (img_buffer))) - CHECK_CL_CALL_RET(clSetKernelArg( kernel, argIdx++, sizeof(cl_mem), (void*)&_output_tensor )) - CHECK_CL_CALL_RET(clSetKernelArg( kernel, argIdx++, sizeof(cl_uint), (void*)& dest_buf_offset)) - CHECK_CL_CALL_RET(clSetKernelArg( kernel, argIdx++, sizeof(cl_uint), (void*)& w)) - CHECK_CL_CALL_RET(clSetKernelArg( kernel, argIdx++, sizeof(cl_uint), (void*)& h)) - CHECK_CL_CALL_RET(clSetKernelArg( kernel, argIdx++, sizeof(cl_uint), (void*)& c)) - CHECK_CL_CALL_RET(clSetKernelArg( kernel, argIdx++, sizeof(cl_float), (void*)& multiplier0)) - CHECK_CL_CALL_RET(clSetKernelArg( kernel, argIdx++, sizeof(cl_float), (void*)& multiplier1)) - CHECK_CL_CALL_RET(clSetKernelArg( kernel, argIdx++, sizeof(cl_float), (void*)& multiplier2)) - CHECK_CL_CALL_RET(clSetKernelArg( kernel, argIdx++, sizeof(cl_float), (void*)& offset0)) - CHECK_CL_CALL_RET(clSetKernelArg( kernel, argIdx++, sizeof(cl_float), (void*)& offset1)) - CHECK_CL_CALL_RET(clSetKernelArg( kernel, argIdx++, sizeof(cl_float), (void*)& offset2)) - CHECK_CL_CALL_RET(clSetKernelArg( kernel, argIdx++, sizeof(cl_uint), (void*)& reverse_chnl)) - - if((status = clEnqueueNDRangeKernel(queue, - kernel, - 1, - nullptr, - &global_work_size, - &local_work_size, - 0 , nullptr, nullptr)) != CL_SUCCESS) - THROW("clEnqueueNDRangeKernel failed on kernel "+STR(kernel_name)+" error " + TOSTR(status)) - dest_buf_offset += single_output_image_size; - } - - int read_size = single_output_image_size*_output_images.size()*sizeof(cl_float); - if((status = clEnqueueReadBuffer(queue, - (cl_mem)_output_tensor, - CL_TRUE, - 0, - read_size, - out_ptr, - 0 , nullptr, nullptr)) != CL_SUCCESS) - THROW("clEnqueueReadBuffer failed: " + TOSTR(status)) - } -#elif ENABLE_HIP - if(_output_image_info.mem_type() == RocalMemType::HIP) - { - unsigned int fp16 = (output_data_type == RocalTensorDataType::FP16); - - auto output_buffers =_ring_buffer.get_read_buffers(); - unsigned dest_buf_offset = 0; - // copy hip buffer to out_ptr - // todo:: add callback routing to exchange memory pointer to avoid extra copy - for( auto&& out_image: output_buffers) - { - auto img_buffer = out_image; - if (format == RocalTensorFormat::NHWC) - { - HipExecCopyInt8ToNHWC(_device.resources()->hip_stream, (const void *)img_buffer, out_ptr, dest_buf_offset, n, c, h, w, - multiplier0, multiplier1, multiplier2, offset0, offset1, offset2, reverse_channels, fp16); - - }else - { - HipExecCopyInt8ToNCHW(_device.resources()->hip_stream, (const void *)img_buffer, out_ptr, dest_buf_offset, n, c, h, w, - multiplier0, multiplier1, multiplier2, offset0, offset1, offset2, reverse_channels, fp16); - } - dest_buf_offset += single_output_image_size; - } - } -#endif - if(_output_image_info.mem_type() == RocalMemType::HOST) - { - float multiplier[3] = {multiplier0, multiplier1, multiplier2 }; - float offset[3] = {offset0, offset1, offset2 }; - size_t dest_buf_offset_start = 0; - - auto output_buffers =_ring_buffer.get_read_buffers(); - for( auto&& out_image: output_buffers) - { - unsigned int single_image_size = w * c * h; - #pragma omp parallel for num_threads(_internal_batch_size) - for(unsigned int batchCount = 0; batchCount < n; batchCount ++) - { - size_t dest_buf_offset = dest_buf_offset_start + single_image_size*batchCount; - auto in_buffer = (unsigned char*)out_image + single_image_size*batchCount; - - if(format == RocalTensorFormat::NHWC) - { - if(output_data_type == RocalTensorDataType::FP32) - { - float *output_tensor_32 = static_cast(out_ptr); - auto channel_size = w * h; - for (unsigned channel_idx = 0; channel_idx < c; channel_idx++) { - for (unsigned i = 0; i < channel_size; i++) - output_tensor_32[dest_buf_offset + channel_idx + i * c] = - offset[channel_idx] + multiplier[channel_idx] * - (reverse_channels ? (float) (in_buffer[i * c + c - channel_idx - 1]) - : (float) (in_buffer[i * c + channel_idx])); - } - } - else if(output_data_type == RocalTensorDataType::FP16) - { - half *output_tensor_16 = static_cast(out_ptr); - auto channel_size = w * h; - for (unsigned channel_idx = 0; channel_idx < c; channel_idx++) { - for (unsigned i = 0; i < channel_size; i++) - output_tensor_16[dest_buf_offset + channel_idx + i * c] = - offset[channel_idx] + multiplier[channel_idx] * - (reverse_channels ? (half) (in_buffer[i * c + c - channel_idx - 1]) - : (half) (in_buffer[i * c + channel_idx])); - } - } - } - if(format == RocalTensorFormat::NCHW) - { - if(output_data_type == RocalTensorDataType::FP32) - { - float *output_tensor_32 = static_cast(out_ptr); - auto channel_size = w * h; - if(c != 3) - { - for(unsigned i = 0; i < channel_size; i++) - output_tensor_32[dest_buf_offset + i] = offset[0] + multiplier[0]*(float)in_buffer[c*i]; - } - else { - #if (ENABLE_SIMD && __AVX2__) - float *B_buf = output_tensor_32 + dest_buf_offset; - float *G_buf = B_buf + channel_size; - float *R_buf = G_buf + channel_size; - - __m256i mask_B, mask_G, mask_R; - if (reverse_channels) { - mask_B = _mm256_setr_epi32(0x80808000, 0x80808003, 0x80808006, 0x80808009, 0x80808000, - 0x80808003, 0x80808006, 0x80808009); - mask_G = _mm256_setr_epi32(0x80808001, 0x80808004, 0x80808007, 0x8080800A, 0x80808001, - 0x80808004, 0x80808007, 0x8080800A); - mask_R = _mm256_setr_epi32(0x80808002, 0x80808005, 0x80808008, 0x8080800B, 0x80808002, - 0x80808005, 0x80808008, 0x8080800B); - } else { - mask_R = _mm256_setr_epi32(0x80808000, 0x80808003, 0x80808006, 0x80808009, 0x80808000, - 0x80808003, 0x80808006, 0x80808009); - mask_G = _mm256_setr_epi32(0x80808001, 0x80808004, 0x80808007, 0x8080800A, 0x80808001, - 0x80808004, 0x80808007, 0x8080800A); - mask_B = _mm256_setr_epi32(0x80808002, 0x80808005, 0x80808008, 0x8080800B, 0x80808002, - 0x80808005, 0x80808008, 0x8080800B); - } - __m256 pmul0 = _mm256_set1_ps(multiplier0); - __m256 pmul1 = _mm256_set1_ps(multiplier1); - __m256 pmul2 = _mm256_set1_ps(multiplier2); - __m256 padd0 = _mm256_set1_ps(offset0); - __m256 padd1 = _mm256_set1_ps(offset1); - __m256 padd2 = _mm256_set1_ps(offset2); - unsigned int alignedLength = (channel_size & ~7); // multiple of 8 - unsigned int i = 0; - - __m256 fR, fG, fB; - for (; i < alignedLength; i += 8) { - __m256i pix0 = _mm256_loadu_si256((const __m256i *) in_buffer); - pix0 = _mm256_permutevar8x32_epi32(pix0, _mm256_setr_epi32(0, 1, 2, 3, 3, 4, 5, 6)); - fB = _mm256_cvtepi32_ps(_mm256_shuffle_epi8(pix0, mask_R)); - fG = _mm256_cvtepi32_ps(_mm256_shuffle_epi8(pix0, mask_G)); - fR = _mm256_cvtepi32_ps(_mm256_shuffle_epi8(pix0, mask_B)); - fB = _mm256_mul_ps(fB, pmul0); - fG = _mm256_mul_ps(fG, pmul1); - fR = _mm256_mul_ps(fR, pmul2); - fB = _mm256_add_ps(fB, padd0); - fG = _mm256_add_ps(fG, padd1); - fR = _mm256_add_ps(fR, padd2); - _mm256_storeu_ps(B_buf, fB); - _mm256_storeu_ps(G_buf, fG); - _mm256_storeu_ps(R_buf, fR); - B_buf += 8; - G_buf += 8; - R_buf += 8; - in_buffer += 24; - } - for (; i < channel_size; i++, in_buffer += 3) { - *B_buf++ = (in_buffer[0] * multiplier0) + offset0; - *G_buf++ = (in_buffer[1] * multiplier1) + offset1; - *R_buf++ = (in_buffer[2] * multiplier2) + offset1; - } - #else - for(unsigned channel_idx = 0; channel_idx < c; channel_idx++) { - for(unsigned i = 0; i < channel_size; i++) - output_tensor_32[dest_buf_offset+channel_idx*channel_size + i] = - offset[channel_idx] + multiplier[channel_idx]*(reverse_channels ? (float)(in_buffer[(c*i+c-channel_idx-1)]) : - (float)(in_buffer[(c*i+channel_idx)])); - } - #endif - } - } - else if(output_data_type == RocalTensorDataType::FP16) - { - half *output_tensor_16 = static_cast(out_ptr); - auto channel_size = w * h; - if(c != 3) { - for(unsigned i = 0; i < channel_size; i++) - output_tensor_16[dest_buf_offset + i] = offset[0] + multiplier[0] * (half)in_buffer[c * i]; - } - else { - #if (ENABLE_SIMD && __AVX2__) - half *B_buf_16 = output_tensor_16 + dest_buf_offset; - half *G_buf_16 = B_buf_16 + channel_size; - half *R_buf_16 = G_buf_16 + channel_size; - - __m256i mask_B, mask_G, mask_R; - if (reverse_channels) { - mask_B = _mm256_setr_epi32(0x80808000, 0x80808003, 0x80808006, 0x80808009, 0x80808000, - 0x80808003, 0x80808006, 0x80808009); - mask_G = _mm256_setr_epi32(0x80808001, 0x80808004, 0x80808007, 0x8080800A, 0x80808001, - 0x80808004, 0x80808007, 0x8080800A); - mask_R = _mm256_setr_epi32(0x80808002, 0x80808005, 0x80808008, 0x8080800B, 0x80808002, - 0x80808005, 0x80808008, 0x8080800B); - } else { - mask_R = _mm256_setr_epi32(0x80808000, 0x80808003, 0x80808006, 0x80808009, 0x80808000, - 0x80808003, 0x80808006, 0x80808009); - mask_G = _mm256_setr_epi32(0x80808001, 0x80808004, 0x80808007, 0x8080800A, 0x80808001, - 0x80808004, 0x80808007, 0x8080800A); - mask_B = _mm256_setr_epi32(0x80808002, 0x80808005, 0x80808008, 0x8080800B, 0x80808002, - 0x80808005, 0x80808008, 0x8080800B); - } - __m256 pmul0 = _mm256_set1_ps(multiplier0); - __m256 pmul1 = _mm256_set1_ps(multiplier1); - __m256 pmul2 = _mm256_set1_ps(multiplier2); - __m256 padd0 = _mm256_set1_ps(offset0); - __m256 padd1 = _mm256_set1_ps(offset1); - __m256 padd2 = _mm256_set1_ps(offset2); - unsigned int alignedLength = (channel_size & ~7); // multiple of 8 - unsigned int i = 0; - - __m256 fR, fG, fB; - __m128i tempR, tempG, tempB; - for (; i < alignedLength; i += 8) { - __m256i pix0 = _mm256_loadu_si256((const __m256i *) in_buffer); - pix0 = _mm256_permutevar8x32_epi32(pix0, _mm256_setr_epi32(0, 1, 2, 3, 3, 4, 5, 6)); - fB = _mm256_cvtepi32_ps(_mm256_shuffle_epi8(pix0, mask_R)); - fG = _mm256_cvtepi32_ps(_mm256_shuffle_epi8(pix0, mask_G)); - fR = _mm256_cvtepi32_ps(_mm256_shuffle_epi8(pix0, mask_B)); - fB = _mm256_fmadd_ps(fB, pmul0, padd0); - fG = _mm256_fmadd_ps(fG, pmul1, padd1); - fR = _mm256_fmadd_ps(fR, pmul2, padd2); - tempB = _mm256_cvtps_ph(fB, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); - tempG = _mm256_cvtps_ph(fG, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); - tempR = _mm256_cvtps_ph(fR, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); - _mm_storeu_si128((__m128i *)B_buf_16, tempB); - _mm_storeu_si128((__m128i *)G_buf_16, tempG); - _mm_storeu_si128((__m128i *)R_buf_16, tempR); - B_buf_16 += 8; - G_buf_16 += 8; - R_buf_16 += 8; - in_buffer += 24; - } - for (; i < channel_size; i++, in_buffer += 3) { - *B_buf_16++ = (half) (in_buffer[0] * multiplier0) + offset0; - *G_buf_16++ = (half) (in_buffer[1] * multiplier1) + offset1; - *R_buf_16++ = (half) (in_buffer[2] * multiplier2) + offset2; - } - #else - for (unsigned channel_idx = 0; channel_idx < c; channel_idx++) { - for (unsigned i = 0; i < channel_size; i++) - output_tensor_16[dest_buf_offset + channel_idx * channel_size + i] = - offset[channel_idx] + multiplier[channel_idx] * - (reverse_channels ? (half) (in_buffer[(c * i + c - channel_idx - 1)]) - : (half) (in_buffer[(c * i + channel_idx)])); - } - #endif - } - } - } // NCHW or NHWC - } // for loop batch - - dest_buf_offset_start += single_output_image_size; - } - } - _convert_time.end(); - return Status::OK; -} - -MasterGraph::Status -MasterGraph::copy_output(unsigned char *out_ptr, size_t out_size_in_bytes) -{ - if(no_more_processed_data()) - return MasterGraph::Status::NO_MORE_DATA; - - // Copies to the output context given by the user - size_t size = output_byte_size(); - if (out_size_in_bytes != (size *_output_images.size())) - return MasterGraph::Status::INVALID_ARGUMENTS; - - _convert_time.start(); - -#if ENABLE_OPENCL - if(processing_on_device_ocl()) - { - size_t dest_buf_offset = 0; - //NOTE: the CL_TRUE flag is only used on the last buffer read call, - // to avoid unnecessary sequence of synchronizations - - // get_read_buffers() calls block_if_empty() internally and blocks if buffers are empty until a new batch is processed - auto output_buffers =_ring_buffer.get_read_buffers(); - auto out_image_idx = output_buffers.size(); - for( auto&& output_handle: output_buffers) - { - bool sync_flag = (--out_image_idx == 0) ? CL_TRUE : CL_FALSE; - cl_int status; - if((status = clEnqueueReadBuffer(_device.resources()->cmd_queue, - (cl_mem) output_handle, - sync_flag?(CL_TRUE):CL_FALSE, - 0, - size, - out_ptr+dest_buf_offset, - 0 , nullptr, nullptr)) != CL_SUCCESS) - THROW("clEnqueueReadBuffer failed: " + TOSTR(status)) - dest_buf_offset += size; - } - } - else { -#elif ENABLE_HIP - if(processing_on_device_hip()) - { - //NOTE: the CL_TRUE flag is only used on the last buffer read call, - // to avoid unnecessary sequence of synchronizations - - // get_read_buffers() calls block_if_empty() internally and blocks if buffers are empty until a new batch is processed - size_t dest_buf_offset = 0; - auto output_buffers =_ring_buffer.get_read_buffers(); - for( auto&& output_handle: output_buffers) - { - hipError_t err = hipMemcpyDtoHAsync((void *)(out_ptr+dest_buf_offset), output_handle, size, _device.resources()->hip_stream); - if (err) { - THROW("hipMemcpyDtoHAsync failed: " + TOSTR(err)) - } - dest_buf_offset += size; - } - // sync to finish copy - if (hipStreamSynchronize(_device.resources()->hip_stream) != hipSuccess) - THROW("hipStreamSynchronize failed for hipMemcpy ") - - } - else { -#endif - // get_host_master_read_buffer is blocking if _ring_buffer is empty, and blocks this thread till internal processing thread process a new batch and store in the _ring_buffer - memcpy(out_ptr, _ring_buffer.get_host_master_read_buffer(), size * _output_images.size()); -#if ENABLE_OPENCL || ENABLE_HIP - } -#endif - _convert_time.end(); - return Status::OK; + auto output_ptr = _ring_buffer.get_read_buffers(); + for(unsigned i = 0; i < _internal_tensor_list.size(); i++) + _output_tensor_list[i]->set_mem_handle(output_ptr[i]); + + return &_output_tensor_list; } void MasterGraph::output_routine() { INFO("Output routine started with "+TOSTR(_remaining_count) + " to load"); - size_t batch_ratio = _is_sequence_reader_output ? _sequence_batch_ratio : _user_to_internal_batch_ratio; - if(!_is_sequence_reader_output) // _sequence_batch_ratio and _user_to_internal_batch_ratio is different. Will be removed in TensorSupport. - { -#if !ENABLE_HIP - if(processing_on_device_ocl() && batch_ratio != 1) - THROW("Internal failure, in the GPU processing case, user and input batch size must be equal") -#else - if(processing_on_device_hip() && batch_ratio != 1) - THROW("Internal failure, in the GPU processing case, user and input batch size must be equal") -#endif - } try { while (_processing) { - const size_t each_cycle_size = output_byte_size()/batch_ratio; - ImageNameBatch full_batch_image_names = {}; pMetaDataBatch full_batch_meta_data = nullptr; pMetaDataBatch augmented_batch_meta_data = nullptr; @@ -975,76 +469,62 @@ void MasterGraph::output_routine() auto write_buffers = _ring_buffer.get_write_buffers(); _rb_block_if_full_time.end(); - _process_time.start(); - // When executing on CPU the internal batch count can be smaller than the user batch count - // In that case the user_batch_size will be an integer multiple of the _internal_batch_size - // Multiple cycles worth of internal_batch_size images should be processed to complete a full _user_batch_size - for(unsigned cycle_idx = 0; cycle_idx< batch_ratio; cycle_idx++) - { - // Swap handles on the input image, so that new image is loaded to be processed - auto load_ret = _loader_module->load_next(); - if (load_ret != LoaderModuleStatus::OK) - THROW("Loader module failed to load next batch of images, status " + TOSTR(load_ret)) + // Swap handles on the input tensor, so that new tensor is loaded to be processed + auto load_ret = _loader_module->load_next(); + if (load_ret != LoaderModuleStatus::OK) + THROW("Loader module failed to load next batch of images, status " + TOSTR(load_ret)) + if (!_processing) + break; + auto this_cycle_names = _loader_module->get_id(); + auto decode_image_info = _loader_module->get_decode_image_info(); + auto crop_image_info = _loader_module->get_crop_image_info(); - if (!_processing) - break; - auto this_cycle_names = _loader_module->get_id(); - auto decode_image_info = _loader_module->get_decode_image_info(); - auto crop_image_info = _loader_module->get_crop_image_info(); + if(this_cycle_names.size() != _user_batch_size) + WRN("Internal problem: names count "+ TOSTR(this_cycle_names.size())) - if(this_cycle_names.size() != _internal_batch_size) - WRN("Internal problem: names count "+ TOSTR(this_cycle_names.size())) + // meta_data lookup is done before _meta_data_graph->process() is called to have the new meta_data ready for processing + if (_meta_data_reader) + _meta_data_reader->lookup(this_cycle_names); - // meta_data lookup is done before _meta_data_graph->process() is called to have the new meta_data ready for processing - if (_meta_data_reader) - _meta_data_reader->lookup(this_cycle_names); + full_batch_image_names += this_cycle_names; - full_batch_image_names += this_cycle_names; + if (!_processing) + break; - if (!_processing) - break; + // Swap handles on the output tensor, so that new processed tensor will be written to the a new buffer + for (size_t idx = 0; idx < _internal_tensor_list.size(); idx++) + _internal_tensor_list[idx]->swap_handle(write_buffers[idx]); - // Swap handles on the output images, so that new processed image will be written to the a new buffer - for (size_t idx = 0; idx < _output_images.size(); idx++) - { - if(_affinity == RocalAffinity::GPU) - _output_images[idx]->swap_handle(write_buffers[idx]); - else - { - auto this_cycle_buffer_ptr = (unsigned char *) write_buffers[idx] + each_cycle_size * cycle_idx; - _output_images[idx]->swap_handle(this_cycle_buffer_ptr); - } - } - - if (!_processing) - break; + if (!_processing) + break; - for(auto node: _nodes) + for(auto node: _nodes) + { + if(node->_is_ssd) { - if(node->_is_ssd) - { - node->set_meta_data(_augmented_meta_data); - } + node->set_meta_data(_augmented_meta_data); } + } - update_node_parameters(); - if(_augmented_meta_data) + update_node_parameters(); + if(_augmented_meta_data) + { + if (_meta_data_graph) { - if (_meta_data_graph) + if(_is_random_bbox_crop) { - if(_is_random_bbox_crop) - { - _meta_data_graph->update_random_bbox_meta_data(_augmented_meta_data, decode_image_info, crop_image_info); - } - _meta_data_graph->process(_augmented_meta_data); + _meta_data_graph->update_random_bbox_meta_data(_augmented_meta_data, decode_image_info, crop_image_info); } - if (full_batch_meta_data) - full_batch_meta_data->concatenate(_augmented_meta_data); - else - full_batch_meta_data = _augmented_meta_data->clone(); + _meta_data_graph->process(_augmented_meta_data); } - _graph->process(); + if (full_batch_meta_data) + full_batch_meta_data->concatenate(_augmented_meta_data); + else + full_batch_meta_data = _augmented_meta_data->clone(); } + _process_time.start(); + _graph->process(); + _process_time.end(); _bencode_time.start(); if(_is_box_encoder ) { @@ -1062,126 +542,6 @@ void MasterGraph::output_routine() _ring_buffer.set_meta_data(full_batch_image_names, full_batch_meta_data); _ring_buffer.push(); // Image data and metadata is now stored in output the ring_buffer, increases it's level by 1 } - _process_time.end(); - - } - catch (const std::exception &e) - { - ERR("Exception thrown in the process routine: " + STR(e.what()) + STR("\n")); - _processing = false; - _ring_buffer.release_all_blocked_calls(); - } -} - -#ifdef ROCAL_VIDEO -void MasterGraph::output_routine_video() -{ - _process_time.start(); - INFO("Output routine of video pipeline started with "+TOSTR(_remaining_count) + " to load"); -#if !ENABLE_HIP - if(processing_on_device_ocl() && _user_to_internal_batch_ratio != 1) - THROW("Internal failure, in the GPU processing case, user and input batch size must be equal") -#else - if(processing_on_device_hip() && _user_to_internal_batch_ratio != 1) - THROW("Internal failure, in the GPU processing case, user and input batch size must be equal") -#endif - try { - while (_processing) - { - const size_t each_cycle_size = output_byte_size()/_user_to_internal_batch_ratio; - - ImageNameBatch full_batch_image_names = {}; - pMetaDataBatch full_batch_meta_data = nullptr; - pMetaDataBatch augmented_batch_meta_data = nullptr; - if (_video_loader_module->remaining_count() < _user_batch_size) - { - // If the internal process routine ,output_routine_video(), has finished processing all the images, and last - // processed images stored in the _ring_buffer will be consumed by the user when it calls the run() func - notify_user_thread(); - // the following call is required in case the ring buffer is waiting for more data to be loaded and there is no more data to process. - _ring_buffer.release_if_empty(); - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - continue; - } - - // _ring_buffer.get_write_buffers() is blocking and blocks here until user uses processed image by calling run() and frees space in the ring_buffer - _rb_block_if_full_time.start(); - auto write_buffers = _ring_buffer.get_write_buffers(); - _rb_block_if_full_time.end(); - - // When executing on CPU the internal batch count can be smaller than the user batch count - // In that case the user_batch_size will be an integer multiple of the _internal_batch_size - // Multiple cycles worth of internal_batch_size images should be processed to complete a full _user_batch_size - for(unsigned cycle_idx = 0; cycle_idx< _user_to_internal_batch_ratio; cycle_idx++) - { - // Swap handles on the input sequence, so that new sequence is loaded to be processed - auto load_ret = _video_loader_module->load_next(); - if (load_ret != VideoLoaderModuleStatus::OK) - THROW("Video Loader module failed to load next batch of images, status " + TOSTR(load_ret)) - - if (!_processing) - break; - auto this_cycle_names = _video_loader_module->get_id(); - auto decode_image_info = _video_loader_module->get_decode_image_info(); - _sequence_start_framenum_vec.insert(_sequence_start_framenum_vec.begin(), _video_loader_module->get_sequence_start_frame_number()); - _sequence_frame_timestamps_vec.insert(_sequence_frame_timestamps_vec.begin(), _video_loader_module->get_sequence_frame_timestamps()); - - if(this_cycle_names.size() != _internal_batch_size) - WRN("Internal problem: names count "+ TOSTR(this_cycle_names.size())) - - // meta_data lookup is done before _meta_data_graph->process() is called to have the new meta_data ready for processing - if (_meta_data_reader) - _meta_data_reader->lookup(this_cycle_names); - - full_batch_image_names += this_cycle_names; - - if (!_processing) - break; - - // Swap handles on the output images, so that new processed image will be written to the a new buffer - for (size_t idx = 0; idx < _output_images.size(); idx++) - { - if(_affinity == RocalAffinity::GPU) - _output_images[idx]->swap_handle(write_buffers[idx]); - else - { - auto this_cycle_buffer_ptr = (unsigned char *) write_buffers[idx] + each_cycle_size * cycle_idx; - _output_images[idx]->swap_handle(this_cycle_buffer_ptr); - } - } - - if (!_processing) - break; - - for(auto node: _nodes) - { - if(node->_is_ssd) - { - node->set_meta_data(_augmented_meta_data); - } - } - - update_node_parameters(); - if(_augmented_meta_data) - { - if (_meta_data_graph) - { - _meta_data_graph->process(_augmented_meta_data); - } - if (full_batch_meta_data) - full_batch_meta_data->concatenate(_augmented_meta_data); - else - full_batch_meta_data = _augmented_meta_data->clone(); - } - _graph->process(); - } - if(_is_box_encoder ) - { - _meta_data_graph->update_box_encoder_meta_data(&_anchors, full_batch_meta_data, _criteria, _offset, _scale, _means, _stds); - } - _ring_buffer.set_meta_data(full_batch_image_names, full_batch_meta_data); - _ring_buffer.push(); // Image data and metadata is now stored in output the ring_buffer, increases it's level by 1 - } } catch (const std::exception &e) { @@ -1189,25 +549,13 @@ void MasterGraph::output_routine_video() _processing = false; _ring_buffer.release_all_blocked_calls(); } - _process_time.end(); } -#endif void MasterGraph::start_processing() { _processing = true; -#ifdef ROCAL_VIDEO - if(_is_video_loader) - { - _remaining_count = _video_loader_module->remaining_count(); - _output_thread = std::thread(&MasterGraph::output_routine_video, this); - } - else -#endif - { - _remaining_count = _loader_module->remaining_count(); - _output_thread = std::thread(&MasterGraph::output_routine, this); - } + _remaining_count = _loader_module->remaining_count(); + _output_thread = std::thread(&MasterGraph::output_routine, this); #if defined(WIN32) || defined(_WIN32) || defined(__WIN32) && !defined(__CYGWIN__) #else // Changing thread scheduling policy and it's priority does not help on latest Ubuntu builds @@ -1422,64 +770,6 @@ size_t MasterGraph::bounding_box_batch_count(int *buf, pMetaDataBatch meta_data_ return size; } - -size_t MasterGraph::compute_optimum_internal_batch_size(size_t user_batch_size, RocalAffinity affinity) -{ - const unsigned MINIMUM_CPU_THREAD_COUNT = 2; - const unsigned DEFAULT_SMT_COUNT = 2; - - - if(affinity == RocalAffinity::GPU) - return user_batch_size; - - unsigned THREAD_COUNT = std::thread::hardware_concurrency(); - if(THREAD_COUNT >= MINIMUM_CPU_THREAD_COUNT) - { - INFO("Can run " + TOSTR(THREAD_COUNT) + " threads simultaneously on this machine") - } - else - { - THREAD_COUNT = MINIMUM_CPU_THREAD_COUNT; - WRN("hardware_concurrency() call failed assuming can run " + TOSTR(THREAD_COUNT) + " threads") - } - size_t ret = user_batch_size; - size_t CORE_COUNT = THREAD_COUNT / DEFAULT_SMT_COUNT; - - if(CORE_COUNT <= 0) - THROW("Wrong core count detected less than 0") - - for( size_t i = CORE_COUNT; i <= THREAD_COUNT; i++) - if(user_batch_size % i == 0) - { - ret = i; - break; - } - - for(size_t i = CORE_COUNT; i > 1; i--) - if(user_batch_size % i == 0) - { - ret = i; - break; - } - INFO("User batch size "+ TOSTR(user_batch_size)+" Internal batch size set to "+ TOSTR(ret)) - return ret; -} - -size_t MasterGraph::output_sample_size() -{ - return output_height() * output_width() * output_depth(); -} - -size_t MasterGraph::output_byte_size() -{ - return output_height() * output_width() * output_depth() * SAMPLE_SIZE; -} - -size_t MasterGraph::output_depth() -{ - return _output_image_info.color_plane_count(); -} - void MasterGraph::notify_user_thread() { if(_output_routine_finished_processing) @@ -1493,163 +783,6 @@ bool MasterGraph::no_more_processed_data() return (_output_routine_finished_processing && _ring_buffer.empty()); } -MasterGraph::Status -MasterGraph::copy_out_tensor_planar(void *out_ptr, RocalTensorFormat format, float multiplier0, float multiplier1, - float multiplier2, float offset0, float offset1, float offset2, bool reverse_channels, RocalTensorDataType output_data_type) -{ - if(no_more_processed_data()) - return MasterGraph::Status::NO_MORE_DATA; - - _convert_time.start(); - // Copies to the output context given by the user, each image is copied separate for planar - const size_t w = output_width(); - const size_t h = _output_image_info.height_single(); - const size_t c = output_depth(); - const size_t n = _output_image_info.batch_size(); - - const size_t single_output_image_size = output_byte_size(); - - - if(_output_image_info.mem_type() == RocalMemType::OCL || _output_image_info.mem_type() == RocalMemType::HIP) - { - THROW("copy_out_tensor_planar for GPU affinity is not implemented") - } - if(_output_image_info.mem_type() == RocalMemType::HOST) - { - float multiplier[3] = {multiplier0, multiplier1, multiplier2 }; - float offset[3] = {offset0, offset1, offset2 }; - size_t dest_buf_offset = 0; - - auto output_buffers =_ring_buffer.get_read_buffers(); - - for( auto&& out_image: output_buffers) - { - for (unsigned batch = 0; batch < n ; batch++) { - const size_t batch_offset = w*h*c*batch; - auto in_buffer = (unsigned char *) out_image + batch_offset; - if (format == RocalTensorFormat::NHWC) { - if (output_data_type == RocalTensorDataType::FP32) { - float *output_tensor_32 = static_cast(out_ptr) + batch_offset; - auto channel_size = w * h; - for (unsigned channel_idx = 0; channel_idx < c; channel_idx++) - for (unsigned i = 0; i < channel_size; i++) - output_tensor_32[dest_buf_offset + channel_idx + i * c] = - offset[channel_idx] + multiplier[channel_idx] * - (reverse_channels ? (float) (in_buffer[i + - (c - channel_idx - - 1) * - channel_size]) - : (float) (in_buffer[i + channel_idx * - channel_size])); - } else if (output_data_type == RocalTensorDataType::FP16) { - half *output_tensor_16 = static_cast(out_ptr) + batch_offset; - auto channel_size = w * h; - for (unsigned channel_idx = 0; channel_idx < c; channel_idx++) - for (unsigned i = 0; i < channel_size; i++) - output_tensor_16[dest_buf_offset + channel_idx + i * c] = - offset[channel_idx] + multiplier[channel_idx] * - (reverse_channels ? (half) (in_buffer[ - (c - channel_idx - 1) * channel_size + i]) - : (half) (in_buffer[ - channel_idx * channel_size + i])); - } - } - if (format == RocalTensorFormat::NCHW) { - if (output_data_type == RocalTensorDataType::FP32) { - float *output_tensor_32 = static_cast(out_ptr) + batch_offset; - //output_tensor_32 += batch_offset; - auto channel_size = w * h; - if (c != 3) { - for (unsigned channel_idx = 0; channel_idx < c; channel_idx++) - for (unsigned i = 0; i < channel_size; i++) - output_tensor_32[dest_buf_offset + channel_idx * channel_size + i] = - offset[channel_idx] + multiplier[channel_idx] * - (reverse_channels ? (float) (in_buffer[ - (c - channel_idx - 1) * channel_size + i]) - : (float) (in_buffer[ - channel_idx * channel_size + i])); - } else { -#if (ENABLE_SIMD && __AVX2__) - - float *B_buf = output_tensor_32 + dest_buf_offset; - float *G_buf = B_buf + channel_size; - float *R_buf = G_buf + channel_size; - unsigned char *in_buffer_R = in_buffer; - unsigned char *in_buffer_G = in_buffer + channel_size; - unsigned char *in_buffer_B = in_buffer_G + channel_size; - - __m256 pmul0 = _mm256_set1_ps(multiplier0); - __m256 pmul1 = _mm256_set1_ps(multiplier1); - __m256 pmul2 = _mm256_set1_ps(multiplier2); - __m256 padd0 = _mm256_set1_ps(offset0); - __m256 padd1 = _mm256_set1_ps(offset1); - __m256 padd2 = _mm256_set1_ps(offset2); - unsigned int alignedLength = (channel_size & ~7); // multiple of 8 - unsigned int i = 0; - - __m256 fR, fG, fB; - for (; i < alignedLength; i += 8) { - __m128i pixR, pixG, pixB; - if (reverse_channels) { - pixB = _mm_loadl_epi64((const __m128i *) in_buffer_R); - pixG = _mm_loadl_epi64((const __m128i *) in_buffer_G); - pixR = _mm_loadl_epi64((const __m128i *) in_buffer_B); - } else { - pixR = _mm_loadl_epi64((const __m128i *) in_buffer_R); - pixG = _mm_loadl_epi64((const __m128i *) in_buffer_G); - pixB = _mm_loadl_epi64((const __m128i *) in_buffer_B); - } - fB = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(pixR)); - fG = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(pixG)); - fR = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(pixB)); - fB = _mm256_mul_ps(fB, pmul0); - fG = _mm256_mul_ps(fG, pmul1); - fR = _mm256_mul_ps(fR, pmul2); - fB = _mm256_add_ps(fB, padd0); - fG = _mm256_add_ps(fG, padd1); - fR = _mm256_add_ps(fR, padd2); - _mm256_storeu_ps(B_buf, fB); - _mm256_storeu_ps(G_buf, fG); - _mm256_storeu_ps(R_buf, fR); - B_buf += 8; - G_buf += 8; - R_buf += 8; - in_buffer_R += 8, in_buffer_G += 8, in_buffer_B += 8; - } - for (; i < channel_size; i++) { - *B_buf++ = (*in_buffer_R++ * multiplier0) + offset0; - *G_buf++ = (*in_buffer_G++ * multiplier1) + offset1; - *R_buf++ = (*in_buffer_B++ * multiplier2) + offset1; - } - -#else - for(unsigned channel_idx = 0; channel_idx < c; channel_idx++) - for(unsigned i = 0; i < channel_size; i++) - output_tensor_32[dest_buf_offset+channel_idx*channel_size + i] = - offset[channel_idx] + multiplier[channel_idx]*(reverse_channels ? (float)(in_buffer[i+(c-channel_idx-1)*channel_size]) : (float)(in_buffer[i+channel_idx*channel_size])); -#endif - } - } else if (output_data_type == RocalTensorDataType::FP16) { - half *output_tensor_16 = static_cast(out_ptr) + batch_offset; - auto channel_size = w * h; - for (unsigned channel_idx = 0; channel_idx < c; channel_idx++) - for (unsigned i = 0; i < channel_size; i++) - output_tensor_16[dest_buf_offset + channel_idx * channel_size + i] = - offset[channel_idx] + multiplier[channel_idx] * - (reverse_channels ? (half) (in_buffer[i + - (c - channel_idx - - 1) * channel_size]) - : (half) (in_buffer[i + channel_idx * - channel_size])); - } - } - } - dest_buf_offset += single_output_image_size; - } - } - _convert_time.end(); - return Status::OK; -} MasterGraph::Status MasterGraph::get_bbox_encoded_buffers(float **boxes_buf_ptr, int **labels_buf_ptr, size_t num_encoded_boxes) diff --git a/rocAL/rocAL/source/pipeline/ring_buffer.cpp b/rocAL/rocAL/source/pipeline/ring_buffer.cpp index 9d8c7a95d6..cb4bab31d4 100644 --- a/rocAL/rocAL/source/pipeline/ring_buffer.cpp +++ b/rocAL/rocAL/source/pipeline/ring_buffer.cpp @@ -26,12 +26,13 @@ THE SOFTWARE. RingBuffer::RingBuffer(unsigned buffer_depth): BUFF_DEPTH(buffer_depth), _dev_sub_buffer(buffer_depth), - _host_master_buffers(buffer_depth), + _host_sub_buffers(buffer_depth), _dev_bbox_buffer(buffer_depth), _dev_labels_buffer(buffer_depth) { reset(); } + void RingBuffer::block_if_empty() { std::unique_lock lock(_lock); @@ -62,15 +63,6 @@ std::vector RingBuffer::get_read_buffers() return _host_sub_buffers[_read_ptr]; } -void *RingBuffer::get_host_master_read_buffer() { - block_if_empty(); - if((_mem_type == RocalMemType::OCL) || (_mem_type == RocalMemType::HIP)) - return nullptr; - - return _host_master_buffers[_read_ptr]; -} - - std::pair RingBuffer::get_box_encode_read_buffers() { block_if_empty(); @@ -119,12 +111,12 @@ void RingBuffer::unblock_writer() _wait_for_unload.notify_all(); } -void RingBuffer::init(RocalMemType mem_type, void *devres, unsigned sub_buffer_size, unsigned sub_buffer_count) +void RingBuffer::init(RocalMemType mem_type, void *devres, std::vector &sub_buffer_size) { _mem_type = mem_type; _dev = devres; _sub_buffer_size = sub_buffer_size; - _sub_buffer_count = sub_buffer_count; + auto sub_buffer_count = sub_buffer_size.size(); if(BUFF_DEPTH < 2) THROW ("Error internal buffer size for the ring buffer should be greater than one") @@ -142,15 +134,15 @@ void RingBuffer::init(RocalMemType mem_type, void *devres, unsigned sub_buffer_s { cl_mem_flags flags = CL_MEM_READ_ONLY; - _dev_sub_buffer[buffIdx].resize(_sub_buffer_count); - for(unsigned sub_idx = 0; sub_idx < _sub_buffer_count; sub_idx++) + _dev_sub_buffer[buffIdx].resize(sub_buffer_count); + for(unsigned sub_idx = 0; sub_idx < sub_buffer_count; sub_idx++) { - _dev_sub_buffer[buffIdx][sub_idx] = clCreateBuffer(dev_ocl->context, flags, sub_buffer_size, NULL, &err); + _dev_sub_buffer[buffIdx][sub_idx] = clCreateBuffer(dev_ocl->context, flags, _sub_buffer_size[sub_idx], NULL, &err); if(err) { _dev_sub_buffer.clear(); - THROW("clCreateBuffer of size " + TOSTR(sub_buffer_size) + " index " + TOSTR(sub_idx) + + THROW("clCreateBuffer of size " + TOSTR(_sub_buffer_size[sub_idx]) + " index " + TOSTR(sub_idx) + " failed " + TOSTR(err)); } @@ -159,7 +151,7 @@ void RingBuffer::init(RocalMemType mem_type, void *devres, unsigned sub_buffer_s } } - else + else { #elif ENABLE_HIP DeviceResourcesHip *dev_hip = static_cast(_dev); @@ -172,33 +164,30 @@ void RingBuffer::init(RocalMemType mem_type, void *devres, unsigned sub_buffer_s for(size_t buffIdx = 0; buffIdx < BUFF_DEPTH; buffIdx++) { - _dev_sub_buffer[buffIdx].resize(_sub_buffer_count); - for(unsigned sub_idx = 0; sub_idx < _sub_buffer_count; sub_idx++) + _dev_sub_buffer[buffIdx].resize(sub_buffer_count); + for(unsigned sub_idx = 0; sub_idx < sub_buffer_count; sub_idx++) { - hipError_t err = hipMalloc(&_dev_sub_buffer[buffIdx][sub_idx], sub_buffer_size); - //printf("allocated HIP device buffer <%d, %d, %d, %p>\n", buffIdx, sub_idx, sub_buffer_size, _dev_sub_buffer[buffIdx][sub_idx]); + hipError_t err = hipMalloc(&_dev_sub_buffer[buffIdx][sub_idx], _sub_buffer_size[sub_idx]); + //printf("allocated HIP device buffer <%d, %d, %d, %p>\n", buffIdx, sub_idx, _sub_buffer_size[sub_idx], _dev_sub_buffer[buffIdx][sub_idx]); if(err != hipSuccess) { _dev_sub_buffer.clear(); - THROW("hipMalloc of size " + TOSTR(sub_buffer_size) + " index " + TOSTR(sub_idx) + + THROW("hipMalloc of size " + TOSTR(_sub_buffer_size[sub_idx]) + " index " + TOSTR(sub_idx) + " failed " + TOSTR(err)); } } } } - else + else { #endif - _host_sub_buffers.resize(BUFF_DEPTH); for(size_t buffIdx = 0; buffIdx < BUFF_DEPTH; buffIdx++) { - const size_t master_buffer_size = sub_buffer_size * sub_buffer_count; // a minimum of extra MEM_ALIGNMENT is allocated - _host_master_buffers[buffIdx] = aligned_alloc(MEM_ALIGNMENT, MEM_ALIGNMENT * (master_buffer_size / MEM_ALIGNMENT + 1)); - _host_sub_buffers[buffIdx].resize(_sub_buffer_count); - for(size_t sub_buff_idx = 0; sub_buff_idx < _sub_buffer_count; sub_buff_idx++) - _host_sub_buffers[buffIdx][sub_buff_idx] = (unsigned char*)_host_master_buffers[buffIdx] + _sub_buffer_size * sub_buff_idx; + _host_sub_buffers[buffIdx].resize(sub_buffer_count); + for(size_t sub_buff_idx = 0; sub_buff_idx < sub_buffer_count; sub_buff_idx++) + _host_sub_buffers[buffIdx][sub_buff_idx] = aligned_alloc(MEM_ALIGNMENT, MEM_ALIGNMENT * (_sub_buffer_size[sub_buff_idx] / MEM_ALIGNMENT + 1)); } #if ENABLE_OPENCL || ENABLE_HIP } @@ -319,12 +308,10 @@ void RingBuffer::release_gpu_res() RingBuffer::~RingBuffer() { if (_mem_type == RocalMemType::HOST) { - for (unsigned idx = 0; idx < _host_master_buffers.size(); idx++) - if (_host_master_buffers[idx]) { - free(_host_master_buffers[idx]); - } - - _host_master_buffers.clear(); + for (unsigned buffIdx = 0; buffIdx < _host_sub_buffers.size(); buffIdx++) + for (unsigned sub_buf_idx = 0; sub_buf_idx < _host_sub_buffers[buffIdx].size(); sub_buf_idx++) + if (_host_sub_buffers[buffIdx][sub_buf_idx]) + free(_host_sub_buffers[buffIdx][sub_buf_idx]); _host_sub_buffers.clear(); } } diff --git a/rocAL/rocAL/source/pipeline/tensor.cpp b/rocAL/rocAL/source/pipeline/tensor.cpp index 86838653ac..95e5e25122 100644 --- a/rocAL/rocAL/source/pipeline/tensor.cpp +++ b/rocAL/rocAL/source/pipeline/tensor.cpp @@ -316,7 +316,7 @@ unsigned rocalTensor::copy_data(hipStream_t stream, void *host_memory, bool sync #endif unsigned rocalTensor::copy_data(void *user_buffer) { - if (_info._type != rocalTensorInfo::Type::HANDLE) return 0; + if (_mem_handle == nullptr) return 0; #if ENABLE_HIP if (_info._mem_type == RocalMemType::HIP) {