diff --git a/rocAL/rocAL/include/pipeline/context.h b/rocAL/rocAL/include/pipeline/context.h
index a4849a53ac..a008f7bdbb 100644
--- a/rocAL/rocAL/include/pipeline/context.h
+++ b/rocAL/rocAL/include/pipeline/context.h
@@ -35,7 +35,6 @@ struct Context
     {
         LOG("Processing on " + STR(((affinity == RocalAffinity::CPU)?" CPU": " GPU")))
         master_graph = std::make_shared<MasterGraph>(batch_size, affinity, gpu_id, prefetch_queue_depth, output_tensor_type);
-        _internal_batch_size = master_graph->internal_batch_size();
     }
     ~Context()
     {
@@ -52,10 +51,8 @@ struct Context
         return master_graph->timing();
     }
     size_t user_batch_size() { return _user_batch_size; }
-    size_t internal_batch_size() { return _internal_batch_size; }
 private:
     void clear_errors() { error = "";}
     std::string error;
     size_t _user_batch_size;
-    size_t _internal_batch_size;
-};
\ No newline at end of file
+};
diff --git a/rocAL/rocAL/include/pipeline/master_graph.h b/rocAL/rocAL/include/pipeline/master_graph.h
index 2180b983e6..3288fbc866 100644
--- a/rocAL/rocAL/include/pipeline/master_graph.h
+++ b/rocAL/rocAL/include/pipeline/master_graph.h
@@ -52,27 +52,16 @@ class MasterGraph
     ~MasterGraph();
     Status reset();
     size_t remaining_count();
-    MasterGraph::Status copy_out_tensor(void *out_ptr, RocalTensorFormat format, float multiplier0, float multiplier1, float multiplier2,
-                    float offset0, float offset1, float offset2, bool reverse_channels, RocalTensorDataType output_data_type);
-    Status copy_output(unsigned char* out_ptr, size_t out_size_in_bytes);
-    Status copy_out_tensor_planar(void *out_ptr, RocalTensorFormat format, float multiplier0, float multiplier1, float multiplier2,
-                    float offset0, float offset1, float offset2, bool reverse_channels, RocalTensorDataType output_data_type);
-    size_t output_width();
-    size_t output_height();
-    size_t output_byte_size();
-    size_t output_depth();
+    rocalTensorList *get_output_tensors();
     void sequence_start_frame_number(std::vector<size_t> &sequence_start_framenum); // Returns the starting frame number of the sequences
     void sequence_frame_timestamps(std::vector<std::vector<float>> &sequence_frame_timestamp); // Returns the timestamps of the frames in the sequences
-    size_t augmentation_branch_count();
-    size_t output_sample_size();
-    RocalColorFormat output_color_format();
     Status build();
     Status run();
     Timing timing();
     RocalMemType mem_type();
     void release();
     template <typename T>
-    std::shared_ptr<T> add_node(const std::vector<Image *> &inputs, const std::vector<Image *> &outputs);
+    std::shared_ptr<T> add_node(const std::vector<rocalTensor *> &inputs, const std::vector<rocalTensor *> &outputs);
     template <typename T, typename M> std::shared_ptr<T> meta_add_node(std::shared_ptr<M> node);
     rocalTensor *create_tensor(const rocalTensorInfo &info, bool is_output);
     rocalTensor *create_loader_output_tensor(const rocalTensorInfo &info);
@@ -88,24 +77,15 @@ class MasterGraph
     void create_randombboxcrop_reader(RandomBBoxCrop_MetaDataReaderType reader_type, RandomBBoxCrop_MetaDataType label_type, bool all_boxes_overlap, bool no_crop, FloatParam* aspect_ratio, bool has_shape, int crop_width, int crop_height, int num_attempts, FloatParam* scaling, int total_num_attempts, int64_t seed=0);
     const std::pair<ImageNameBatch,pMetaDataBatch>& meta_data();
     void set_loop(bool val) { _loop = val; }
-    void set_output_images(const std::vector<Image*> &output_images, unsigned int num_of_outputs)
-    {
-        _output_images.resize(num_of_outputs);
-        _output_images = output_images;
-    }
-    void set_output(Image* output_image);
+    void set_output(rocalTensor* output_tensor);
     bool empty() { return (remaining_count() < (_is_sequence_reader_output ? _sequence_batch_size : _user_batch_size)); }
-    size_t internal_batch_size() { return _internal_batch_size; }
     size_t sequence_batch_size() { return _sequence_batch_size; }
     std::shared_ptr<MetaDataGraph> meta_data_graph() { return _meta_data_graph; }
     std::shared_ptr<MetaDataReader> meta_data_reader() { return _meta_data_reader; }
     bool is_random_bbox_crop() {return _is_random_bbox_crop; }
-    void set_video_loader_flag() { _is_video_loader = true; }
-    bool is_video_loader() {return _is_video_loader; }
     bool is_sequence_reader_output() {return _is_sequence_reader_output; }
     void set_sequence_reader_output() { _is_sequence_reader_output = true; }
     void set_sequence_batch_size(size_t sequence_length) { _sequence_batch_size = _user_batch_size * sequence_length; }
-    void set_sequence_batch_ratio() { _sequence_batch_ratio = _sequence_batch_size / _internal_batch_size; }
     Status get_bbox_encoded_buffers(float **boxes_buf_ptr, int **labels_buf_ptr, size_t num_encoded_boxes);
     size_t bounding_box_batch_count(int* buf, pMetaDataBatch meta_data_batch);
 #if ENABLE_OPENCL
@@ -113,33 +93,26 @@ class MasterGraph
 #endif
 private:
     Status update_node_parameters();
-    Status allocate_output_tensor();
-    Status deallocate_output_tensor();
     void create_single_graph();
     void start_processing();
     void stop_processing();
     void output_routine();
-    void output_routine_video();
     void decrease_image_count();
-    bool processing_on_device_ocl() { return _output_image_info.mem_type() == RocalMemType::OCL; };
-    bool processing_on_device_hip() { return _output_image_info.mem_type() == RocalMemType::HIP; };
-    /// notify_user_thread() is called when the internal processing thread is done with processing all available images
+    /// notify_user_thread() is called when the internal processing thread is done with processing all available tensors
     void notify_user_thread();
-    /// no_more_processed_data() is logically linked to the notify_user_thread() and is used to tell the user they've already consumed all the processed images
+    /// no_more_processed_data() is logically linked to the notify_user_thread() and is used to tell the user they've already consumed all the processed tensors
     bool no_more_processed_data();
-    RingBuffer _ring_buffer;//!< The queue that keeps the images that have benn processed by the internal thread (_output_thread) asynchronous to the user's thread
+    RingBuffer _ring_buffer;//!< The queue that keeps the tensors that have benn processed by the internal thread (_output_thread) asynchronous to the user's thread
     MetaDataBatch* _augmented_meta_data = nullptr;//!< The output of the meta_data_graph,
     CropCordBatch* _random_bbox_crop_cords_data = nullptr;
     std::thread _output_thread;
-    ImageInfo _output_image_info;//!< Keeps the information about ROCAL's output image , it includes all images of a batch stacked on top of each other
     rocalTensorList _internal_tensor_list;  //!< Keeps a list of ovx tensors that are used to store the augmented outputs (there is an augmentation output batch per element in the list)
     rocalTensorList _output_tensor_list;    //!< Keeps a list of ovx tensors(augmented outputs) that are to be passed to the user (there is an augmentation output batch per element in the list)
-    std::list<Image*> _internal_images;//!< Keeps all the ovx images (virtual/non-virtual) either intermediate images, or input images that feed the graph
+    std::list<rocalTensor*> _internal_tensors;  //!< Keeps all the ovx tensors (virtual/non-virtual) either intermediate tensors, or input tensors that feed the graph
     std::list<std::shared_ptr<Node>> _nodes;//!< List of all the nodes
     std::list<std::shared_ptr<Node>> _root_nodes;//!< List of all root nodes (image/video loaders)
     std::list<std::shared_ptr<Node>> _meta_data_nodes;//!< List of nodes where meta data has to be updated after augmentation
-    std::map<Image*, std::shared_ptr<Node>> _image_map;//!< key: image, value : Parent node
-    void * _output_tensor;//!< In the GPU processing case , is used to convert the U8 samples to float32 before they are being transfered back to host
+    std::map<rocalTensor*, std::shared_ptr<Node>> _tensor_map;//!< key: tensor, value : Parent node
 #if ENABLE_HIP
     DeviceManagerHip   _device;//!< Keeps the device related constructs needed for running on GPU
 #elif ENABLE_OPENCL
@@ -148,10 +121,7 @@ class MasterGraph
     std::shared_ptr<Graph> _graph = nullptr;
     RocalAffinity _affinity;
     const int _gpu_id;//!< Defines the device id used for processing
-    pLoaderModule _loader_module; //!< Keeps the loader module used to feed the input the images of the graph
-#ifdef ROCAL_VIDEO
-    pVideoLoaderModule _video_loader_module; //!< Keeps the video loader module used to feed the input sequences of the graph
-#endif
+    pLoaderModule _loader_module; //!< Keeps the loader module used to feed the input the tensors of the graph
     TimingDBG _convert_time, _process_time, _bencode_time;
     const size_t _user_batch_size;//!< Batch size provided by the user
     vx_context _context;
@@ -162,20 +132,15 @@ class MasterGraph
     bool _first_run = true;
     bool _processing;//!< Indicates if internal processing thread should keep processing or not
     const static unsigned SAMPLE_SIZE = sizeof(unsigned char);
-    int _remaining_count;//!< Keeps the count of remaining images yet to be processed for the user,
-    bool _loop;//!< Indicates if user wants to indefinitely loops through images or not
-    static size_t compute_optimum_internal_batch_size(size_t user_batch_size, RocalAffinity affinity);
-    const size_t _internal_batch_size;//!< In the host processing case , internal batch size can be different than _user_batch_size. This batch size used internally throughout.
-    const size_t _user_to_internal_batch_ratio;
+    int _remaining_count;//!< Keeps the count of remaining tensors yet to be processed for the user,
+    bool _loop;//!< Indicates if user wants to indefinitely loops through tensors or not
     size_t _prefetch_queue_depth;
     bool _output_routine_finished_processing = false;
     const RocalTensorDataType _out_data_type;
     bool _is_random_bbox_crop = false;
-    bool _is_video_loader = false; //!< Set to true if Video Loader is invoked.
     std::vector<std::vector<size_t>> _sequence_start_framenum_vec; //!< Stores the starting frame number of the sequences.
     std::vector<std::vector<std::vector<float>>>_sequence_frame_timestamps_vec; //!< Stores the timestamps of the frames in a sequences.
     size_t _sequence_batch_size = 0; //!< Indicates the _user_batch_size when sequence reader outputs are required
-    size_t _sequence_batch_ratio; //!< Indicates the _user_to_internal_batch_ratio when sequence reader outputs are required
     bool _is_sequence_reader_output = false; //!< Set to true if Sequence Reader is invoked.
     // box encoder variables
     bool _is_box_encoder = false; //bool variable to set the box encoder
@@ -192,23 +157,23 @@ class MasterGraph
 };
 
 template <typename T>
-std::shared_ptr<T> MasterGraph::add_node(const std::vector<Image *> &inputs, const std::vector<Image *> &outputs)
+std::shared_ptr<T> MasterGraph::add_node(const std::vector<rocalTensor *> &inputs, const std::vector<rocalTensor *> &outputs)
 {
     auto node = std::make_shared<T>(inputs, outputs);
     _nodes.push_back(node);
 
     for(auto& input: inputs)
     {
-        if (_image_map.find(input) == _image_map.end())
-            THROW("Input image is invalid, cannot be found among output of previously created nodes")
+        if (_tensor_map.find(input) == _tensor_map.end())
+            THROW("Input tensor is invalid, cannot be found among output of previously created nodes")
 
-        auto parent_node = _image_map.find(input)->second;
+        auto parent_node = _tensor_map.find(input)->second;
         parent_node->add_next(node);
         node->add_previous(parent_node);
     }
 
     for(auto& output: outputs)
-        _image_map.insert(std::make_pair(output, node));
+        _tensor_map.insert(std::make_pair(output, node));
 
     return node;
 }
@@ -219,7 +184,7 @@ std::shared_ptr<T> MasterGraph::meta_add_node(std::shared_ptr<M> node)
     auto meta_node = std::make_shared<T>();
     _meta_data_graph->_meta_nodes.push_back(meta_node);
     meta_node->_node = node;
-    meta_node->_batch_size = _is_sequence_reader_output ? _sequence_batch_size : _user_batch_size;
+    meta_node->_batch_size = _user_batch_size;
     return meta_node;
 }
 
@@ -240,11 +205,12 @@ template<> inline std::shared_ptr<ImageLoaderNode> MasterGraph::add_node(const s
     _loader_module->set_prefetch_queue_depth(_prefetch_queue_depth);
     _root_nodes.push_back(node);
     for(auto& output: outputs)
-        _image_map.insert(std::make_pair(output, node));
+        _tensor_map.insert(std::make_pair(output, node));
 
     return node;
 }
-template<> inline std::shared_ptr<ImageLoaderSingleShardNode> MasterGraph::add_node(const std::vector<Image*>& inputs, const std::vector<Image*>& outputs)
+
+template<> inline std::shared_ptr<ImageLoaderSingleShardNode> MasterGraph::add_node(const std::vector<rocalTensor*>& inputs, const std::vector<rocalTensor*>& outputs)
 {
     if(_loader_module)
         THROW("A loader already exists, cannot have more than one loader")
@@ -257,11 +223,12 @@ template<> inline std::shared_ptr<ImageLoaderSingleShardNode> MasterGraph::add_n
     _loader_module->set_prefetch_queue_depth(_prefetch_queue_depth);
     _root_nodes.push_back(node);
     for(auto& output: outputs)
-        _image_map.insert(std::make_pair(output, node));
+        _tensor_map.insert(std::make_pair(output, node));
 
     return node;
 }
-template<> inline std::shared_ptr<FusedJpegCropNode> MasterGraph::add_node(const std::vector<Image*>& inputs, const std::vector<Image*>& outputs)
+
+template<> inline std::shared_ptr<FusedJpegCropNode> MasterGraph::add_node(const std::vector<rocalTensor*>& inputs, const std::vector<rocalTensor*>& outputs)
 {
     if(_loader_module)
         THROW("A loader already exists, cannot have more than one loader")
@@ -275,12 +242,12 @@ template<> inline std::shared_ptr<FusedJpegCropNode> MasterGraph::add_node(const
     _loader_module->set_random_bbox_data_reader(_randombboxcrop_meta_data_reader);
     _root_nodes.push_back(node);
     for(auto& output: outputs)
-        _image_map.insert(std::make_pair(output, node));
+        _tensor_map.insert(std::make_pair(output, node));
 
     return node;
 }
 
-template<> inline std::shared_ptr<FusedJpegCropSingleShardNode> MasterGraph::add_node(const std::vector<Image*>& inputs, const std::vector<Image*>& outputs)
+template<> inline std::shared_ptr<FusedJpegCropSingleShardNode> MasterGraph::add_node(const std::vector<rocalTensor*>& inputs, const std::vector<rocalTensor*>& outputs)
 {
     if(_loader_module)
         THROW("A loader already exists, cannot have more than one loader")
@@ -294,7 +261,7 @@ template<> inline std::shared_ptr<FusedJpegCropSingleShardNode> MasterGraph::add
     _loader_module->set_random_bbox_data_reader(_randombboxcrop_meta_data_reader);
     _root_nodes.push_back(node);
     for(auto& output: outputs)
-        _image_map.insert(std::make_pair(output, node));
+        _tensor_map.insert(std::make_pair(output, node));
 
     return node;
 }
@@ -302,7 +269,7 @@ template<> inline std::shared_ptr<FusedJpegCropSingleShardNode> MasterGraph::add
 /*
  * Explicit specialization for Cifar10LoaderNode
  */
-template<> inline std::shared_ptr<Cifar10LoaderNode> MasterGraph::add_node(const std::vector<Image*>& inputs, const std::vector<Image*>& outputs)
+template<> inline std::shared_ptr<Cifar10LoaderNode> MasterGraph::add_node(const std::vector<rocalTensor*>& inputs, const std::vector<rocalTensor*>& outputs)
 {
     if(_loader_module)
         THROW("A loader already exists, cannot have more than one loader")
@@ -315,47 +282,46 @@ template<> inline std::shared_ptr<Cifar10LoaderNode> MasterGraph::add_node(const
     _loader_module->set_prefetch_queue_depth(_prefetch_queue_depth);
     _root_nodes.push_back(node);
     for(auto& output: outputs)
-        _image_map.insert(std::make_pair(output, node));
+        _tensor_map.insert(std::make_pair(output, node));
 
     return node;
 }
 
-#ifdef ROCAL_VIDEO
 /*
  * Explicit specialization for VideoLoaderNode
  */
-template<> inline std::shared_ptr<VideoLoaderNode> MasterGraph::add_node(const std::vector<Image*>& inputs, const std::vector<Image*>& outputs)
+template<> inline std::shared_ptr<VideoLoaderNode> MasterGraph::add_node(const std::vector<rocalTensor*>& inputs, const std::vector<rocalTensor*>& outputs)
 {
-    if(_video_loader_module)
-        THROW("A video loader already exists, cannot have more than one loader")
+    if(_loader_module)
+        THROW("A loader already exists, cannot have more than one loader")
 #if ENABLE_HIP || ENABLE_OPENCL
     auto node = std::make_shared<VideoLoaderNode>(outputs[0], (void *)_device.resources());
 #else    
     auto node = std::make_shared<VideoLoaderNode>(outputs[0], nullptr);
-#endif    
-    _video_loader_module = node->get_loader_module();
-    _video_loader_module->set_prefetch_queue_depth(_prefetch_queue_depth);
+#endif 
+    _loader_module = node->get_loader_module();
+    _loader_module->set_prefetch_queue_depth(_prefetch_queue_depth);
     _root_nodes.push_back(node);
     for(auto& output: outputs)
-        _image_map.insert(std::make_pair(output, node));
+        _tensor_map.insert(std::make_pair(output, node));
 
     return node;
 }
-template<> inline std::shared_ptr<VideoLoaderSingleShardNode> MasterGraph::add_node(const std::vector<Image*>& inputs, const std::vector<Image*>& outputs)
+
+template<> inline std::shared_ptr<VideoLoaderSingleShardNode> MasterGraph::add_node(const std::vector<rocalTensor*>& inputs, const std::vector<rocalTensor*>& outputs)
 {
-    if(_video_loader_module)
-        THROW("A video loader already exists, cannot have more than one loader")
+    if(_loader_module)
+        THROW("A loader already exists, cannot have more than one loader")
 #if ENABLE_HIP || ENABLE_OPENCL
     auto node = std::make_shared<VideoLoaderSingleShardNode>(outputs[0], (void *)_device.resources());
 #else
     auto node = std::make_shared<VideoLoaderSingleShardNode>(outputs[0], nullptr);
 #endif    
-    _video_loader_module = node->get_loader_module();
-    _video_loader_module->set_prefetch_queue_depth(_prefetch_queue_depth);
+    _loader_module = node->get_loader_module();
+    _loader_module->set_prefetch_queue_depth(_prefetch_queue_depth);
     _root_nodes.push_back(node);
     for(auto& output: outputs)
-        _image_map.insert(std::make_pair(output, node));
+        _tensor_map.insert(std::make_pair(output, node));
 
     return node;
 }
-#endif
diff --git a/rocAL/rocAL/include/pipeline/ring_buffer.h b/rocAL/rocAL/include/pipeline/ring_buffer.h
index 77618aac6d..d0148a5074 100644
--- a/rocAL/rocAL/include/pipeline/ring_buffer.h
+++ b/rocAL/rocAL/include/pipeline/ring_buffer.h
@@ -45,10 +45,10 @@ class RingBuffer
     ///\param dev
     ///\param sub_buffer_size
     ///\param sub_buffer_count
-    void init(RocalMemType mem_type, void *dev, unsigned sub_buffer_size, unsigned sub_buffer_count);
+    void init(RocalMemType mem_type, void *dev, std::vector<size_t> &sub_buffer_size);
     void initBoxEncoderMetaData(RocalMemType mem_type, size_t encoded_bbox_size, size_t encoded_labels_size);
     void release_gpu_res();
-    std::vector<void*> get_read_buffers() ;
+    std::vector<void*> get_read_buffers();
     void* get_host_master_read_buffer();
     std::vector<void*> get_write_buffers();
     std::pair<void*, void*> get_box_encode_write_buffers();
@@ -72,13 +72,12 @@ class RingBuffer
     void increment_write_ptr();
     bool full();
     const unsigned BUFF_DEPTH;
-    unsigned _sub_buffer_size;
+    std::vector<size_t> _sub_buffer_size;
     unsigned _sub_buffer_count;
     std::mutex _lock;
     std::condition_variable _wait_for_load;
     std::condition_variable _wait_for_unload;
     std::vector<std::vector<void*>> _dev_sub_buffer;
-    std::vector<void*> _host_master_buffers;
     std::vector<std::vector<void*>> _host_sub_buffers;
     std::vector<void *> _dev_bbox_buffer;
     std::vector<void *> _dev_labels_buffer;
diff --git a/rocAL/rocAL/source/api/rocal_api_data_loaders.cpp b/rocAL/rocAL/source/api/rocal_api_data_loaders.cpp
index 7f663b068e..e65daae795 100644
--- a/rocAL/rocAL/source/api/rocal_api_data_loaders.cpp
+++ b/rocAL/rocAL/source/api/rocal_api_data_loaders.cpp
@@ -301,7 +301,6 @@ rocalSequenceReader(
         // Set sequence batch size and batch ratio in master graph as it varies according to sequence length
         context->master_graph->set_sequence_reader_output();
         context->master_graph->set_sequence_batch_size(sequence_length);
-        context->master_graph->set_sequence_batch_ratio();
         bool decoder_keep_original = true;
 
         // This has been introduced to support variable width and height video frames in future.
@@ -384,7 +383,6 @@ rocalSequenceReaderSingleShard(
         // Set sequence batch size and batch ratio in master graph as it varies according to sequence length
         context->master_graph->set_sequence_reader_output();
         context->master_graph->set_sequence_batch_size(sequence_length);
-        context->master_graph->set_sequence_batch_ratio();
         bool decoder_keep_original = true;
 
         // This has been introduced to support variable width and height video frames in future.
diff --git a/rocAL/rocAL/source/pipeline/master_graph.cpp b/rocAL/rocAL/source/pipeline/master_graph.cpp
index 8938fdc8cd..9579aed9f7 100644
--- a/rocAL/rocAL/source/pipeline/master_graph.cpp
+++ b/rocAL/rocAL/source/pipeline/master_graph.cpp
@@ -113,7 +113,6 @@ MasterGraph::~MasterGraph()
 
 MasterGraph::MasterGraph(size_t batch_size, RocalAffinity affinity, int gpu_id, size_t prefetch_queue_depth, RocalTensorDataType output_tensor_data_type):
         _ring_buffer(prefetch_queue_depth),
-        _output_tensor(nullptr),
         _graph(nullptr),
         _affinity(affinity),
         _gpu_id(gpu_id),
@@ -130,8 +129,6 @@ MasterGraph::MasterGraph(size_t batch_size, RocalAffinity affinity, int gpu_id,
 #endif        
         _first_run(true),
         _processing(false),
-        _internal_batch_size(compute_optimum_internal_batch_size(batch_size, affinity)),
-        _user_to_internal_batch_ratio (_user_batch_size/_internal_batch_size),
         _prefetch_queue_depth(prefetch_queue_depth),
         _out_data_type(output_tensor_data_type),
 #if ENABLE_HIP
@@ -262,12 +259,12 @@ MasterGraph::create_single_graph()
     _graph = std::make_shared<Graph>(_context, _affinity, 0, _gpu_id);
     for(auto& node: _nodes)
     {
-        // Any image not yet created can be created as virtual image
-        for(auto& image: node->output())
-            if(image->info().type() == ImageInfo::Type::UNKNOWN)
+        // Any tensor not yet created can be created as virtual tensor
+        for(auto& tensor: node->output())
+            if(tensor->info().type() == rocalTensorInfo::Type::UNKNOWN)
             {
-                image->create_virtual(_context, _graph->get());
-                _internal_images.push_back(image);
+                tensor->create_virtual(_context, _graph->get());
+                _internal_tensors.push_back(tensor);
             }
         node->create(_graph);
     }
@@ -277,20 +274,13 @@ MasterGraph::create_single_graph()
 MasterGraph::Status
 MasterGraph::build()
 {
-    if(_output_images.empty())
-        THROW("No output images are there, cannot create the pipeline")
+    if(_internal_tensor_list.empty())
+        THROW("No output tensors are there, cannot create the pipeline")
 
-    // Verify all output images have the same dimension, otherwise creating a unified tensor from them is not supported
-    _output_image_info = _output_images.front()->info();
-    for(auto&& output_image : _output_images)
-        if(!(output_image->info() == _output_image_info))
-            THROW("Dimension of the output images do not match")
-
-    allocate_output_tensor();
 #if ENABLE_HIP || ENABLE_OPENCL
-    _ring_buffer.init(_mem_type, (void *)_device.resources(), output_byte_size(), _output_images.size());
+    _ring_buffer.init(_mem_type, (void *)_device.resources(), _internal_tensor_list.data_size());
 #else
-    _ring_buffer.init(_mem_type, nullptr, output_byte_size(), _output_images.size());
+    _ring_buffer.init(_mem_type, nullptr, _internal_tensor_list.data_size());
 #endif
     if (_is_box_encoder) _ring_buffer.initBoxEncoderMetaData(_mem_type, _user_batch_size*_num_anchors*4*sizeof(float), _user_batch_size*_num_anchors*sizeof(int));
     create_single_graph();
@@ -320,30 +310,28 @@ MasterGraph::create_tensor(const rocalTensorInfo &info, bool is_output) {
         if (output->create_from_handle(_context) != 0)
             THROW("Cannot create the tensor from handle")
         _internal_tensor_list.push_back(output);
-
-        auto *output_ext = new rocalTensor(info);
-        if (output_ext->create_from_handle(_context) != 0)
-            THROW("Cannot create the tensor from handle")
-        _output_tensor_list.push_back(output_ext);
+        _output_tensor_list.push_back(new rocalTensor(info));   // Creating a replica of the output tensor to be returned to the user
     }
 
     return output;
 }
 
 void
-MasterGraph::set_output(Image* output_image)
+MasterGraph::set_output(rocalTensor* output_tensor)
 {
-    if(output_image->is_handle_set() == false)
+    if(output_tensor->is_handle_set() == false)
     {
-        if (output_image->create_from_handle(_context) != 0)
-                THROW("Cannot create the image from handle")
-        _output_images.push_back(output_image);
+        if (output_tensor->create_from_handle(_context) != 0)
+                THROW("Cannot create the tensor from handle")
+
+        _internal_tensor_list.push_back(output_tensor);
+        _output_tensor_list.push_back(new rocalTensor(output_tensor->info()));  // Creating a replica of the output tensor to be returned to the user
     }
     else
     {
         // Decoder case only
-        auto actual_output = create_image(output_image->info(), true);
-        add_node<CopyNode>({output_image}, {actual_output});
+        auto actual_output = create_tensor(output_tensor->info(), true);
+        add_node<CopyNode>({output_tensor}, {actual_output});
     }
 }
 
@@ -353,23 +341,16 @@ void MasterGraph::release()
     stop_processing();
     _nodes.clear();
     _root_nodes.clear();
-    _image_map.clear();
+    _tensor_map.clear();
     _ring_buffer.release_gpu_res();
     //shut_down loader:: required for releasing any allocated resourses
-#ifdef ROCAL_VIDEO
-    if(_is_video_loader)
-        _video_loader_module->shut_down();
-    else
-#endif
-        _loader_module->shut_down();
+    _loader_module->shut_down();
     // release all openvx resources.
     vx_status status;
-    for(auto& image: _internal_images)
-        delete image;// It will call the vxReleaseImage internally in the destructor
-    for(auto& image: _output_images)
-        delete image;// It will call the vxReleaseImage internally in the destructor
-    deallocate_output_tensor();
-
+    for(auto& tensor: _internal_tensors)
+        delete tensor;  // It will call the vxReleaseTensor internally in the destructor
+    _internal_tensor_list.release(); // It will call the vxReleaseTensor internally in the destructor for each tensor in the list
+    _output_tensor_list.release();   // It will call the vxReleaseTensor internally in the destructor for each tensor in the list
 
     if(_graph != nullptr)
         _graph->release();
@@ -394,30 +375,6 @@ MasterGraph::update_node_parameters()
     return Status::OK;
 }
 
-size_t
-MasterGraph::augmentation_branch_count()
-{
-    return _output_images.size();
-}
-
-RocalColorFormat
-MasterGraph::output_color_format()
-{
-    return _output_image_info.color_format();
-}
-
-size_t
-MasterGraph::output_width()
-{
-    return _output_image_info.width();
-}
-
-size_t
-MasterGraph::output_height()
-{
-    return _output_image_info.height_batch() * (_is_sequence_reader_output ? _sequence_batch_ratio : _user_to_internal_batch_ratio);
-}
-
 void
 MasterGraph::sequence_start_frame_number(std::vector<size_t> &sequence_start_framenum)
 {
@@ -432,60 +389,6 @@ MasterGraph::sequence_frame_timestamps(std::vector<std::vector<float>> &sequence
     _sequence_frame_timestamps_vec.pop_back();
 }
 
-MasterGraph::Status
-MasterGraph::allocate_output_tensor()
-{
-#if ENABLE_OPENCL
-    if(processing_on_device_ocl())
-    {
-        // creating a float buffer that can accommodates all output images
-        size_t output_buffer_size = output_byte_size() * _output_images.size();
-        cl_int ret = CL_SUCCESS;
-        _output_tensor = nullptr;
-        size_t size = output_buffer_size*sizeof(cl_float);
-        cl_mem clImgFloat  = clCreateBuffer(_device.resources()->context,
-                                            CL_MEM_READ_WRITE,
-                                            size,
-                                            nullptr, &ret);
-
-        if (!clImgFloat || ret != CL_SUCCESS)
-            THROW("clCreateBuffer of size " + TOSTR(size) + " failed " + TOSTR(ret))
-
-        _output_tensor = clImgFloat;
-    }
-#elif ENABLE_HIP
-    if (processing_on_device_hip())
-    {
-        // creating a float buffer that can accommodates all output images
-        size_t output_buffer_size = output_byte_size() * _output_images.size();
-        size_t size = (_out_data_type==RocalTensorDataType::FP32)? output_buffer_size*sizeof(float): output_buffer_size*sizeof(half);
-        hipError_t status = hipMalloc( &_output_tensor, size);
-        if (status != hipSuccess || !_output_tensor )
-            THROW("ROCAL::hipMalloc of size " + TOSTR(size) + " failed " + TOSTR(status))
-    }
-#endif
-    return Status::OK;
-}
-
-MasterGraph::Status
-MasterGraph::deallocate_output_tensor()
-{
-#if ENABLE_OPENCL
-    if(processing_on_device_ocl() && _output_tensor != nullptr)
-        clReleaseMemObject((cl_mem)_output_tensor );
-#elif ENABLE_HIP
-    if(processing_on_device_hip() && _output_tensor != nullptr) {
-        hipError_t err = hipFree(_output_tensor );
-        if (err != hipSuccess) {
-            THROW("MasterGraph::deallocate_output_tensor  hipFree failed " + TOSTR(err))
-        }
-        _output_tensor = nullptr;
-    }
-#endif
-
-    return Status::OK;
-}
-
 MasterGraph::Status
 MasterGraph::reset()
 {
@@ -495,24 +398,14 @@ MasterGraph::reset()
     if(_output_thread.joinable())
         _output_thread.join();
     _ring_buffer.reset();
+    _sequence_start_framenum_vec.clear();
+    _sequence_frame_timestamps_vec.clear();
     // clearing meta ring buffer
-#ifdef ROCAL_VIDEO
-    if(_is_video_loader)
-    {
-        _video_loader_module->reset();
-        _sequence_start_framenum_vec.clear();
-        _sequence_frame_timestamps_vec.clear();
-    }
-    else
-#endif
-    {
-        // if random_bbox meta reader is used: read again to get different crops
-        if (_randombboxcrop_meta_data_reader != nullptr)
-            _randombboxcrop_meta_data_reader->release();
-        // resetting loader module to start from the beginning of the media and clear it's internal state/buffers
-        _loader_module->reset();
-    }
-
+    // if random_bbox meta reader is used: read again to get different crops
+    if (_randombboxcrop_meta_data_reader != nullptr)
+        _randombboxcrop_meta_data_reader->release();
+    // resetting loader module to start from the beginning of the media and clear it's internal state/buffers
+    _loader_module->reset();
     // restart processing of the images
     _first_run = true;
     _output_routine_finished_processing = false;
@@ -535,428 +428,29 @@ MasterGraph::mem_type()
 Timing
 MasterGraph::timing()
 {
-    Timing t;
-#ifdef ROCAL_VIDEO
-    if(_is_video_loader)
-    {
-        t = _video_loader_module->timing();
-        t.video_process_time += _process_time.get_timing();
-    }
-    else
-#endif
-    {
-        t  = _loader_module->timing();
-        t.image_process_time += _process_time.get_timing();
-    }
+    Timing t = _loader_module->timing();
+    t.image_process_time += _process_time.get_timing();
     t.copy_to_output += _convert_time.get_timing();
     t.bb_process_time += _bencode_time.get_timing();
     return t;
 }
 
-
-#define CHECK_CL_CALL_RET(x) { cl_int ret; ret = x; if( ret != CL_SUCCESS) THROW("ocl call failed "+STR(#x)+" error "+TOSTR(ret)) }
-
-MasterGraph::Status
-MasterGraph::copy_out_tensor(void *out_ptr, RocalTensorFormat format, float multiplier0, float multiplier1,
-                             float multiplier2, float offset0, float offset1, float offset2, bool reverse_channels, RocalTensorDataType output_data_type)
+rocalTensorList *
+MasterGraph::get_output_tensors()
 {
-    if(no_more_processed_data())
-        return MasterGraph::Status::NO_MORE_DATA;
-
-    if (output_color_format() == RocalColorFormat::RGB_PLANAR)
-        return MasterGraph::copy_out_tensor_planar(out_ptr,format,multiplier0, multiplier1, multiplier2, offset0, offset1, offset2, reverse_channels, output_data_type);
-
-    _convert_time.start();
-    // Copies to the output context given by the user
-    unsigned int n = _user_batch_size;
-    const size_t c = output_depth();
-    const size_t h = _output_image_info.height_single();
-    const size_t w = output_width();
-    const size_t single_output_image_size = output_byte_size();
-
-#if ENABLE_OPENCL
-    if(_output_image_info.mem_type() == RocalMemType::OCL)
-    {
-        if(output_data_type == RocalTensorDataType::FP16)
-            THROW("FP16 tensor output for GPU affinity is not implemented")
-        // OCL device memory
-        cl_int status;
-
-        size_t global_work_size = output_sample_size();
-        size_t local_work_size = 256;
-
-        // TODO: Use the runKernel function instead
-
-        auto kernel_name = (format == RocalTensorFormat::NHWC)? "copyInt8ToNHWC" : "copyInt8ToNCHW";
-        cl_kernel kernel = _device["utility"][kernel_name];
-        auto queue = _device.resources()->cmd_queue;
-        unsigned dest_buf_offset = 0;
-        auto output_buffers =_ring_buffer.get_read_buffers();
-        for( auto&& out_image: output_buffers)
-        {
-            int argIdx = 0;
-            unsigned reverse_chnl = reverse_channels ? 1 : 0;
-            auto img_buffer = out_image;
-            CHECK_CL_CALL_RET(clSetKernelArg( kernel, argIdx++, sizeof(cl_mem), (void*)& (img_buffer)))
-            CHECK_CL_CALL_RET(clSetKernelArg( kernel, argIdx++, sizeof(cl_mem), (void*)&_output_tensor ))
-            CHECK_CL_CALL_RET(clSetKernelArg( kernel, argIdx++, sizeof(cl_uint), (void*)& dest_buf_offset))
-            CHECK_CL_CALL_RET(clSetKernelArg( kernel, argIdx++, sizeof(cl_uint), (void*)& w))
-            CHECK_CL_CALL_RET(clSetKernelArg( kernel, argIdx++, sizeof(cl_uint), (void*)& h))
-            CHECK_CL_CALL_RET(clSetKernelArg( kernel, argIdx++, sizeof(cl_uint), (void*)& c))
-            CHECK_CL_CALL_RET(clSetKernelArg( kernel, argIdx++, sizeof(cl_float), (void*)& multiplier0))
-            CHECK_CL_CALL_RET(clSetKernelArg( kernel, argIdx++, sizeof(cl_float), (void*)& multiplier1))
-            CHECK_CL_CALL_RET(clSetKernelArg( kernel, argIdx++, sizeof(cl_float), (void*)& multiplier2))
-            CHECK_CL_CALL_RET(clSetKernelArg( kernel, argIdx++, sizeof(cl_float), (void*)& offset0))
-            CHECK_CL_CALL_RET(clSetKernelArg( kernel, argIdx++, sizeof(cl_float), (void*)& offset1))
-            CHECK_CL_CALL_RET(clSetKernelArg( kernel, argIdx++, sizeof(cl_float), (void*)& offset2))
-            CHECK_CL_CALL_RET(clSetKernelArg( kernel, argIdx++, sizeof(cl_uint), (void*)& reverse_chnl))
-
-            if((status = clEnqueueNDRangeKernel(queue,
-                                                kernel,
-                                                1,
-                                                nullptr,
-                                                &global_work_size,
-                                                &local_work_size,
-                                                0 , nullptr, nullptr)) != CL_SUCCESS)
-                THROW("clEnqueueNDRangeKernel failed on kernel "+STR(kernel_name)+" error " + TOSTR(status))
-            dest_buf_offset += single_output_image_size;
-        }
-
-        int read_size = single_output_image_size*_output_images.size()*sizeof(cl_float);
-        if((status = clEnqueueReadBuffer(queue,
-                                         (cl_mem)_output_tensor,
-                                         CL_TRUE,
-                                         0,
-                                         read_size,
-                                         out_ptr,
-                                         0 , nullptr, nullptr)) != CL_SUCCESS)
-            THROW("clEnqueueReadBuffer failed: " + TOSTR(status))
-    }
-#elif ENABLE_HIP
-    if(_output_image_info.mem_type() == RocalMemType::HIP)
-    {
-        unsigned int fp16 = (output_data_type == RocalTensorDataType::FP16);
-
-        auto output_buffers =_ring_buffer.get_read_buffers();
-        unsigned dest_buf_offset = 0;
-        // copy hip buffer to out_ptr
-        // todo:: add callback routing to exchange memory pointer to avoid extra copy
-        for( auto&& out_image: output_buffers)
-        {
-            auto img_buffer = out_image;
-            if (format == RocalTensorFormat::NHWC)
-            {
-                HipExecCopyInt8ToNHWC(_device.resources()->hip_stream, (const void *)img_buffer, out_ptr, dest_buf_offset, n, c, h, w,
-                                        multiplier0, multiplier1, multiplier2, offset0, offset1, offset2, reverse_channels, fp16);
-
-            }else
-            {
-                HipExecCopyInt8ToNCHW(_device.resources()->hip_stream, (const void *)img_buffer, out_ptr, dest_buf_offset, n, c, h, w,
-                                        multiplier0, multiplier1, multiplier2, offset0, offset1, offset2, reverse_channels, fp16);
-            }
-            dest_buf_offset += single_output_image_size;
-        }
-    }
-#endif
-    if(_output_image_info.mem_type() == RocalMemType::HOST)
-    {
-        float multiplier[3] = {multiplier0, multiplier1, multiplier2 };
-        float offset[3] = {offset0, offset1, offset2 };
-        size_t dest_buf_offset_start = 0;
-
-        auto output_buffers =_ring_buffer.get_read_buffers();
-        for( auto&& out_image: output_buffers)
-        {
-            unsigned int single_image_size = w * c * h;
-            #pragma omp parallel for num_threads(_internal_batch_size)
-            for(unsigned int batchCount = 0; batchCount < n; batchCount ++)
-            {
-                size_t dest_buf_offset = dest_buf_offset_start + single_image_size*batchCount;
-                auto in_buffer = (unsigned char*)out_image + single_image_size*batchCount;
-
-                if(format == RocalTensorFormat::NHWC)
-                {
-                    if(output_data_type == RocalTensorDataType::FP32)
-                    {
-                          float *output_tensor_32 = static_cast<float *>(out_ptr);
-                          auto channel_size = w * h;
-                          for (unsigned channel_idx = 0; channel_idx < c; channel_idx++) {
-                              for (unsigned i = 0; i < channel_size; i++)
-                                  output_tensor_32[dest_buf_offset + channel_idx + i * c] =
-                                          offset[channel_idx] + multiplier[channel_idx] *
-                                                                (reverse_channels ? (float) (in_buffer[i * c + c - channel_idx - 1])
-                                                                                  : (float) (in_buffer[i * c + channel_idx]));
-                          }
-                    }
-                    else if(output_data_type == RocalTensorDataType::FP16)
-                    {
-                        half *output_tensor_16 = static_cast<half *>(out_ptr);
-                        auto channel_size = w * h;
-                        for (unsigned channel_idx = 0; channel_idx < c; channel_idx++) {
-                            for (unsigned i = 0; i < channel_size; i++)
-                                output_tensor_16[dest_buf_offset + channel_idx + i * c] =
-                                        offset[channel_idx] + multiplier[channel_idx] *
-                                                              (reverse_channels ? (half) (in_buffer[i * c + c - channel_idx - 1])
-                                                                                : (half) (in_buffer[i * c + channel_idx]));
-                        }
-                    }
-                }
-                if(format == RocalTensorFormat::NCHW)
-                {
-                    if(output_data_type == RocalTensorDataType::FP32)
-                    {
-                        float *output_tensor_32 = static_cast<float *>(out_ptr);
-                        auto channel_size  = w * h;
-                        if(c != 3)
-                        {
-                            for(unsigned i = 0; i < channel_size; i++)
-                                output_tensor_32[dest_buf_offset + i] = offset[0] + multiplier[0]*(float)in_buffer[c*i];
-                        }
-                        else {
-    #if (ENABLE_SIMD && __AVX2__)
-                            float *B_buf = output_tensor_32 + dest_buf_offset;
-                            float *G_buf = B_buf + channel_size;
-                            float *R_buf = G_buf + channel_size;
-
-                            __m256i mask_B, mask_G, mask_R;
-                            if (reverse_channels) {
-                                mask_B = _mm256_setr_epi32(0x80808000, 0x80808003, 0x80808006, 0x80808009, 0x80808000,
-                                                          0x80808003, 0x80808006, 0x80808009);
-                                mask_G = _mm256_setr_epi32(0x80808001, 0x80808004, 0x80808007, 0x8080800A, 0x80808001,
-                                                          0x80808004, 0x80808007, 0x8080800A);
-                                mask_R = _mm256_setr_epi32(0x80808002, 0x80808005, 0x80808008, 0x8080800B, 0x80808002,
-                                                          0x80808005, 0x80808008, 0x8080800B);
-                            } else {
-                                mask_R = _mm256_setr_epi32(0x80808000, 0x80808003, 0x80808006, 0x80808009, 0x80808000,
-                                                          0x80808003, 0x80808006, 0x80808009);
-                                mask_G = _mm256_setr_epi32(0x80808001, 0x80808004, 0x80808007, 0x8080800A, 0x80808001,
-                                                          0x80808004, 0x80808007, 0x8080800A);
-                                mask_B = _mm256_setr_epi32(0x80808002, 0x80808005, 0x80808008, 0x8080800B, 0x80808002,
-                                                          0x80808005, 0x80808008, 0x8080800B);
-                            }
-                            __m256 pmul0 = _mm256_set1_ps(multiplier0);
-                            __m256 pmul1 = _mm256_set1_ps(multiplier1);
-                            __m256 pmul2 = _mm256_set1_ps(multiplier2);
-                            __m256 padd0 = _mm256_set1_ps(offset0);
-                            __m256 padd1 = _mm256_set1_ps(offset1);
-                            __m256 padd2 = _mm256_set1_ps(offset2);
-                            unsigned int alignedLength = (channel_size & ~7);    // multiple of 8
-                            unsigned int i = 0;
-
-                            __m256 fR, fG, fB;
-                            for (; i < alignedLength; i += 8) {
-                                __m256i pix0 = _mm256_loadu_si256((const __m256i *) in_buffer);
-                                pix0 = _mm256_permutevar8x32_epi32(pix0, _mm256_setr_epi32(0, 1, 2, 3, 3, 4, 5, 6));
-                                fB = _mm256_cvtepi32_ps(_mm256_shuffle_epi8(pix0, mask_R));
-                                fG = _mm256_cvtepi32_ps(_mm256_shuffle_epi8(pix0, mask_G));
-                                fR = _mm256_cvtepi32_ps(_mm256_shuffle_epi8(pix0, mask_B));
-                                fB = _mm256_mul_ps(fB, pmul0);
-                                fG = _mm256_mul_ps(fG, pmul1);
-                                fR = _mm256_mul_ps(fR, pmul2);
-                                fB = _mm256_add_ps(fB, padd0);
-                                fG = _mm256_add_ps(fG, padd1);
-                                fR = _mm256_add_ps(fR, padd2);
-                                _mm256_storeu_ps(B_buf, fB);
-                                _mm256_storeu_ps(G_buf, fG);
-                                _mm256_storeu_ps(R_buf, fR);
-                                B_buf += 8;
-                                G_buf += 8;
-                                R_buf += 8;
-                                in_buffer += 24;
-                            }
-                            for (; i < channel_size; i++, in_buffer += 3) {
-                                *B_buf++ = (in_buffer[0] * multiplier0) + offset0;
-                                *G_buf++ = (in_buffer[1] * multiplier1) + offset1;
-                                *R_buf++ = (in_buffer[2] * multiplier2) + offset1;
-                            }
-    #else
-                            for(unsigned channel_idx = 0; channel_idx < c; channel_idx++) {
-                                for(unsigned i = 0; i < channel_size; i++)
-                                    output_tensor_32[dest_buf_offset+channel_idx*channel_size + i] =
-                                            offset[channel_idx] + multiplier[channel_idx]*(reverse_channels ? (float)(in_buffer[(c*i+c-channel_idx-1)]) :
-                                            (float)(in_buffer[(c*i+channel_idx)]));
-                            }
-    #endif
-                        }
-                    }
-                    else if(output_data_type == RocalTensorDataType::FP16) 
-                    {
-                        half *output_tensor_16 = static_cast<half *>(out_ptr);
-                        auto channel_size = w * h;
-                        if(c != 3) {
-                            for(unsigned i = 0; i < channel_size; i++)
-                                output_tensor_16[dest_buf_offset + i] = offset[0] + multiplier[0] * (half)in_buffer[c * i];
-                        }
-                        else {
-    #if (ENABLE_SIMD && __AVX2__)
-                            half *B_buf_16 = output_tensor_16 + dest_buf_offset;
-                            half *G_buf_16 = B_buf_16 + channel_size;
-                            half *R_buf_16 = G_buf_16 + channel_size;
-
-                            __m256i mask_B, mask_G, mask_R;
-                            if (reverse_channels) {
-                                mask_B = _mm256_setr_epi32(0x80808000, 0x80808003, 0x80808006, 0x80808009, 0x80808000,
-                                                            0x80808003, 0x80808006, 0x80808009);
-                                mask_G = _mm256_setr_epi32(0x80808001, 0x80808004, 0x80808007, 0x8080800A, 0x80808001,
-                                                            0x80808004, 0x80808007, 0x8080800A);
-                                mask_R = _mm256_setr_epi32(0x80808002, 0x80808005, 0x80808008, 0x8080800B, 0x80808002,
-                                                            0x80808005, 0x80808008, 0x8080800B);
-                            } else {
-                                mask_R = _mm256_setr_epi32(0x80808000, 0x80808003, 0x80808006, 0x80808009, 0x80808000,
-                                                            0x80808003, 0x80808006, 0x80808009);
-                                mask_G = _mm256_setr_epi32(0x80808001, 0x80808004, 0x80808007, 0x8080800A, 0x80808001,
-                                                            0x80808004, 0x80808007, 0x8080800A);
-                                mask_B = _mm256_setr_epi32(0x80808002, 0x80808005, 0x80808008, 0x8080800B, 0x80808002,
-                                                            0x80808005, 0x80808008, 0x8080800B);
-                            }
-                            __m256 pmul0 = _mm256_set1_ps(multiplier0);
-                            __m256 pmul1 = _mm256_set1_ps(multiplier1);
-                            __m256 pmul2 = _mm256_set1_ps(multiplier2);
-                            __m256 padd0 = _mm256_set1_ps(offset0);
-                            __m256 padd1 = _mm256_set1_ps(offset1);
-                            __m256 padd2 = _mm256_set1_ps(offset2);
-                            unsigned int alignedLength = (channel_size & ~7);    // multiple of 8
-                            unsigned int i = 0;
-
-                            __m256 fR, fG, fB;
-                            __m128i tempR, tempG, tempB;
-                            for (; i < alignedLength; i += 8) {
-                                __m256i pix0 = _mm256_loadu_si256((const __m256i *) in_buffer);
-                                pix0 = _mm256_permutevar8x32_epi32(pix0, _mm256_setr_epi32(0, 1, 2, 3, 3, 4, 5, 6));
-                                fB = _mm256_cvtepi32_ps(_mm256_shuffle_epi8(pix0, mask_R));
-                                fG = _mm256_cvtepi32_ps(_mm256_shuffle_epi8(pix0, mask_G));
-                                fR = _mm256_cvtepi32_ps(_mm256_shuffle_epi8(pix0, mask_B));
-                                fB = _mm256_fmadd_ps(fB, pmul0, padd0);
-                                fG = _mm256_fmadd_ps(fG, pmul1, padd1);
-                                fR = _mm256_fmadd_ps(fR, pmul2, padd2);
-                                tempB = _mm256_cvtps_ph(fB, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
-                                tempG = _mm256_cvtps_ph(fG, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
-                                tempR = _mm256_cvtps_ph(fR, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
-                                _mm_storeu_si128((__m128i *)B_buf_16, tempB);
-                                _mm_storeu_si128((__m128i *)G_buf_16, tempG);
-                                _mm_storeu_si128((__m128i *)R_buf_16, tempR);
-                                B_buf_16 += 8;
-                                G_buf_16 += 8;
-                                R_buf_16 += 8;
-                                in_buffer += 24;
-                            }
-                            for (; i < channel_size; i++, in_buffer += 3) {
-                                *B_buf_16++ = (half) (in_buffer[0] * multiplier0) + offset0;
-                                *G_buf_16++ = (half) (in_buffer[1] * multiplier1) + offset1;
-                                *R_buf_16++ = (half) (in_buffer[2] * multiplier2) + offset2;
-                            }
-    #else
-                            for (unsigned channel_idx = 0; channel_idx < c; channel_idx++) {
-                                for (unsigned i = 0; i < channel_size; i++)
-                                    output_tensor_16[dest_buf_offset + channel_idx * channel_size + i] =
-                                            offset[channel_idx] + multiplier[channel_idx] *
-                                                                  (reverse_channels ? (half) (in_buffer[(c * i + c - channel_idx - 1)])
-                                                                                    : (half) (in_buffer[(c * i + channel_idx)]));
-                            }
-    #endif
-                        }
-                    }
-                }  // NCHW or NHWC
-            } // for loop batch
-
-            dest_buf_offset_start += single_output_image_size;
-        }
-    }
-    _convert_time.end();
-    return Status::OK;
-}
-
-MasterGraph::Status
-MasterGraph::copy_output(unsigned char *out_ptr, size_t out_size_in_bytes)
-{
-    if(no_more_processed_data())
-        return MasterGraph::Status::NO_MORE_DATA;
-
-    // Copies to the output context given by the user
-    size_t size = output_byte_size();
-    if (out_size_in_bytes != (size *_output_images.size()))
-        return MasterGraph::Status::INVALID_ARGUMENTS;
-
-    _convert_time.start();
-
-#if ENABLE_OPENCL
-    if(processing_on_device_ocl())
-    {
-        size_t dest_buf_offset = 0;
-        //NOTE: the CL_TRUE flag is only used on the last buffer read call,
-        // to avoid unnecessary sequence of synchronizations
-
-        // get_read_buffers() calls block_if_empty() internally and blocks if buffers are empty until a new batch is processed
-        auto output_buffers =_ring_buffer.get_read_buffers();
-        auto out_image_idx = output_buffers.size();
-        for( auto&& output_handle: output_buffers)
-        {
-            bool sync_flag = (--out_image_idx == 0) ? CL_TRUE : CL_FALSE;
-            cl_int status;
-            if((status = clEnqueueReadBuffer(_device.resources()->cmd_queue,
-                                             (cl_mem) output_handle,
-                                             sync_flag?(CL_TRUE):CL_FALSE,
-                                             0,
-                                             size,
-                                             out_ptr+dest_buf_offset,
-                                             0 , nullptr, nullptr)) != CL_SUCCESS)
-                THROW("clEnqueueReadBuffer failed: " + TOSTR(status))
-            dest_buf_offset += size;
-        }
-    }
-    else {
-#elif ENABLE_HIP
-    if(processing_on_device_hip())
-    {
-        //NOTE: the CL_TRUE flag is only used on the last buffer read call,
-        // to avoid unnecessary sequence of synchronizations
-
-        // get_read_buffers() calls block_if_empty() internally and blocks if buffers are empty until a new batch is processed
-        size_t dest_buf_offset = 0;
-        auto output_buffers =_ring_buffer.get_read_buffers();
-        for( auto&& output_handle: output_buffers)
-        {
-            hipError_t err = hipMemcpyDtoHAsync((void *)(out_ptr+dest_buf_offset), output_handle, size, _device.resources()->hip_stream);
-            if (err) {
-                THROW("hipMemcpyDtoHAsync failed: " + TOSTR(err))
-            }
-            dest_buf_offset += size;
-        }
-        // sync to finish copy
-        if (hipStreamSynchronize(_device.resources()->hip_stream) != hipSuccess)
-            THROW("hipStreamSynchronize failed for hipMemcpy ")
-
-    }
-    else {
-#endif
-        // get_host_master_read_buffer is blocking if _ring_buffer is empty, and blocks this thread till internal processing thread process a new batch and store in the _ring_buffer
-        memcpy(out_ptr, _ring_buffer.get_host_master_read_buffer(), size * _output_images.size());
-#if ENABLE_OPENCL || ENABLE_HIP
-    }
-#endif    
-    _convert_time.end();
-    return Status::OK;
+    auto output_ptr = _ring_buffer.get_read_buffers();
+    for(unsigned i = 0; i < _internal_tensor_list.size(); i++)
+        _output_tensor_list[i]->set_mem_handle(output_ptr[i]);
+    
+    return &_output_tensor_list;
 }
 
 void MasterGraph::output_routine()
 {
     INFO("Output routine started with "+TOSTR(_remaining_count) + " to load");
-    size_t batch_ratio = _is_sequence_reader_output ? _sequence_batch_ratio : _user_to_internal_batch_ratio;
-    if(!_is_sequence_reader_output) // _sequence_batch_ratio and _user_to_internal_batch_ratio is different. Will be removed in TensorSupport.
-    {
-#if !ENABLE_HIP
-    if(processing_on_device_ocl() && batch_ratio != 1)
-        THROW("Internal failure, in the GPU processing case, user and input batch size must be equal")
-#else
-    if(processing_on_device_hip() && batch_ratio != 1)
-        THROW("Internal failure, in the GPU processing case, user and input batch size must be equal")
-#endif
-    }
     try {
         while (_processing)
         {
-            const size_t each_cycle_size = output_byte_size()/batch_ratio;
-
             ImageNameBatch full_batch_image_names = {};
             pMetaDataBatch full_batch_meta_data = nullptr;
             pMetaDataBatch augmented_batch_meta_data = nullptr;
@@ -975,76 +469,62 @@ void MasterGraph::output_routine()
             auto write_buffers = _ring_buffer.get_write_buffers();
             _rb_block_if_full_time.end();
 
-            _process_time.start();
-            // When executing on CPU the internal batch count can be smaller than the user batch count
-            // In that case the user_batch_size will be an integer multiple of the _internal_batch_size
-            // Multiple cycles worth of internal_batch_size images should be processed to complete a full _user_batch_size
-            for(unsigned cycle_idx = 0; cycle_idx< batch_ratio; cycle_idx++)
-            {
-                // Swap handles on the input image, so that new image is loaded to be processed
-                auto load_ret = _loader_module->load_next();
-                if (load_ret != LoaderModuleStatus::OK)
-                    THROW("Loader module failed to load next batch of images, status " + TOSTR(load_ret))
+            // Swap handles on the input tensor, so that new tensor is loaded to be processed
+            auto load_ret = _loader_module->load_next();
+            if (load_ret != LoaderModuleStatus::OK)
+                THROW("Loader module failed to load next batch of images, status " + TOSTR(load_ret))
+            if (!_processing)
+                break;
+            auto this_cycle_names =  _loader_module->get_id();
+            auto decode_image_info = _loader_module->get_decode_image_info();
+            auto crop_image_info = _loader_module->get_crop_image_info();
 
-                if (!_processing)
-                    break;
-                auto this_cycle_names =  _loader_module->get_id();
-                auto decode_image_info = _loader_module->get_decode_image_info();
-                auto crop_image_info = _loader_module->get_crop_image_info();
+            if(this_cycle_names.size() != _user_batch_size)
+                WRN("Internal problem: names count "+ TOSTR(this_cycle_names.size()))
 
-                if(this_cycle_names.size() != _internal_batch_size)
-                    WRN("Internal problem: names count "+ TOSTR(this_cycle_names.size()))
+            // meta_data lookup is done before _meta_data_graph->process() is called to have the new meta_data ready for processing
+            if (_meta_data_reader)
+                _meta_data_reader->lookup(this_cycle_names);
 
-                // meta_data lookup is done before _meta_data_graph->process() is called to have the new meta_data ready for processing
-                if (_meta_data_reader)
-                    _meta_data_reader->lookup(this_cycle_names);
+            full_batch_image_names += this_cycle_names;
 
-                full_batch_image_names += this_cycle_names;
+            if (!_processing)
+                break;
 
-                if (!_processing)
-                    break;
+            // Swap handles on the output tensor, so that new processed tensor will be written to the a new buffer
+            for (size_t idx = 0; idx < _internal_tensor_list.size(); idx++)
+                _internal_tensor_list[idx]->swap_handle(write_buffers[idx]);
 
-                // Swap handles on the output images, so that new processed image will be written to the a new buffer
-                for (size_t idx = 0; idx < _output_images.size(); idx++)
-                {
-                    if(_affinity == RocalAffinity::GPU)
-                        _output_images[idx]->swap_handle(write_buffers[idx]);
-                    else
-                    {
-                        auto this_cycle_buffer_ptr = (unsigned char *) write_buffers[idx] + each_cycle_size * cycle_idx;
-                        _output_images[idx]->swap_handle(this_cycle_buffer_ptr);
-                    }
-                }
-
-                if (!_processing)
-                    break;
+            if (!_processing)
+                break;
 
-                for(auto node: _nodes)
+            for(auto node: _nodes)
+            {
+                if(node->_is_ssd)
                 {
-                    if(node->_is_ssd)
-                    {
-                        node->set_meta_data(_augmented_meta_data);
-                    }
+                    node->set_meta_data(_augmented_meta_data);
                 }
+            }
 
-                update_node_parameters();
-                if(_augmented_meta_data)
+            update_node_parameters();
+            if(_augmented_meta_data)
+            {
+                if (_meta_data_graph)
                 {
-                    if (_meta_data_graph)
+                    if(_is_random_bbox_crop)
                     {
-                        if(_is_random_bbox_crop)
-                        {
-                            _meta_data_graph->update_random_bbox_meta_data(_augmented_meta_data, decode_image_info, crop_image_info);
-                        }
-                        _meta_data_graph->process(_augmented_meta_data);
+                        _meta_data_graph->update_random_bbox_meta_data(_augmented_meta_data, decode_image_info, crop_image_info);
                     }
-                    if (full_batch_meta_data)
-                        full_batch_meta_data->concatenate(_augmented_meta_data);
-                    else
-                        full_batch_meta_data = _augmented_meta_data->clone();
+                    _meta_data_graph->process(_augmented_meta_data);
                 }
-                _graph->process();
+                if (full_batch_meta_data)
+                    full_batch_meta_data->concatenate(_augmented_meta_data);
+                else
+                    full_batch_meta_data = _augmented_meta_data->clone();
             }
+            _process_time.start();
+            _graph->process();
+            _process_time.end();
             _bencode_time.start();
             if(_is_box_encoder )
             {
@@ -1062,126 +542,6 @@ void MasterGraph::output_routine()
             _ring_buffer.set_meta_data(full_batch_image_names, full_batch_meta_data);
             _ring_buffer.push(); // Image data and metadata is now stored in output the ring_buffer, increases it's level by 1
         }
-        _process_time.end();
-
-    }
-    catch (const std::exception &e)
-    {
-        ERR("Exception thrown in the process routine: " + STR(e.what()) + STR("\n"));
-        _processing = false;
-        _ring_buffer.release_all_blocked_calls();
-    }
-}
-
-#ifdef ROCAL_VIDEO
-void MasterGraph::output_routine_video()
-{
-    _process_time.start();
-    INFO("Output routine of video pipeline started with "+TOSTR(_remaining_count) + " to load");
-#if !ENABLE_HIP
-    if(processing_on_device_ocl() && _user_to_internal_batch_ratio != 1)
-        THROW("Internal failure, in the GPU processing case, user and input batch size must be equal")
-#else
-    if(processing_on_device_hip() && _user_to_internal_batch_ratio != 1)
-        THROW("Internal failure, in the GPU processing case, user and input batch size must be equal")
-#endif
-    try {
-        while (_processing)
-        {
-            const size_t each_cycle_size = output_byte_size()/_user_to_internal_batch_ratio;
-
-            ImageNameBatch full_batch_image_names = {};
-            pMetaDataBatch full_batch_meta_data = nullptr;
-            pMetaDataBatch augmented_batch_meta_data = nullptr;
-            if (_video_loader_module->remaining_count() < _user_batch_size)
-            {
-                // If the internal process routine ,output_routine_video(), has finished processing all the images, and last
-                // processed images stored in the _ring_buffer will be consumed by the user when it calls the run() func
-                notify_user_thread();
-                // the following call is required in case the ring buffer is waiting for more data to be loaded and there is no more data to process.
-                _ring_buffer.release_if_empty();
-                std::this_thread::sleep_for(std::chrono::milliseconds(100));
-                continue;
-            }
-
-            // _ring_buffer.get_write_buffers() is blocking and blocks here until user uses processed image by calling run() and frees space in the ring_buffer
-            _rb_block_if_full_time.start();
-            auto write_buffers = _ring_buffer.get_write_buffers();
-            _rb_block_if_full_time.end();
-
-            // When executing on CPU the internal batch count can be smaller than the user batch count
-            // In that case the user_batch_size will be an integer multiple of the _internal_batch_size
-            // Multiple cycles worth of internal_batch_size images should be processed to complete a full _user_batch_size
-            for(unsigned cycle_idx = 0; cycle_idx< _user_to_internal_batch_ratio; cycle_idx++)
-            {
-                // Swap handles on the input sequence, so that new sequence is loaded to be processed
-                auto load_ret = _video_loader_module->load_next();
-                if (load_ret != VideoLoaderModuleStatus::OK)
-                    THROW("Video Loader module failed to load next batch of images, status " + TOSTR(load_ret))
-
-                if (!_processing)
-                    break;
-                auto this_cycle_names = _video_loader_module->get_id();
-                auto decode_image_info = _video_loader_module->get_decode_image_info();
-                _sequence_start_framenum_vec.insert(_sequence_start_framenum_vec.begin(), _video_loader_module->get_sequence_start_frame_number());
-                _sequence_frame_timestamps_vec.insert(_sequence_frame_timestamps_vec.begin(), _video_loader_module->get_sequence_frame_timestamps());
-
-                if(this_cycle_names.size() != _internal_batch_size)
-                    WRN("Internal problem: names count "+ TOSTR(this_cycle_names.size()))
-
-                // meta_data lookup is done before _meta_data_graph->process() is called to have the new meta_data ready for processing
-                if (_meta_data_reader)
-                    _meta_data_reader->lookup(this_cycle_names);
-
-                full_batch_image_names += this_cycle_names;
-
-                if (!_processing)
-                    break;
-
-                // Swap handles on the output images, so that new processed image will be written to the a new buffer
-                for (size_t idx = 0; idx < _output_images.size(); idx++)
-                {
-                    if(_affinity == RocalAffinity::GPU)
-                        _output_images[idx]->swap_handle(write_buffers[idx]);
-                    else
-                    {
-                        auto this_cycle_buffer_ptr = (unsigned char *) write_buffers[idx] + each_cycle_size * cycle_idx;
-                        _output_images[idx]->swap_handle(this_cycle_buffer_ptr);
-                    }
-                }
-
-                if (!_processing)
-                    break;
-
-                for(auto node: _nodes)
-                {
-                    if(node->_is_ssd)
-                    {
-                        node->set_meta_data(_augmented_meta_data);
-                    }
-                }
-
-                update_node_parameters();
-                if(_augmented_meta_data)
-                {
-                    if (_meta_data_graph)
-                    {
-                        _meta_data_graph->process(_augmented_meta_data);
-                    }
-                    if (full_batch_meta_data)
-                        full_batch_meta_data->concatenate(_augmented_meta_data);
-                    else
-                        full_batch_meta_data = _augmented_meta_data->clone();
-                }
-                _graph->process();
-            }
-            if(_is_box_encoder )
-            {
-                _meta_data_graph->update_box_encoder_meta_data(&_anchors, full_batch_meta_data, _criteria, _offset, _scale, _means, _stds);
-            }
-            _ring_buffer.set_meta_data(full_batch_image_names, full_batch_meta_data);
-            _ring_buffer.push(); // Image data and metadata is now stored in output the ring_buffer, increases it's level by 1
-        }
     }
     catch (const std::exception &e)
     {
@@ -1189,25 +549,13 @@ void MasterGraph::output_routine_video()
         _processing = false;
         _ring_buffer.release_all_blocked_calls();
     }
-    _process_time.end();
 }
-#endif
 
 void MasterGraph::start_processing()
 {
     _processing = true;
-#ifdef ROCAL_VIDEO
-    if(_is_video_loader)
-    {
-        _remaining_count = _video_loader_module->remaining_count();
-        _output_thread = std::thread(&MasterGraph::output_routine_video, this);
-    }
-    else
-#endif
-    {
-        _remaining_count = _loader_module->remaining_count();
-        _output_thread = std::thread(&MasterGraph::output_routine, this);
-    }
+    _remaining_count = _loader_module->remaining_count();
+    _output_thread = std::thread(&MasterGraph::output_routine, this);
 #if defined(WIN32) || defined(_WIN32) || defined(__WIN32) && !defined(__CYGWIN__)
 #else
 //  Changing thread scheduling policy and it's priority does not help on latest Ubuntu builds
@@ -1422,64 +770,6 @@ size_t MasterGraph::bounding_box_batch_count(int *buf, pMetaDataBatch meta_data_
     return size;
 }
 
-
-size_t MasterGraph::compute_optimum_internal_batch_size(size_t user_batch_size, RocalAffinity affinity)
-{
-    const unsigned MINIMUM_CPU_THREAD_COUNT = 2;
-    const unsigned DEFAULT_SMT_COUNT = 2;
-
-
-    if(affinity == RocalAffinity::GPU)
-        return user_batch_size;
-
-    unsigned THREAD_COUNT = std::thread::hardware_concurrency();
-    if(THREAD_COUNT >= MINIMUM_CPU_THREAD_COUNT)
-    {
-        INFO("Can run " + TOSTR(THREAD_COUNT) + " threads simultaneously on this machine")
-    }
-    else
-    {
-        THREAD_COUNT = MINIMUM_CPU_THREAD_COUNT;
-        WRN("hardware_concurrency() call failed assuming can run " + TOSTR(THREAD_COUNT) + " threads")
-    }
-    size_t ret = user_batch_size;
-    size_t CORE_COUNT = THREAD_COUNT / DEFAULT_SMT_COUNT;
-
-    if(CORE_COUNT <= 0)
-        THROW("Wrong core count detected less than 0")
-
-    for( size_t i = CORE_COUNT; i <= THREAD_COUNT; i++)
-        if(user_batch_size % i == 0)
-        {
-            ret = i;
-            break;
-        }
-
-    for(size_t i = CORE_COUNT; i > 1; i--)
-        if(user_batch_size % i == 0)
-        {
-            ret = i;
-            break;
-        }
-    INFO("User batch size "+ TOSTR(user_batch_size)+" Internal batch size set to "+ TOSTR(ret))
-    return ret;
-}
-
-size_t MasterGraph::output_sample_size()
-{
-    return output_height() * output_width() * output_depth();
-}
-
-size_t MasterGraph::output_byte_size()
-{
-    return output_height() * output_width() * output_depth() * SAMPLE_SIZE;
-}
-
-size_t MasterGraph::output_depth()
-{
-    return _output_image_info.color_plane_count();
-}
-
 void MasterGraph::notify_user_thread()
 {
     if(_output_routine_finished_processing)
@@ -1493,163 +783,6 @@ bool MasterGraph::no_more_processed_data()
     return (_output_routine_finished_processing && _ring_buffer.empty());
 }
 
-MasterGraph::Status
-MasterGraph::copy_out_tensor_planar(void *out_ptr, RocalTensorFormat format, float multiplier0, float multiplier1,
-                             float multiplier2, float offset0, float offset1, float offset2, bool reverse_channels, RocalTensorDataType output_data_type)
-{
-    if(no_more_processed_data())
-        return MasterGraph::Status::NO_MORE_DATA;
-
-    _convert_time.start();
-    // Copies to the output context given by the user, each image is copied separate for planar
-    const size_t w = output_width();
-    const size_t h = _output_image_info.height_single();
-    const size_t c = output_depth();
-    const size_t n = _output_image_info.batch_size();
-
-    const size_t single_output_image_size = output_byte_size();
-
-
-    if(_output_image_info.mem_type() == RocalMemType::OCL || _output_image_info.mem_type() == RocalMemType::HIP)
-    {
-        THROW("copy_out_tensor_planar for GPU affinity is not implemented")
-    }
-    if(_output_image_info.mem_type() == RocalMemType::HOST)
-    {
-        float multiplier[3] = {multiplier0, multiplier1, multiplier2 };
-        float offset[3] = {offset0, offset1, offset2 };
-        size_t dest_buf_offset = 0;
-
-        auto output_buffers =_ring_buffer.get_read_buffers();
-
-        for( auto&& out_image: output_buffers)
-        {
-            for (unsigned batch = 0; batch < n ; batch++) {
-                const size_t batch_offset = w*h*c*batch;
-                auto in_buffer = (unsigned char *) out_image + batch_offset;
-                if (format == RocalTensorFormat::NHWC) {
-                    if (output_data_type == RocalTensorDataType::FP32) {
-                        float *output_tensor_32 = static_cast<float *>(out_ptr) + batch_offset;
-                        auto channel_size = w * h;
-                        for (unsigned channel_idx = 0; channel_idx < c; channel_idx++)
-                            for (unsigned i = 0; i < channel_size; i++)
-                                output_tensor_32[dest_buf_offset + channel_idx + i * c] =
-                                        offset[channel_idx] + multiplier[channel_idx] *
-                                                              (reverse_channels ? (float) (in_buffer[i +
-                                                                                                     (c - channel_idx -
-                                                                                                      1) *
-                                                                                                     channel_size])
-                                                                                : (float) (in_buffer[i + channel_idx *
-                                                                                                         channel_size]));
-                    } else if (output_data_type == RocalTensorDataType::FP16) {
-                        half *output_tensor_16 = static_cast<half *>(out_ptr) + batch_offset;
-                        auto channel_size = w * h;
-                        for (unsigned channel_idx = 0; channel_idx < c; channel_idx++)
-                            for (unsigned i = 0; i < channel_size; i++)
-                                output_tensor_16[dest_buf_offset + channel_idx + i * c] =
-                                        offset[channel_idx] + multiplier[channel_idx] *
-                                                              (reverse_channels ? (half) (in_buffer[
-                                                                      (c - channel_idx - 1) * channel_size + i])
-                                                                                : (half) (in_buffer[
-                                                                              channel_idx * channel_size + i]));
-                    }
-                }
-                if (format == RocalTensorFormat::NCHW) {
-                    if (output_data_type == RocalTensorDataType::FP32) {
-                        float *output_tensor_32 = static_cast<float *>(out_ptr) + batch_offset;
-                        //output_tensor_32 += batch_offset;
-                        auto channel_size = w * h;
-                        if (c != 3) {
-                            for (unsigned channel_idx = 0; channel_idx < c; channel_idx++)
-                                for (unsigned i = 0; i < channel_size; i++)
-                                    output_tensor_32[dest_buf_offset + channel_idx * channel_size + i] =
-                                            offset[channel_idx] + multiplier[channel_idx] *
-                                                                  (reverse_channels ? (float) (in_buffer[
-                                                                          (c - channel_idx - 1) * channel_size + i])
-                                                                                    : (float) (in_buffer[
-                                                                                  channel_idx * channel_size + i]));
-                        } else {
-#if (ENABLE_SIMD && __AVX2__)
-
-                            float *B_buf = output_tensor_32 + dest_buf_offset;
-                            float *G_buf = B_buf + channel_size;
-                            float *R_buf = G_buf + channel_size;
-                            unsigned char *in_buffer_R = in_buffer;
-                            unsigned char *in_buffer_G = in_buffer + channel_size;
-                            unsigned char *in_buffer_B = in_buffer_G + channel_size;
-
-                            __m256 pmul0 = _mm256_set1_ps(multiplier0);
-                            __m256 pmul1 = _mm256_set1_ps(multiplier1);
-                            __m256 pmul2 = _mm256_set1_ps(multiplier2);
-                            __m256 padd0 = _mm256_set1_ps(offset0);
-                            __m256 padd1 = _mm256_set1_ps(offset1);
-                            __m256 padd2 = _mm256_set1_ps(offset2);
-                            unsigned int alignedLength = (channel_size & ~7);    // multiple of 8
-                            unsigned int i = 0;
-
-                            __m256 fR, fG, fB;
-                            for (; i < alignedLength; i += 8) {
-                                __m128i pixR, pixG, pixB;
-                                if (reverse_channels) {
-                                    pixB = _mm_loadl_epi64((const __m128i *) in_buffer_R);
-                                    pixG = _mm_loadl_epi64((const __m128i *) in_buffer_G);
-                                    pixR = _mm_loadl_epi64((const __m128i *) in_buffer_B);
-                                } else {
-                                    pixR = _mm_loadl_epi64((const __m128i *) in_buffer_R);
-                                    pixG = _mm_loadl_epi64((const __m128i *) in_buffer_G);
-                                    pixB = _mm_loadl_epi64((const __m128i *) in_buffer_B);
-                                }
-                                fB = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(pixR));
-                                fG = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(pixG));
-                                fR = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(pixB));
-                                fB = _mm256_mul_ps(fB, pmul0);
-                                fG = _mm256_mul_ps(fG, pmul1);
-                                fR = _mm256_mul_ps(fR, pmul2);
-                                fB = _mm256_add_ps(fB, padd0);
-                                fG = _mm256_add_ps(fG, padd1);
-                                fR = _mm256_add_ps(fR, padd2);
-                                _mm256_storeu_ps(B_buf, fB);
-                                _mm256_storeu_ps(G_buf, fG);
-                                _mm256_storeu_ps(R_buf, fR);
-                                B_buf += 8;
-                                G_buf += 8;
-                                R_buf += 8;
-                                in_buffer_R += 8, in_buffer_G += 8, in_buffer_B += 8;
-                            }
-                            for (; i < channel_size; i++) {
-                                *B_buf++ = (*in_buffer_R++ * multiplier0) + offset0;
-                                *G_buf++ = (*in_buffer_G++ * multiplier1) + offset1;
-                                *R_buf++ = (*in_buffer_B++ * multiplier2) + offset1;
-                            }
-
-#else
-                            for(unsigned channel_idx = 0; channel_idx < c; channel_idx++)
-                                for(unsigned i = 0; i < channel_size; i++)
-                                    output_tensor_32[dest_buf_offset+channel_idx*channel_size + i] =
-                                            offset[channel_idx] + multiplier[channel_idx]*(reverse_channels ? (float)(in_buffer[i+(c-channel_idx-1)*channel_size]) : (float)(in_buffer[i+channel_idx*channel_size]));
-#endif
-                        }
-                    } else if (output_data_type == RocalTensorDataType::FP16) {
-                        half *output_tensor_16 = static_cast<half *>(out_ptr) + batch_offset;
-                        auto channel_size = w * h;
-                        for (unsigned channel_idx = 0; channel_idx < c; channel_idx++)
-                            for (unsigned i = 0; i < channel_size; i++)
-                                output_tensor_16[dest_buf_offset + channel_idx * channel_size + i] =
-                                        offset[channel_idx] + multiplier[channel_idx] *
-                                                              (reverse_channels ? (half) (in_buffer[i +
-                                                                                                    (c - channel_idx -
-                                                                                                     1) * channel_size])
-                                                                                : (half) (in_buffer[i + channel_idx *
-                                                                                                        channel_size]));
-                    }
-                }
-            }
-            dest_buf_offset += single_output_image_size;
-        }
-    }
-    _convert_time.end();
-    return Status::OK;
-}
 
 MasterGraph::Status
 MasterGraph::get_bbox_encoded_buffers(float **boxes_buf_ptr, int **labels_buf_ptr, size_t num_encoded_boxes)
diff --git a/rocAL/rocAL/source/pipeline/ring_buffer.cpp b/rocAL/rocAL/source/pipeline/ring_buffer.cpp
index 9d8c7a95d6..cb4bab31d4 100644
--- a/rocAL/rocAL/source/pipeline/ring_buffer.cpp
+++ b/rocAL/rocAL/source/pipeline/ring_buffer.cpp
@@ -26,12 +26,13 @@ THE SOFTWARE.
 RingBuffer::RingBuffer(unsigned buffer_depth):
         BUFF_DEPTH(buffer_depth),
         _dev_sub_buffer(buffer_depth),
-        _host_master_buffers(buffer_depth),
+        _host_sub_buffers(buffer_depth),
         _dev_bbox_buffer(buffer_depth),
         _dev_labels_buffer(buffer_depth)
 {
     reset();
 }
+
 void RingBuffer::block_if_empty()
 {
     std::unique_lock<std::mutex> lock(_lock);
@@ -62,15 +63,6 @@ std::vector<void*> RingBuffer::get_read_buffers()
     return _host_sub_buffers[_read_ptr];
 }
 
-void *RingBuffer::get_host_master_read_buffer() {
-    block_if_empty();
-    if((_mem_type == RocalMemType::OCL) || (_mem_type == RocalMemType::HIP))
-        return nullptr;
-
-    return _host_master_buffers[_read_ptr];
-}
-
-
 std::pair<void*, void*> RingBuffer::get_box_encode_read_buffers()
 {
     block_if_empty();
@@ -119,12 +111,12 @@ void RingBuffer::unblock_writer()
     _wait_for_unload.notify_all();
 }
 
-void RingBuffer::init(RocalMemType mem_type, void *devres, unsigned sub_buffer_size, unsigned sub_buffer_count)
+void RingBuffer::init(RocalMemType mem_type, void *devres, std::vector<size_t> &sub_buffer_size)
 {
     _mem_type = mem_type;
     _dev = devres;
     _sub_buffer_size = sub_buffer_size;
-    _sub_buffer_count = sub_buffer_count;
+    auto sub_buffer_count = sub_buffer_size.size();
     if(BUFF_DEPTH < 2)
         THROW ("Error internal buffer size for the ring buffer should be greater than one")
 
@@ -142,15 +134,15 @@ void RingBuffer::init(RocalMemType mem_type, void *devres, unsigned sub_buffer_s
         {
             cl_mem_flags flags = CL_MEM_READ_ONLY;
 
-            _dev_sub_buffer[buffIdx].resize(_sub_buffer_count);
-            for(unsigned sub_idx = 0; sub_idx < _sub_buffer_count; sub_idx++)
+            _dev_sub_buffer[buffIdx].resize(sub_buffer_count);
+            for(unsigned sub_idx = 0; sub_idx < sub_buffer_count; sub_idx++)
             {
-                _dev_sub_buffer[buffIdx][sub_idx] =  clCreateBuffer(dev_ocl->context, flags, sub_buffer_size, NULL, &err);
+                _dev_sub_buffer[buffIdx][sub_idx] =  clCreateBuffer(dev_ocl->context, flags, _sub_buffer_size[sub_idx], NULL, &err);
 
                 if(err)
                 {
                     _dev_sub_buffer.clear();
-                    THROW("clCreateBuffer of size " + TOSTR(sub_buffer_size) + " index " + TOSTR(sub_idx) +
+                    THROW("clCreateBuffer of size " + TOSTR(_sub_buffer_size[sub_idx]) + " index " + TOSTR(sub_idx) +
                           " failed " + TOSTR(err));
                 }
 
@@ -159,7 +151,7 @@ void RingBuffer::init(RocalMemType mem_type, void *devres, unsigned sub_buffer_s
 
         }
     }
-   else
+    else
     {
 #elif ENABLE_HIP
     DeviceResourcesHip *dev_hip = static_cast<DeviceResourcesHip *>(_dev);
@@ -172,33 +164,30 @@ void RingBuffer::init(RocalMemType mem_type, void *devres, unsigned sub_buffer_s
 
         for(size_t buffIdx = 0; buffIdx < BUFF_DEPTH; buffIdx++)
         {
-            _dev_sub_buffer[buffIdx].resize(_sub_buffer_count);
-            for(unsigned sub_idx = 0; sub_idx < _sub_buffer_count; sub_idx++)
+            _dev_sub_buffer[buffIdx].resize(sub_buffer_count);
+            for(unsigned sub_idx = 0; sub_idx < sub_buffer_count; sub_idx++)
             {
 
-                hipError_t err =  hipMalloc(&_dev_sub_buffer[buffIdx][sub_idx], sub_buffer_size);
-                //printf("allocated HIP device buffer <%d, %d, %d, %p>\n", buffIdx, sub_idx, sub_buffer_size, _dev_sub_buffer[buffIdx][sub_idx]);
+                hipError_t err =  hipMalloc(&_dev_sub_buffer[buffIdx][sub_idx], _sub_buffer_size[sub_idx]);
+                //printf("allocated HIP device buffer <%d, %d, %d, %p>\n", buffIdx, sub_idx, _sub_buffer_size[sub_idx], _dev_sub_buffer[buffIdx][sub_idx]);
                 if(err != hipSuccess)
                 {
                     _dev_sub_buffer.clear();
-                    THROW("hipMalloc of size " + TOSTR(sub_buffer_size) + " index " + TOSTR(sub_idx) +
+                    THROW("hipMalloc of size " + TOSTR(_sub_buffer_size[sub_idx]) + " index " + TOSTR(sub_idx) +
                           " failed " + TOSTR(err));
                 }
             }
         }
     }
-   else
+    else
     {
 #endif
-        _host_sub_buffers.resize(BUFF_DEPTH);
         for(size_t buffIdx = 0; buffIdx < BUFF_DEPTH; buffIdx++)
         {
-            const size_t master_buffer_size = sub_buffer_size * sub_buffer_count;
             // a minimum of extra MEM_ALIGNMENT is allocated
-            _host_master_buffers[buffIdx] = aligned_alloc(MEM_ALIGNMENT, MEM_ALIGNMENT * (master_buffer_size / MEM_ALIGNMENT + 1));
-            _host_sub_buffers[buffIdx].resize(_sub_buffer_count);
-            for(size_t sub_buff_idx = 0; sub_buff_idx < _sub_buffer_count; sub_buff_idx++)
-                _host_sub_buffers[buffIdx][sub_buff_idx] = (unsigned char*)_host_master_buffers[buffIdx] + _sub_buffer_size * sub_buff_idx;
+            _host_sub_buffers[buffIdx].resize(sub_buffer_count);
+            for(size_t sub_buff_idx = 0; sub_buff_idx < sub_buffer_count; sub_buff_idx++)
+                _host_sub_buffers[buffIdx][sub_buff_idx] = aligned_alloc(MEM_ALIGNMENT, MEM_ALIGNMENT * (_sub_buffer_size[sub_buff_idx] / MEM_ALIGNMENT + 1));
         }
 #if ENABLE_OPENCL || ENABLE_HIP
     }
@@ -319,12 +308,10 @@ void RingBuffer::release_gpu_res()
 RingBuffer::~RingBuffer()
 {
     if (_mem_type == RocalMemType::HOST) {
-        for (unsigned idx = 0; idx < _host_master_buffers.size(); idx++)
-            if (_host_master_buffers[idx]) {
-                free(_host_master_buffers[idx]);
-            }
-
-        _host_master_buffers.clear();
+        for (unsigned buffIdx = 0; buffIdx < _host_sub_buffers.size(); buffIdx++)
+            for (unsigned sub_buf_idx = 0; sub_buf_idx < _host_sub_buffers[buffIdx].size(); sub_buf_idx++)
+                if (_host_sub_buffers[buffIdx][sub_buf_idx])
+                    free(_host_sub_buffers[buffIdx][sub_buf_idx]);
         _host_sub_buffers.clear();
     }
 }
diff --git a/rocAL/rocAL/source/pipeline/tensor.cpp b/rocAL/rocAL/source/pipeline/tensor.cpp
index 86838653ac..95e5e25122 100644
--- a/rocAL/rocAL/source/pipeline/tensor.cpp
+++ b/rocAL/rocAL/source/pipeline/tensor.cpp
@@ -316,7 +316,7 @@ unsigned rocalTensor::copy_data(hipStream_t stream, void *host_memory, bool sync
 #endif
 
 unsigned rocalTensor::copy_data(void *user_buffer) {
-    if (_info._type != rocalTensorInfo::Type::HANDLE) return 0;
+    if (_mem_handle == nullptr) return 0;
 
 #if ENABLE_HIP
     if (_info._mem_type == RocalMemType::HIP) {