From d444fb9c20737bdc3471b2bc643ca6d91dd306e7 Mon Sep 17 00:00:00 2001
From: SundarRajan28 <sundarrajan@multicorewareinc.com>
Date: Fri, 6 Oct 2023 07:56:26 +0000
Subject: [PATCH 01/33] Adding mask pipeline support for rocAL

---
 rocAL/include/api/rocal_api_data_transfer.h   |   3 +-
 rocAL/include/api/rocal_api_meta_data.h       |  14 +-
 .../include/meta_data/coco_meta_data_reader.h |   6 +-
 rocAL/include/meta_data/meta_data.h           |  11 +-
 rocAL/include/meta_data/meta_data_reader.h    |  16 +-
 rocAL/include/pipeline/master_graph.h         |   5 +-
 rocAL/include/pipeline/ring_buffer.h          |   8 +-
 rocAL/include/pipeline/tensor.h               |  18 +-
 .../readers/image/coco_file_source_reader.h   |   3 +-
 rocAL/rocAL_hip/rocal_hip_kernels.cpp         |  83 ++++++---
 rocAL/rocAL_hip/rocal_hip_kernels.h           |   8 +-
 rocAL/source/api/rocal_api_augmentation.cpp   |  16 +-
 rocAL/source/api/rocal_api_data_transfer.cpp  |   4 +-
 rocAL/source/api/rocal_api_meta_data.cpp      |  41 ++++-
 .../meta_data/coco_meta_data_reader.cpp       |  30 ++-
 .../meta_node_resize_mirror_normalize.cpp     |   5 +
 rocAL/source/pipeline/master_graph.cpp        | 174 ++++++++++--------
 rocAL/source/pipeline/ring_buffer.cpp         |  40 +++-
 rocAL/source/pipeline/tensor.cpp              |  51 +----
 .../readers/image/coco_file_source_reader.cpp |  71 ++++++-
 rocAL_pybind/amd/rocal/pipeline.py            |  14 ++
 rocAL_pybind/amd/rocal/readers.py             |   7 +-
 rocAL_pybind/amd/rocal/types.py               |   2 +
 rocAL_pybind/rocal_pybind.cpp                 |  54 ++++++
 24 files changed, 477 insertions(+), 207 deletions(-)
diff --git a/rocAL/include/api/rocal_api_data_transfer.h b/rocAL/include/api/rocal_api_data_transfer.h
index 0d60e0b94..f94819273 100644
--- a/rocAL/include/api/rocal_api_data_transfer.h
+++ b/rocAL/include/api/rocal_api_data_transfer.h
@@ -63,7 +63,7 @@ extern "C" RocalStatus ROCAL_API_CALL rocalToTensor(RocalContext rocal_context,
                                                     RocalTensorLayout tensor_format, RocalTensorOutputType tensor_output_type,
                                                     float multiplier0, float multiplier1, float multiplier2, float offset0,
                                                     float offset1, float offset2,
-                                                    bool reverse_channels, RocalOutputMemType output_mem_type);
+                                                    bool reverse_channels, RocalOutputMemType output_mem_type, int max_height = 0, int max_width = 0);
 
 /*!
  * \brief Sets the output images in the RocalContext
@@ -74,7 +74,6 @@ extern "C" RocalStatus ROCAL_API_CALL rocalToTensor(RocalContext rocal_context,
  */
 extern "C" void ROCAL_API_CALL rocalSetOutputs(RocalContext p_context, unsigned int num_of_outputs, std::vector<RocalTensor> &output_images);
 
-
 /*!
  * \brief gives the list of output tensors from rocal context
  * \ingroup group_rocal_data_transfer
diff --git a/rocAL/include/api/rocal_api_meta_data.h b/rocAL/include/api/rocal_api_meta_data.h
index dfe961acd..5c5a305dd 100644
--- a/rocAL/include/api/rocal_api_meta_data.h
+++ b/rocAL/include/api/rocal_api_meta_data.h
@@ -76,9 +76,14 @@ extern "C" RocalMetaData ROCAL_API_CALL rocalCreateTFReaderDetection(RocalContex
  * \ingroup group_rocal_meta_data
  * \param [in] rocal_context rocal context
  * \param [in] source_path path to the coco json file
+ * \param [in] mask enable polygon masks
+ * \param [in] ltrb If set to True, bboxes are returned as [left, top, right, bottom]. If set to False, the bboxes are returned as [x, y, width, height]
+ * \param [in] is_box_encoder If set to True, bboxes are returned as encoded bboxes using the anchors
+ * \param [in] avoid_class_remapping If set to True, classes are returned directly. Otherwise, classes are mapped to consecutive values
+ * \param [in] aspect_ratio_grouping If set to True, images are sorted by their aspect ratio and returned
  * \return RocalMetaData object, can be used to inquire about the rocal's output (processed) tensors
  */
-extern "C" RocalMetaData ROCAL_API_CALL rocalCreateCOCOReader(RocalContext rocal_context, const char* source_path, bool is_output, bool mask = false, bool ltrb = true, bool is_box_encoder = false);
+extern "C" RocalMetaData ROCAL_API_CALL rocalCreateCOCOReader(RocalContext rocal_context, const char* source_path, bool is_output, bool mask = false, bool ltrb = true, bool is_box_encoder = false, bool avoid_class_remapping = false, bool aspect_ratio_grouping = false);
 
 /*! \brief create coco reader key points
  * \ingroup group_rocal_meta_data
@@ -209,6 +214,13 @@ extern "C" RocalTensorList ROCAL_API_CALL rocalGetBoundingBoxCords(RocalContext
  */
 extern "C" void ROCAL_API_CALL rocalGetImageSizes(RocalContext rocal_context, int* buf);
 
+/*! \brief get ROI image sizes
+ * \ingroup group_rocal_meta_data
+ * \param [in] rocal_context rocal context
+ * \param [out] buf The user's buffer that will be filled with ROI image size info for the images in the output batch
+ */
+extern "C" void ROCAL_API_CALL rocalGetROIImageSizes(RocalContext rocal_context, int* buf);
+
 /*! \brief create text cifar10 label reader
  * \ingroup group_rocal_meta_data
  * \param [in] rocal_context rocal context
diff --git a/rocAL/include/meta_data/coco_meta_data_reader.h b/rocAL/include/meta_data/coco_meta_data_reader.h
index aec539bed..5e5efc67b 100644
--- a/rocAL/include/meta_data/coco_meta_data_reader.h
+++ b/rocAL/include/meta_data/coco_meta_data_reader.h
@@ -32,12 +32,12 @@ class COCOMetaDataReader : public MetaDataReader {
    public:
     void init(const MetaDataConfig& cfg, pMetaDataBatch meta_data_batch) override;
     void lookup(const std::vector<std::string>& image_names) override;
+    ImgSize lookup_image_size(const std::string& image_name) override;
     void read_all(const std::string& path) override;
     void release(std::string image_name);
     void release() override;
     void print_map_contents();
     bool set_timestamp_mode() override { return false; }
-
     const std::map<std::string, std::shared_ptr<MetaData>>& get_map_content() override { return _map_content; }
     COCOMetaDataReader();
 
@@ -45,12 +45,14 @@ class COCOMetaDataReader : public MetaDataReader {
     pMetaDataBatch _output;
     std::string _path;
     int meta_data_reader_type;
+    bool _avoid_class_remapping;
     void add(std::string image_name, BoundingBoxCords bbox, Labels labels, ImgSize image_size, int image_id = 0);
-    void add(std::string image_name, BoundingBoxCords bbox, Labels labels, ImgSize image_size, MaskCords mask_cords, std::vector<int> polygon_count, std::vector<std::vector<int>> vertices_count);  // To add Mask coordinates to Metadata struct
+    void add(std::string image_name, BoundingBoxCords bbox, Labels labels, ImgSize image_size, MaskCords mask_cords, std::vector<int> polygon_count, std::vector<std::vector<int>> vertices_count, int image_id = 0);  // To add Mask coordinates to Metadata struct
     bool exists(const std::string& image_name) override;
     std::map<std::string, std::shared_ptr<MetaData>> _map_content;
     std::map<std::string, std::shared_ptr<MetaData>>::iterator _itr;
     std::map<std::string, ImgSize> _map_img_sizes;
+    std::map<int, std::string> _map_img_names;
     std::map<std::string, ImgSize>::iterator itr;
     std::map<int, int> _label_info;
     std::map<int, int>::iterator _it_label;
diff --git a/rocAL/include/meta_data/meta_data.h b/rocAL/include/meta_data/meta_data.h
index 2d3ba9a26..cf0b9a458 100644
--- a/rocAL/include/meta_data/meta_data.h
+++ b/rocAL/include/meta_data/meta_data.h
@@ -104,6 +104,7 @@ typedef class MetaDataInfo {
     int img_id = -1;
     std::string img_name = "";
     ImgSize img_size = {};
+    ImgSize img_roi_size = {};
 } MetaDataInfo;
 
 class MetaData {
@@ -121,9 +122,11 @@ class MetaData {
     virtual JointsData& get_joints_data() = 0;
     virtual void set_joints_data(JointsData* joints_data) = 0;
     ImgSize& get_img_size() { return _info.img_size; }
+    ImgSize& get_img_roi_size() { return _info.img_roi_size; }
     std::string& get_image_name() { return _info.img_name; }
     int& get_image_id() { return _info.img_id; }
     void set_img_size(ImgSize img_size) { _info.img_size = std::move(img_size); }
+    void set_img_roi_size(ImgSize img_roi_size) { _info.img_roi_size = std::move(img_roi_size); }
     void set_img_id(int img_id) { _info.img_id = img_id; }
     void set_img_name(std::string img_name) { _info.img_name = img_name; }
     void set_metadata_info(MetaDataInfo info) { _info = std::move(info); }
@@ -167,13 +170,14 @@ class BoundingBox : public Label {
 
 struct PolygonMask : public BoundingBox {
    public:
-    PolygonMask(BoundingBoxCords bb_cords, Labels bb_label_ids, ImgSize img_size, MaskCords mask_cords, std::vector<int> polygon_count, std::vector<std::vector<int>> vertices_count) {
+    PolygonMask(BoundingBoxCords bb_cords, Labels bb_label_ids, ImgSize img_size, MaskCords mask_cords, std::vector<int> polygon_count, std::vector<std::vector<int>> vertices_count, int img_id = 0) {
         _bb_cords = std::move(bb_cords);
         _label_ids = std::move(bb_label_ids);
         _info.img_size = std::move(img_size);
         _mask_cords = std::move(mask_cords);
         _polygon_count = std::move(polygon_count);
         _vertices_count = std::move(vertices_count);
+        _info.img_id = img_id;
     }
     std::vector<int>& get_polygon_count() override { return _polygon_count; }
     std::vector<std::vector<int>>& get_vertices_count() override { return _vertices_count; }
@@ -207,20 +211,24 @@ class MetaDataInfoBatch {
     std::vector<int> img_ids = {};
     std::vector<std::string> img_names = {};
     std::vector<ImgSize> img_sizes = {};
+    std::vector<ImgSize> img_roi_sizes = {};
     void clear() {
         img_ids.clear();
         img_names.clear();
         img_sizes.clear();
+        img_roi_sizes.clear();
     }
     void resize(int batch_size) {
         img_ids.resize(batch_size);
         img_names.resize(batch_size);
         img_sizes.resize(batch_size);
+        img_roi_sizes.resize(batch_size);
     }
     void insert(MetaDataInfoBatch& other) {
         img_sizes.insert(img_sizes.end(), other.img_sizes.begin(), other.img_sizes.end());
         img_ids.insert(img_ids.end(), other.img_ids.begin(), other.img_ids.end());
         img_names.insert(img_names.end(), other.img_names.begin(), other.img_names.end());
+        img_roi_sizes.insert(img_roi_sizes.end(), other.img_roi_sizes.begin(), other.img_roi_sizes.end());
     }
 };
 
@@ -249,6 +257,7 @@ class MetaDataBatch {
     std::vector<int>& get_image_id_batch() { return _info_batch.img_ids; }
     std::vector<std::string>& get_image_names_batch() { return _info_batch.img_names; }
     ImgSizes& get_img_sizes_batch() { return _info_batch.img_sizes; }
+    ImgSizes& get_img_roi_sizes_batch() { return _info_batch.img_roi_sizes; }
     MetaDataInfoBatch& get_info_batch() { return _info_batch; }
     void set_metadata_type(MetaDataType metadata_type) { _type = metadata_type; }
     MetaDataType get_metadata_type() { return _type; }
diff --git a/rocAL/include/meta_data/meta_data_reader.h b/rocAL/include/meta_data/meta_data_reader.h
index e0a334ade..b16722c4e 100644
--- a/rocAL/include/meta_data/meta_data_reader.h
+++ b/rocAL/include/meta_data/meta_data_reader.h
@@ -56,16 +56,20 @@ struct MetaDataConfig {
     unsigned _frame_stride;
     unsigned _out_img_width;
     unsigned _out_img_height;
+    bool _avoid_class_remapping;
+    bool _aspect_ratio_grouping;
 
    public:
-    MetaDataConfig(const MetaDataType& type, const MetaDataReaderType& reader_type, const std::string& path, const std::map<std::string, std::string>& feature_key_map = std::map<std::string, std::string>(), const std::string file_prefix = std::string(), const unsigned& sequence_length = 3, const unsigned& frame_step = 3, const unsigned& frame_stride = 1)
-        : _type(type), _reader_type(reader_type), _path(path), _feature_key_map(feature_key_map), _file_prefix(file_prefix), _sequence_length(sequence_length), _frame_step(frame_step), _frame_stride(frame_stride) {}
+    MetaDataConfig(const MetaDataType& type, const MetaDataReaderType& reader_type, const std::string& path, const std::map<std::string, std::string>& feature_key_map = std::map<std::string, std::string>(), const std::string file_prefix = std::string(), const unsigned& sequence_length = 3, const unsigned& frame_step = 3, const unsigned& frame_stride = 1, bool avoid_class_remapping = false)
+        : _type(type), _reader_type(reader_type), _path(path), _feature_key_map(feature_key_map), _file_prefix(file_prefix), _sequence_length(sequence_length), _frame_step(frame_step), _frame_stride(frame_stride), _avoid_class_remapping(avoid_class_remapping) {}
     MetaDataConfig() = delete;
     MetaDataType type() const { return _type; }
     MetaDataReaderType reader_type() const { return _reader_type; }
     std::string path() const { return _path; }
     std::map<std::string, std::string> feature_key_map() const { return _feature_key_map; }
     std::string file_prefix() const { return _file_prefix; }
+    bool class_remapping() const { return _avoid_class_remapping; }
+    bool aspect_ratio_grouping() const { return _aspect_ratio_grouping; }
     unsigned sequence_length() const { return _sequence_length; }
     unsigned frame_step() const { return _frame_step; }
     unsigned frame_stride() const { return _frame_stride; }
@@ -73,9 +77,14 @@ struct MetaDataConfig {
     unsigned out_img_height() const { return _out_img_height; }
     void set_out_img_width(unsigned out_img_width) { _out_img_width = out_img_width; }
     void set_out_img_height(unsigned out_img_height) { _out_img_height = out_img_height; }
+    void set_avoid_class_remapping(bool avoid_class_remapping) { _avoid_class_remapping = avoid_class_remapping; }
+    void set_aspect_ratio_grouping(bool aspect_ratio_grouping) { _aspect_ratio_grouping = aspect_ratio_grouping; }
 };
 
 class MetaDataReader {
+   private:
+    bool _aspect_ratio_grouping = false;
+
    public:
     enum class Status {
         OK = 0
@@ -88,4 +97,7 @@ class MetaDataReader {
     virtual const std::map<std::string, std::shared_ptr<MetaData>>& get_map_content() = 0;
     virtual bool exists(const std::string& image_name) = 0;
     virtual bool set_timestamp_mode() = 0;
+    virtual ImgSize lookup_image_size(const std::string& image_name) { return {}; }
+    void set_aspect_ratio_grouping(bool aspect_ratio_grouping) { _aspect_ratio_grouping = aspect_ratio_grouping; }
+    bool aspect_ratio_grouping() const { return _aspect_ratio_grouping; }
 };
diff --git a/rocAL/include/pipeline/master_graph.h b/rocAL/include/pipeline/master_graph.h
index 98349ae86..dd5662c93 100644
--- a/rocAL/include/pipeline/master_graph.h
+++ b/rocAL/include/pipeline/master_graph.h
@@ -82,7 +82,7 @@ class MasterGraph {
     Status reset();
     size_t remaining_count();
     MasterGraph::Status to_tensor(void *out_ptr, RocalTensorlayout format, float multiplier0, float multiplier1, float multiplier2,
-                                  float offset0, float offset1, float offset2, bool reverse_channels, RocalTensorDataType output_data_type, RocalOutputMemType output_mem_type);
+                                  float offset0, float offset1, float offset2, bool reverse_channels, RocalTensorDataType output_data_type, RocalOutputMemType output_mem_type, int max_height = 0, int max_width = 0);
     Status copy_output(unsigned char *out_ptr, size_t out_size_in_bytes);
     Status copy_out_tensor_planar(void *out_ptr, RocalTensorlayout format, float multiplier0, float multiplier1, float multiplier2,
                                   float offset0, float offset1, float offset2, bool reverse_channels, RocalTensorDataType output_data_type);
@@ -106,7 +106,8 @@ class MasterGraph {
     Tensor *create_loader_output_tensor(const TensorInfo &info);
     std::vector<rocalTensorList *> create_label_reader(const char *source_path, MetaDataReaderType reader_type);
     std::vector<rocalTensorList *> create_video_label_reader(const char *source_path, MetaDataReaderType reader_type, unsigned sequence_length, unsigned frame_step, unsigned frame_stride, bool file_list_frame_num = true);
-    std::vector<rocalTensorList *> create_coco_meta_data_reader(const char *source_path, bool is_output, MetaDataReaderType reader_type, MetaDataType label_type, bool ltrb_bbox = true, bool is_box_encoder = false, float sigma = 0.0, unsigned pose_output_width = 0, unsigned pose_output_height = 0);
+    std::vector<rocalTensorList *> create_coco_meta_data_reader(const char *source_path, bool is_output, MetaDataReaderType reader_type, MetaDataType label_type, bool ltrb_bbox = true, bool is_box_encoder = false,
+                                                                bool avoid_class_remapping = false, bool aspect_ratio_grouping = false, float sigma = 0.0, unsigned pose_output_width = 0, unsigned pose_output_height = 0);
     std::vector<rocalTensorList *> create_tf_record_meta_data_reader(const char *source_path, MetaDataReaderType reader_type, MetaDataType label_type, const std::map<std::string, std::string> feature_key_map);
     std::vector<rocalTensorList *> create_caffe_lmdb_record_meta_data_reader(const char *source_path, MetaDataReaderType reader_type, MetaDataType label_type);
     std::vector<rocalTensorList *> create_caffe2_lmdb_record_meta_data_reader(const char *source_path, MetaDataReaderType reader_type, MetaDataType label_type);
diff --git a/rocAL/include/pipeline/ring_buffer.h b/rocAL/include/pipeline/ring_buffer.h
index fc6ba9e0d..a5fc53183 100644
--- a/rocAL/include/pipeline/ring_buffer.h
+++ b/rocAL/include/pipeline/ring_buffer.h
@@ -46,12 +46,12 @@ class RingBuffer {
     ///\param dev
     ///\param sub_buffer_size
     ///\param sub_buffer_count
-    void init(RocalMemType mem_type, void *dev, std::vector<size_t> &sub_buffer_size);
+    void init(RocalMemType mem_type, void *dev, std::vector<size_t> &sub_buffer_size, size_t roi_buffer_size);
     void initBoxEncoderMetaData(RocalMemType mem_type, size_t encoded_bbox_size, size_t encoded_labels_size);
     void init_metadata(RocalMemType mem_type, std::vector<size_t> &sub_buffer_size);
     void release_gpu_res();
-    std::vector<void *> get_read_buffers();
-    std::vector<void *> get_write_buffers();
+    std::pair<std::vector<void *>, std::vector<unsigned *>> get_read_buffers();
+    std::pair<std::vector<void *>, std::vector<unsigned *>> get_write_buffers();
     std::pair<void *, void *> get_box_encode_write_buffers();
     std::pair<void *, void *> get_box_encode_read_buffers();
     MetaDataNamePair &get_meta_data();
@@ -86,6 +86,8 @@ class RingBuffer {
     std::condition_variable _wait_for_unload;
     std::vector<std::vector<void *>> _dev_sub_buffer;
     std::vector<std::vector<void *>> _host_sub_buffers;
+    std::vector<std::vector<unsigned *>> _dev_roi_buffers;
+    std::vector<std::vector<unsigned *>> _host_roi_buffers;
     std::vector<std::vector<void *>> _host_meta_data_buffers;
     std::vector<void *> _dev_bbox_buffer;
     std::vector<void *> _dev_labels_buffer;
diff --git a/rocAL/include/pipeline/tensor.h b/rocAL/include/pipeline/tensor.h
index dd7868e88..54b0bf524 100644
--- a/rocAL/include/pipeline/tensor.h
+++ b/rocAL/include/pipeline/tensor.h
@@ -84,9 +84,6 @@ class TensorInfo {
                RocalTensorDataType data_type, RocalTensorlayout layout,
                RocalColorFormat color_format);
 
-    //! Copy constructor
-    TensorInfo(const TensorInfo& info);
-    ~TensorInfo();
     // Setting properties required for Image / Video
     void set_roi_type(RocalROIType roi_type) { _roi_type = roi_type; }
     void set_data_type(RocalTensorDataType data_type) {
@@ -211,7 +208,7 @@ class TensorInfo {
     RocalROIType roi_type() const { return _roi_type; }
     RocalTensorDataType data_type() const { return _data_type; }
     RocalTensorlayout layout() const { return _layout; }
-    RocalROI* get_roi() const { return (RocalROI*)_roi_buf; }
+    RocalROI* get_roi() const { return (RocalROI*)_roi.get(); }
     RocalColorFormat color_format() const { return _color_format; }
     Type type() const { return _type; }
     uint64_t data_type_size() {
@@ -221,6 +218,14 @@ class TensorInfo {
     bool is_image() const { return _is_image; }
     void set_metadata() { _is_metadata = true; }
     bool is_metadata() const { return _is_metadata; }
+    void set_roi_ptr(unsigned* roi_ptr) {
+        auto deleter = [&](unsigned* ptr) {};  // Empty destructor used, since memory is handled by the pipeline
+        _roi.reset(roi_ptr, deleter);
+    }
+    void copy_roi(void* roi_buffer) {
+        if (_roi != nullptr && roi_buffer != nullptr)
+            memcpy((void*)roi_buffer, (const void*)_roi.get(), _batch_size * sizeof(RocalROI));
+    }
 
    private:
     Type _type = Type::UNKNOWN;                                  //!< tensor type, whether is virtual tensor, created from handle or is a regular tensor
@@ -233,7 +238,8 @@ class TensorInfo {
     RocalTensorDataType _data_type = RocalTensorDataType::FP32;  //!< tensor data type
     RocalTensorlayout _layout = RocalTensorlayout::NONE;         //!< layout of the tensor
     RocalColorFormat _color_format;                              //!< color format of the image
-    void* _roi_buf = nullptr;
+    unsigned* _roi_buf = nullptr;
+    std::shared_ptr<unsigned> _roi;
     uint64_t _data_type_size = tensor_data_size(_data_type);
     uint64_t _data_size = 0;
     std::vector<size_t> _max_shape;  //!< stores the the width and height dimensions in the tensor
@@ -275,6 +281,8 @@ class Tensor : public rocalTensor {
     void create_roi_tensor_from_handle(void** handle);
     void update_tensor_roi(const std::vector<uint32_t>& width, const std::vector<uint32_t>& height);
     void reset_tensor_roi() { _info.reset_tensor_roi_buffers(); }
+    void set_roi(unsigned* roi_ptr) { _info.set_roi_ptr(roi_ptr); }
+    void copy_roi(void* roi_buffer) { _info.copy_roi(roi_buffer); }
     vx_tensor get_roi_tensor() { return _vx_roi_handle; }
     // create_from_handle() no internal memory allocation is done here since
     // tensor's handle should be swapped with external buffers before usage
diff --git a/rocAL/include/readers/image/coco_file_source_reader.h b/rocAL/include/readers/image/coco_file_source_reader.h
index 928fea12f..0e3a11bb8 100644
--- a/rocAL/include/readers/image/coco_file_source_reader.h
+++ b/rocAL/include/readers/image/coco_file_source_reader.h
@@ -76,7 +76,8 @@ class COCOFileSourceReader : public Reader {
     DIR *_src_dir;
     DIR *_sub_dir;
     struct dirent *_entity;
-    std::vector<std::string> _file_names;
+    std::vector<std::string> _file_names, _sorted_file_names;
+    std::vector<float> _aspect_ratios;
     std::vector<std::string> _files;
     unsigned _curr_file_idx;
     FILE *_current_fPtr;
diff --git a/rocAL/rocAL_hip/rocal_hip_kernels.cpp b/rocAL/rocAL_hip/rocal_hip_kernels.cpp
index d238d0366..449a8672c 100644
--- a/rocAL/rocAL_hip/rocal_hip_kernels.cpp
+++ b/rocAL/rocAL_hip/rocal_hip_kernels.cpp
@@ -32,6 +32,7 @@ Hip_CopyInt8ToNHWC_fp32(
     void *output_tensor,
     unsigned int dst_buf_offset,
     uint4 nchw,
+    uint2 outDims,
     float3 multiplier,
     float3 offset,
     unsigned int reverse_channels) {
@@ -40,19 +41,21 @@ Hip_CopyInt8ToNHWC_fp32(
     const int W = nchw.w;
     const int H = nchw.z;
     const int C = nchw.y;
+    const int maxOutH = outDims.x;
+    const int maxOutW = outDims.y;
     const int img_offset = C * W * H;
+    const int out_img_offset = C * maxOutW * maxOutH;
 
-    if ((x >= W) || (y >= H))
+    if ((x >= maxOutW) || (y >= maxOutH))
         return;
     for (unsigned int n = 0; n < nchw.x; n++) {
         unsigned int srcIdx = (y * W + x) * C;  // src is RGB
-        unsigned int dstIdx = (y * W + x) * C;
+        unsigned int dstIdx = (y * maxOutW + x) * C;
         // copy float3  pixels to dst
         if (C == 3) {
             float3 dst;
-
             const uchar *inp_img = &inp_image_u8[n * img_offset];
-            float *out_tensor = (float *)((float *)output_tensor + dst_buf_offset + n * img_offset);
+            float *out_tensor = (float *)((float *)output_tensor + dst_buf_offset + n * out_img_offset);
             if (reverse_channels)
                 dst = make_float3((float)inp_img[srcIdx + 2], (float)inp_img[srcIdx + 1], (float)inp_img[srcIdx]) * multiplier + offset;
             else
@@ -62,7 +65,7 @@ Hip_CopyInt8ToNHWC_fp32(
             out_tensor[dstIdx + 2] = dst.z;
         } else {
             const uchar *inp_img = &inp_image_u8[n * img_offset + dst_buf_offset];
-            float *out_tensor = (float *)output_tensor + dst_buf_offset + n * img_offset;
+            float *out_tensor = (float *)output_tensor + dst_buf_offset + n * out_img_offset;
             out_tensor[dstIdx] = (float)inp_img[srcIdx] * multiplier.x + offset.x;
         }
     }
@@ -74,6 +77,7 @@ Hip_CopyInt8ToNHWC_fp16(
     void *output_tensor,
     unsigned int dst_buf_offset,
     uint4 nchw,
+    uint2 outDims,
     float3 multiplier,
     float3 offset,
     const unsigned int reverse_channels) {
@@ -82,16 +86,19 @@ Hip_CopyInt8ToNHWC_fp16(
     const int W = nchw.w;
     const int H = nchw.z;
     const int C = nchw.y;
+    const int maxOutH = outDims.x;
+    const int maxOutW = outDims.y;
     const int img_offset = C * W * H;
+    const int out_img_offset = C * maxOutW * maxOutH;
 
-    if ((x >= W) || (y >= H))
+    if ((x >= maxOutW) || (y >= maxOutH))
         return;
     for (unsigned int n = 0; n < nchw.x; n++) {
-        __half *out_tensor = (__half *)output_tensor + dst_buf_offset + n * img_offset;
+        __half *out_tensor = (__half *)output_tensor + dst_buf_offset + n * out_img_offset;
         unsigned int srcIdx = (y * W + x) * C;
         // copy float3  pixels to dst
         if (C == 3) {
-            unsigned int dstIdx = y * W + x * 3;
+            unsigned int dstIdx = y * maxOutW + x * 3;
             const uchar *inp_img = &inp_image_u8[n * img_offset];
             float3 dst;
             if (reverse_channels)
@@ -102,9 +109,9 @@ Hip_CopyInt8ToNHWC_fp16(
             out_tensor[dstIdx + 1] = __float2half(dst.y);
             out_tensor[dstIdx + 2] = __float2half(dst.z);
         } else {
-            unsigned int dstIdx = y * W + x;
+            unsigned int dstIdx = y * maxOutW + x;
             const uchar *inp_img = &inp_image_u8[n * img_offset];
-            float *out_tensor = (float *)output_tensor + n * img_offset;
+            float *out_tensor = (float *)output_tensor + n * out_img_offset;
             out_tensor[dstIdx] = __float2half((float)inp_img[srcIdx] * multiplier.x + offset.x);
         }
     }
@@ -116,6 +123,7 @@ Hip_CopyInt8ToNCHW_fp32(
     void *output_tensor,
     unsigned int dst_buf_offset,
     uint4 nchw,
+    uint2 outDims,
     float3 multiplier,
     float3 offset,
     unsigned int reverse_channels) {
@@ -124,16 +132,20 @@ Hip_CopyInt8ToNCHW_fp32(
     const int W = nchw.w;
     const int H = nchw.z;
     const int C = nchw.y;
+    const int maxOutH = outDims.x;
+    const int maxOutW = outDims.y;
     const int img_offset = C * W * H;
+    const int out_img_offset = C * maxOutW * maxOutH;
 
-    if ((x >= W) || (y >= H))
+    if ((x >= maxOutW) || (y >= maxOutH))
         return;
     for (unsigned int n = 0; n < nchw.x; n++) {
         unsigned int srcIdx = (y * W + x) * C;
-        unsigned int dstIdx = y * W + x;
+        unsigned int dstIdx = y * maxOutW + x;
         // copy float3  pixels to dst
         const uchar *inp_img = &inp_image_u8[n * img_offset];
-        float *out_tensor = (float *)output_tensor + n * img_offset + dst_buf_offset;
+        float *out_tensor = (float *)output_tensor + n * out_img_offset + dst_buf_offset;
+        unsigned int stride = maxOutW * maxOutH;
         if (C == 3) {
             float3 dst;
             if (reverse_channels)
@@ -141,8 +153,8 @@ Hip_CopyInt8ToNCHW_fp32(
             else
                 dst = make_float3((float)inp_img[srcIdx], (float)inp_img[srcIdx + 1], (float)inp_img[srcIdx + 2]) * multiplier + offset;
             out_tensor[dstIdx] = dst.x;
-            out_tensor[dstIdx + W * H] = dst.y;
-            out_tensor[dstIdx + W * H * 2] = dst.z;
+            out_tensor[dstIdx + stride] = dst.y;
+            out_tensor[dstIdx + stride * 2] = dst.z;
         } else {
             out_tensor[dstIdx] = (float)inp_img[srcIdx] * multiplier.x + offset.x;
         }
@@ -155,6 +167,7 @@ Hip_CopyInt8ToNCHW_fp16(
     void *output_tensor,
     unsigned int dst_buf_offset,
     uint4 nchw,
+    uint2 outDims,
     float3 multiplier,
     float3 offset,
     const unsigned int reverse_channels) {
@@ -163,16 +176,20 @@ Hip_CopyInt8ToNCHW_fp16(
     const int W = nchw.w;
     const int H = nchw.z;
     const int C = nchw.y;
+    const int maxOutH = outDims.x;
+    const int maxOutW = outDims.y;
     const int img_offset = C * W * H;
+    const int out_img_offset = C * maxOutW * maxOutH;
 
-    if ((x >= W) || (y >= H))
+    if ((x >= maxOutW) || (y >= maxOutH))
         return;
     for (unsigned int n = 0; n < nchw.x; n++) {
-        __half *out_tensor = (__half *)output_tensor + n * img_offset + dst_buf_offset;
+        __half *out_tensor = (__half *)output_tensor + n * out_img_offset + dst_buf_offset;
         const uchar *inp_img = &inp_image_u8[n * img_offset];
         unsigned int srcIdx = (y * W + x) * C;
         // copy float3  pixels to dst
-        unsigned int dstIdx = y * W + x;
+        unsigned int dstIdx = y * maxOutW + x;
+        unsigned int stride = maxOutW * maxOutH;
         if (C == 3) {
             float3 dst;
             if (reverse_channels)
@@ -180,8 +197,8 @@ Hip_CopyInt8ToNCHW_fp16(
             else
                 dst = make_float3((float)inp_img[srcIdx], (float)inp_img[srcIdx + 1], (float)inp_img[srcIdx + 2]) * multiplier + offset;
             out_tensor[dstIdx] = __float2half(dst.x);
-            out_tensor[dstIdx + W * H] = __float2half(dst.y);
-            out_tensor[dstIdx + W * H * 2] = __float2half(dst.z);
+            out_tensor[dstIdx + stride] = __float2half(dst.y);
+            out_tensor[dstIdx + stride * 2] = __float2half(dst.z);
         } else {
             out_tensor[dstIdx] = __float2half((float)inp_img[srcIdx] * multiplier.x + offset.x);
         }
@@ -204,15 +221,22 @@ int HipExecCopyInt8ToNHWC(
     float offset1,
     float offset2,
     unsigned int reverse_channels,
-    unsigned int fp16) {
+    unsigned int fp16,
+    const unsigned max_output_height,
+    const unsigned max_output_width) {
     int localThreads_x = 16, localThreads_y = 16;
+    uint2 outDims;
+    if ((max_output_height == 0) || (max_output_width == 0))
+        outDims = make_uint2(h, w);
+    else
+        outDims = make_uint2(max_output_height, max_output_width);
     int globalThreads_x = w, globalThreads_y = h;
     if (!fp16) {
         hipLaunchKernelGGL(Hip_CopyInt8ToNHWC_fp32,
                            dim3(ceil((float)globalThreads_x / localThreads_x), ceil((float)globalThreads_y / localThreads_y)),
                            dim3(localThreads_x, localThreads_y),
                            0, stream, (const uchar *)inp_image_u8, output_tensor, dst_buf_offset,
-                           make_uint4(n, c, h, w),
+                           make_uint4(n, c, h, w), outDims,
                            make_float3(multiplier0, multiplier1, multiplier2), make_float3(offset0, offset1, offset2),
                            reverse_channels);
     } else {
@@ -220,7 +244,7 @@ int HipExecCopyInt8ToNHWC(
                            dim3(ceil((float)globalThreads_x / localThreads_x), ceil((float)globalThreads_y / localThreads_y)),
                            dim3(localThreads_x, localThreads_y),
                            0, stream, (const uchar *)inp_image_u8, output_tensor, dst_buf_offset,
-                           make_uint4(n, c, h, w),
+                           make_uint4(n, c, h, w), outDims,
                            make_float3(multiplier0, multiplier1, multiplier2), make_float3(offset0, offset1, offset2),
                            reverse_channels);
     }
@@ -243,15 +267,22 @@ int HipExecCopyInt8ToNCHW(
     float offset1,
     float offset2,
     unsigned int reverse_channels,
-    unsigned int fp16) {
+    unsigned int fp16,
+    const unsigned max_output_height,
+    const unsigned max_output_width) {
     int localThreads_x = 16, localThreads_y = 16;
+    uint2 outDims;
+    if ((max_output_height == 0) || (max_output_width == 0))
+        outDims = make_uint2(h, w);
+    else
+        outDims = make_uint2(max_output_height, max_output_width);
     int globalThreads_x = w, globalThreads_y = h;
     if (!fp16) {
         hipLaunchKernelGGL(Hip_CopyInt8ToNCHW_fp32,
                            dim3(ceil((float)globalThreads_x / localThreads_x), ceil((float)globalThreads_y / localThreads_y)),
                            dim3(localThreads_x, localThreads_y),
                            0, stream, (const uchar *)inp_image_u8, output_tensor, dst_buf_offset,
-                           make_uint4(n, c, h, w),
+                           make_uint4(n, c, h, w), outDims,
                            make_float3(multiplier0, multiplier1, multiplier2), make_float3(offset0, offset1, offset2),
                            reverse_channels);
     } else {
@@ -259,7 +290,7 @@ int HipExecCopyInt8ToNCHW(
                            dim3(ceil((float)globalThreads_x / localThreads_x), ceil((float)globalThreads_y / localThreads_y)),
                            dim3(localThreads_x, localThreads_y),
                            0, stream, (const uchar *)inp_image_u8, output_tensor, dst_buf_offset,
-                           make_uint4(n, c, h, w),
+                           make_uint4(n, c, h, w), outDims,
                            make_float3(multiplier0, multiplier1, multiplier2), make_float3(offset0, offset1, offset2),
                            reverse_channels);
     }
diff --git a/rocAL/rocAL_hip/rocal_hip_kernels.h b/rocAL/rocAL_hip/rocal_hip_kernels.h
index 9c6d81884..0db801f59 100644
--- a/rocAL/rocAL_hip/rocal_hip_kernels.h
+++ b/rocAL/rocAL_hip/rocal_hip_kernels.h
@@ -38,7 +38,9 @@ int HipExecCopyInt8ToNHWC(
     float offset1,
     float offset2,
     unsigned int reverse_channels,
-    unsigned int fp16);
+    unsigned int fp16,
+    const unsigned max_output_height = 0,
+    const unsigned max_output_width = 0);
 
 int HipExecCopyInt8ToNCHW(
     hipStream_t stream,
@@ -56,4 +58,6 @@ int HipExecCopyInt8ToNCHW(
     float offset1,
     float offset2,
     unsigned int reverse_channels,
-    unsigned int fp16);
+    unsigned int fp16,
+    const unsigned max_output_height = 0,
+    const unsigned max_output_width = 0);
diff --git a/rocAL/source/api/rocal_api_augmentation.cpp b/rocAL/source/api/rocal_api_augmentation.cpp
index b4ca8b42e..4137c50b6 100644
--- a/rocAL/source/api/rocal_api_augmentation.cpp
+++ b/rocAL/source/api/rocal_api_augmentation.cpp
@@ -554,16 +554,20 @@ RocalTensor ROCAL_API_CALL
     try {
         if ((dest_width | dest_height | resize_longer | resize_shorter) == 0)
             THROW("Atleast one size 'dest_width' or 'dest_height' or 'resize_shorter' or 'resize_longer' must be specified")
-        if ((dest_width | dest_height) && (resize_longer | resize_shorter))
+        if ((dest_width | dest_height) && (resize_longer | resize_shorter) && (scaling_mode != RocalResizeScalingMode::ROCAL_SCALING_MODE_MIN_MAX))
             THROW("Only one method of specifying size can be used \ndest_width and/or dest_height\nresize_shorter\nresize_longer")
-        if (resize_longer && resize_shorter)
-            THROW("'resize_longer' and 'resize_shorter' cannot be passed together. They are mutually exclusive.")
+        if (resize_longer && resize_shorter && scaling_mode != RocalResizeScalingMode::ROCAL_SCALING_MODE_MIN_MAX)
+            THROW("'resize_longer' and 'resize_shorter' can only be passed together for min max scaling mode")
 
         unsigned out_width, out_height;
         RocalResizeScalingMode resize_scaling_mode;
 
         // Change the scaling mode if resize_shorter or resize_longer is specified
-        if (resize_shorter) {
+        if (scaling_mode == RocalResizeScalingMode::ROCAL_SCALING_MODE_MIN_MAX) {
+            resize_scaling_mode = scaling_mode;
+            out_width = dest_width;
+            out_height = dest_height;
+        } else if (resize_shorter) {
             resize_scaling_mode = RocalResizeScalingMode::ROCAL_SCALING_MODE_NOT_SMALLER;
             out_width = out_height = resize_shorter;
         } else if (resize_longer) {
@@ -609,6 +613,10 @@ RocalTensor ROCAL_API_CALL
                 max_out_height = maximum_size[1] ? maximum_size[1] : max_out_height;
             }
         }
+        if (scaling_mode == RocalResizeScalingMode::ROCAL_SCALING_MODE_MIN_MAX) {
+            // For Min Max scaling mode, both min size and max size are passed as resize_shorter and resize_longer values
+            maximum_size = {resize_shorter, resize_longer};
+        }
 
         RocalTensorlayout op_tensor_layout = static_cast<RocalTensorlayout>(output_layout);
         RocalTensorDataType op_tensor_datatype = static_cast<RocalTensorDataType>(output_datatype);
diff --git a/rocAL/source/api/rocal_api_data_transfer.cpp b/rocAL/source/api/rocal_api_data_transfer.cpp
index 189328f6e..a3e3088cf 100644
--- a/rocAL/source/api/rocal_api_data_transfer.cpp
+++ b/rocAL/source/api/rocal_api_data_transfer.cpp
@@ -30,7 +30,7 @@ THE SOFTWARE.
 RocalStatus ROCAL_API_CALL
 rocalToTensor(RocalContext p_context, void* out_ptr, RocalTensorLayout tensor_format, RocalTensorOutputType tensor_output_type, float multiplier0,
               float multiplier1, float multiplier2, float offset0, float offset1, float offset2,
-              bool reverse_channels, RocalOutputMemType output_mem_type) {
+              bool reverse_channels, RocalOutputMemType output_mem_type, int max_height, int max_width) {
     auto context = static_cast<Context*>(p_context);
     try {
         if (tensor_format != ROCAL_NHWC && tensor_format != ROCAL_NCHW)
@@ -42,7 +42,7 @@ rocalToTensor(RocalContext p_context, void* out_ptr, RocalTensorLayout tensor_fo
         auto tensor_layout = (tensor_format == ROCAL_NHWC) ? RocalTensorlayout::NHWC : RocalTensorlayout::NCHW;
         auto tensor_output_data_type = (tensor_output_type == ROCAL_FP32) ? RocalTensorDataType::FP32 : RocalTensorDataType::FP16;
         context->master_graph->to_tensor(out_ptr, tensor_layout, multiplier0, multiplier1, multiplier2,
-                                         offset0, offset1, offset2, reverse_channels, tensor_output_data_type, output_mem_type);
+                                         offset0, offset1, offset2, reverse_channels, tensor_output_data_type, output_mem_type, max_height, max_width);
     } catch (const std::exception& e) {
         context->capture_error(e.what());
         ERR(e.what())
diff --git a/rocAL/source/api/rocal_api_meta_data.cpp b/rocAL/source/api/rocal_api_meta_data.cpp
index 1fa4768a7..0eaf89958 100644
--- a/rocAL/source/api/rocal_api_meta_data.cpp
+++ b/rocAL/source/api/rocal_api_meta_data.cpp
@@ -71,14 +71,14 @@ RocalMetaData
 
 RocalMetaData
     ROCAL_API_CALL
-    rocalCreateCOCOReader(RocalContext p_context, const char* source_path, bool is_output, bool mask, bool ltrb, bool is_box_encoder) {
+    rocalCreateCOCOReader(RocalContext p_context, const char* source_path, bool is_output, bool mask, bool ltrb, bool is_box_encoder, bool avoid_class_remapping, bool aspect_ratio_grouping) {
     if (!p_context)
         THROW("Invalid rocal context passed to rocalCreateCOCOReader")
     auto context = static_cast<Context*>(p_context);
     if (mask) {
-        return context->master_graph->create_coco_meta_data_reader(source_path, is_output, MetaDataReaderType::COCO_META_DATA_READER, MetaDataType::PolygonMask, ltrb, is_box_encoder);
+        return context->master_graph->create_coco_meta_data_reader(source_path, is_output, MetaDataReaderType::COCO_META_DATA_READER, MetaDataType::PolygonMask, ltrb, is_box_encoder, avoid_class_remapping, aspect_ratio_grouping);
     }
-    return context->master_graph->create_coco_meta_data_reader(source_path, is_output, MetaDataReaderType::COCO_META_DATA_READER, MetaDataType::BoundingBox, ltrb, is_box_encoder);
+    return context->master_graph->create_coco_meta_data_reader(source_path, is_output, MetaDataReaderType::COCO_META_DATA_READER, MetaDataType::BoundingBox, ltrb, is_box_encoder, avoid_class_remapping, aspect_ratio_grouping);
 }
 
 RocalMetaData
@@ -200,8 +200,7 @@ void
     if (context->user_batch_size() != meta_data_batch_size)
         THROW("meta data batch size is wrong " + TOSTR(meta_data_batch_size) + " != " + TOSTR(context->user_batch_size()))
     for (unsigned int i = 0; i < meta_data_batch_size; i++) {
-        std::string str_id = meta_data.first[i].erase(0, meta_data.first[i].find_first_not_of('0'));
-        buf[i] = stoi(str_id);
+        buf[i] = meta_data.second->get_image_id_batch()[i];
     }
 }
 
@@ -360,6 +359,32 @@ void
     }
 }
 
+void
+    ROCAL_API_CALL
+    rocalGetROIImageSizes(RocalContext p_context, int* buf) {
+    if (!p_context) {
+        THROW("Invalid rocal context passed to rocalGetROIImageSizes")
+        return;
+    }
+    auto context = static_cast<Context*>(p_context);
+    try {
+        auto meta_data = context->master_graph->meta_data();
+        size_t meta_data_batch_size = meta_data.second->get_img_roi_sizes_batch().size();
+
+        if (!meta_data.second) {
+            WRN("No label has been loaded for this output image")
+            return;
+        }
+        for (unsigned i = 0; i < meta_data_batch_size; i++) {
+            memcpy(buf, &(meta_data.second->get_img_roi_sizes_batch()[i]), sizeof(ImgSize));
+            buf += 2;
+        }
+    } catch (const std::exception& e) {
+        context->capture_error(e.what());
+        std::cerr << e.what() << '\n';
+    }
+}
+
 RocalMetaData
     ROCAL_API_CALL
     rocalCreateTextCifar10LabelReader(RocalContext p_context, const char* source_path, const char* file_prefix) {
@@ -396,10 +421,8 @@ void
     }
 }
 
-void
-    ROCAL_API_CALL
-    rocalBoxEncoder(RocalContext p_context, std::vector<float>& anchors, float criteria,
-                    std::vector<float>& means, std::vector<float>& stds, bool offset, float scale) {
+void ROCAL_API_CALL rocalBoxEncoder(RocalContext p_context, std::vector<float>& anchors, float criteria,
+                                    std::vector<float>& means, std::vector<float>& stds, bool offset, float scale) {
     if (!p_context)
         THROW("Invalid rocal context passed to rocalBoxEncoder")
     auto context = static_cast<Context*>(p_context);
diff --git a/rocAL/source/meta_data/coco_meta_data_reader.cpp b/rocAL/source/meta_data/coco_meta_data_reader.cpp
index 66b34157f..d0ddea904 100644
--- a/rocAL/source/meta_data/coco_meta_data_reader.cpp
+++ b/rocAL/source/meta_data/coco_meta_data_reader.cpp
@@ -33,6 +33,8 @@ using namespace std;
 
 void COCOMetaDataReader::init(const MetaDataConfig &cfg, pMetaDataBatch meta_data_batch) {
     _path = cfg.path();
+    _avoid_class_remapping = cfg.class_remapping();
+    this->set_aspect_ratio_grouping(cfg.aspect_ratio_grouping());
     _output = meta_data_batch;
     _output->set_metadata_type(cfg.type());
 }
@@ -41,6 +43,13 @@ bool COCOMetaDataReader::exists(const std::string &image_name) {
     return _map_content.find(image_name) != _map_content.end();
 }
 
+ImgSize COCOMetaDataReader::lookup_image_size(const std::string &image_name) {
+    auto it = _map_content.find(image_name);
+    if (_map_content.end() == it)
+        THROW("ERROR: Given name not present in the map " + image_name)
+    return it->second->get_img_size();
+}
+
 void COCOMetaDataReader::lookup(const std::vector<std::string> &image_names) {
     if (image_names.empty()) {
         WRN("No image names passed")
@@ -67,7 +76,7 @@ void COCOMetaDataReader::lookup(const std::vector<std::string> &image_names) {
     }
 }
 
-void COCOMetaDataReader::add(std::string image_name, BoundingBoxCords bb_coords, Labels bb_labels, ImgSize image_size, MaskCords mask_cords, std::vector<int> polygon_count, std::vector<std::vector<int>> vertices_count) {
+void COCOMetaDataReader::add(std::string image_name, BoundingBoxCords bb_coords, Labels bb_labels, ImgSize image_size, MaskCords mask_cords, std::vector<int> polygon_count, std::vector<std::vector<int>> vertices_count, int image_id) {
     if (exists(image_name)) {
         auto it = _map_content.find(image_name);
         it->second->get_bb_cords().push_back(bb_coords[0]);
@@ -77,7 +86,7 @@ void COCOMetaDataReader::add(std::string image_name, BoundingBoxCords bb_coords,
         it->second->get_vertices_count().push_back(vertices_count[0]);
         return;
     }
-    pMetaDataPolygonMask info = std::make_shared<PolygonMask>(bb_coords, bb_labels, image_size, mask_cords, polygon_count, vertices_count);
+    pMetaDataPolygonMask info = std::make_shared<PolygonMask>(bb_coords, bb_labels, image_size, mask_cords, polygon_count, vertices_count, image_id);
     _map_content.insert(pair<std::string, std::shared_ptr<PolygonMask>>(image_name, info));
 }
 
@@ -163,6 +172,7 @@ void COCOMetaDataReader::read_all(const std::string &path) {
     parser.EnterObject();
     while (const char *key = parser.NextObjectKey()) {
         if (0 == std::strcmp(key, "images")) {
+            int image_id;
             RAPIDJSON_ASSERT(parser.PeekType() == kArrayType);
             parser.EnterArray();
             while (parser.NextArrayValue()) {
@@ -178,10 +188,13 @@ void COCOMetaDataReader::read_all(const std::string &path) {
                         img_size.h = parser.GetInt();
                     } else if (0 == std::strcmp(internal_key, "file_name")) {
                         image_name = parser.GetString();
+                    } else if (0 == std::strcmp(internal_key, "id")) {
+                        image_id = parser.GetInt();
                     } else {
                         parser.SkipValue();
                     }
                 }
+                _map_img_names.insert(pair<int, std::string>(image_id, image_name));
                 _map_img_sizes.insert(pair<std::string, ImgSize>(image_name, img_size));
                 img_size = {};
             }
@@ -256,12 +269,9 @@ void COCOMetaDataReader::read_all(const std::string &path) {
                         parser.SkipValue();
                     }
                 }
-                char buffer[13];
-                sprintf(buffer, "%012d", id);
-                string str(buffer);
-                std::string file_name = str + ".jpg";
 
-                auto it = _map_img_sizes.find(file_name);
+                auto itr = _map_img_names.find(id);
+                auto it = _map_img_sizes.find(itr->second);
                 ImgSize image_size = it->second;  // Convert to "ltrb" format
                 if ((_output->get_metadata_type() == MetaDataType::PolygonMask) && iscrowd == 0) {
                     box.l = bbox[0];
@@ -272,7 +282,7 @@ void COCOMetaDataReader::read_all(const std::string &path) {
                     bb_labels.push_back(label);
                     polygon_count.push_back(polygon_size);
                     vertices_count.push_back(vertices_array);
-                    add(file_name, bb_coords, bb_labels, image_size, mask, polygon_count, vertices_count);
+                    add(itr->second, bb_coords, bb_labels, image_size, mask, polygon_count, vertices_count, id);
                     mask.clear();
                     polygon_size = 0;
                     polygon_count.clear();
@@ -287,7 +297,7 @@ void COCOMetaDataReader::read_all(const std::string &path) {
                     box.b = (bbox[1] + bbox[3]);
                     bb_coords.push_back(box);
                     bb_labels.push_back(label);
-                    add(file_name, bb_coords, bb_labels, image_size, id);
+                    add(itr->second, bb_coords, bb_labels, image_size, id);
                     bb_coords.clear();
                     bb_labels.clear();
                 }
@@ -303,7 +313,7 @@ void COCOMetaDataReader::read_all(const std::string &path) {
         Labels continuous_label_id;
         for (unsigned int i = 0; i < bb_coords.size(); i++) {
             auto _it_label = _label_info.find(bb_labels[i]);
-            int cnt_idx = _it_label->second;
+            int cnt_idx = _avoid_class_remapping ? _it_label->first : _it_label->second;
             continuous_label_id.push_back(cnt_idx);
         }
         elem.second->set_labels(continuous_label_id);
diff --git a/rocAL/source/meta_data/meta_node_resize_mirror_normalize.cpp b/rocAL/source/meta_data/meta_node_resize_mirror_normalize.cpp
index a2f2db643..b3deb4199 100644
--- a/rocAL/source/meta_data/meta_node_resize_mirror_normalize.cpp
+++ b/rocAL/source/meta_data/meta_node_resize_mirror_normalize.cpp
@@ -75,6 +75,11 @@ void ResizeMirrorNormalizeMetaNode::update_parameters(pMetaDataBatch input_meta_
             bb_coords.push_back(coords_buf[j]);
             bb_labels.push_back(labels_buf[j]);
         }
+        // get roi width and height of output image
+        auto img_roi_size = input_meta_data->get_img_roi_sizes_batch()[i];
+        img_roi_size.w = output_roi[i].x2;
+        img_roi_size.h = output_roi[i].y2;
+        output_meta_data->get_img_roi_sizes_batch()[i] = img_roi_size;
         output_meta_data->get_bb_cords_batch()[i] = bb_coords;
         output_meta_data->get_labels_batch()[i] = bb_labels;
     }
diff --git a/rocAL/source/pipeline/master_graph.cpp b/rocAL/source/pipeline/master_graph.cpp
index f36139122..a11618074 100644
--- a/rocAL/source/pipeline/master_graph.cpp
+++ b/rocAL/source/pipeline/master_graph.cpp
@@ -263,9 +263,9 @@ MasterGraph::build() {
         THROW("No output tensors are there, cannot create the pipeline")
 
 #if ENABLE_HIP || ENABLE_OPENCL
-    _ring_buffer.init(_mem_type, (void *)_device.resources(), _internal_tensor_list.data_size());
+    _ring_buffer.init(_mem_type, (void *)_device.resources(), _internal_tensor_list.data_size(), _user_batch_size * sizeof(RocalROI));
 #else
-    _ring_buffer.init(_mem_type, nullptr, _internal_tensor_list.data_size());
+    _ring_buffer.init(_mem_type, nullptr, _internal_tensor_list.data_size(), _user_batch_size * sizeof(RocalROI));
 #endif
     if (_is_box_encoder) _ring_buffer.initBoxEncoderMetaData(_mem_type, _user_batch_size * _num_anchors * 4 * sizeof(float), _user_batch_size * _num_anchors * sizeof(int));
     create_single_graph();
@@ -452,7 +452,7 @@ MasterGraph::timing() {
 
 MasterGraph::Status
 MasterGraph::to_tensor(void *out_ptr, RocalTensorlayout format, float multiplier0, float multiplier1,
-                       float multiplier2, float offset0, float offset1, float offset2, bool reverse_channels, RocalTensorDataType output_data_type, RocalOutputMemType output_mem_type) {
+                       float multiplier2, float offset0, float offset1, float offset2, bool reverse_channels, RocalTensorDataType output_data_type, RocalOutputMemType output_mem_type, int max_height, int max_width) {
     if (no_more_processed_data())
         return MasterGraph::Status::NO_MORE_DATA;
 
@@ -474,6 +474,10 @@ MasterGraph::to_tensor(void *out_ptr, RocalTensorlayout format, float multiplier
     const size_t h = dims[1];
     const size_t w = dims[2];
     const size_t single_output_tensor_size = output_tensor_info.data_size();
+    if ((max_height == 0) || (max_width == 0)) {
+        max_height = h;
+        max_width = w;
+    }
 
 #if ENABLE_OPENCL
     if (output_tensor_info.mem_type() == RocalMemType::OCL) {
@@ -491,7 +495,7 @@ MasterGraph::to_tensor(void *out_ptr, RocalTensorlayout format, float multiplier
         cl_kernel kernel = _device["utility"][kernel_name];
         auto queue = _device.resources()->cmd_queue;
         unsigned dest_buf_offset = 0;
-        auto output_buffers = _ring_buffer.get_read_buffers();
+        auto output_buffers = _ring_buffer.get_read_buffers().first;
 
         if (_output_tensor_buffer == nullptr) {
             size_t size = output_tensor_info.data_size() * sizeof(cl_float);
@@ -548,7 +552,7 @@ MasterGraph::to_tensor(void *out_ptr, RocalTensorlayout format, float multiplier
     if (output_tensor_info.mem_type() == RocalMemType::HIP) {
         unsigned int fp16 = (output_data_type == RocalTensorDataType::FP16);
 
-        auto output_buffers = _ring_buffer.get_read_buffers();
+        auto output_buffers = _ring_buffer.get_read_buffers().first;
         unsigned dest_buf_offset = 0;
         // copy hip buffer to out_ptr
         // todo:: add callback routing to exchange memory pointer to avoid extra copy
@@ -556,11 +560,11 @@ MasterGraph::to_tensor(void *out_ptr, RocalTensorlayout format, float multiplier
             auto img_buffer = out_tensor;
             if (format == RocalTensorlayout::NHWC) {
                 HipExecCopyInt8ToNHWC(_device.resources()->hip_stream, (const void *)img_buffer, out_ptr, dest_buf_offset, n, c, h, w,
-                                      multiplier0, multiplier1, multiplier2, offset0, offset1, offset2, reverse_channels, fp16);
+                                      multiplier0, multiplier1, multiplier2, offset0, offset1, offset2, reverse_channels, fp16, max_height, max_width);
 
             } else {
                 HipExecCopyInt8ToNCHW(_device.resources()->hip_stream, (const void *)img_buffer, out_ptr, dest_buf_offset, n, c, h, w,
-                                      multiplier0, multiplier1, multiplier2, offset0, offset1, offset2, reverse_channels, fp16);
+                                      multiplier0, multiplier1, multiplier2, offset0, offset1, offset2, reverse_channels, fp16, max_height, max_width);
             }
             dest_buf_offset += single_output_tensor_size;
         }
@@ -569,7 +573,7 @@ MasterGraph::to_tensor(void *out_ptr, RocalTensorlayout format, float multiplier
         if (output_mem_type == RocalOutputMemType::ROCAL_MEMCPY_GPU) {
             unsigned int fp16 = (output_data_type == RocalTensorDataType::FP16);
 
-            auto output_buffers = _ring_buffer.get_read_buffers();
+            auto output_buffers = _ring_buffer.get_read_buffers().first;
             unsigned dest_buf_offset = 0;
 
             if (_output_tensor_buffer == nullptr) {
@@ -593,11 +597,11 @@ MasterGraph::to_tensor(void *out_ptr, RocalTensorlayout format, float multiplier
 
                 if (format == RocalTensorlayout::NHWC) {
                     HipExecCopyInt8ToNHWC(_device.resources()->hip_stream, (const void *)_output_tensor_buffer, out_ptr, dest_buf_offset, n, c, h, w,
-                                          multiplier0, multiplier1, multiplier2, offset0, offset1, offset2, reverse_channels, fp16);
+                                          multiplier0, multiplier1, multiplier2, offset0, offset1, offset2, reverse_channels, fp16, max_height, max_width);
 
                 } else {
                     HipExecCopyInt8ToNCHW(_device.resources()->hip_stream, (const void *)_output_tensor_buffer, out_ptr, dest_buf_offset, n, c, h, w,
-                                          multiplier0, multiplier1, multiplier2, offset0, offset1, offset2, reverse_channels, fp16);
+                                          multiplier0, multiplier1, multiplier2, offset0, offset1, offset2, reverse_channels, fp16, max_height, max_width);
                 }
                 dest_buf_offset += single_output_tensor_size;
             }
@@ -610,15 +614,17 @@ MasterGraph::to_tensor(void *out_ptr, RocalTensorlayout format, float multiplier
             float offset[3] = {offset0, offset1, offset2};
             size_t dest_buf_offset_start = 0;
 
-            auto output_buffers = _ring_buffer.get_read_buffers();
+            auto output_buffers = _ring_buffer.get_read_buffers().first;
             auto num_threads = _cpu_num_threads * 2;
             for (auto &&out_tensor : output_buffers) {
                 unsigned int single_tensor_size = w * c * h;
-                auto channel_size = w * h;
+                unsigned int channel_size = max_width * max_height;
+                unsigned int output_single_tensor_size = max_height * max_width * c;
+                unsigned int input_width_stride = w * c;
 #pragma omp parallel for num_threads(num_threads)
-                for (unsigned int batchCount = 0; batchCount < n; batchCount++) {
-                    size_t dest_buf_offset = dest_buf_offset_start + single_tensor_size * batchCount;
-                    auto in_buffer = (unsigned char *)out_tensor + single_tensor_size * batchCount;
+                for (unsigned int batch_count = 0; batch_count < n; batch_count++) {
+                    size_t dest_buf_offset = dest_buf_offset_start + output_single_tensor_size * batch_count;
+                    auto in_buffer = (unsigned char *)out_tensor + single_tensor_size * batch_count;
 
                     if (format == RocalTensorlayout::NHWC) {
                         if (output_data_type == RocalTensorDataType::FP32) {
@@ -669,34 +675,37 @@ MasterGraph::to_tensor(void *out_ptr, RocalTensorlayout format, float multiplier
                                 __m256 padd0 = _mm256_set1_ps(offset0);
                                 __m256 padd1 = _mm256_set1_ps(offset1);
                                 __m256 padd2 = _mm256_set1_ps(offset2);
-                                unsigned int alignedLength = (channel_size & ~7);  // multiple of 8
-                                unsigned int i = 0;
+                                int alignedLength = (max_width & ~7);  // multiple of 8
 
                                 __m256 fR, fG, fB;
-                                for (; i < alignedLength; i += 8) {
-                                    __m256i pix0 = _mm256_loadu_si256((const __m256i *)in_buffer);
-                                    pix0 = _mm256_permutevar8x32_epi32(pix0, _mm256_setr_epi32(0, 1, 2, 3, 3, 4, 5, 6));
-                                    fB = _mm256_cvtepi32_ps(_mm256_shuffle_epi8(pix0, mask_R));
-                                    fG = _mm256_cvtepi32_ps(_mm256_shuffle_epi8(pix0, mask_G));
-                                    fR = _mm256_cvtepi32_ps(_mm256_shuffle_epi8(pix0, mask_B));
-                                    fB = _mm256_mul_ps(fB, pmul0);
-                                    fG = _mm256_mul_ps(fG, pmul1);
-                                    fR = _mm256_mul_ps(fR, pmul2);
-                                    fB = _mm256_add_ps(fB, padd0);
-                                    fG = _mm256_add_ps(fG, padd1);
-                                    fR = _mm256_add_ps(fR, padd2);
-                                    _mm256_storeu_ps(B_buf, fB);
-                                    _mm256_storeu_ps(G_buf, fG);
-                                    _mm256_storeu_ps(R_buf, fR);
-                                    B_buf += 8;
-                                    G_buf += 8;
-                                    R_buf += 8;
-                                    in_buffer += 24;
-                                }
-                                for (; i < channel_size; i++, in_buffer += 3) {
-                                    *B_buf++ = (in_buffer[0] * multiplier0) + offset0;
-                                    *G_buf++ = (in_buffer[1] * multiplier1) + offset1;
-                                    *R_buf++ = (in_buffer[2] * multiplier2) + offset1;
+                                for (int row = 0; row < max_height; row++) {
+                                    unsigned char *in_buffer_row = reinterpret_cast<unsigned char *>(in_buffer) + (row * input_width_stride);
+                                    int col = 0;
+                                    for (; col < alignedLength; col += 8) {
+                                        __m256i pix0 = _mm256_loadu_si256((const __m256i *)in_buffer_row);
+                                        pix0 = _mm256_permutevar8x32_epi32(pix0, _mm256_setr_epi32(0, 1, 2, 3, 3, 4, 5, 6));
+                                        fB = _mm256_cvtepi32_ps(_mm256_shuffle_epi8(pix0, mask_R));
+                                        fG = _mm256_cvtepi32_ps(_mm256_shuffle_epi8(pix0, mask_G));
+                                        fR = _mm256_cvtepi32_ps(_mm256_shuffle_epi8(pix0, mask_B));
+                                        fB = _mm256_mul_ps(fB, pmul0);
+                                        fG = _mm256_mul_ps(fG, pmul1);
+                                        fR = _mm256_mul_ps(fR, pmul2);
+                                        fB = _mm256_add_ps(fB, padd0);
+                                        fG = _mm256_add_ps(fG, padd1);
+                                        fR = _mm256_add_ps(fR, padd2);
+                                        _mm256_storeu_ps(B_buf, fB);
+                                        _mm256_storeu_ps(G_buf, fG);
+                                        _mm256_storeu_ps(R_buf, fR);
+                                        B_buf += 8;
+                                        G_buf += 8;
+                                        R_buf += 8;
+                                        in_buffer_row += 24;
+                                    }
+                                    for (; col < max_width; col++, in_buffer_row += 3) {
+                                        *B_buf++ = (in_buffer_row[0] * multiplier0) + offset0;
+                                        *G_buf++ = (in_buffer_row[1] * multiplier1) + offset1;
+                                        *R_buf++ = (in_buffer_row[2] * multiplier2) + offset1;
+                                    }
                                 }
 #else
                                 for (unsigned channel_idx = 0; channel_idx < c; channel_idx++) {
@@ -733,35 +742,38 @@ MasterGraph::to_tensor(void *out_ptr, RocalTensorlayout format, float multiplier
                                 __m256 padd0 = _mm256_set1_ps(offset0);
                                 __m256 padd1 = _mm256_set1_ps(offset1);
                                 __m256 padd2 = _mm256_set1_ps(offset2);
-                                unsigned int alignedLength = (channel_size & ~7);  // multiple of 8
-                                unsigned int i = 0;
+                                int alignedLength = (max_width & ~7);  // multiple of 8
 
                                 __m256 fR, fG, fB;
                                 __m128i tempR, tempG, tempB;
-                                for (; i < alignedLength; i += 8) {
-                                    __m256i pix0 = _mm256_loadu_si256((const __m256i *)in_buffer);
-                                    pix0 = _mm256_permutevar8x32_epi32(pix0, _mm256_setr_epi32(0, 1, 2, 3, 3, 4, 5, 6));
-                                    fB = _mm256_cvtepi32_ps(_mm256_shuffle_epi8(pix0, mask_R));
-                                    fG = _mm256_cvtepi32_ps(_mm256_shuffle_epi8(pix0, mask_G));
-                                    fR = _mm256_cvtepi32_ps(_mm256_shuffle_epi8(pix0, mask_B));
-                                    fB = _mm256_fmadd_ps(fB, pmul0, padd0);
-                                    fG = _mm256_fmadd_ps(fG, pmul1, padd1);
-                                    fR = _mm256_fmadd_ps(fR, pmul2, padd2);
-                                    tempB = _mm256_cvtps_ph(fB, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
-                                    tempG = _mm256_cvtps_ph(fG, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
-                                    tempR = _mm256_cvtps_ph(fR, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
-                                    _mm_storeu_si128((__m128i *)B_buf_16, tempB);
-                                    _mm_storeu_si128((__m128i *)G_buf_16, tempG);
-                                    _mm_storeu_si128((__m128i *)R_buf_16, tempR);
-                                    B_buf_16 += 8;
-                                    G_buf_16 += 8;
-                                    R_buf_16 += 8;
-                                    in_buffer += 24;
-                                }
-                                for (; i < channel_size; i++, in_buffer += 3) {
-                                    *B_buf_16++ = (half)(in_buffer[0] * multiplier0) + offset0;
-                                    *G_buf_16++ = (half)(in_buffer[1] * multiplier1) + offset1;
-                                    *R_buf_16++ = (half)(in_buffer[2] * multiplier2) + offset2;
+                                for (int row = 0; row < max_height; row++) {
+                                    unsigned char *in_buffer_row = reinterpret_cast<unsigned char *>(in_buffer) + (row * input_width_stride);
+                                    int col = 0;
+                                    for (; col < alignedLength; col += 8) {
+                                        __m256i pix0 = _mm256_loadu_si256((const __m256i *)in_buffer_row);
+                                        pix0 = _mm256_permutevar8x32_epi32(pix0, _mm256_setr_epi32(0, 1, 2, 3, 3, 4, 5, 6));
+                                        fB = _mm256_cvtepi32_ps(_mm256_shuffle_epi8(pix0, mask_R));
+                                        fG = _mm256_cvtepi32_ps(_mm256_shuffle_epi8(pix0, mask_G));
+                                        fR = _mm256_cvtepi32_ps(_mm256_shuffle_epi8(pix0, mask_B));
+                                        fB = _mm256_fmadd_ps(fB, pmul0, padd0);
+                                        fG = _mm256_fmadd_ps(fG, pmul1, padd1);
+                                        fR = _mm256_fmadd_ps(fR, pmul2, padd2);
+                                        tempB = _mm256_cvtps_ph(fB, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+                                        tempG = _mm256_cvtps_ph(fG, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+                                        tempR = _mm256_cvtps_ph(fR, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC);
+                                        _mm_storeu_si128((__m128i *)B_buf_16, tempB);
+                                        _mm_storeu_si128((__m128i *)G_buf_16, tempG);
+                                        _mm_storeu_si128((__m128i *)R_buf_16, tempR);
+                                        B_buf_16 += 8;
+                                        G_buf_16 += 8;
+                                        R_buf_16 += 8;
+                                        in_buffer_row += 24;
+                                    }
+                                    for (; col < max_width; col++, in_buffer_row += 3) {
+                                        *B_buf_16++ = (half)(in_buffer_row[0] * multiplier0) + offset0;
+                                        *G_buf_16++ = (half)(in_buffer_row[1] * multiplier1) + offset1;
+                                        *R_buf_16++ = (half)(in_buffer_row[2] * multiplier2) + offset2;
+                                    }
                                 }
 #else
                                 for (unsigned channel_idx = 0; channel_idx < c; channel_idx++) {
@@ -808,7 +820,7 @@ MasterGraph::copy_output(unsigned char *out_ptr, size_t out_size_in_bytes) {
         //  to avoid unnecessary sequence of synchronizations
 
         // get_read_buffers() calls block_if_empty() internally and blocks if buffers are empty until a new batch is processed
-        auto output_buffers = _ring_buffer.get_read_buffers();
+        auto output_buffers = _ring_buffer.get_read_buffers().first;
         auto out_image_idx = output_buffers.size();
         for (auto &&output_handle : output_buffers) {
             bool sync_flag = (--out_image_idx == 0) ? CL_TRUE : CL_FALSE;
@@ -831,7 +843,7 @@ MasterGraph::copy_output(unsigned char *out_ptr, size_t out_size_in_bytes) {
 
         // get_read_buffers() calls block_if_empty() internally and blocks if buffers are empty until a new batch is processed
         size_t dest_buf_offset = 0;
-        auto output_buffers = _ring_buffer.get_read_buffers();
+        auto output_buffers = _ring_buffer.get_read_buffers().first;
         for (auto &&output_handle : output_buffers) {
             hipError_t err = hipMemcpyDtoHAsync((void *)(out_ptr + dest_buf_offset), output_handle, size, _device.resources()->hip_stream);
             if (err) {
@@ -846,7 +858,7 @@ MasterGraph::copy_output(unsigned char *out_ptr, size_t out_size_in_bytes) {
     } else {
 #endif
         // get_read_buffer is blocking if _ring_buffer is empty, and blocks this thread till internal processing thread process a new batch and store in the _ring_buffer
-        auto output_buffer = _ring_buffer.get_read_buffers()[0];
+        auto output_buffer = _ring_buffer.get_read_buffers().first[0];
         memcpy(out_ptr, output_buffer, size);
 #if ENABLE_OPENCL || ENABLE_HIP
     }
@@ -857,10 +869,13 @@ MasterGraph::copy_output(unsigned char *out_ptr, size_t out_size_in_bytes) {
 
 TensorList *
 MasterGraph::get_output_tensors() {
-    auto output_ptr = _ring_buffer.get_read_buffers();
-    for (unsigned i = 0; i < _internal_tensor_list.size(); i++)
+    auto read_buffers = _ring_buffer.get_read_buffers();
+    auto output_ptr = read_buffers.first;
+    auto roi_ptr = read_buffers.second;
+    for (unsigned i = 0; i < _internal_tensor_list.size(); i++) {
         _output_tensor_list[i]->set_mem_handle(output_ptr[i]);
-
+        _output_tensor_list[i]->set_roi(roi_ptr[i]);
+    }
     return &_output_tensor_list;
 }
 
@@ -880,6 +895,7 @@ void MasterGraph::output_routine() {
             _rb_block_if_full_time.start();
             // _ring_buffer.get_write_buffers() is blocking and blocks here until user uses processed image by calling run() and frees space in the ring_buffer
             auto write_buffers = _ring_buffer.get_write_buffers();
+            auto write_output_buffers = write_buffers.first;
             _rb_block_if_full_time.end();
 
             // Swap handles on the input tensor, so that new tensor is loaded to be processed
@@ -904,7 +920,7 @@ void MasterGraph::output_routine() {
 
             // Swap handles on the output tensor, so that new processed tensor will be written to the a new buffer
             for (size_t idx = 0; idx < _internal_tensor_list.size(); idx++)
-                _internal_tensor_list[idx]->swap_handle(write_buffers[idx]);
+                _internal_tensor_list[idx]->swap_handle(write_output_buffers[idx]);
 
             if (!_processing)
                 break;
@@ -931,6 +947,10 @@ void MasterGraph::output_routine() {
             _process_time.start();
             _graph->process();
             _process_time.end();
+
+            auto write_roi_buffers = write_buffers.second;  // Obtain ROI buffers from ring buffer
+            for (size_t idx = 0; idx < _internal_tensor_list.size(); idx++)
+                _internal_tensor_list[idx]->copy_roi(write_roi_buffers[idx]);  // Copy ROI from internal tensor's buffer to ring buffer
             _bencode_time.start();
             if (_is_box_encoder) {
                 auto bbox_encode_write_buffers = _ring_buffer.get_box_encode_write_buffers();
@@ -984,13 +1004,15 @@ void MasterGraph::stop_processing() {
         _output_thread.join();
 }
 
-std::vector<rocalTensorList *> MasterGraph::create_coco_meta_data_reader(const char *source_path, bool is_output, MetaDataReaderType reader_type, MetaDataType metadata_type, bool ltrb_bbox, bool is_box_encoder, float sigma, unsigned pose_output_width, unsigned pose_output_height) {
+std::vector<rocalTensorList *> MasterGraph::create_coco_meta_data_reader(const char *source_path, bool is_output, MetaDataReaderType reader_type, MetaDataType metadata_type, bool ltrb_bbox, bool is_box_encoder, bool avoid_class_remapping, bool aspect_ratio_grouping, float sigma, unsigned pose_output_width, unsigned pose_output_height) {
     if (_meta_data_reader)
         THROW("A metadata reader has already been created")
     if (_augmented_meta_data)
         THROW("Metadata output already defined, there can only be a single output for metadata augmentation");
 
     MetaDataConfig config(metadata_type, reader_type, source_path, std::map<std::string, std::string>(), std::string());
+    config.set_avoid_class_remapping(avoid_class_remapping);
+    config.set_aspect_ratio_grouping(aspect_ratio_grouping);
     config.set_out_img_width(pose_output_width);
     config.set_out_img_height(pose_output_height);
     _meta_data_graph = create_meta_data_graph(config);
@@ -1409,7 +1431,7 @@ MasterGraph::copy_out_tensor_planar(void *out_ptr, RocalTensorlayout format, flo
         float offset[3] = {offset0, offset1, offset2};
         size_t dest_buf_offset = 0;
 
-        auto output_buffers = _ring_buffer.get_read_buffers();
+        auto output_buffers = _ring_buffer.get_read_buffers().first;
 
         for (auto &&out_tensor : output_buffers) {
             for (unsigned batch = 0; batch < n; batch++) {
diff --git a/rocAL/source/pipeline/ring_buffer.cpp b/rocAL/source/pipeline/ring_buffer.cpp
index 4dad4e7a5..9d7a798e9 100644
--- a/rocAL/source/pipeline/ring_buffer.cpp
+++ b/rocAL/source/pipeline/ring_buffer.cpp
@@ -22,11 +22,11 @@ THE SOFTWARE.
 
 #include "ring_buffer.h"
 
-#include <device_manager.h>
-
 RingBuffer::RingBuffer(unsigned buffer_depth) : BUFF_DEPTH(buffer_depth),
                                                 _dev_sub_buffer(buffer_depth),
                                                 _host_sub_buffers(buffer_depth),
+                                                _dev_roi_buffers(buffer_depth),
+                                                _host_roi_buffers(buffer_depth),
                                                 _dev_bbox_buffer(buffer_depth),
                                                 _dev_labels_buffer(buffer_depth) {
     reset();
@@ -50,11 +50,13 @@ void RingBuffer::block_if_full() {
         _wait_for_unload.wait(lock);
     }
 }
-std::vector<void *> RingBuffer::get_read_buffers() {
+
+std::pair<std::vector<void *>, std::vector<unsigned *>> RingBuffer::get_read_buffers() {
     block_if_empty();
     if ((_mem_type == RocalMemType::OCL) || (_mem_type == RocalMemType::HIP))
-        return _dev_sub_buffer[_read_ptr];
-    return _host_sub_buffers[_read_ptr];
+        return std::make_pair(_dev_sub_buffer[_read_ptr], _dev_roi_buffers[_read_ptr]);
+
+    return std::make_pair(_host_sub_buffers[_read_ptr], _host_roi_buffers[_read_ptr]);
 }
 
 std::pair<void *, void *> RingBuffer::get_box_encode_read_buffers() {
@@ -64,12 +66,12 @@ std::pair<void *, void *> RingBuffer::get_box_encode_read_buffers() {
     return std::make_pair(_host_meta_data_buffers[_read_ptr][1], _host_meta_data_buffers[_read_ptr][0]);
 }
 
-std::vector<void *> RingBuffer::get_write_buffers() {
+std::pair<std::vector<void *>, std::vector<unsigned *>> RingBuffer::get_write_buffers() {
     block_if_full();
     if ((_mem_type == RocalMemType::OCL) || (_mem_type == RocalMemType::HIP))
-        return _dev_sub_buffer[_write_ptr];
+        return std::make_pair(_dev_sub_buffer[_write_ptr], _dev_roi_buffers[_write_ptr]);
 
-    return _host_sub_buffers[_write_ptr];
+    return std::make_pair(_host_sub_buffers[_write_ptr], _host_roi_buffers[_write_ptr]);
 }
 
 std::pair<void *, void *> RingBuffer::get_box_encode_write_buffers() {
@@ -109,7 +111,7 @@ void RingBuffer::unblock_writer() {
     _wait_for_unload.notify_all();
 }
 
-void RingBuffer::init(RocalMemType mem_type, void *devres, std::vector<size_t> &sub_buffer_size) {
+void RingBuffer::init(RocalMemType mem_type, void *devres, std::vector<size_t> &sub_buffer_size, size_t roi_buffer_size) {
     _mem_type = mem_type;
     _dev = devres;
     _sub_buffer_size = sub_buffer_size;
@@ -152,6 +154,7 @@ void RingBuffer::init(RocalMemType mem_type, void *devres, std::vector<size_t> &
 
         for (size_t buffIdx = 0; buffIdx < BUFF_DEPTH; buffIdx++) {
             _dev_sub_buffer[buffIdx].resize(sub_buffer_count);
+            _dev_roi_buffers[buffIdx].resize(sub_buffer_count);
             for (unsigned sub_idx = 0; sub_idx < sub_buffer_count; sub_idx++) {
                 hipError_t err = hipMalloc(&_dev_sub_buffer[buffIdx][sub_idx], _sub_buffer_size[sub_idx]);
                 // printf("allocated HIP device buffer <%d, %d, %d, %p>\n", buffIdx, sub_idx, _sub_buffer_size[sub_idx], _dev_sub_buffer[buffIdx][sub_idx]);
@@ -160,6 +163,11 @@ void RingBuffer::init(RocalMemType mem_type, void *devres, std::vector<size_t> &
                     THROW("hipMalloc of size " + TOSTR(_sub_buffer_size[sub_idx]) + " index " + TOSTR(sub_idx) +
                           " failed " + TOSTR(err));
                 }
+                err = hipHostMalloc((void **)&_dev_roi_buffers[buffIdx][sub_idx], roi_buffer_size, hipHostMallocDefault);  // Allocate HIP page locked ROI buffers
+                if (err != hipSuccess || !_dev_roi_buffers[buffIdx][sub_idx]) {
+                    _dev_roi_buffers.clear();
+                    THROW("hipHostMalloc of size " + TOSTR(roi_buffer_size) + " failed " + TOSTR(err))
+                }
             }
         }
     } else {
@@ -167,8 +175,11 @@ void RingBuffer::init(RocalMemType mem_type, void *devres, std::vector<size_t> &
         for (size_t buffIdx = 0; buffIdx < BUFF_DEPTH; buffIdx++) {
             // a minimum of extra MEM_ALIGNMENT is allocated
             _host_sub_buffers[buffIdx].resize(sub_buffer_count);
-            for (size_t sub_buff_idx = 0; sub_buff_idx < sub_buffer_count; sub_buff_idx++)
+            _host_roi_buffers[buffIdx].resize(sub_buffer_count);
+            for (size_t sub_buff_idx = 0; sub_buff_idx < sub_buffer_count; sub_buff_idx++) {
                 _host_sub_buffers[buffIdx][sub_buff_idx] = aligned_alloc(MEM_ALIGNMENT, MEM_ALIGNMENT * (_sub_buffer_size[sub_buff_idx] / MEM_ALIGNMENT + 1));
+                _host_roi_buffers[buffIdx][sub_buff_idx] = static_cast<unsigned *>(malloc(roi_buffer_size));  // Allocate HOST ROI buffers
+            }
         }
 #if ENABLE_OPENCL || ENABLE_HIP
     }
@@ -287,6 +298,11 @@ void RingBuffer::release_gpu_res() {
                         // printf("Error Freeing device buffer <%d, %d, %p>\n", buffIdx, sub_buf_idx, _dev_sub_buffer[buffIdx][sub_buf_idx]);
                         ERR("Could not release hip memory in the ring buffer")
                     }
+                if (_dev_roi_buffers[buffIdx][sub_buf_idx]) {
+                    if (hipHostFree((void *)_dev_roi_buffers[buffIdx][sub_buf_idx]) != hipSuccess) {
+                        ERR("Could not release hip memory for ROI in the ring buffer")
+                    }
+                }
             }
             if (_host_meta_data_buffers.size() != 0) {
                 for (unsigned sub_buf_idx = 0; sub_buf_idx < _host_meta_data_buffers[buffIdx].size(); sub_buf_idx++) {
@@ -297,6 +313,7 @@ void RingBuffer::release_gpu_res() {
         }
         _dev_sub_buffer.clear();
         _host_meta_data_buffers.clear();
+        _dev_roi_buffers.clear();
     }
 #elif ENABLE_OPENCL
         if (_mem_type == RocalMemType::OCL) {
@@ -325,6 +342,8 @@ RingBuffer::~RingBuffer() {
             for (unsigned sub_buf_idx = 0; sub_buf_idx < _host_sub_buffers[buffIdx].size(); sub_buf_idx++) {
                 if (_host_sub_buffers[buffIdx][sub_buf_idx])
                     free(_host_sub_buffers[buffIdx][sub_buf_idx]);
+                if (_host_roi_buffers[buffIdx][sub_buf_idx])
+                    free(_host_roi_buffers[buffIdx][sub_buf_idx]);
             }
             if (_host_meta_data_buffers.size() != 0) {
                 for (unsigned sub_buf_idx = 0; sub_buf_idx < _host_meta_data_buffers[buffIdx].size(); sub_buf_idx++) {
@@ -335,6 +354,7 @@ RingBuffer::~RingBuffer() {
         }
         _host_sub_buffers.clear();
         _host_meta_data_buffers.clear();
+        _host_roi_buffers.clear();
     }
 }
 
diff --git a/rocAL/source/pipeline/tensor.cpp b/rocAL/source/pipeline/tensor.cpp
index 2cebfd0e0..08e73b5ee 100644
--- a/rocAL/source/pipeline/tensor.cpp
+++ b/rocAL/source/pipeline/tensor.cpp
@@ -107,9 +107,14 @@ bool operator==(const TensorInfo &rhs, const TensorInfo &lhs) {
 }
 
 void TensorInfo::reset_tensor_roi_buffers() {
-    if (!_roi_buf) {
-        size_t roi_size = (_layout == RocalTensorlayout::NFCHW || _layout == RocalTensorlayout::NFHWC) ? _dims[0] * _dims[1] : _batch_size;  // For Sequences pre allocating the ROI to N * F to replicate in OpenVX extensions
-        allocate_host_or_pinned_mem((void **)&_roi_buf, roi_size * 4 * sizeof(unsigned), _mem_type);
+    size_t roi_size = (_layout == RocalTensorlayout::NFCHW || _layout == RocalTensorlayout::NFHWC) ? _dims[0] * _dims[1] : _batch_size;  // For Sequences pre allocating the ROI to N * F to replicate in OpenVX extensions
+    allocate_host_or_pinned_mem((void **)&_roi_buf, roi_size * 4 * sizeof(unsigned), _mem_type);
+    if (_mem_type == RocalMemType::HIP) {
+#if ENABLE_HIP
+        _roi.reset(_roi_buf, hipHostFree);
+#endif
+    } else {
+        _roi.reset(_roi_buf, free);
     }
     if (_is_image) {
         auto roi = get_roi();
@@ -172,46 +177,6 @@ TensorInfo::TensorInfo(std::vector<size_t> dims,
     set_max_shape();
 }
 
-TensorInfo::TensorInfo(const TensorInfo &other) {
-    _type = other._type;
-    _num_of_dims = other._num_of_dims;
-    _dims = other._dims;
-    _strides = other._strides;
-    _batch_size = other._batch_size;
-    _mem_type = other._mem_type;
-    _roi_type = other._roi_type;
-    _data_type = other._data_type;
-    _layout = other._layout;
-    _color_format = other._color_format;
-    _data_type_size = other._data_type_size;
-    _data_size = other._data_size;
-    _max_shape = other._max_shape;
-    _is_image = other._is_image;
-    _is_metadata = other._is_metadata;
-    _channels = other._channels;
-    if (!other.is_metadata()) {  // For Metadata ROI buffer is not required
-        allocate_host_or_pinned_mem(&_roi_buf, _batch_size * 4 * sizeof(unsigned), _mem_type);
-        memcpy((void *)_roi_buf, (const void *)other.get_roi(), _batch_size * 4 * sizeof(unsigned));
-    }
-}
-
-TensorInfo::~TensorInfo() {
-    if (!_is_metadata) {
-        if (_mem_type == RocalMemType::HIP) {
-#if ENABLE_HIP
-            if (_roi_buf) {
-                hipError_t err = hipHostFree(_roi_buf);
-                if (err != hipSuccess)
-                    ERR("hipHostFree failed " + TOSTR(err));
-            }
-#endif
-        } else {
-            if (_roi_buf) free(_roi_buf);
-        }
-        _roi_buf = nullptr;
-    }
-}
-
 void Tensor::update_tensor_roi(const std::vector<uint32_t> &width,
                                const std::vector<uint32_t> &height) {
     if (_info.is_image()) {
diff --git a/rocAL/source/readers/image/coco_file_source_reader.cpp b/rocAL/source/readers/image/coco_file_source_reader.cpp
index 636421a29..22eb63b02 100644
--- a/rocAL/source/readers/image/coco_file_source_reader.cpp
+++ b/rocAL/source/readers/image/coco_file_source_reader.cpp
@@ -85,9 +85,53 @@ Reader::Status COCOFileSourceReader::initialize(ReaderConfig desc) {
             replicate_last_batch_to_pad_partial_shard();
         }
     }
-    // shuffle dataset if set
-    if (ret == Reader::Status::OK && _shuffle)
-        std::random_shuffle(_file_names.begin(), _file_names.end());
+
+    if (_meta_data_reader && _meta_data_reader->aspect_ratio_grouping()) {
+        // calculate the aspect ratio for each file and create a pair of <filename, aspect_ratio>
+        std::vector<std::pair<std::string, float>> file_aspect_ratio_pair(_file_names.size());
+        for (size_t i = 0; i < _file_names.size(); i++) {
+            auto filename = _file_names[i];
+            std::string base_filename = filename.substr(filename.find_last_of("/\\") + 1);
+            auto img_size = _meta_data_reader->lookup_image_size(base_filename);
+            auto aspect_ratio = static_cast<float>(img_size.h) / img_size.w;
+            file_aspect_ratio_pair[i] = std::make_pair(filename, aspect_ratio);
+            _aspect_ratios.push_back(aspect_ratio);
+        };
+
+        // sort the <filename, aspect_ratio> pairs according to aspect ratios
+        std::sort(file_aspect_ratio_pair.begin(), file_aspect_ratio_pair.end(), [](auto &lop, auto &rop) { return lop.second < rop.second; });
+
+        // extract sorted file_names
+        std::transform(file_aspect_ratio_pair.begin(), file_aspect_ratio_pair.end(), std::back_inserter(_sorted_file_names), [](auto &pair) { return pair.first; });
+        // extract sorted aspect ratios
+        _aspect_ratios.clear();
+        std::transform(file_aspect_ratio_pair.begin(), file_aspect_ratio_pair.end(), std::back_inserter(_aspect_ratios), [](auto &pair) { return pair.second; });
+
+        // Copy the sorted file_names to _file_names vector to be used in sharding
+        _file_names = _sorted_file_names;
+        // Calculate the mid element which divides the aspect ratios into two groups (<=1.0 and >1.0)
+        auto mid = std::upper_bound(_aspect_ratios.begin(), _aspect_ratios.end(), 1.0f) - _aspect_ratios.begin();
+
+        // shuffle dataset if set
+        if (ret == Reader::Status::OK && _shuffle) {
+            // Shuffle within groups using the mid element as the limit - [start, mid) and [mid, last)
+            std::random_shuffle(_file_names.begin(), _file_names.begin() + mid);
+            std::random_shuffle(_file_names.begin() + mid, _file_names.end());
+            std::vector<std::string> shuffled_filenames;
+            int split_count = _file_names.size() / _batch_count;  // Number of batches for this shard
+            std::vector<int> indexes(split_count);
+            std::iota(indexes.begin(), indexes.end(), 0);
+            // Shuffle the index vector and use the index to fetch batch size elements for decoding
+            std::random_shuffle(indexes.begin(), indexes.end());
+            for (auto const idx : indexes)
+                shuffled_filenames.insert(shuffled_filenames.end(), _file_names.begin() + idx * _batch_count, _file_names.begin() + idx * _batch_count + _batch_count);
+            _file_names = shuffled_filenames;
+        }
+    } else {
+        // shuffle dataset if set
+        if (ret == Reader::Status::OK && _shuffle)
+            std::random_shuffle(_file_names.begin(), _file_names.end());
+    }
     return ret;
 }
 
@@ -173,8 +217,27 @@ int COCOFileSourceReader::release() {
 }
 
 void COCOFileSourceReader::reset() {
-    if (_shuffle)
+    if (_meta_data_reader && _meta_data_reader->aspect_ratio_grouping()) {
+        _file_names = _sorted_file_names;
+        // Calculate the mid element which divides the aspect ratios into two groups (<=1.0 and >1.0)
+        auto mid = std::upper_bound(_aspect_ratios.begin(), _aspect_ratios.end(), 1.0f) - _aspect_ratios.begin();
+        if (_shuffle) {
+            // Shuffle within groups using the mid element as the limit - [start, mid) and [mid, last)
+            std::random_shuffle(_file_names.begin(), _file_names.begin() + mid);
+            std::random_shuffle(_file_names.begin() + mid, _file_names.end());
+            std::vector<std::string> shuffled_filenames;
+            int split_count = _file_names.size() / _batch_count;  // Number of batches for this shard
+            std::vector<int> indexes(split_count);
+            std::iota(indexes.begin(), indexes.end(), 0);
+            // Shuffle the index vector and use the index to fetch batch size elements for decoding
+            std::random_shuffle(indexes.begin(), indexes.end());
+            for (auto const idx : indexes)
+                shuffled_filenames.insert(shuffled_filenames.end(), _file_names.begin() + idx * _batch_count, _file_names.begin() + idx * _batch_count + _batch_count);
+            _file_names = shuffled_filenames;
+        }
+    } else if (_shuffle) {
         std::random_shuffle(_file_names.begin(), _file_names.end());
+    }
     _read_counter = 0;
     _curr_file_idx = 0;
 }
diff --git a/rocAL_pybind/amd/rocal/pipeline.py b/rocAL_pybind/amd/rocal/pipeline.py
index dca380e09..6c2dc579d 100644
--- a/rocAL_pybind/amd/rocal/pipeline.py
+++ b/rocAL_pybind/amd/rocal/pipeline.py
@@ -146,6 +146,11 @@ def define_graph(self):
     def get_handle(self):
         return self._handle
 
+    def copyToExternalTensor(self, array,  multiplier, offset, reverse_channels, tensor_format, tensor_dtype, max_height=0, max_width=0):
+
+        b.rocalToTensor(self._handle, ctypes.c_void_p(array.data_ptr()), tensor_format, tensor_dtype,
+                                    multiplier[0], multiplier[1], multiplier[2], offset[0], offset[1], offset[2], (1 if reverse_channels else 0), self._output_memory_type, max_height, max_width)
+
     def get_one_hot_encoded_labels(self, array, device):
         if device == "cpu":
             if (isinstance(array, np.ndarray)):
@@ -212,6 +217,12 @@ def get_bounding_box_labels(self):
     def get_bounding_box_cords(self):
         return b.getBoundingBoxCords(self._handle)
 
+    def get_mask_count(self, array):
+        return b.getMaskCount(self._handle, array)
+
+    def get_mask_coordinates(self, array_count, array):
+        return b.getMaskCoordinates(self._handle, array_count, array)
+    
     def get_image_labels(self):
         return b.getImageLabels(self._handle)
 
@@ -224,6 +235,9 @@ def get_encoded_boxes_and_lables(self, batch_size, num_anchors):
     def get_img_sizes(self, array):
         return b.getImgSizes(self._handle, array)
 
+    def get_roi_img_sizes(self, array):
+        return b.getROIImgSizes(self._handle, array)
+    
     def get_image_name_length(self, idx):
         return b.getImageNameLen(self._handle, idx)
 
diff --git a/rocAL_pybind/amd/rocal/readers.py b/rocAL_pybind/amd/rocal/readers.py
index d669d588f..9f28a1996 100644
--- a/rocAL_pybind/amd/rocal/readers.py
+++ b/rocAL_pybind/amd/rocal/readers.py
@@ -29,7 +29,7 @@
 
 
 def coco(annotations_file='', ltrb=True, masks=False, ratio=False, avoid_class_remapping=False,
-         pixelwise_masks=False, is_box_encoder=False, is_box_iou_matcher=False, stick_to_shard=False, pad_last_batch=False):
+         pixelwise_masks=False, is_box_encoder=False, is_box_iou_matcher=False, aspect_ratio_grouping=False, stick_to_shard=False, pad_last_batch=False):
     """!Creates a COCOReader node.
 
         @param annotations_file         Path to the COCO annotations file.
@@ -40,6 +40,7 @@ def coco(annotations_file='', ltrb=True, masks=False, ratio=False, avoid_class_r
         @param pixelwise_masks          Whether to read mask data and generate pixel-wise masks.
         @param is_box_encoder           Whether to enable box encoder in the pipeline.
         @param is_box_iou_matcher       Whether to enable box IOU matcher in the pipeline.
+        @param aspect_ratio_grouping    Whether to enable aspect ratio grouping in the pipeline.
         @param stick_to_shard           Determines whether the reader should stick to a data shard instead of going through the entire dataset.
         @param pad_last_batch           If set to True, pads the shard by repeating the last sample.
 
@@ -54,7 +55,9 @@ def coco(annotations_file='', ltrb=True, masks=False, ratio=False, avoid_class_r
         "is_output": True,
         "mask": masks,
         "ltrb": ltrb,
-        "is_box_encoder": is_box_encoder}
+        "is_box_encoder": is_box_encoder,
+        "avoid_class_remapping": avoid_class_remapping,
+        "aspect_ratio_grouping": aspect_ratio_grouping}
     meta_data = b.cocoReader(
         Pipeline._current_pipeline._handle, *(kwargs_pybind.values()))
     return (meta_data, labels, bboxes)
diff --git a/rocAL_pybind/amd/rocal/types.py b/rocAL_pybind/amd/rocal/types.py
index 6edd9a9c9..4409f5b19 100644
--- a/rocAL_pybind/amd/rocal/types.py
+++ b/rocAL_pybind/amd/rocal/types.py
@@ -79,6 +79,7 @@
 from rocal_pybind.types import SCALING_MODE_STRETCH
 from rocal_pybind.types import SCALING_MODE_NOT_SMALLER
 from rocal_pybind.types import SCALING_MODE_NOT_LARGER
+from rocal_pybind.types import SCALING_MODE_MIN_MAX
 
 #     RocalResizeInterpolationType
 from rocal_pybind.types import NEAREST_NEIGHBOR_INTERPOLATION
@@ -141,6 +142,7 @@
     SCALING_MODE_STRETCH: ("SCALING_MODE_STRETCH", SCALING_MODE_STRETCH),
     SCALING_MODE_NOT_SMALLER: ("SCALING_MODE_NOT_SMALLER", SCALING_MODE_NOT_SMALLER),
     SCALING_MODE_NOT_LARGER: ("SCALING_MODE_NOT_LARGER", SCALING_MODE_NOT_LARGER),
+    SCALING_MODE_MIN_MAX: ("SCALING_MODE_MIN_MAX", SCALING_MODE_MIN_MAX),
 
 }
 
diff --git a/rocAL_pybind/rocal_pybind.cpp b/rocAL_pybind/rocal_pybind.cpp
index 3c83772ad..e774f1f21 100644
--- a/rocAL_pybind/rocal_pybind.cpp
+++ b/rocAL_pybind/rocal_pybind.cpp
@@ -84,6 +84,18 @@ py::object wrapper_image_name(RocalContext context, int array_len) {
     return py::bytes(s);
 }
 
+py::object wrapper_copy_to_tensor(RocalContext context, py::object p,
+                                  RocalTensorLayout tensor_format, RocalTensorOutputType tensor_output_type, float multiplier0,
+                                  float multiplier1, float multiplier2, float offset0, float offset1, float offset2,
+                                  bool reverse_channels, RocalOutputMemType output_mem_type, int max_height, int max_width) {
+    auto ptr = ctypes_void_ptr(p);
+    // call pure C++ function
+    int status = rocalToTensor(context, ptr, tensor_format, tensor_output_type, multiplier0,
+                               multiplier1, multiplier2, offset0, offset1, offset2,
+                               reverse_channels, output_mem_type, max_height, max_width);
+    return py::cast<py::none>(Py_None);
+}
+
 std::unordered_map<int, std::string> rocalToPybindLayout = {
     {0, "NHWC"},
     {1, "NCHW"},
@@ -290,6 +302,7 @@ PYBIND11_MODULE(rocal_pybind, m) {
         .value("SCALING_MODE_STRETCH", ROCAL_SCALING_MODE_STRETCH)
         .value("SCALING_MODE_NOT_SMALLER", ROCAL_SCALING_MODE_NOT_SMALLER)
         .value("SCALING_MODE_NOT_LARGER", ROCAL_SCALING_MODE_NOT_LARGER)
+        .value("SCALING_MODE_MIN_MAX", ROCAL_SCALING_MODE_MIN_MAX)
         .export_values();
     py::enum_<RocalResizeInterpolationType>(types_m, "RocalResizeInterpolationType", "Decode size policies")
         .value("NEAREST_NEIGHBOR_INTERPOLATION", ROCAL_NEAREST_NEIGHBOR_INTERPOLATION)
@@ -362,6 +375,11 @@ PYBIND11_MODULE(rocal_pybind, m) {
         int *ptr = static_cast<int *>(buf.ptr);
         rocalGetImageSizes(context, ptr);
     });
+    m.def("getROIImgSizes", [](RocalContext context, py::array_t<int> array) {
+        auto buf = array.request();
+        int *ptr = static_cast<int *>(buf.ptr);
+        rocalGetROIImageSizes(context, ptr);
+    });
     // rocal_api_parameter.h
     m.def("setSeed", &rocalSetSeed);
     m.def("getSeed", &rocalGetSeed);
@@ -382,6 +400,7 @@ PYBIND11_MODULE(rocal_pybind, m) {
     m.def("getIntValue", &rocalGetIntValue);
     m.def("getFloatValue", &rocalGetFloatValue);
     // rocal_api_data_transfer.h
+    m.def("rocalToTensor", &wrapper_copy_to_tensor);
     m.def("getOutputTensors", [](RocalContext context) {
         rocalTensorList *output_tensor_list = rocalGetOutputTensors(context);
         py::list list;
@@ -435,6 +454,41 @@ PYBIND11_MODULE(rocal_pybind, m) {
         }
         return boxes_list;
     });
+    m.def("getMaskCount", [](RocalContext context, py::array_t<int> array) {
+        auto buf = array.mutable_data();
+        unsigned count = rocalGetMaskCount(context, buf);  // total number of polygons in complete batch
+        return count;
+    });
+    m.def("getMaskCoordinates", [](RocalContext context, py::array_t<int> polygon_size, py::array_t<int> mask_count) {
+        auto buf = polygon_size.request();
+        int *polygon_size_ptr = static_cast<int *>(buf.ptr);
+        // call pure C++ function
+        rocalTensorList *mask_data = rocalGetMaskCoordinates(context, polygon_size_ptr);
+        rocalTensorList *bbox_labels = rocalGetBoundingBoxLabel(context);
+        py::list complete_list;
+        int poly_cnt = 0;
+        int prev_object_cnt = 0;
+        auto mask_count_buf = mask_count.request();
+        int *mask_count_ptr = static_cast<int *>(mask_count_buf.ptr);
+        for (int i = 0; i < bbox_labels->size(); i++) {  // nbatchSize
+            float *mask_buffer = static_cast<float *>(mask_data->at(i)->buffer());
+            py::list poly_batch_list;
+            for (unsigned j = prev_object_cnt; j < bbox_labels->at(i)->dims().at(0) + prev_object_cnt; j++) {
+                py::list single_image;
+                for (int k = 0; k < mask_count_ptr[j]; k++) {
+                    py::list polygons_buffer;
+                    for (int l = 0; l < polygon_size_ptr[poly_cnt]; l++)
+                        polygons_buffer.append(mask_buffer[l]);
+                    mask_buffer += polygon_size_ptr[poly_cnt++];
+                    single_image.append(polygons_buffer);
+                }
+                poly_batch_list.append(single_image);
+            }
+            prev_object_cnt += bbox_labels->at(i)->dims().at(0);
+            complete_list.append(poly_batch_list);
+        }
+        return complete_list;
+    });
     // Will be enabled when IOU matcher changes are introduced in C++
     // m.def("getMatchedIndices", [](RocalContext context) {
     //     rocalTensorList *matches = rocalGetMatchedIndices(context);

From 416b6f516f4d7cc58f90706a0377a7d9a5633c6f Mon Sep 17 00:00:00 2001
From: fgladwin <fgladwin@amd.com>
Date: Sun, 15 Oct 2023 13:13:13 -0400
Subject: [PATCH 02/33] Fix build issue

---
 rocAL/source/meta_data/meta_node_resize_mirror_normalize.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rocAL/source/meta_data/meta_node_resize_mirror_normalize.cpp b/rocAL/source/meta_data/meta_node_resize_mirror_normalize.cpp
index 8a24dc68c..d2229d22a 100644
--- a/rocAL/source/meta_data/meta_node_resize_mirror_normalize.cpp
+++ b/rocAL/source/meta_data/meta_node_resize_mirror_normalize.cpp
@@ -77,8 +77,8 @@ void ResizeMirrorNormalizeMetaNode::update_parameters(pMetaDataBatch input_meta_
         }
         // get roi width and height of output image
         auto img_roi_size = input_meta_data->get_img_roi_sizes_batch()[i];
-        img_roi_size.w = output_roi[i].x2;
-        img_roi_size.h = output_roi[i].y2;
+        img_roi_size.w = output_roi[i].xywh.w;
+        img_roi_size.h = output_roi[i].xywh.h;
         output_meta_data->get_img_roi_sizes_batch()[i] = img_roi_size;
         output_meta_data->get_bb_cords_batch()[i] = bb_coords;
         output_meta_data->get_labels_batch()[i] = bb_labels;

From d2f8fab28be9b6d9d03b88fa1b7c4bf47ea9013e Mon Sep 17 00:00:00 2001
From: fgladwin <fgladwin@amd.com>
Date: Sun, 15 Oct 2023 13:32:16 -0400
Subject: [PATCH 03/33] Add box IOU matcher changes

Add pybind changes for IOU matcher
Remove BoundingBoxCordf
---
 rocAL/include/api/rocal_api_meta_data.h       |  22 ++-
 rocAL/include/meta_data/bounding_box_graph.h  |  18 +--
 rocAL/include/meta_data/meta_data_graph.h     |   1 +
 rocAL/include/pipeline/master_graph.h         |  11 +-
 rocAL/source/api/rocal_api_meta_data.cpp      |  32 ++++-
 rocAL/source/meta_data/bounding_box_graph.cpp | 128 +++++++++++++-----
 rocAL/source/pipeline/master_graph.cpp        |  43 +++++-
 rocAL_pybind/amd/rocal/fn.py                  |   2 +-
 rocAL_pybind/amd/rocal/readers.py             |   3 +-
 rocAL_pybind/rocal_pybind.cpp                 |  25 ++--
 10 files changed, 217 insertions(+), 68 deletions(-)

diff --git a/rocAL/include/api/rocal_api_meta_data.h b/rocAL/include/api/rocal_api_meta_data.h
index 5c5a305dd..d339944c7 100644
--- a/rocAL/include/api/rocal_api_meta_data.h
+++ b/rocAL/include/api/rocal_api_meta_data.h
@@ -81,9 +81,10 @@ extern "C" RocalMetaData ROCAL_API_CALL rocalCreateTFReaderDetection(RocalContex
  * \param [in] is_box_encoder If set to True, bboxes are returned as encoded bboxes using the anchors
  * \param [in] avoid_class_remapping If set to True, classes are returned directly. Otherwise, classes are mapped to consecutive values
  * \param [in] aspect_ratio_grouping If set to True, images are sorted by their aspect ratio and returned
+ * \param [in] is_box_iou_matcher If set to True, box iou matcher which returns matched indices is enabled in the pipeline
  * \return RocalMetaData object, can be used to inquire about the rocal's output (processed) tensors
  */
-extern "C" RocalMetaData ROCAL_API_CALL rocalCreateCOCOReader(RocalContext rocal_context, const char* source_path, bool is_output, bool mask = false, bool ltrb = true, bool is_box_encoder = false, bool avoid_class_remapping = false, bool aspect_ratio_grouping = false);
+extern "C" RocalMetaData ROCAL_API_CALL rocalCreateCOCOReader(RocalContext rocal_context, const char* source_path, bool is_output, bool mask = false, bool ltrb = true, bool is_box_encoder = false, bool avoid_class_remapping = false, bool aspect_ratio_grouping = false, bool is_box_iou_matcher = false);
 
 /*! \brief create coco reader key points
  * \ingroup group_rocal_meta_data
@@ -296,4 +297,23 @@ extern "C" void ROCAL_API_CALL rocalGetImageId(RocalContext p_context, int* buf)
  */
 extern "C" void ROCAL_API_CALL rocalGetJointsDataPtr(RocalContext p_context, RocalJointsData** joints_data);
 
+/*! \brief API to enable box IOU matcher and pass required params to pipeline
+ * \ingroup group_rocal_meta_data
+ * \param [in] p_context rocAL context
+ * \param [in] anchors The anchors / ground truth bounding box coordinates
+ * \param [in] criteria Threshold IoU for matching bounding boxes with anchors.
+ * \param [in] high_threshold The max threshold for IOU
+ * \param [in] low_threshold The min threshold for IOU
+ * \param [in] allow_low_quality_matches bool value when set to true allows low quality matches
+ */
+extern "C" void ROCAL_API_CALL rocalBoxIouMatcher(RocalContext p_context, std::vector<float>& anchors, float criteria,
+                                                  float high_threshold, float low_threshold, bool allow_low_quality_matches = true);
+
+/*! \brief API to return the matched idices for the bounding box and anchors
+ * \ingroup group_rocal_meta_data
+ * \param [in] rocal_context rocAL context
+ * \return RocalTensorList of matched indices
+ */
+extern "C" RocalTensorList ROCAL_API_CALL rocalGetMatchedIndices(RocalContext rocal_context);
+
 #endif  // MIVISIONX_ROCAL_API_META_DATA_H
diff --git a/rocAL/include/meta_data/bounding_box_graph.h b/rocAL/include/meta_data/bounding_box_graph.h
index 76e2cf5fe..6591e536f 100644
--- a/rocAL/include/meta_data/bounding_box_graph.h
+++ b/rocAL/include/meta_data/bounding_box_graph.h
@@ -26,22 +26,7 @@ THE SOFTWARE.
 #include "meta_data_graph.h"
 #include "meta_node.h"
 
-typedef struct {
-    float xc;
-    float yc;
-    float w;
-    float h;
-} BoundingBoxCord_xcycwh;
-typedef struct {
-    float l;
-    float t;
-    float r;
-    float b;
-} BoundingBoxCord_ltrb;
-typedef union {
-    BoundingBoxCord_xcycwh xcycwh;
-    BoundingBoxCord_ltrb ltrb;
-} BoundingBoxCordf;  // Union comprises of float bbox cords of ltrb/xcycwh type
+typedef  struct { float xc; float yc; float w; float h; } BoundingBoxCord_xcycwh;
 
 class BoundingBoxGraph : public MetaDataGraph {
    public:
@@ -49,4 +34,5 @@ class BoundingBoxGraph : public MetaDataGraph {
     void update_meta_data(pMetaDataBatch meta_data, decoded_image_info decode_image_info) override;
     void update_random_bbox_meta_data(pMetaDataBatch input_meta_data, pMetaDataBatch output_meta_data, decoded_image_info decoded_image_info, crop_image_info crop_image_info) override;
     void update_box_encoder_meta_data(std::vector<float> *anchors, pMetaDataBatch full_batch_meta_data, float criteria, bool offset, float scale, std::vector<float> &means, std::vector<float> &stds, float *encoded_boxes_data, int *encoded_labels_data) override;
+    void update_box_iou_matcher(std::vector<float> *anchors, int *matches_idx_buffer, pMetaDataBatch full_batch_meta_data, float criteria, float high_threshold, float low_threshold, bool allow_low_quality_matches) override;
 };
diff --git a/rocAL/include/meta_data/meta_data_graph.h b/rocAL/include/meta_data/meta_data_graph.h
index b66c5d15a..735b3b9ab 100644
--- a/rocAL/include/meta_data/meta_data_graph.h
+++ b/rocAL/include/meta_data/meta_data_graph.h
@@ -37,5 +37,6 @@ class MetaDataGraph {
     virtual void update_meta_data(pMetaDataBatch meta_data, decoded_image_info decoded_image_info) = 0;
     virtual void update_random_bbox_meta_data(pMetaDataBatch input_meta_data, pMetaDataBatch output_meta_data, decoded_image_info decoded_image_info, crop_image_info crop_image_info) = 0;
     virtual void update_box_encoder_meta_data(std::vector<float> *anchors, pMetaDataBatch full_batch_meta_data, float criteria, bool offset, float scale, std::vector<float> &means, std::vector<float> &stds, float *encoded_boxes_data, int *encoded_labels_data) = 0;
+    virtual void update_box_iou_matcher(std::vector<float> *anchors, int *matches_idx_buffer, pMetaDataBatch full_batch_meta_data, float criteria, float high_threshold, float low_threshold, bool allow_low_quality_matches) = 0;
     std::list<std::shared_ptr<MetaNode>> _meta_nodes;
 };
diff --git a/rocAL/include/pipeline/master_graph.h b/rocAL/include/pipeline/master_graph.h
index dd5662c93..28d442e34 100644
--- a/rocAL/include/pipeline/master_graph.h
+++ b/rocAL/include/pipeline/master_graph.h
@@ -50,6 +50,7 @@ THE SOFTWARE.
 #define BBOX_COUNT 4
 #define MAX_NUM_ANCHORS 8732  // Num of bbox achors used in SSD training
 #define MAX_MASK_BUFFER 10000
+#define MAX_ANCHORS 120087  // Num of bbox achors used in Retinanet training
 
 #if ENABLE_SIMD
 #if _WIN32
@@ -107,18 +108,20 @@ class MasterGraph {
     std::vector<rocalTensorList *> create_label_reader(const char *source_path, MetaDataReaderType reader_type);
     std::vector<rocalTensorList *> create_video_label_reader(const char *source_path, MetaDataReaderType reader_type, unsigned sequence_length, unsigned frame_step, unsigned frame_stride, bool file_list_frame_num = true);
     std::vector<rocalTensorList *> create_coco_meta_data_reader(const char *source_path, bool is_output, MetaDataReaderType reader_type, MetaDataType label_type, bool ltrb_bbox = true, bool is_box_encoder = false,
-                                                                bool avoid_class_remapping = false, bool aspect_ratio_grouping = false, float sigma = 0.0, unsigned pose_output_width = 0, unsigned pose_output_height = 0);
+                                                                bool avoid_class_remapping = false, bool aspect_ratio_grouping = false, bool is_box_iou_matcher = false, float sigma = 0.0, unsigned pose_output_width = 0, unsigned pose_output_height = 0);
     std::vector<rocalTensorList *> create_tf_record_meta_data_reader(const char *source_path, MetaDataReaderType reader_type, MetaDataType label_type, const std::map<std::string, std::string> feature_key_map);
     std::vector<rocalTensorList *> create_caffe_lmdb_record_meta_data_reader(const char *source_path, MetaDataReaderType reader_type, MetaDataType label_type);
     std::vector<rocalTensorList *> create_caffe2_lmdb_record_meta_data_reader(const char *source_path, MetaDataReaderType reader_type, MetaDataType label_type);
     std::vector<rocalTensorList *> create_cifar10_label_reader(const char *source_path, const char *file_prefix);
     std::vector<rocalTensorList *> create_mxnet_label_reader(const char *source_path, bool is_output);
     void box_encoder(std::vector<float> &anchors, float criteria, const std::vector<float> &means, const std::vector<float> &stds, bool offset, float scale);
+    void box_iou_matcher(std::vector<float> &anchors, float criteria, float high_threshold, float low_threshold, bool allow_low_quality_matches);
     void create_randombboxcrop_reader(RandomBBoxCrop_MetaDataReaderType reader_type, RandomBBoxCrop_MetaDataType label_type, bool all_boxes_overlap, bool no_crop, FloatParam *aspect_ratio, bool has_shape, int crop_width, int crop_height, int num_attempts, FloatParam *scaling, int total_num_attempts, int64_t seed = 0);
     const std::pair<ImageNameBatch, pMetaDataBatch> &meta_data();
     TensorList *labels_meta_data();
     TensorList *bbox_meta_data();
     TensorList *mask_meta_data();
+    TensorList *matched_index_meta_data();
     void set_loop(bool val) { _loop = val; }
     void set_output(Tensor *output_tensor);
     size_t calculate_cpu_num_threads(size_t shard_count);
@@ -164,6 +167,7 @@ class MasterGraph {
     TensorList _labels_tensor_list;
     TensorList _bbox_tensor_list;
     TensorList _mask_tensor_list;
+    TensorList _matches_tensor_list;
     std::vector<size_t> _meta_data_buffer_size;
 #if ENABLE_HIP
     DeviceManagerHip _device;                                                     //!< Keeps the device related constructs needed for running on GPU
@@ -204,6 +208,11 @@ class MasterGraph {
     bool _offset;                                                                 // Returns normalized offsets ((encoded_bboxes*scale - anchors*scale) - mean) / stds in EncodedBBoxes that use std and the mean and scale arguments if offset="True"
     std::vector<float> _means, _stds;                                             //_means:  [x y w h] mean values for normalization _stds: [x y w h] standard deviations for offset normalization.
     bool _augmentation_metanode = false;
+    // box IoU matcher variables
+    bool _is_box_iou_matcher = false; // bool variable to set the box iou matcher
+    float _high_threshold = 0.5f;    // Max IoU threshold
+    float _low_threshold = 0.4f;     // Min IoU threshold
+    bool _allow_low_quality_matches = true; // Set to true to include low quality matches in matched idx generation
 #if ENABLE_HIP
     BoxEncoderGpu *_box_encoder_gpu = nullptr;
 #endif
diff --git a/rocAL/source/api/rocal_api_meta_data.cpp b/rocAL/source/api/rocal_api_meta_data.cpp
index 0eaf89958..34beb9a5f 100644
--- a/rocAL/source/api/rocal_api_meta_data.cpp
+++ b/rocAL/source/api/rocal_api_meta_data.cpp
@@ -71,14 +71,14 @@ RocalMetaData
 
 RocalMetaData
     ROCAL_API_CALL
-    rocalCreateCOCOReader(RocalContext p_context, const char* source_path, bool is_output, bool mask, bool ltrb, bool is_box_encoder, bool avoid_class_remapping, bool aspect_ratio_grouping) {
+    rocalCreateCOCOReader(RocalContext p_context, const char* source_path, bool is_output, bool mask, bool ltrb, bool is_box_encoder, bool avoid_class_remapping, bool aspect_ratio_grouping, bool is_box_iou_matcher) {
     if (!p_context)
         THROW("Invalid rocal context passed to rocalCreateCOCOReader")
     auto context = static_cast<Context*>(p_context);
     if (mask) {
-        return context->master_graph->create_coco_meta_data_reader(source_path, is_output, MetaDataReaderType::COCO_META_DATA_READER, MetaDataType::PolygonMask, ltrb, is_box_encoder, avoid_class_remapping, aspect_ratio_grouping);
+        return context->master_graph->create_coco_meta_data_reader(source_path, is_output, MetaDataReaderType::COCO_META_DATA_READER, MetaDataType::PolygonMask, ltrb, is_box_encoder, avoid_class_remapping, aspect_ratio_grouping, is_box_iou_matcher);
     }
-    return context->master_graph->create_coco_meta_data_reader(source_path, is_output, MetaDataReaderType::COCO_META_DATA_READER, MetaDataType::BoundingBox, ltrb, is_box_encoder, avoid_class_remapping, aspect_ratio_grouping);
+    return context->master_graph->create_coco_meta_data_reader(source_path, is_output, MetaDataReaderType::COCO_META_DATA_READER, MetaDataType::BoundingBox, ltrb, is_box_encoder, avoid_class_remapping, aspect_ratio_grouping, is_box_iou_matcher);
 }
 
 RocalMetaData
@@ -88,7 +88,7 @@ RocalMetaData
         THROW("Invalid rocal context passed to rocalCreateCOCOReaderKeyPoints")
     auto context = static_cast<Context*>(p_context);
 
-    return context->master_graph->create_coco_meta_data_reader(source_path, is_output, MetaDataReaderType::COCO_KEY_POINTS_META_DATA_READER, MetaDataType::KeyPoints, sigma, pose_output_width, pose_output_height);
+    return context->master_graph->create_coco_meta_data_reader(source_path, is_output, MetaDataReaderType::COCO_KEY_POINTS_META_DATA_READER, MetaDataType::KeyPoints, false, false, false, false, sigma, pose_output_width, pose_output_height);
 }
 
 RocalMetaData
@@ -488,3 +488,27 @@ void
 
     *joints_data = (RocalJointsData*)(&(meta_data.second->get_joints_data_batch()));
 }
+
+void
+    ROCAL_API_CALL
+    rocalBoxIouMatcher(RocalContext p_context,
+                       std::vector<float>& anchors,
+                       float criteria, float high_threshold,
+                       float low_threshold,
+                       bool allow_low_quality_matches) {
+    if (!p_context)
+        THROW("Invalid rocal context passed to rocalBoxIouMatcher")
+    auto context = static_cast<Context*>(p_context);
+    context->master_graph->box_iou_matcher(anchors, criteria, high_threshold,
+                                           low_threshold,
+                                           allow_low_quality_matches);
+}
+
+RocalTensorList
+    ROCAL_API_CALL
+    rocalGetMatchedIndices(RocalContext p_context) {
+    if (!p_context)
+        THROW("Invalid rocal context passed to rocalGetMatchedIndices")
+    auto context = static_cast<Context*>(p_context);
+    return context->master_graph->matched_index_meta_data();
+}
diff --git a/rocAL/source/meta_data/bounding_box_graph.cpp b/rocAL/source/meta_data/bounding_box_graph.cpp
index 2c6b2d105..9e7d19d72 100644
--- a/rocAL/source/meta_data/bounding_box_graph.cpp
+++ b/rocAL/source/meta_data/bounding_box_graph.cpp
@@ -58,13 +58,13 @@ void BoundingBoxGraph::update_meta_data(pMetaDataBatch input_meta_data, decoded_
     }
 }
 
-inline float ssd_BBoxIntersectionOverUnion(const BoundingBoxCord &box1, const float &box1_area, const BoundingBoxCordf &box2) {
-    float xA = std::max(static_cast<float>(box1.l), box2.ltrb.l);
-    float yA = std::max(static_cast<float>(box1.t), box2.ltrb.t);
-    float xB = std::min(static_cast<float>(box1.r), box2.ltrb.r);
-    float yB = std::min(static_cast<float>(box1.b), box2.ltrb.b);
+inline float ssd_BBoxIntersectionOverUnion(const BoundingBoxCord &box1, const float &box1_area, const BoundingBoxCord &box2) {
+    float xA = std::max(static_cast<float>(box1.l), box2.l);
+    float yA = std::max(static_cast<float>(box1.t), box2.t);
+    float xB = std::min(static_cast<float>(box1.r), box2.r);
+    float yB = std::min(static_cast<float>(box1.b), box2.b);
     float intersection_area = std::max((float)0.0, xB - xA) * std::max((float)0.0, yB - yA);
-    float box2_area = (box2.ltrb.b - box2.ltrb.t) * (box2.ltrb.r - box2.ltrb.l);
+    float box2_area = (box2.b - box2.t) * (box2.r - box2.l);
     return (float)(intersection_area / (box1_area + box2_area - intersection_area));
 }
 
@@ -116,7 +116,7 @@ void BoundingBoxGraph::update_random_bbox_meta_data(pMetaDataBatch input_meta_da
     }
 }
 
-inline void calculate_ious_for_box(float *ious, BoundingBoxCord &box, BoundingBoxCordf *anchors, unsigned int num_anchors) {
+inline void calculate_ious_for_box(float *ious, BoundingBoxCord &box, BoundingBoxCord *anchors, unsigned int num_anchors) {
     float box_area = (box.b - box.t) * (box.r - box.l);
     ious[0] = ssd_BBoxIntersectionOverUnion(box, box_area, anchors[0]);
 
@@ -149,13 +149,13 @@ inline int find_best_box_for_anchor(unsigned anchor_idx, const std::vector<float
 void BoundingBoxGraph::update_box_encoder_meta_data(std::vector<float> *anchors, pMetaDataBatch full_batch_meta_data, float criteria, bool offset, float scale, std::vector<float> &means, std::vector<float> &stds, float *encoded_boxes_data, int *encoded_labels_data) {
 #pragma omp parallel for
     for (int i = 0; i < full_batch_meta_data->size(); i++) {
-        BoundingBoxCordf *bbox_anchors = reinterpret_cast<BoundingBoxCordf *>(anchors->data());
+        BoundingBoxCord *bbox_anchors = reinterpret_cast<BoundingBoxCord *>(anchors->data());
         auto bb_count = full_batch_meta_data->get_labels_batch()[i].size();
         int *bb_labels = full_batch_meta_data->get_labels_batch()[i].data();
         BoundingBoxCord *bb_coords = reinterpret_cast<BoundingBoxCord *>(full_batch_meta_data->get_bb_cords_batch()[i].data());
         unsigned anchors_size = anchors->size() / 4;  // divide the anchors_size by 4 to get the total number of anchors
         int *encoded_labels = encoded_labels_data + (i * anchors_size);
-        BoundingBoxCordf *encoded_bb = reinterpret_cast<BoundingBoxCordf *>(encoded_boxes_data + (i * anchors_size * 4));
+        BoundingBoxCord_xcycwh *encoded_bb = reinterpret_cast<BoundingBoxCord_xcycwh *>(encoded_boxes_data + (i * anchors_size * 4));
         // Calculate Ious
         // ious size - bboxes count x anchors count
         std::vector<float> ious(bb_count * anchors_size);
@@ -167,36 +167,36 @@ void BoundingBoxGraph::update_box_encoder_meta_data(std::vector<float> *anchors,
         float half_scale = 0.5 * scale;
         // Depending on the matches ->place the best bbox instead of the corresponding anchor_idx in anchor
         for (unsigned anchor_idx = 0; anchor_idx < anchors_size; anchor_idx++) {
-            BoundingBoxCordf box_bestidx, anchor_xcyxwh;
-            BoundingBoxCordf *p_anchor = &bbox_anchors[anchor_idx];
+            BoundingBoxCord_xcycwh box_bestidx, anchor_xcyxwh;
+            BoundingBoxCord *p_anchor = &bbox_anchors[anchor_idx];
             const auto best_idx = find_best_box_for_anchor(anchor_idx, ious, bb_count, anchors_size);
             // Filter matches by criteria
             if (ious[(best_idx * anchors_size) + anchor_idx] > criteria)  // Its a match
             {
                 // Convert the "ltrb" format to "xcycwh"
                 if (offset) {
-                    box_bestidx.xcycwh.xc = (bb_coords[best_idx].l + bb_coords[best_idx].r) * half_scale;  // xc
-                    box_bestidx.xcycwh.yc = (bb_coords[best_idx].t + bb_coords[best_idx].b) * half_scale;  // yc
-                    box_bestidx.xcycwh.w = (bb_coords[best_idx].r - bb_coords[best_idx].l) * scale;        // w
-                    box_bestidx.xcycwh.h = (bb_coords[best_idx].b - bb_coords[best_idx].t) * scale;        // h
+                    box_bestidx.xc = (bb_coords[best_idx].l + bb_coords[best_idx].r) * half_scale;  // xc
+                    box_bestidx.yc = (bb_coords[best_idx].t + bb_coords[best_idx].b) * half_scale;  // yc
+                    box_bestidx.w = (bb_coords[best_idx].r - bb_coords[best_idx].l) * scale;        // w
+                    box_bestidx.h = (bb_coords[best_idx].b - bb_coords[best_idx].t) * scale;        // h
                     // Convert the "ltrb" format to "xcycwh"
-                    anchor_xcyxwh.xcycwh.xc = (p_anchor->ltrb.l + p_anchor->ltrb.r) * half_scale;  // xc
-                    anchor_xcyxwh.xcycwh.yc = (p_anchor->ltrb.t + p_anchor->ltrb.b) * half_scale;  // yc
-                    anchor_xcyxwh.xcycwh.w = (p_anchor->ltrb.r - p_anchor->ltrb.l) * scale;        // w
-                    anchor_xcyxwh.xcycwh.h = (p_anchor->ltrb.b - p_anchor->ltrb.t) * scale;        // h
+                    anchor_xcyxwh.xc = (p_anchor->l + p_anchor->r) * half_scale;  // xc
+                    anchor_xcyxwh.yc = (p_anchor->t + p_anchor->b) * half_scale;  // yc
+                    anchor_xcyxwh.w = (p_anchor->r - p_anchor->l) * scale;        // w
+                    anchor_xcyxwh.h = (p_anchor->b - p_anchor->t) * scale;        // h
                     // Reference for offset calculation between the Ground Truth bounding boxes & anchor boxes in <xc,yc,w,h> format
                     // https://github.com/sgrvinod/a-PyTorch-Tutorial-to-Object-Detection#predictions-vis-%C3%A0-vis-priors
-                    box_bestidx.xcycwh.xc = ((box_bestidx.xcycwh.xc - anchor_xcyxwh.xcycwh.xc) / anchor_xcyxwh.xcycwh.w - means[0]) * inv_stds[0];
-                    box_bestidx.xcycwh.yc = ((box_bestidx.xcycwh.yc - anchor_xcyxwh.xcycwh.yc) / anchor_xcyxwh.xcycwh.h - means[1]) * inv_stds[1];
-                    box_bestidx.xcycwh.w = (std::log(box_bestidx.xcycwh.w / anchor_xcyxwh.xcycwh.w) - means[2]) * inv_stds[2];
-                    box_bestidx.xcycwh.h = (std::log(box_bestidx.xcycwh.h / anchor_xcyxwh.xcycwh.h) - means[3]) * inv_stds[3];
+                    box_bestidx.xc = ((box_bestidx.xc - anchor_xcyxwh.xc) / anchor_xcyxwh.w - means[0]) * inv_stds[0];
+                    box_bestidx.yc = ((box_bestidx.yc - anchor_xcyxwh.yc) / anchor_xcyxwh.h - means[1]) * inv_stds[1];
+                    box_bestidx.w = (std::log(box_bestidx.w / anchor_xcyxwh.w) - means[2]) * inv_stds[2];
+                    box_bestidx.h = (std::log(box_bestidx.h / anchor_xcyxwh.h) - means[3]) * inv_stds[3];
                     encoded_bb[anchor_idx] = box_bestidx;
                     encoded_labels[anchor_idx] = bb_labels[best_idx];
                 } else {
-                    box_bestidx.xcycwh.xc = 0.5 * (bb_coords[best_idx].l + bb_coords[best_idx].r);  // xc
-                    box_bestidx.xcycwh.yc = 0.5 * (bb_coords[best_idx].t + bb_coords[best_idx].b);  // yc
-                    box_bestidx.xcycwh.w = bb_coords[best_idx].r - bb_coords[best_idx].l;           // w
-                    box_bestidx.xcycwh.h = bb_coords[best_idx].b - bb_coords[best_idx].t;           // h
+                    box_bestidx.xc = 0.5 * (bb_coords[best_idx].l + bb_coords[best_idx].r);  // xc
+                    box_bestidx.yc = 0.5 * (bb_coords[best_idx].t + bb_coords[best_idx].b);  // yc
+                    box_bestidx.w = bb_coords[best_idx].r - bb_coords[best_idx].l;           // w
+                    box_bestidx.h = bb_coords[best_idx].b - bb_coords[best_idx].t;           // h
                     encoded_bb[anchor_idx] = box_bestidx;
                     encoded_labels[anchor_idx] = bb_labels[best_idx];
                 }
@@ -207,13 +207,79 @@ void BoundingBoxGraph::update_box_encoder_meta_data(std::vector<float> *anchors,
                     encoded_labels[anchor_idx] = 0;
                 } else {
                     // Convert the "ltrb" format to "xcycwh"
-                    encoded_bb[anchor_idx].xcycwh.xc = 0.5 * (p_anchor->ltrb.l + p_anchor->ltrb.r);  // xc
-                    encoded_bb[anchor_idx].xcycwh.yc = 0.5 * (p_anchor->ltrb.t + p_anchor->ltrb.b);  // yc
-                    encoded_bb[anchor_idx].xcycwh.w = (-p_anchor->ltrb.l + p_anchor->ltrb.r);        // w
-                    encoded_bb[anchor_idx].xcycwh.h = (-p_anchor->ltrb.t + p_anchor->ltrb.b);        // h
+                    encoded_bb[anchor_idx].xc = 0.5 * (p_anchor->l + p_anchor->r);  // xc
+                    encoded_bb[anchor_idx].yc = 0.5 * (p_anchor->t + p_anchor->b);  // yc
+                    encoded_bb[anchor_idx].w = (-p_anchor->l + p_anchor->r);        // w
+                    encoded_bb[anchor_idx].h = (-p_anchor->t + p_anchor->b);        // h
                     encoded_labels[anchor_idx] = 0;
                 }
             }
         }
     }
 }
+
+void BoundingBoxGraph::update_box_iou_matcher(std::vector<float> *anchors, int *matches_idx_buffer,
+                                              pMetaDataBatch full_batch_meta_data, float criteria, float high_threshold,
+                                              float low_threshold, bool allow_low_quality_matches) {
+    auto bb_coords_batch = full_batch_meta_data->get_bb_cords_batch();
+    unsigned anchors_size = anchors->size() / 4;  // divide the anchors_size by 4 to get the total number of anchors
+    BoundingBoxCord *bbox_anchors = reinterpret_cast<BoundingBoxCord *>(anchors->data());
+
+    std::vector<int *> matches(full_batch_meta_data->size());
+    for (int i = 0; i < full_batch_meta_data->size(); i++) {
+        matches[i] = reinterpret_cast<int *>(matches_idx_buffer + i * anchors_size);
+    }
+
+#pragma omp parallel for
+    for (int i = 0; i < full_batch_meta_data->size(); i++) {
+        auto bb_coords = bb_coords_batch[i];
+        auto bb_count = bb_coords.size();
+
+        std::vector<float> matched_vals(anchors_size, -1.0);
+        std::vector<int> low_quality_preds(anchors_size, -1);
+
+        // Calculate IoU's, The number of IoU Values calculated will be (bb_count x anchors_size)
+        for (unsigned bb_idx = 0; bb_idx < bb_count; bb_idx++) {
+            BoundingBoxCord box = bb_coords[bb_idx];
+            float box_area = (box.b - box.t) * (box.r - box.l);
+            float best_bbox_iou = -1.0f;
+            std::vector<float> bbox_iou(anchors_size);  // IoU value for bbox mapped with each anchor
+            for (unsigned int anchor_idx = 0; anchor_idx < anchors_size; anchor_idx++) {
+                float iou_val = ssd_BBoxIntersectionOverUnion(box, box_area, bbox_anchors[anchor_idx]);
+                bbox_iou[anchor_idx] = iou_val;
+
+                // Find col maximum in (bb_count x anchors_size) IoU values calculated
+                if (iou_val > matched_vals[anchor_idx]) {
+                    matched_vals[anchor_idx] = iou_val;
+                    matches[i][anchor_idx] = static_cast<int>(bb_idx);
+                }
+
+                // Find row maximum in (bb_count x anchors_size) IoU values calculated
+                if (allow_low_quality_matches) {
+                    if (iou_val > best_bbox_iou) best_bbox_iou = iou_val;
+                }
+            }
+
+            if (allow_low_quality_matches) {
+                for (unsigned int anchor_idx = 0; anchor_idx < anchors_size; anchor_idx++) {  // if the element is found
+                    if (fabs(bbox_iou[anchor_idx] - best_bbox_iou) < 1e-6)
+                        low_quality_preds[anchor_idx] = anchor_idx;
+                }
+            }
+        }
+
+        // Update matched indices based on thresholds and low quality matches
+        for (uint pred_idx = 0; pred_idx < anchors_size; pred_idx++) {
+            if (!(allow_low_quality_matches && low_quality_preds[pred_idx] != -1)) {
+                if (matched_vals[pred_idx] < low_threshold) {
+                    matches[i][pred_idx] = -1;
+                } else if ((matched_vals[pred_idx] < high_threshold)) {
+                    matches[i][pred_idx] = -2;
+                }
+            }
+        }
+
+        matched_vals.clear();
+        low_quality_preds.clear();
+    }
+}
diff --git a/rocAL/source/pipeline/master_graph.cpp b/rocAL/source/pipeline/master_graph.cpp
index ed65248b1..baa5326d1 100644
--- a/rocAL/source/pipeline/master_graph.cpp
+++ b/rocAL/source/pipeline/master_graph.cpp
@@ -962,6 +962,10 @@ void MasterGraph::output_routine() {
 #endif
                     _meta_data_graph->update_box_encoder_meta_data(&_anchors, output_meta_data, _criteria, _offset, _scale, _means, _stds, (float *)bbox_encode_write_buffers.first, (int *)bbox_encode_write_buffers.second);
             }
+            if (_is_box_iou_matcher) {
+                int *matches_write_buffer = reinterpret_cast<int *>(_ring_buffer.get_meta_write_buffers()[2]);
+                _meta_data_graph->update_box_iou_matcher(&_anchors, matches_write_buffer, output_meta_data, _criteria, _high_threshold, _low_threshold, _allow_low_quality_matches);
+            }
             _bencode_time.end();
 #ifdef ROCAL_VIDEO
             _sequence_start_framenum_vec.insert(_sequence_start_framenum_vec.begin(), _loader_module->get_sequence_start_frame_number());
@@ -1004,7 +1008,8 @@ void MasterGraph::stop_processing() {
         _output_thread.join();
 }
 
-std::vector<rocalTensorList *> MasterGraph::create_coco_meta_data_reader(const char *source_path, bool is_output, MetaDataReaderType reader_type, MetaDataType metadata_type, bool ltrb_bbox, bool is_box_encoder, bool avoid_class_remapping, bool aspect_ratio_grouping, float sigma, unsigned pose_output_width, unsigned pose_output_height) {
+std::vector<rocalTensorList *> MasterGraph::create_coco_meta_data_reader(const char *source_path, bool is_output, MetaDataReaderType reader_type, MetaDataType metadata_type, bool ltrb_bbox, bool is_box_encoder,
+                                                                         bool avoid_class_remapping, bool aspect_ratio_grouping, bool is_box_iou_matcher, float sigma, unsigned pose_output_width, unsigned pose_output_height) {
     if (_meta_data_reader)
         THROW("A metadata reader has already been created")
     if (_augmented_meta_data)
@@ -1039,6 +1044,13 @@ std::vector<rocalTensorList *> MasterGraph::create_coco_meta_data_reader(const c
         default_mask_info.set_metadata();
         _meta_data_buffer_size.emplace_back(_user_batch_size * default_mask_info.data_size());
     }
+    if (is_box_iou_matcher) {
+        _is_box_iou_matcher = true;
+        dims = {MAX_ANCHORS};
+        default_matches_info = TensorInfo(std::move(dims), _mem_type, RocalTensorDataType::INT32);  // Create default matches info
+        default_matches_info.set_metadata();
+        _meta_data_buffer_size.emplace_back(_user_batch_size * default_matches_info.data_size());
+    }
 
     for (unsigned i = 0; i < _user_batch_size; i++)  // Create rocALTensorList for each metadata
     {
@@ -1050,12 +1062,18 @@ std::vector<rocalTensorList *> MasterGraph::create_coco_meta_data_reader(const c
             auto mask_info = default_mask_info;
             _mask_tensor_list.push_back(new Tensor(mask_info));
         }
+        if(is_box_iou_matcher) {
+            auto matches_info = default_matches_info;
+            _matches_tensor_list.push_back(new Tensor(matches_info));
+        }
     }
     _ring_buffer.init_metadata(RocalMemType::HOST, _meta_data_buffer_size);
     _metadata_output_tensor_list.emplace_back(&_labels_tensor_list);
     _metadata_output_tensor_list.emplace_back(&_bbox_tensor_list);
     if (metadata_type == MetaDataType::PolygonMask)
         _metadata_output_tensor_list.emplace_back(&_mask_tensor_list);
+    if(is_box_iou_matcher)
+        _metadata_output_tensor_list.emplace_back(&_matches_tensor_list);
 
     return _metadata_output_tensor_list;
 }
@@ -1347,6 +1365,18 @@ const std::pair<ImageNameBatch, pMetaDataBatch> &MasterGraph::meta_data() {
     return _ring_buffer.get_meta_data();
 }
 
+void MasterGraph::box_iou_matcher(std::vector<float> &anchors, float criteria,
+                                  float high_threshold, float low_threshold,
+                                  bool allow_low_quality_matches) {
+    if (!_is_box_iou_matcher)
+        THROW("Box IOU matcher variable not set cannot return matched idx")
+    _num_anchors = anchors.size() / 4;
+    _anchors = anchors;
+    _high_threshold = high_threshold;
+    _low_threshold = low_threshold;
+    _allow_low_quality_matches = allow_low_quality_matches;
+}
+
 size_t MasterGraph::bounding_box_batch_count(pMetaDataBatch meta_data_batch) {
     size_t size = 0;
     for (unsigned i = 0; i < _user_batch_size; i++)
@@ -1396,6 +1426,17 @@ TensorList *MasterGraph::mask_meta_data() {
     return &_mask_tensor_list;
 }
 
+TensorList *MasterGraph::matched_index_meta_data() {
+    if (_ring_buffer.level() == 0)
+        THROW("No meta data has been loaded")
+    auto meta_data_buffers = reinterpret_cast<unsigned char *>(_ring_buffer.get_meta_read_buffers()[2]);  // Get matches buffer from ring buffer
+    for (unsigned i = 0; i < _matches_tensor_list.size(); i++) {
+        _matches_tensor_list[i]->set_mem_handle(reinterpret_cast<void *>(meta_data_buffers));
+        meta_data_buffers += _matches_tensor_list[i]->info().data_size();
+    }
+    return &_matches_tensor_list;
+}
+
 void MasterGraph::notify_user_thread() {
     if (_output_routine_finished_processing)
         return;
diff --git a/rocAL_pybind/amd/rocal/fn.py b/rocAL_pybind/amd/rocal/fn.py
index a5b60b62c..9e45cefe0 100644
--- a/rocAL_pybind/amd/rocal/fn.py
+++ b/rocAL_pybind/amd/rocal/fn.py
@@ -1051,7 +1051,7 @@ def box_iou_matcher(*inputs, anchors, criteria=0.5, high_threshold=0.5,
     # pybind call arguments
     kwargs_pybind = {"anchors": anchors, "criteria": criteria, "high_threshold": high_threshold,
                      "low_threshold": low_threshold, "allow_low_quality_matches": allow_low_quality_matches}
-    box_iou_matcher = b.BoxIOUMatcher(
+    box_iou_matcher = b.boxIouMatcher(
         Pipeline._current_pipeline._handle, *(kwargs_pybind.values()))
     Pipeline._current_pipeline._box_iou_matcher = True
     return (box_iou_matcher, [])
diff --git a/rocAL_pybind/amd/rocal/readers.py b/rocAL_pybind/amd/rocal/readers.py
index 9f28a1996..70e5a25f3 100644
--- a/rocAL_pybind/amd/rocal/readers.py
+++ b/rocAL_pybind/amd/rocal/readers.py
@@ -57,7 +57,8 @@ def coco(annotations_file='', ltrb=True, masks=False, ratio=False, avoid_class_r
         "ltrb": ltrb,
         "is_box_encoder": is_box_encoder,
         "avoid_class_remapping": avoid_class_remapping,
-        "aspect_ratio_grouping": aspect_ratio_grouping}
+        "aspect_ratio_grouping": aspect_ratio_grouping,
+        "is_box_iou_matcher": is_box_iou_matcher}
     meta_data = b.cocoReader(
         Pipeline._current_pipeline._handle, *(kwargs_pybind.values()))
     return (meta_data, labels, bboxes)
diff --git a/rocAL_pybind/rocal_pybind.cpp b/rocAL_pybind/rocal_pybind.cpp
index 468be6d99..979cdee0e 100644
--- a/rocAL_pybind/rocal_pybind.cpp
+++ b/rocAL_pybind/rocal_pybind.cpp
@@ -384,7 +384,7 @@ PYBIND11_MODULE(rocal_pybind, m) {
     // rocal_api_meta_data.h
     m.def("randomBBoxCrop", &rocalRandomBBoxCrop);
     m.def("boxEncoder", &rocalBoxEncoder);
-    // m.def("BoxIOUMatcher", &rocalBoxIOUMatcher);  // Will be enabled when IOU matcher changes are introduced in C++
+    m.def("boxIouMatcher", &rocalBoxIouMatcher);
     m.def("getImgSizes", [](RocalContext context, py::array_t<int> array) {
         auto buf = array.request();
         int *ptr = static_cast<int *>(buf.ptr);
@@ -504,17 +504,18 @@ PYBIND11_MODULE(rocal_pybind, m) {
         }
         return complete_list;
     });
-    // Will be enabled when IOU matcher changes are introduced in C++
-    // m.def("getMatchedIndices", [](RocalContext context) {
-    //     rocalTensorList *matches = rocalGetMatchedIndices(context);
-    //     return py::array(py::buffer_info(
-    //                     (int *)(matches->at(0)->buffer()),
-    //                     sizeof(int),
-    //                     py::format_descriptor<int>::format(),
-    //                     1,
-    //                     {matches->size() * 120087},
-    //                     {sizeof(int) }));
-    // }, py::return_value_policy::reference);
+    m.def(
+        "getMatchedIndices", [](RocalContext context) {
+            rocalTensorList *matches = rocalGetMatchedIndices(context);
+            return py::array(py::buffer_info(
+                static_cast<int *>(matches->at(0)->buffer()),
+                sizeof(int),
+                py::format_descriptor<int>::format(),
+                1,
+                {matches->size() * matches->at(0)->dims().at(0)},
+                {sizeof(int)}));
+        },
+        py::return_value_policy::reference);
     m.def("rocalGetEncodedBoxesAndLables", [](RocalContext context, uint batch_size, uint num_anchors) {
         auto vec_pair_labels_boxes = rocalGetEncodedBoxesAndLables(context, batch_size * num_anchors);
         auto labels_buf_ptr = static_cast<int *>(vec_pair_labels_boxes[0]->at(0)->buffer());

From 36f852ca5a2f8e155f05a594296cbf4342642260 Mon Sep 17 00:00:00 2001
From: SundarRajan28 <sundarrajan@multicorewareinc.com>
Date: Mon, 16 Oct 2023 08:09:21 +0000
Subject: [PATCH 04/33] Fixing build issues

---
 .../augmentations/node_sequence_rearrange.cpp | 47 ++++++++-----------
 .../meta_node_resize_mirror_normalize.cpp     |  4 +-
 2 files changed, 22 insertions(+), 29 deletions(-)

diff --git a/rocAL/source/augmentations/node_sequence_rearrange.cpp b/rocAL/source/augmentations/node_sequence_rearrange.cpp
index f5e7234e6..d95579484 100644
--- a/rocAL/source/augmentations/node_sequence_rearrange.cpp
+++ b/rocAL/source/augmentations/node_sequence_rearrange.cpp
@@ -20,41 +20,34 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */
 
-#include <vx_ext_rpp.h>
-#include <VX/vx_compatibility.h>
 #include "node_sequence_rearrange.h"
-#include "exception.h"
 
+#include <VX/vx_compatibility.h>
+#include <vx_ext_rpp.h>
+
+#include "exception.h"
 
-SequenceRearrangeNode::SequenceRearrangeNode(const std::vector<Image *> &inputs, const std::vector<Image *> &outputs) :
-        Node(inputs, outputs)
-{
-}
+SequenceRearrangeNode::SequenceRearrangeNode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) : Node(inputs, outputs) {}
 
-void SequenceRearrangeNode::create_node()
-{
-    if(_node)
+void SequenceRearrangeNode::create_node() {
+    if (_node)
         return;
 
     vx_status status;
-    _sequence_array = vxCreateArray(vxGetContext((vx_reference)_graph->get()), VX_TYPE_UINT32, _new_sequence_length);
-    status = vxAddArrayItems(_sequence_array, _new_sequence_length, _new_order.data(), sizeof(vx_uint32));
-    if(status != VX_SUCCESS)
-        THROW("Adding array items failed: "+ TOSTR(status))
-    _node = vxExtrppNode_SequenceRearrangebatchPD(_graph->get(), _inputs[0]->handle(), _outputs[0]->handle(), _sequence_array, _new_sequence_length, _sequence_length, _sequence_count);
-    if((status = vxGetStatus((vx_reference)_node)) != VX_SUCCESS)
-        THROW("Adding the sequence rearrange (vxExtrppNode_SequenceRearrange) node failed: "+ TOSTR(status))
+    vx_array sequence_array = vxCreateArray(vxGetContext((vx_reference)_graph->get()), VX_TYPE_UINT32, _new_order.size());
+    status = vxAddArrayItems(sequence_array, _new_order.size(), _new_order.data(), sizeof(vx_uint32));
+    if (status != VX_SUCCESS)
+        THROW("Adding array items failed: " + TOSTR(status));
+    int input_layout = (int)_inputs[0]->info().layout();
+    vx_scalar input_layout_vx = vxCreateScalar(vxGetContext((vx_reference)_graph->get()), VX_TYPE_INT32, &input_layout);
+    _node = vxExtRppSequenceRearrange(_graph->get(), _inputs[0]->handle(), _outputs[0]->handle(), sequence_array, input_layout_vx);
+
+    if ((status = vxGetStatus((vx_reference)_node)) != VX_SUCCESS)
+        THROW("Adding the sequence rearrange (vxExtRppSequenceRearrange) node failed: " + TOSTR(status))
 }
 
-void SequenceRearrangeNode::init(unsigned int* new_order, unsigned int new_sequence_length, unsigned int sequence_length, unsigned int sequence_count)
-{
-    _new_sequence_length = new_sequence_length;
-    _sequence_length = sequence_length;
-    _sequence_count = sequence_count;
-    _new_order.resize(_new_sequence_length);
-    std::copy(new_order, new_order + _new_sequence_length, _new_order.begin());
+void SequenceRearrangeNode::init(std::vector<unsigned int> &new_order) {
+    _new_order = new_order;
 }
 
-void SequenceRearrangeNode::update_node()
-{
-}
\ No newline at end of file
+void SequenceRearrangeNode::update_node() {}
diff --git a/rocAL/source/meta_data/meta_node_resize_mirror_normalize.cpp b/rocAL/source/meta_data/meta_node_resize_mirror_normalize.cpp
index 8a24dc68c..d2229d22a 100644
--- a/rocAL/source/meta_data/meta_node_resize_mirror_normalize.cpp
+++ b/rocAL/source/meta_data/meta_node_resize_mirror_normalize.cpp
@@ -77,8 +77,8 @@ void ResizeMirrorNormalizeMetaNode::update_parameters(pMetaDataBatch input_meta_
         }
         // get roi width and height of output image
         auto img_roi_size = input_meta_data->get_img_roi_sizes_batch()[i];
-        img_roi_size.w = output_roi[i].x2;
-        img_roi_size.h = output_roi[i].y2;
+        img_roi_size.w = output_roi[i].xywh.w;
+        img_roi_size.h = output_roi[i].xywh.h;
         output_meta_data->get_img_roi_sizes_batch()[i] = img_roi_size;
         output_meta_data->get_bb_cords_batch()[i] = bb_coords;
         output_meta_data->get_labels_batch()[i] = bb_labels;

From c87cd2096c18bb66e5e5f8011b4ecf43c7e11afd Mon Sep 17 00:00:00 2001
From: SundarRajan98 <svaithiy@amd.com>
Date: Wed, 25 Oct 2023 14:22:26 +0000
Subject: [PATCH 05/33] Resolving review comments

---
 rocAL/include/meta_data/meta_data_reader.h |  6 +++---
 rocAL/include/pipeline/master_graph.h      |  2 +-
 rocAL/rocAL_hip/rocal_hip_kernels.cpp      | 12 ++++++------
 rocAL/source/api/rocal_api_meta_data.cpp   |  6 ++++--
 rocAL/source/pipeline/master_graph.cpp     | 10 +++++-----
 rocAL_pybind/amd/rocal/pipeline.py         |  3 +--
 rocAL_pybind/rocal_pybind.cpp              | 12 ++++++------
 7 files changed, 26 insertions(+), 25 deletions(-)

diff --git a/rocAL/include/meta_data/meta_data_reader.h b/rocAL/include/meta_data/meta_data_reader.h
index b16722c4e..bdddca51b 100644
--- a/rocAL/include/meta_data/meta_data_reader.h
+++ b/rocAL/include/meta_data/meta_data_reader.h
@@ -60,8 +60,8 @@ struct MetaDataConfig {
     bool _aspect_ratio_grouping;
 
    public:
-    MetaDataConfig(const MetaDataType& type, const MetaDataReaderType& reader_type, const std::string& path, const std::map<std::string, std::string>& feature_key_map = std::map<std::string, std::string>(), const std::string file_prefix = std::string(), const unsigned& sequence_length = 3, const unsigned& frame_step = 3, const unsigned& frame_stride = 1, bool avoid_class_remapping = false)
-        : _type(type), _reader_type(reader_type), _path(path), _feature_key_map(feature_key_map), _file_prefix(file_prefix), _sequence_length(sequence_length), _frame_step(frame_step), _frame_stride(frame_stride), _avoid_class_remapping(avoid_class_remapping) {}
+    MetaDataConfig(const MetaDataType& type, const MetaDataReaderType& reader_type, const std::string& path, const std::map<std::string, std::string>& feature_key_map = std::map<std::string, std::string>(), const std::string file_prefix = std::string(), const unsigned& sequence_length = 3, const unsigned& frame_step = 3, const unsigned& frame_stride = 1)
+        : _type(type), _reader_type(reader_type), _path(path), _feature_key_map(feature_key_map), _file_prefix(file_prefix), _sequence_length(sequence_length), _frame_step(frame_step), _frame_stride(frame_stride) {}
     MetaDataConfig() = delete;
     MetaDataType type() const { return _type; }
     MetaDataReaderType reader_type() const { return _reader_type; }
@@ -83,7 +83,7 @@ struct MetaDataConfig {
 
 class MetaDataReader {
    private:
-    bool _aspect_ratio_grouping = false;
+    bool _aspect_ratio_grouping;
 
    public:
     enum class Status {
diff --git a/rocAL/include/pipeline/master_graph.h b/rocAL/include/pipeline/master_graph.h
index dd5662c93..4ed84bb6e 100644
--- a/rocAL/include/pipeline/master_graph.h
+++ b/rocAL/include/pipeline/master_graph.h
@@ -82,7 +82,7 @@ class MasterGraph {
     Status reset();
     size_t remaining_count();
     MasterGraph::Status to_tensor(void *out_ptr, RocalTensorlayout format, float multiplier0, float multiplier1, float multiplier2,
-                                  float offset0, float offset1, float offset2, bool reverse_channels, RocalTensorDataType output_data_type, RocalOutputMemType output_mem_type, int max_height = 0, int max_width = 0);
+                                  float offset0, float offset1, float offset2, bool reverse_channels, RocalTensorDataType output_data_type, RocalOutputMemType output_mem_type, uint max_height = 0, uint max_width = 0);
     Status copy_output(unsigned char *out_ptr, size_t out_size_in_bytes);
     Status copy_out_tensor_planar(void *out_ptr, RocalTensorlayout format, float multiplier0, float multiplier1, float multiplier2,
                                   float offset0, float offset1, float offset2, bool reverse_channels, RocalTensorDataType output_data_type);
diff --git a/rocAL/rocAL_hip/rocal_hip_kernels.cpp b/rocAL/rocAL_hip/rocal_hip_kernels.cpp
index 449a8672c..4637ff257 100644
--- a/rocAL/rocAL_hip/rocal_hip_kernels.cpp
+++ b/rocAL/rocAL_hip/rocal_hip_kernels.cpp
@@ -136,6 +136,7 @@ Hip_CopyInt8ToNCHW_fp32(
     const int maxOutW = outDims.y;
     const int img_offset = C * W * H;
     const int out_img_offset = C * maxOutW * maxOutH;
+    unsigned int cstride = maxOutW * maxOutH;
 
     if ((x >= maxOutW) || (y >= maxOutH))
         return;
@@ -145,7 +146,6 @@ Hip_CopyInt8ToNCHW_fp32(
         // copy float3  pixels to dst
         const uchar *inp_img = &inp_image_u8[n * img_offset];
         float *out_tensor = (float *)output_tensor + n * out_img_offset + dst_buf_offset;
-        unsigned int stride = maxOutW * maxOutH;
         if (C == 3) {
             float3 dst;
             if (reverse_channels)
@@ -153,8 +153,8 @@ Hip_CopyInt8ToNCHW_fp32(
             else
                 dst = make_float3((float)inp_img[srcIdx], (float)inp_img[srcIdx + 1], (float)inp_img[srcIdx + 2]) * multiplier + offset;
             out_tensor[dstIdx] = dst.x;
-            out_tensor[dstIdx + stride] = dst.y;
-            out_tensor[dstIdx + stride * 2] = dst.z;
+            out_tensor[dstIdx + cstride] = dst.y;
+            out_tensor[dstIdx + cstride * 2] = dst.z;
         } else {
             out_tensor[dstIdx] = (float)inp_img[srcIdx] * multiplier.x + offset.x;
         }
@@ -180,6 +180,7 @@ Hip_CopyInt8ToNCHW_fp16(
     const int maxOutW = outDims.y;
     const int img_offset = C * W * H;
     const int out_img_offset = C * maxOutW * maxOutH;
+    unsigned int cstride = maxOutW * maxOutH;
 
     if ((x >= maxOutW) || (y >= maxOutH))
         return;
@@ -189,7 +190,6 @@ Hip_CopyInt8ToNCHW_fp16(
         unsigned int srcIdx = (y * W + x) * C;
         // copy float3  pixels to dst
         unsigned int dstIdx = y * maxOutW + x;
-        unsigned int stride = maxOutW * maxOutH;
         if (C == 3) {
             float3 dst;
             if (reverse_channels)
@@ -197,8 +197,8 @@ Hip_CopyInt8ToNCHW_fp16(
             else
                 dst = make_float3((float)inp_img[srcIdx], (float)inp_img[srcIdx + 1], (float)inp_img[srcIdx + 2]) * multiplier + offset;
             out_tensor[dstIdx] = __float2half(dst.x);
-            out_tensor[dstIdx + stride] = __float2half(dst.y);
-            out_tensor[dstIdx + stride * 2] = __float2half(dst.z);
+            out_tensor[dstIdx + cstride] = __float2half(dst.y);
+            out_tensor[dstIdx + cstride * 2] = __float2half(dst.z);
         } else {
             out_tensor[dstIdx] = __float2half((float)inp_img[srcIdx] * multiplier.x + offset.x);
         }
diff --git a/rocAL/source/api/rocal_api_meta_data.cpp b/rocAL/source/api/rocal_api_meta_data.cpp
index 0eaf89958..ffb68391e 100644
--- a/rocAL/source/api/rocal_api_meta_data.cpp
+++ b/rocAL/source/api/rocal_api_meta_data.cpp
@@ -421,8 +421,10 @@ void
     }
 }
 
-void ROCAL_API_CALL rocalBoxEncoder(RocalContext p_context, std::vector<float>& anchors, float criteria,
-                                    std::vector<float>& means, std::vector<float>& stds, bool offset, float scale) {
+void
+    ROCAL_API_CALL
+    rocalBoxEncoder(RocalContext p_context, std::vector<float>& anchors, float criteria,
+                    std::vector<float>& means, std::vector<float>& stds, bool offset, float scale) {
     if (!p_context)
         THROW("Invalid rocal context passed to rocalBoxEncoder")
     auto context = static_cast<Context*>(p_context);
diff --git a/rocAL/source/pipeline/master_graph.cpp b/rocAL/source/pipeline/master_graph.cpp
index ed65248b1..72c763406 100644
--- a/rocAL/source/pipeline/master_graph.cpp
+++ b/rocAL/source/pipeline/master_graph.cpp
@@ -452,7 +452,7 @@ MasterGraph::timing() {
 
 MasterGraph::Status
 MasterGraph::to_tensor(void *out_ptr, RocalTensorlayout format, float multiplier0, float multiplier1,
-                       float multiplier2, float offset0, float offset1, float offset2, bool reverse_channels, RocalTensorDataType output_data_type, RocalOutputMemType output_mem_type, int max_height, int max_width) {
+                       float multiplier2, float offset0, float offset1, float offset2, bool reverse_channels, RocalTensorDataType output_data_type, RocalOutputMemType output_mem_type, uint max_height, uint max_width) {
     if (no_more_processed_data())
         return MasterGraph::Status::NO_MORE_DATA;
 
@@ -678,9 +678,9 @@ MasterGraph::to_tensor(void *out_ptr, RocalTensorlayout format, float multiplier
                                 int alignedLength = (max_width & ~7);  // multiple of 8
 
                                 __m256 fR, fG, fB;
-                                for (int row = 0; row < max_height; row++) {
+                                for (uint row = 0; row < max_height; row++) {
                                     unsigned char *in_buffer_row = reinterpret_cast<unsigned char *>(in_buffer) + (row * input_width_stride);
-                                    int col = 0;
+                                    uint col = 0;
                                     for (; col < alignedLength; col += 8) {
                                         __m256i pix0 = _mm256_loadu_si256((const __m256i *)in_buffer_row);
                                         pix0 = _mm256_permutevar8x32_epi32(pix0, _mm256_setr_epi32(0, 1, 2, 3, 3, 4, 5, 6));
@@ -746,9 +746,9 @@ MasterGraph::to_tensor(void *out_ptr, RocalTensorlayout format, float multiplier
 
                                 __m256 fR, fG, fB;
                                 __m128i tempR, tempG, tempB;
-                                for (int row = 0; row < max_height; row++) {
+                                for (uint row = 0; row < max_height; row++) {
                                     unsigned char *in_buffer_row = reinterpret_cast<unsigned char *>(in_buffer) + (row * input_width_stride);
-                                    int col = 0;
+                                    uint col = 0;
                                     for (; col < alignedLength; col += 8) {
                                         __m256i pix0 = _mm256_loadu_si256((const __m256i *)in_buffer_row);
                                         pix0 = _mm256_permutevar8x32_epi32(pix0, _mm256_setr_epi32(0, 1, 2, 3, 3, 4, 5, 6));
diff --git a/rocAL_pybind/amd/rocal/pipeline.py b/rocAL_pybind/amd/rocal/pipeline.py
index 6c2dc579d..b0ee8f440 100644
--- a/rocAL_pybind/amd/rocal/pipeline.py
+++ b/rocAL_pybind/amd/rocal/pipeline.py
@@ -147,9 +147,8 @@ def get_handle(self):
         return self._handle
 
     def copyToExternalTensor(self, array,  multiplier, offset, reverse_channels, tensor_format, tensor_dtype, max_height=0, max_width=0):
-
         b.rocalToTensor(self._handle, ctypes.c_void_p(array.data_ptr()), tensor_format, tensor_dtype,
-                                    multiplier[0], multiplier[1], multiplier[2], offset[0], offset[1], offset[2], (1 if reverse_channels else 0), self._output_memory_type, max_height, max_width)
+                        multiplier[0], multiplier[1], multiplier[2], offset[0], offset[1], offset[2], (1 if reverse_channels else 0), self._output_memory_type, max_height, max_width)
 
     def get_one_hot_encoded_labels(self, array, device):
         if device == "cpu":
diff --git a/rocAL_pybind/rocal_pybind.cpp b/rocAL_pybind/rocal_pybind.cpp
index 468be6d99..835098b3d 100644
--- a/rocAL_pybind/rocal_pybind.cpp
+++ b/rocAL_pybind/rocal_pybind.cpp
@@ -87,7 +87,7 @@ py::object wrapper_image_name(RocalContext context, int array_len) {
 py::object wrapper_copy_to_tensor(RocalContext context, py::object p,
                                   RocalTensorLayout tensor_format, RocalTensorOutputType tensor_output_type, float multiplier0,
                                   float multiplier1, float multiplier2, float offset0, float offset1, float offset2,
-                                  bool reverse_channels, RocalOutputMemType output_mem_type, int max_height, int max_width) {
+                                  bool reverse_channels, RocalOutputMemType output_mem_type, uint max_height, uint max_width) {
     auto ptr = ctypes_void_ptr(p);
     // call pure C++ function
     int status = rocalToTensor(context, ptr, tensor_format, tensor_output_type, multiplier0,
@@ -489,15 +489,15 @@ PYBIND11_MODULE(rocal_pybind, m) {
             float *mask_buffer = static_cast<float *>(mask_data->at(i)->buffer());
             py::list poly_batch_list;
             for (unsigned j = prev_object_cnt; j < bbox_labels->at(i)->dims().at(0) + prev_object_cnt; j++) {
-                py::list single_image;
+                py::list polygons_buffer;
                 for (int k = 0; k < mask_count_ptr[j]; k++) {
-                    py::list polygons_buffer;
+                    py::list coords_buffer;
                     for (int l = 0; l < polygon_size_ptr[poly_cnt]; l++)
-                        polygons_buffer.append(mask_buffer[l]);
+                        coords_buffer.append(mask_buffer[l]);
                     mask_buffer += polygon_size_ptr[poly_cnt++];
-                    single_image.append(polygons_buffer);
+                    polygons_buffer.append(coords_buffer);
                 }
-                poly_batch_list.append(single_image);
+                poly_batch_list.append(polygons_buffer);
             }
             prev_object_cnt += bbox_labels->at(i)->dims().at(0);
             complete_list.append(poly_batch_list);

From 8071dfb6efaa597f0a6e482acbecad9e8c85231b Mon Sep 17 00:00:00 2001
From: SundarRajan98 <svaithiy@amd.com>
Date: Wed, 25 Oct 2023 16:53:57 +0000
Subject: [PATCH 06/33] Resolving review comments

---
 .../include/meta_data/coco_meta_data_reader.h |  2 +
 rocAL/include/meta_data/meta_data_reader.h    |  6 +--
 .../readers/image/coco_file_source_reader.h   |  1 +
 rocAL/source/pipeline/master_graph.cpp        |  4 +-
 .../readers/image/coco_file_source_reader.cpp | 49 +++++++------------
 5 files changed, 27 insertions(+), 35 deletions(-)

diff --git a/rocAL/include/meta_data/coco_meta_data_reader.h b/rocAL/include/meta_data/coco_meta_data_reader.h
index 5e5efc67b..bdfab4efb 100644
--- a/rocAL/include/meta_data/coco_meta_data_reader.h
+++ b/rocAL/include/meta_data/coco_meta_data_reader.h
@@ -39,6 +39,8 @@ class COCOMetaDataReader : public MetaDataReader {
     void print_map_contents();
     bool set_timestamp_mode() override { return false; }
     const std::map<std::string, std::shared_ptr<MetaData>>& get_map_content() override { return _map_content; }
+    void set_aspect_ratio_grouping(bool aspect_ratio_grouping) override { _aspect_ratio_grouping = aspect_ratio_grouping; }
+    bool aspect_ratio_grouping() const override { return _aspect_ratio_grouping; }
     COCOMetaDataReader();
 
    private:
diff --git a/rocAL/include/meta_data/meta_data_reader.h b/rocAL/include/meta_data/meta_data_reader.h
index bdddca51b..849603cf6 100644
--- a/rocAL/include/meta_data/meta_data_reader.h
+++ b/rocAL/include/meta_data/meta_data_reader.h
@@ -82,7 +82,7 @@ struct MetaDataConfig {
 };
 
 class MetaDataReader {
-   private:
+   protected:
     bool _aspect_ratio_grouping;
 
    public:
@@ -98,6 +98,6 @@ class MetaDataReader {
     virtual bool exists(const std::string& image_name) = 0;
     virtual bool set_timestamp_mode() = 0;
     virtual ImgSize lookup_image_size(const std::string& image_name) { return {}; }
-    void set_aspect_ratio_grouping(bool aspect_ratio_grouping) { _aspect_ratio_grouping = aspect_ratio_grouping; }
-    bool aspect_ratio_grouping() const { return _aspect_ratio_grouping; }
+    virtual void set_aspect_ratio_grouping(bool aspect_ratio_grouping) { return; }
+    virtual bool aspect_ratio_grouping() const { return {}; }
 };
diff --git a/rocAL/include/readers/image/coco_file_source_reader.h b/rocAL/include/readers/image/coco_file_source_reader.h
index 0e3a11bb8..ffa35caea 100644
--- a/rocAL/include/readers/image/coco_file_source_reader.h
+++ b/rocAL/include/readers/image/coco_file_source_reader.h
@@ -104,4 +104,5 @@ class COCOFileSourceReader : public Reader {
     void incremenet_file_id() { _file_id++; }
     void replicate_last_image_to_fill_last_shard();
     void replicate_last_batch_to_pad_partial_shard();
+    void shuffle_with_aspect_ratios();
 };
diff --git a/rocAL/source/pipeline/master_graph.cpp b/rocAL/source/pipeline/master_graph.cpp
index 72c763406..fb20553af 100644
--- a/rocAL/source/pipeline/master_graph.cpp
+++ b/rocAL/source/pipeline/master_graph.cpp
@@ -675,7 +675,7 @@ MasterGraph::to_tensor(void *out_ptr, RocalTensorlayout format, float multiplier
                                 __m256 padd0 = _mm256_set1_ps(offset0);
                                 __m256 padd1 = _mm256_set1_ps(offset1);
                                 __m256 padd2 = _mm256_set1_ps(offset2);
-                                int alignedLength = (max_width & ~7);  // multiple of 8
+                                uint alignedLength = (max_width & ~7);  // multiple of 8
 
                                 __m256 fR, fG, fB;
                                 for (uint row = 0; row < max_height; row++) {
@@ -742,7 +742,7 @@ MasterGraph::to_tensor(void *out_ptr, RocalTensorlayout format, float multiplier
                                 __m256 padd0 = _mm256_set1_ps(offset0);
                                 __m256 padd1 = _mm256_set1_ps(offset1);
                                 __m256 padd2 = _mm256_set1_ps(offset2);
-                                int alignedLength = (max_width & ~7);  // multiple of 8
+                                uint alignedLength = (max_width & ~7);  // multiple of 8
 
                                 __m256 fR, fG, fB;
                                 __m128i tempR, tempG, tempB;
diff --git a/rocAL/source/readers/image/coco_file_source_reader.cpp b/rocAL/source/readers/image/coco_file_source_reader.cpp
index 22eb63b02..ec3f60501 100644
--- a/rocAL/source/readers/image/coco_file_source_reader.cpp
+++ b/rocAL/source/readers/image/coco_file_source_reader.cpp
@@ -109,23 +109,10 @@ Reader::Status COCOFileSourceReader::initialize(ReaderConfig desc) {
 
         // Copy the sorted file_names to _file_names vector to be used in sharding
         _file_names = _sorted_file_names;
-        // Calculate the mid element which divides the aspect ratios into two groups (<=1.0 and >1.0)
-        auto mid = std::upper_bound(_aspect_ratios.begin(), _aspect_ratios.end(), 1.0f) - _aspect_ratios.begin();
 
         // shuffle dataset if set
         if (ret == Reader::Status::OK && _shuffle) {
-            // Shuffle within groups using the mid element as the limit - [start, mid) and [mid, last)
-            std::random_shuffle(_file_names.begin(), _file_names.begin() + mid);
-            std::random_shuffle(_file_names.begin() + mid, _file_names.end());
-            std::vector<std::string> shuffled_filenames;
-            int split_count = _file_names.size() / _batch_count;  // Number of batches for this shard
-            std::vector<int> indexes(split_count);
-            std::iota(indexes.begin(), indexes.end(), 0);
-            // Shuffle the index vector and use the index to fetch batch size elements for decoding
-            std::random_shuffle(indexes.begin(), indexes.end());
-            for (auto const idx : indexes)
-                shuffled_filenames.insert(shuffled_filenames.end(), _file_names.begin() + idx * _batch_count, _file_names.begin() + idx * _batch_count + _batch_count);
-            _file_names = shuffled_filenames;
+            shuffle_with_aspect_ratios();
         }
     } else {
         // shuffle dataset if set
@@ -216,25 +203,27 @@ int COCOFileSourceReader::release() {
     return 0;
 }
 
+void COCOFileSourceReader::shuffle_with_aspect_ratios() {
+    // Calculate the mid element which divides the aspect ratios into two groups (<=1.0 and >1.0)
+    auto mid = std::upper_bound(_aspect_ratios.begin(), _aspect_ratios.end(), 1.0f) - _aspect_ratios.begin();
+    // Shuffle within groups using the mid element as the limit - [start, mid) and [mid, last)
+    std::random_shuffle(_file_names.begin(), _file_names.begin() + mid);
+    std::random_shuffle(_file_names.begin() + mid, _file_names.end());
+    std::vector<std::string> shuffled_filenames;
+    int split_count = _file_names.size() / _batch_count;  // Number of batches for this shard
+    std::vector<int> indexes(split_count);
+    std::iota(indexes.begin(), indexes.end(), 0);
+    // Shuffle the index vector and use the index to fetch batch size elements for decoding
+    std::random_shuffle(indexes.begin(), indexes.end());
+    for (auto const idx : indexes)
+        shuffled_filenames.insert(shuffled_filenames.end(), _file_names.begin() + idx * _batch_count, _file_names.begin() + idx * _batch_count + _batch_count);
+    _file_names = shuffled_filenames;
+}
+
 void COCOFileSourceReader::reset() {
     if (_meta_data_reader && _meta_data_reader->aspect_ratio_grouping()) {
         _file_names = _sorted_file_names;
-        // Calculate the mid element which divides the aspect ratios into two groups (<=1.0 and >1.0)
-        auto mid = std::upper_bound(_aspect_ratios.begin(), _aspect_ratios.end(), 1.0f) - _aspect_ratios.begin();
-        if (_shuffle) {
-            // Shuffle within groups using the mid element as the limit - [start, mid) and [mid, last)
-            std::random_shuffle(_file_names.begin(), _file_names.begin() + mid);
-            std::random_shuffle(_file_names.begin() + mid, _file_names.end());
-            std::vector<std::string> shuffled_filenames;
-            int split_count = _file_names.size() / _batch_count;  // Number of batches for this shard
-            std::vector<int> indexes(split_count);
-            std::iota(indexes.begin(), indexes.end(), 0);
-            // Shuffle the index vector and use the index to fetch batch size elements for decoding
-            std::random_shuffle(indexes.begin(), indexes.end());
-            for (auto const idx : indexes)
-                shuffled_filenames.insert(shuffled_filenames.end(), _file_names.begin() + idx * _batch_count, _file_names.begin() + idx * _batch_count + _batch_count);
-            _file_names = shuffled_filenames;
-        }
+        if (_shuffle) shuffle_with_aspect_ratios();
     } else if (_shuffle) {
         std::random_shuffle(_file_names.begin(), _file_names.end());
     }

From 5c5a4408ecdf9d322f55c319694155fea532699a Mon Sep 17 00:00:00 2001
From: SundarRajan98 <svaithiy@amd.com>
Date: Thu, 26 Oct 2023 18:11:50 +0000
Subject: [PATCH 07/33] Resolving review comments

---
 .../include/meta_data/coco_meta_data_reader.h |   4 +-
 rocAL/include/meta_data/meta_data_reader.h    |   4 +-
 rocAL/rocAL_hip/rocal_hip_kernels.cpp         | 110 +++++++++---------
 rocAL/source/api/rocal_api_augmentation.cpp   |   2 +
 .../meta_data/coco_meta_data_reader.cpp       |   6 +-
 rocAL/source/pipeline/master_graph.cpp        |   9 +-
 .../readers/image/coco_file_source_reader.cpp |   4 +-
 rocAL_pybind/rocal_pybind.cpp                 |   2 +-
 8 files changed, 70 insertions(+), 71 deletions(-)

diff --git a/rocAL/include/meta_data/coco_meta_data_reader.h b/rocAL/include/meta_data/coco_meta_data_reader.h
index bdfab4efb..43b7d293b 100644
--- a/rocAL/include/meta_data/coco_meta_data_reader.h
+++ b/rocAL/include/meta_data/coco_meta_data_reader.h
@@ -40,7 +40,7 @@ class COCOMetaDataReader : public MetaDataReader {
     bool set_timestamp_mode() override { return false; }
     const std::map<std::string, std::shared_ptr<MetaData>>& get_map_content() override { return _map_content; }
     void set_aspect_ratio_grouping(bool aspect_ratio_grouping) override { _aspect_ratio_grouping = aspect_ratio_grouping; }
-    bool aspect_ratio_grouping() const override { return _aspect_ratio_grouping; }
+    bool get_aspect_ratio_grouping() const override { return _aspect_ratio_grouping; }
     COCOMetaDataReader();
 
    private:
@@ -54,7 +54,7 @@ class COCOMetaDataReader : public MetaDataReader {
     std::map<std::string, std::shared_ptr<MetaData>> _map_content;
     std::map<std::string, std::shared_ptr<MetaData>>::iterator _itr;
     std::map<std::string, ImgSize> _map_img_sizes;
-    std::map<int, std::string> _map_img_names;
+    std::map<int, std::string> _map_image_names_to_id;  // Maps image names to their image IDs
     std::map<std::string, ImgSize>::iterator itr;
     std::map<int, int> _label_info;
     std::map<int, int>::iterator _it_label;
diff --git a/rocAL/include/meta_data/meta_data_reader.h b/rocAL/include/meta_data/meta_data_reader.h
index 849603cf6..035fe5411 100644
--- a/rocAL/include/meta_data/meta_data_reader.h
+++ b/rocAL/include/meta_data/meta_data_reader.h
@@ -69,7 +69,7 @@ struct MetaDataConfig {
     std::map<std::string, std::string> feature_key_map() const { return _feature_key_map; }
     std::string file_prefix() const { return _file_prefix; }
     bool class_remapping() const { return _avoid_class_remapping; }
-    bool aspect_ratio_grouping() const { return _aspect_ratio_grouping; }
+    bool get_aspect_ratio_grouping() const { return _aspect_ratio_grouping; }
     unsigned sequence_length() const { return _sequence_length; }
     unsigned frame_step() const { return _frame_step; }
     unsigned frame_stride() const { return _frame_stride; }
@@ -99,5 +99,5 @@ class MetaDataReader {
     virtual bool set_timestamp_mode() = 0;
     virtual ImgSize lookup_image_size(const std::string& image_name) { return {}; }
     virtual void set_aspect_ratio_grouping(bool aspect_ratio_grouping) { return; }
-    virtual bool aspect_ratio_grouping() const { return {}; }
+    virtual bool get_aspect_ratio_grouping() const { return {}; }
 };
diff --git a/rocAL/rocAL_hip/rocal_hip_kernels.cpp b/rocAL/rocAL_hip/rocal_hip_kernels.cpp
index 4637ff257..3475ec13a 100644
--- a/rocAL/rocAL_hip/rocal_hip_kernels.cpp
+++ b/rocAL/rocAL_hip/rocal_hip_kernels.cpp
@@ -32,7 +32,7 @@ Hip_CopyInt8ToNHWC_fp32(
     void *output_tensor,
     unsigned int dst_buf_offset,
     uint4 nchw,
-    uint2 outDims,
+    uint2 out_dims,
     float3 multiplier,
     float3 offset,
     unsigned int reverse_channels) {
@@ -41,32 +41,32 @@ Hip_CopyInt8ToNHWC_fp32(
     const int W = nchw.w;
     const int H = nchw.z;
     const int C = nchw.y;
-    const int maxOutH = outDims.x;
-    const int maxOutW = outDims.y;
+    const int maxOutH = out_dims.x;
+    const int maxOutW = out_dims.y;
     const int img_offset = C * W * H;
     const int out_img_offset = C * maxOutW * maxOutH;
 
     if ((x >= maxOutW) || (y >= maxOutH))
         return;
     for (unsigned int n = 0; n < nchw.x; n++) {
-        unsigned int srcIdx = (y * W + x) * C;  // src is RGB
-        unsigned int dstIdx = (y * maxOutW + x) * C;
+        unsigned int src_idx = (y * W + x) * C;  // src is RGB
+        unsigned int dst_idx = (y * maxOutW + x) * C;
         // copy float3  pixels to dst
         if (C == 3) {
             float3 dst;
             const uchar *inp_img = &inp_image_u8[n * img_offset];
             float *out_tensor = (float *)((float *)output_tensor + dst_buf_offset + n * out_img_offset);
             if (reverse_channels)
-                dst = make_float3((float)inp_img[srcIdx + 2], (float)inp_img[srcIdx + 1], (float)inp_img[srcIdx]) * multiplier + offset;
+                dst = make_float3((float)inp_img[src_idx + 2], (float)inp_img[src_idx + 1], (float)inp_img[src_idx]) * multiplier + offset;
             else
-                dst = make_float3((float)inp_img[srcIdx], (float)inp_img[srcIdx + 1], (float)inp_img[srcIdx + 2]) * multiplier + offset;
-            out_tensor[dstIdx] = dst.x;
-            out_tensor[dstIdx + 1] = dst.y;
-            out_tensor[dstIdx + 2] = dst.z;
+                dst = make_float3((float)inp_img[src_idx], (float)inp_img[src_idx + 1], (float)inp_img[src_idx + 2]) * multiplier + offset;
+            out_tensor[dst_idx] = dst.x;
+            out_tensor[dst_idx + 1] = dst.y;
+            out_tensor[dst_idx + 2] = dst.z;
         } else {
             const uchar *inp_img = &inp_image_u8[n * img_offset + dst_buf_offset];
             float *out_tensor = (float *)output_tensor + dst_buf_offset + n * out_img_offset;
-            out_tensor[dstIdx] = (float)inp_img[srcIdx] * multiplier.x + offset.x;
+            out_tensor[dst_idx] = (float)inp_img[src_idx] * multiplier.x + offset.x;
         }
     }
 }
@@ -77,7 +77,7 @@ Hip_CopyInt8ToNHWC_fp16(
     void *output_tensor,
     unsigned int dst_buf_offset,
     uint4 nchw,
-    uint2 outDims,
+    uint2 out_dims,
     float3 multiplier,
     float3 offset,
     const unsigned int reverse_channels) {
@@ -86,8 +86,8 @@ Hip_CopyInt8ToNHWC_fp16(
     const int W = nchw.w;
     const int H = nchw.z;
     const int C = nchw.y;
-    const int maxOutH = outDims.x;
-    const int maxOutW = outDims.y;
+    const int maxOutH = out_dims.x;
+    const int maxOutW = out_dims.y;
     const int img_offset = C * W * H;
     const int out_img_offset = C * maxOutW * maxOutH;
 
@@ -95,24 +95,24 @@ Hip_CopyInt8ToNHWC_fp16(
         return;
     for (unsigned int n = 0; n < nchw.x; n++) {
         __half *out_tensor = (__half *)output_tensor + dst_buf_offset + n * out_img_offset;
-        unsigned int srcIdx = (y * W + x) * C;
+        unsigned int src_idx = (y * W + x) * C;
         // copy float3  pixels to dst
         if (C == 3) {
-            unsigned int dstIdx = y * maxOutW + x * 3;
+            unsigned int dst_idx = y * maxOutW + x * 3;
             const uchar *inp_img = &inp_image_u8[n * img_offset];
             float3 dst;
             if (reverse_channels)
-                dst = make_float3((float)inp_img[srcIdx + 2], (float)inp_img[srcIdx + 1], (float)inp_img[srcIdx]) * multiplier + offset;
+                dst = make_float3((float)inp_img[src_idx + 2], (float)inp_img[src_idx + 1], (float)inp_img[src_idx]) * multiplier + offset;
             else
-                dst = make_float3((float)inp_img[srcIdx], (float)inp_img[srcIdx + 1], (float)inp_img[srcIdx + 2]) * multiplier + offset;
-            out_tensor[dstIdx] = __float2half(dst.x);
-            out_tensor[dstIdx + 1] = __float2half(dst.y);
-            out_tensor[dstIdx + 2] = __float2half(dst.z);
+                dst = make_float3((float)inp_img[src_idx], (float)inp_img[src_idx + 1], (float)inp_img[src_idx + 2]) * multiplier + offset;
+            out_tensor[dst_idx] = __float2half(dst.x);
+            out_tensor[dst_idx + 1] = __float2half(dst.y);
+            out_tensor[dst_idx + 2] = __float2half(dst.z);
         } else {
-            unsigned int dstIdx = y * maxOutW + x;
+            unsigned int dst_idx = y * maxOutW + x;
             const uchar *inp_img = &inp_image_u8[n * img_offset];
             float *out_tensor = (float *)output_tensor + n * out_img_offset;
-            out_tensor[dstIdx] = __float2half((float)inp_img[srcIdx] * multiplier.x + offset.x);
+            out_tensor[dst_idx] = __float2half((float)inp_img[src_idx] * multiplier.x + offset.x);
         }
     }
 }
@@ -123,7 +123,7 @@ Hip_CopyInt8ToNCHW_fp32(
     void *output_tensor,
     unsigned int dst_buf_offset,
     uint4 nchw,
-    uint2 outDims,
+    uint2 out_dims,
     float3 multiplier,
     float3 offset,
     unsigned int reverse_channels) {
@@ -132,8 +132,8 @@ Hip_CopyInt8ToNCHW_fp32(
     const int W = nchw.w;
     const int H = nchw.z;
     const int C = nchw.y;
-    const int maxOutH = outDims.x;
-    const int maxOutW = outDims.y;
+    const int maxOutH = out_dims.x;
+    const int maxOutW = out_dims.y;
     const int img_offset = C * W * H;
     const int out_img_offset = C * maxOutW * maxOutH;
     unsigned int cstride = maxOutW * maxOutH;
@@ -141,22 +141,22 @@ Hip_CopyInt8ToNCHW_fp32(
     if ((x >= maxOutW) || (y >= maxOutH))
         return;
     for (unsigned int n = 0; n < nchw.x; n++) {
-        unsigned int srcIdx = (y * W + x) * C;
-        unsigned int dstIdx = y * maxOutW + x;
+        unsigned int src_idx = (y * W + x) * C;
+        unsigned int dst_idx = y * maxOutW + x;
         // copy float3  pixels to dst
         const uchar *inp_img = &inp_image_u8[n * img_offset];
         float *out_tensor = (float *)output_tensor + n * out_img_offset + dst_buf_offset;
         if (C == 3) {
             float3 dst;
             if (reverse_channels)
-                dst = make_float3((float)inp_img[srcIdx + 2], (float)inp_img[srcIdx + 1], (float)inp_img[srcIdx]) * multiplier + offset;
+                dst = make_float3((float)inp_img[src_idx + 2], (float)inp_img[src_idx + 1], (float)inp_img[src_idx]) * multiplier + offset;
             else
-                dst = make_float3((float)inp_img[srcIdx], (float)inp_img[srcIdx + 1], (float)inp_img[srcIdx + 2]) * multiplier + offset;
-            out_tensor[dstIdx] = dst.x;
-            out_tensor[dstIdx + cstride] = dst.y;
-            out_tensor[dstIdx + cstride * 2] = dst.z;
+                dst = make_float3((float)inp_img[src_idx], (float)inp_img[src_idx + 1], (float)inp_img[src_idx + 2]) * multiplier + offset;
+            out_tensor[dst_idx] = dst.x;
+            out_tensor[dst_idx + cstride] = dst.y;
+            out_tensor[dst_idx + cstride * 2] = dst.z;
         } else {
-            out_tensor[dstIdx] = (float)inp_img[srcIdx] * multiplier.x + offset.x;
+            out_tensor[dst_idx] = (float)inp_img[src_idx] * multiplier.x + offset.x;
         }
     }
 }
@@ -167,7 +167,7 @@ Hip_CopyInt8ToNCHW_fp16(
     void *output_tensor,
     unsigned int dst_buf_offset,
     uint4 nchw,
-    uint2 outDims,
+    uint2 out_dims,
     float3 multiplier,
     float3 offset,
     const unsigned int reverse_channels) {
@@ -176,8 +176,8 @@ Hip_CopyInt8ToNCHW_fp16(
     const int W = nchw.w;
     const int H = nchw.z;
     const int C = nchw.y;
-    const int maxOutH = outDims.x;
-    const int maxOutW = outDims.y;
+    const int maxOutH = out_dims.x;
+    const int maxOutW = out_dims.y;
     const int img_offset = C * W * H;
     const int out_img_offset = C * maxOutW * maxOutH;
     unsigned int cstride = maxOutW * maxOutH;
@@ -187,20 +187,20 @@ Hip_CopyInt8ToNCHW_fp16(
     for (unsigned int n = 0; n < nchw.x; n++) {
         __half *out_tensor = (__half *)output_tensor + n * out_img_offset + dst_buf_offset;
         const uchar *inp_img = &inp_image_u8[n * img_offset];
-        unsigned int srcIdx = (y * W + x) * C;
+        unsigned int src_idx = (y * W + x) * C;
         // copy float3  pixels to dst
-        unsigned int dstIdx = y * maxOutW + x;
+        unsigned int dst_idx = y * maxOutW + x;
         if (C == 3) {
             float3 dst;
             if (reverse_channels)
-                dst = make_float3((float)inp_img[srcIdx + 2], (float)inp_img[srcIdx + 1], (float)inp_img[srcIdx]) * multiplier + offset;
+                dst = make_float3((float)inp_img[src_idx + 2], (float)inp_img[src_idx + 1], (float)inp_img[src_idx]) * multiplier + offset;
             else
-                dst = make_float3((float)inp_img[srcIdx], (float)inp_img[srcIdx + 1], (float)inp_img[srcIdx + 2]) * multiplier + offset;
-            out_tensor[dstIdx] = __float2half(dst.x);
-            out_tensor[dstIdx + cstride] = __float2half(dst.y);
-            out_tensor[dstIdx + cstride * 2] = __float2half(dst.z);
+                dst = make_float3((float)inp_img[src_idx], (float)inp_img[src_idx + 1], (float)inp_img[src_idx + 2]) * multiplier + offset;
+            out_tensor[dst_idx] = __float2half(dst.x);
+            out_tensor[dst_idx + cstride] = __float2half(dst.y);
+            out_tensor[dst_idx + cstride * 2] = __float2half(dst.z);
         } else {
-            out_tensor[dstIdx] = __float2half((float)inp_img[srcIdx] * multiplier.x + offset.x);
+            out_tensor[dst_idx] = __float2half((float)inp_img[src_idx] * multiplier.x + offset.x);
         }
     }
 }
@@ -225,18 +225,18 @@ int HipExecCopyInt8ToNHWC(
     const unsigned max_output_height,
     const unsigned max_output_width) {
     int localThreads_x = 16, localThreads_y = 16;
-    uint2 outDims;
+    uint2 out_dims;
     if ((max_output_height == 0) || (max_output_width == 0))
-        outDims = make_uint2(h, w);
+        out_dims = make_uint2(h, w);
     else
-        outDims = make_uint2(max_output_height, max_output_width);
+        out_dims = make_uint2(max_output_height, max_output_width);
     int globalThreads_x = w, globalThreads_y = h;
     if (!fp16) {
         hipLaunchKernelGGL(Hip_CopyInt8ToNHWC_fp32,
                            dim3(ceil((float)globalThreads_x / localThreads_x), ceil((float)globalThreads_y / localThreads_y)),
                            dim3(localThreads_x, localThreads_y),
                            0, stream, (const uchar *)inp_image_u8, output_tensor, dst_buf_offset,
-                           make_uint4(n, c, h, w), outDims,
+                           make_uint4(n, c, h, w), out_dims,
                            make_float3(multiplier0, multiplier1, multiplier2), make_float3(offset0, offset1, offset2),
                            reverse_channels);
     } else {
@@ -244,7 +244,7 @@ int HipExecCopyInt8ToNHWC(
                            dim3(ceil((float)globalThreads_x / localThreads_x), ceil((float)globalThreads_y / localThreads_y)),
                            dim3(localThreads_x, localThreads_y),
                            0, stream, (const uchar *)inp_image_u8, output_tensor, dst_buf_offset,
-                           make_uint4(n, c, h, w), outDims,
+                           make_uint4(n, c, h, w), out_dims,
                            make_float3(multiplier0, multiplier1, multiplier2), make_float3(offset0, offset1, offset2),
                            reverse_channels);
     }
@@ -271,18 +271,18 @@ int HipExecCopyInt8ToNCHW(
     const unsigned max_output_height,
     const unsigned max_output_width) {
     int localThreads_x = 16, localThreads_y = 16;
-    uint2 outDims;
+    uint2 out_dims;
     if ((max_output_height == 0) || (max_output_width == 0))
-        outDims = make_uint2(h, w);
+        out_dims = make_uint2(h, w);
     else
-        outDims = make_uint2(max_output_height, max_output_width);
+        out_dims = make_uint2(max_output_height, max_output_width);
     int globalThreads_x = w, globalThreads_y = h;
     if (!fp16) {
         hipLaunchKernelGGL(Hip_CopyInt8ToNCHW_fp32,
                            dim3(ceil((float)globalThreads_x / localThreads_x), ceil((float)globalThreads_y / localThreads_y)),
                            dim3(localThreads_x, localThreads_y),
                            0, stream, (const uchar *)inp_image_u8, output_tensor, dst_buf_offset,
-                           make_uint4(n, c, h, w), outDims,
+                           make_uint4(n, c, h, w), out_dims,
                            make_float3(multiplier0, multiplier1, multiplier2), make_float3(offset0, offset1, offset2),
                            reverse_channels);
     } else {
@@ -290,7 +290,7 @@ int HipExecCopyInt8ToNCHW(
                            dim3(ceil((float)globalThreads_x / localThreads_x), ceil((float)globalThreads_y / localThreads_y)),
                            dim3(localThreads_x, localThreads_y),
                            0, stream, (const uchar *)inp_image_u8, output_tensor, dst_buf_offset,
-                           make_uint4(n, c, h, w), outDims,
+                           make_uint4(n, c, h, w), out_dims,
                            make_float3(multiplier0, multiplier1, multiplier2), make_float3(offset0, offset1, offset2),
                            reverse_channels);
     }
diff --git a/rocAL/source/api/rocal_api_augmentation.cpp b/rocAL/source/api/rocal_api_augmentation.cpp
index 4137c50b6..0988b58ed 100644
--- a/rocAL/source/api/rocal_api_augmentation.cpp
+++ b/rocAL/source/api/rocal_api_augmentation.cpp
@@ -554,8 +554,10 @@ RocalTensor ROCAL_API_CALL
     try {
         if ((dest_width | dest_height | resize_longer | resize_shorter) == 0)
             THROW("Atleast one size 'dest_width' or 'dest_height' or 'resize_shorter' or 'resize_longer' must be specified")
+        // Specifying dest width and height along with Resize_shorter and resize_longer can be used together in case of MIN_MAX_SCALING_MODE - for other scaling modes, this throws an error
         if ((dest_width | dest_height) && (resize_longer | resize_shorter) && (scaling_mode != RocalResizeScalingMode::ROCAL_SCALING_MODE_MIN_MAX))
             THROW("Only one method of specifying size can be used \ndest_width and/or dest_height\nresize_shorter\nresize_longer")
+        // Resize_shorter and resize_longer can be used together in case of MIN_MAX_SCALING_MODE - for other scaling modes, this throws an error
         if (resize_longer && resize_shorter && scaling_mode != RocalResizeScalingMode::ROCAL_SCALING_MODE_MIN_MAX)
             THROW("'resize_longer' and 'resize_shorter' can only be passed together for min max scaling mode")
 
diff --git a/rocAL/source/meta_data/coco_meta_data_reader.cpp b/rocAL/source/meta_data/coco_meta_data_reader.cpp
index d0ddea904..8a23120ad 100644
--- a/rocAL/source/meta_data/coco_meta_data_reader.cpp
+++ b/rocAL/source/meta_data/coco_meta_data_reader.cpp
@@ -34,7 +34,7 @@ using namespace std;
 void COCOMetaDataReader::init(const MetaDataConfig &cfg, pMetaDataBatch meta_data_batch) {
     _path = cfg.path();
     _avoid_class_remapping = cfg.class_remapping();
-    this->set_aspect_ratio_grouping(cfg.aspect_ratio_grouping());
+    this->set_aspect_ratio_grouping(cfg.get_aspect_ratio_grouping());
     _output = meta_data_batch;
     _output->set_metadata_type(cfg.type());
 }
@@ -194,7 +194,7 @@ void COCOMetaDataReader::read_all(const std::string &path) {
                         parser.SkipValue();
                     }
                 }
-                _map_img_names.insert(pair<int, std::string>(image_id, image_name));
+                _map_image_names_to_id.insert(pair<int, std::string>(image_id, image_name));
                 _map_img_sizes.insert(pair<std::string, ImgSize>(image_name, img_size));
                 img_size = {};
             }
@@ -270,7 +270,7 @@ void COCOMetaDataReader::read_all(const std::string &path) {
                     }
                 }
 
-                auto itr = _map_img_names.find(id);
+                auto itr = _map_image_names_to_id.find(id);
                 auto it = _map_img_sizes.find(itr->second);
                 ImgSize image_size = it->second;  // Convert to "ltrb" format
                 if ((_output->get_metadata_type() == MetaDataType::PolygonMask) && iscrowd == 0) {
diff --git a/rocAL/source/pipeline/master_graph.cpp b/rocAL/source/pipeline/master_graph.cpp
index fb20553af..6e3620247 100644
--- a/rocAL/source/pipeline/master_graph.cpp
+++ b/rocAL/source/pipeline/master_graph.cpp
@@ -687,12 +687,9 @@ MasterGraph::to_tensor(void *out_ptr, RocalTensorlayout format, float multiplier
                                         fB = _mm256_cvtepi32_ps(_mm256_shuffle_epi8(pix0, mask_R));
                                         fG = _mm256_cvtepi32_ps(_mm256_shuffle_epi8(pix0, mask_G));
                                         fR = _mm256_cvtepi32_ps(_mm256_shuffle_epi8(pix0, mask_B));
-                                        fB = _mm256_mul_ps(fB, pmul0);
-                                        fG = _mm256_mul_ps(fG, pmul1);
-                                        fR = _mm256_mul_ps(fR, pmul2);
-                                        fB = _mm256_add_ps(fB, padd0);
-                                        fG = _mm256_add_ps(fG, padd1);
-                                        fR = _mm256_add_ps(fR, padd2);
+                                        fB = _mm256_fmadd_ps(fB, pmul0, padd0);
+                                        fG = _mm256_fmadd_ps(fG, pmul1, padd1);
+                                        fR = _mm256_fmadd_ps(fR, pmul2, padd2);
                                         _mm256_storeu_ps(B_buf, fB);
                                         _mm256_storeu_ps(G_buf, fG);
                                         _mm256_storeu_ps(R_buf, fR);
diff --git a/rocAL/source/readers/image/coco_file_source_reader.cpp b/rocAL/source/readers/image/coco_file_source_reader.cpp
index ec3f60501..f1e656f6a 100644
--- a/rocAL/source/readers/image/coco_file_source_reader.cpp
+++ b/rocAL/source/readers/image/coco_file_source_reader.cpp
@@ -86,7 +86,7 @@ Reader::Status COCOFileSourceReader::initialize(ReaderConfig desc) {
         }
     }
 
-    if (_meta_data_reader && _meta_data_reader->aspect_ratio_grouping()) {
+    if (_meta_data_reader && _meta_data_reader->get_aspect_ratio_grouping()) {
         // calculate the aspect ratio for each file and create a pair of <filename, aspect_ratio>
         std::vector<std::pair<std::string, float>> file_aspect_ratio_pair(_file_names.size());
         for (size_t i = 0; i < _file_names.size(); i++) {
@@ -221,7 +221,7 @@ void COCOFileSourceReader::shuffle_with_aspect_ratios() {
 }
 
 void COCOFileSourceReader::reset() {
-    if (_meta_data_reader && _meta_data_reader->aspect_ratio_grouping()) {
+    if (_meta_data_reader && _meta_data_reader->get_aspect_ratio_grouping()) {
         _file_names = _sorted_file_names;
         if (_shuffle) shuffle_with_aspect_ratios();
     } else if (_shuffle) {
diff --git a/rocAL_pybind/rocal_pybind.cpp b/rocAL_pybind/rocal_pybind.cpp
index 835098b3d..b00eb1f80 100644
--- a/rocAL_pybind/rocal_pybind.cpp
+++ b/rocAL_pybind/rocal_pybind.cpp
@@ -485,7 +485,7 @@ PYBIND11_MODULE(rocal_pybind, m) {
         int prev_object_cnt = 0;
         auto mask_count_buf = mask_count.request();
         int *mask_count_ptr = static_cast<int *>(mask_count_buf.ptr);
-        for (int i = 0; i < bbox_labels->size(); i++) {  // nbatchSize
+        for (int i = 0; i < bbox_labels->size(); i++) {  // For each image in a batch, parse through the mask metadata buffers and convert them to polygons format
             float *mask_buffer = static_cast<float *>(mask_data->at(i)->buffer());
             py::list poly_batch_list;
             for (unsigned j = prev_object_cnt; j < bbox_labels->at(i)->dims().at(0) + prev_object_cnt; j++) {

From 5c4a23deac00bf1b472410b438bf32a72fcc01d5 Mon Sep 17 00:00:00 2001
From: SundarRajan98 <svaithiy@amd.com>
Date: Thu, 26 Oct 2023 18:19:11 +0000
Subject: [PATCH 08/33] Formatting changes

---
 rocAL/rocAL_hip/rocal_hip_kernels.cpp | 40 +++++++++++++--------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/rocAL/rocAL_hip/rocal_hip_kernels.cpp b/rocAL/rocAL_hip/rocal_hip_kernels.cpp
index 3475ec13a..f4f1f076d 100644
--- a/rocAL/rocAL_hip/rocal_hip_kernels.cpp
+++ b/rocAL/rocAL_hip/rocal_hip_kernels.cpp
@@ -43,8 +43,8 @@ Hip_CopyInt8ToNHWC_fp32(
     const int C = nchw.y;
     const int maxOutH = out_dims.x;
     const int maxOutW = out_dims.y;
-    const int img_offset = C * W * H;
-    const int out_img_offset = C * maxOutW * maxOutH;
+    const int imgOffset = C * W * H;
+    const int outImgOffset = C * maxOutW * maxOutH;
 
     if ((x >= maxOutW) || (y >= maxOutH))
         return;
@@ -54,8 +54,8 @@ Hip_CopyInt8ToNHWC_fp32(
         // copy float3  pixels to dst
         if (C == 3) {
             float3 dst;
-            const uchar *inp_img = &inp_image_u8[n * img_offset];
-            float *out_tensor = (float *)((float *)output_tensor + dst_buf_offset + n * out_img_offset);
+            const uchar *inp_img = &inp_image_u8[n * imgOffset];
+            float *out_tensor = (float *)((float *)output_tensor + dst_buf_offset + n * outImgOffset);
             if (reverse_channels)
                 dst = make_float3((float)inp_img[src_idx + 2], (float)inp_img[src_idx + 1], (float)inp_img[src_idx]) * multiplier + offset;
             else
@@ -64,8 +64,8 @@ Hip_CopyInt8ToNHWC_fp32(
             out_tensor[dst_idx + 1] = dst.y;
             out_tensor[dst_idx + 2] = dst.z;
         } else {
-            const uchar *inp_img = &inp_image_u8[n * img_offset + dst_buf_offset];
-            float *out_tensor = (float *)output_tensor + dst_buf_offset + n * out_img_offset;
+            const uchar *inp_img = &inp_image_u8[n * imgOffset + dst_buf_offset];
+            float *out_tensor = (float *)output_tensor + dst_buf_offset + n * outImgOffset;
             out_tensor[dst_idx] = (float)inp_img[src_idx] * multiplier.x + offset.x;
         }
     }
@@ -88,18 +88,18 @@ Hip_CopyInt8ToNHWC_fp16(
     const int C = nchw.y;
     const int maxOutH = out_dims.x;
     const int maxOutW = out_dims.y;
-    const int img_offset = C * W * H;
-    const int out_img_offset = C * maxOutW * maxOutH;
+    const int imgOffset = C * W * H;
+    const int outImgOffset = C * maxOutW * maxOutH;
 
     if ((x >= maxOutW) || (y >= maxOutH))
         return;
     for (unsigned int n = 0; n < nchw.x; n++) {
-        __half *out_tensor = (__half *)output_tensor + dst_buf_offset + n * out_img_offset;
+        __half *out_tensor = (__half *)output_tensor + dst_buf_offset + n * outImgOffset;
         unsigned int src_idx = (y * W + x) * C;
         // copy float3  pixels to dst
         if (C == 3) {
             unsigned int dst_idx = y * maxOutW + x * 3;
-            const uchar *inp_img = &inp_image_u8[n * img_offset];
+            const uchar *inp_img = &inp_image_u8[n * imgOffset];
             float3 dst;
             if (reverse_channels)
                 dst = make_float3((float)inp_img[src_idx + 2], (float)inp_img[src_idx + 1], (float)inp_img[src_idx]) * multiplier + offset;
@@ -110,8 +110,8 @@ Hip_CopyInt8ToNHWC_fp16(
             out_tensor[dst_idx + 2] = __float2half(dst.z);
         } else {
             unsigned int dst_idx = y * maxOutW + x;
-            const uchar *inp_img = &inp_image_u8[n * img_offset];
-            float *out_tensor = (float *)output_tensor + n * out_img_offset;
+            const uchar *inp_img = &inp_image_u8[n * imgOffset];
+            float *out_tensor = (float *)output_tensor + n * outImgOffset;
             out_tensor[dst_idx] = __float2half((float)inp_img[src_idx] * multiplier.x + offset.x);
         }
     }
@@ -134,8 +134,8 @@ Hip_CopyInt8ToNCHW_fp32(
     const int C = nchw.y;
     const int maxOutH = out_dims.x;
     const int maxOutW = out_dims.y;
-    const int img_offset = C * W * H;
-    const int out_img_offset = C * maxOutW * maxOutH;
+    const int imgOffset = C * W * H;
+    const int outImgOffset = C * maxOutW * maxOutH;
     unsigned int cstride = maxOutW * maxOutH;
 
     if ((x >= maxOutW) || (y >= maxOutH))
@@ -144,8 +144,8 @@ Hip_CopyInt8ToNCHW_fp32(
         unsigned int src_idx = (y * W + x) * C;
         unsigned int dst_idx = y * maxOutW + x;
         // copy float3  pixels to dst
-        const uchar *inp_img = &inp_image_u8[n * img_offset];
-        float *out_tensor = (float *)output_tensor + n * out_img_offset + dst_buf_offset;
+        const uchar *inp_img = &inp_image_u8[n * imgOffset];
+        float *out_tensor = (float *)output_tensor + n * outImgOffset + dst_buf_offset;
         if (C == 3) {
             float3 dst;
             if (reverse_channels)
@@ -178,15 +178,15 @@ Hip_CopyInt8ToNCHW_fp16(
     const int C = nchw.y;
     const int maxOutH = out_dims.x;
     const int maxOutW = out_dims.y;
-    const int img_offset = C * W * H;
-    const int out_img_offset = C * maxOutW * maxOutH;
+    const int imgOffset = C * W * H;
+    const int outImgOffset = C * maxOutW * maxOutH;
     unsigned int cstride = maxOutW * maxOutH;
 
     if ((x >= maxOutW) || (y >= maxOutH))
         return;
     for (unsigned int n = 0; n < nchw.x; n++) {
-        __half *out_tensor = (__half *)output_tensor + n * out_img_offset + dst_buf_offset;
-        const uchar *inp_img = &inp_image_u8[n * img_offset];
+        __half *out_tensor = (__half *)output_tensor + n * outImgOffset + dst_buf_offset;
+        const uchar *inp_img = &inp_image_u8[n * imgOffset];
         unsigned int src_idx = (y * W + x) * C;
         // copy float3  pixels to dst
         unsigned int dst_idx = y * maxOutW + x;

From bffff3f23422624097767180aca16b024cef58e1 Mon Sep 17 00:00:00 2001
From: SundarRajan98 <svaithiy@amd.com>
Date: Tue, 31 Oct 2023 11:26:18 +0000
Subject: [PATCH 09/33] Resolving review comments

---
 rocAL/include/api/rocal_api_data_transfer.h  |   2 +-
 rocAL/include/pipeline/master_graph.h        |   2 +-
 rocAL/rocAL_hip/rocal_hip_kernels.cpp        | 102 +++++++++----------
 rocAL/rocAL_hip/rocal_hip_kernels.h          |   8 +-
 rocAL/source/api/rocal_api_data_transfer.cpp |   4 +-
 rocAL/source/pipeline/master_graph.cpp       |  32 +++---
 rocAL_pybind/amd/rocal/pipeline.py           |   4 +-
 rocAL_pybind/rocal_pybind.cpp                |   4 +-
 8 files changed, 79 insertions(+), 79 deletions(-)

diff --git a/rocAL/include/api/rocal_api_data_transfer.h b/rocAL/include/api/rocal_api_data_transfer.h
index f94819273..621c714aa 100644
--- a/rocAL/include/api/rocal_api_data_transfer.h
+++ b/rocAL/include/api/rocal_api_data_transfer.h
@@ -63,7 +63,7 @@ extern "C" RocalStatus ROCAL_API_CALL rocalToTensor(RocalContext rocal_context,
                                                     RocalTensorLayout tensor_format, RocalTensorOutputType tensor_output_type,
                                                     float multiplier0, float multiplier1, float multiplier2, float offset0,
                                                     float offset1, float offset2,
-                                                    bool reverse_channels, RocalOutputMemType output_mem_type, int max_height = 0, int max_width = 0);
+                                                    bool reverse_channels, RocalOutputMemType output_mem_type, int max_roi_height = 0, int max_roi_width = 0);
 
 /*!
  * \brief Sets the output images in the RocalContext
diff --git a/rocAL/include/pipeline/master_graph.h b/rocAL/include/pipeline/master_graph.h
index 4ed84bb6e..af2f12b84 100644
--- a/rocAL/include/pipeline/master_graph.h
+++ b/rocAL/include/pipeline/master_graph.h
@@ -82,7 +82,7 @@ class MasterGraph {
     Status reset();
     size_t remaining_count();
     MasterGraph::Status to_tensor(void *out_ptr, RocalTensorlayout format, float multiplier0, float multiplier1, float multiplier2,
-                                  float offset0, float offset1, float offset2, bool reverse_channels, RocalTensorDataType output_data_type, RocalOutputMemType output_mem_type, uint max_height = 0, uint max_width = 0);
+                                  float offset0, float offset1, float offset2, bool reverse_channels, RocalTensorDataType output_data_type, RocalOutputMemType output_mem_type, uint max_roi_height = 0, uint max_roi_width = 0);
     Status copy_output(unsigned char *out_ptr, size_t out_size_in_bytes);
     Status copy_out_tensor_planar(void *out_ptr, RocalTensorlayout format, float multiplier0, float multiplier1, float multiplier2,
                                   float offset0, float offset1, float offset2, bool reverse_channels, RocalTensorDataType output_data_type);
diff --git a/rocAL/rocAL_hip/rocal_hip_kernels.cpp b/rocAL/rocAL_hip/rocal_hip_kernels.cpp
index f4f1f076d..03ac5a88f 100644
--- a/rocAL/rocAL_hip/rocal_hip_kernels.cpp
+++ b/rocAL/rocAL_hip/rocal_hip_kernels.cpp
@@ -41,21 +41,21 @@ Hip_CopyInt8ToNHWC_fp32(
     const int W = nchw.w;
     const int H = nchw.z;
     const int C = nchw.y;
-    const int maxOutH = out_dims.x;
-    const int maxOutW = out_dims.y;
-    const int imgOffset = C * W * H;
-    const int outImgOffset = C * maxOutW * maxOutH;
+    const int max_roi_height = out_dims.x;
+    const int max_roi_width = out_dims.y;
+    const int img_offset = C * W * H;
+    const int out_img_offset = C * max_roi_width * max_roi_height;
 
-    if ((x >= maxOutW) || (y >= maxOutH))
+    if ((x >= max_roi_width) || (y >= max_roi_height))
         return;
     for (unsigned int n = 0; n < nchw.x; n++) {
         unsigned int src_idx = (y * W + x) * C;  // src is RGB
-        unsigned int dst_idx = (y * maxOutW + x) * C;
+        unsigned int dst_idx = (y * max_roi_width + x) * C;
         // copy float3  pixels to dst
         if (C == 3) {
             float3 dst;
-            const uchar *inp_img = &inp_image_u8[n * imgOffset];
-            float *out_tensor = (float *)((float *)output_tensor + dst_buf_offset + n * outImgOffset);
+            const uchar *inp_img = &inp_image_u8[n * img_offset];
+            float *out_tensor = (float *)((float *)output_tensor + dst_buf_offset + n * out_img_offset);
             if (reverse_channels)
                 dst = make_float3((float)inp_img[src_idx + 2], (float)inp_img[src_idx + 1], (float)inp_img[src_idx]) * multiplier + offset;
             else
@@ -64,8 +64,8 @@ Hip_CopyInt8ToNHWC_fp32(
             out_tensor[dst_idx + 1] = dst.y;
             out_tensor[dst_idx + 2] = dst.z;
         } else {
-            const uchar *inp_img = &inp_image_u8[n * imgOffset + dst_buf_offset];
-            float *out_tensor = (float *)output_tensor + dst_buf_offset + n * outImgOffset;
+            const uchar *inp_img = &inp_image_u8[n * img_offset + dst_buf_offset];
+            float *out_tensor = (float *)output_tensor + dst_buf_offset + n * out_img_offset;
             out_tensor[dst_idx] = (float)inp_img[src_idx] * multiplier.x + offset.x;
         }
     }
@@ -86,20 +86,20 @@ Hip_CopyInt8ToNHWC_fp16(
     const int W = nchw.w;
     const int H = nchw.z;
     const int C = nchw.y;
-    const int maxOutH = out_dims.x;
-    const int maxOutW = out_dims.y;
-    const int imgOffset = C * W * H;
-    const int outImgOffset = C * maxOutW * maxOutH;
+    const int max_roi_height = out_dims.x;
+    const int max_roi_width = out_dims.y;
+    const int img_offset = C * W * H;
+    const int out_img_offset = C * max_roi_width * max_roi_height;
 
-    if ((x >= maxOutW) || (y >= maxOutH))
+    if ((x >= max_roi_width) || (y >= max_roi_height))
         return;
     for (unsigned int n = 0; n < nchw.x; n++) {
-        __half *out_tensor = (__half *)output_tensor + dst_buf_offset + n * outImgOffset;
+        __half *out_tensor = (__half *)output_tensor + dst_buf_offset + n * out_img_offset;
         unsigned int src_idx = (y * W + x) * C;
         // copy float3  pixels to dst
         if (C == 3) {
-            unsigned int dst_idx = y * maxOutW + x * 3;
-            const uchar *inp_img = &inp_image_u8[n * imgOffset];
+            unsigned int dst_idx = y * max_roi_width + x * 3;
+            const uchar *inp_img = &inp_image_u8[n * img_offset];
             float3 dst;
             if (reverse_channels)
                 dst = make_float3((float)inp_img[src_idx + 2], (float)inp_img[src_idx + 1], (float)inp_img[src_idx]) * multiplier + offset;
@@ -109,9 +109,9 @@ Hip_CopyInt8ToNHWC_fp16(
             out_tensor[dst_idx + 1] = __float2half(dst.y);
             out_tensor[dst_idx + 2] = __float2half(dst.z);
         } else {
-            unsigned int dst_idx = y * maxOutW + x;
-            const uchar *inp_img = &inp_image_u8[n * imgOffset];
-            float *out_tensor = (float *)output_tensor + n * outImgOffset;
+            unsigned int dst_idx = y * max_roi_width + x;
+            const uchar *inp_img = &inp_image_u8[n * img_offset];
+            float *out_tensor = (float *)output_tensor + n * out_img_offset;
             out_tensor[dst_idx] = __float2half((float)inp_img[src_idx] * multiplier.x + offset.x);
         }
     }
@@ -132,20 +132,20 @@ Hip_CopyInt8ToNCHW_fp32(
     const int W = nchw.w;
     const int H = nchw.z;
     const int C = nchw.y;
-    const int maxOutH = out_dims.x;
-    const int maxOutW = out_dims.y;
-    const int imgOffset = C * W * H;
-    const int outImgOffset = C * maxOutW * maxOutH;
-    unsigned int cstride = maxOutW * maxOutH;
+    const int max_roi_height = out_dims.x;
+    const int max_roi_width = out_dims.y;
+    const int img_offset = C * W * H;
+    const int out_img_offset = C * max_roi_width * max_roi_height;
+    unsigned int cstride = max_roi_width * max_roi_height;
 
-    if ((x >= maxOutW) || (y >= maxOutH))
+    if ((x >= max_roi_width) || (y >= max_roi_height))
         return;
     for (unsigned int n = 0; n < nchw.x; n++) {
         unsigned int src_idx = (y * W + x) * C;
-        unsigned int dst_idx = y * maxOutW + x;
+        unsigned int dst_idx = y * max_roi_width + x;
         // copy float3  pixels to dst
-        const uchar *inp_img = &inp_image_u8[n * imgOffset];
-        float *out_tensor = (float *)output_tensor + n * outImgOffset + dst_buf_offset;
+        const uchar *inp_img = &inp_image_u8[n * img_offset];
+        float *out_tensor = (float *)output_tensor + n * out_img_offset + dst_buf_offset;
         if (C == 3) {
             float3 dst;
             if (reverse_channels)
@@ -176,20 +176,20 @@ Hip_CopyInt8ToNCHW_fp16(
     const int W = nchw.w;
     const int H = nchw.z;
     const int C = nchw.y;
-    const int maxOutH = out_dims.x;
-    const int maxOutW = out_dims.y;
-    const int imgOffset = C * W * H;
-    const int outImgOffset = C * maxOutW * maxOutH;
-    unsigned int cstride = maxOutW * maxOutH;
+    const int max_roi_height = out_dims.x;
+    const int max_roi_width = out_dims.y;
+    const int img_offset = C * W * H;
+    const int out_img_offset = C * max_roi_width * max_roi_height;
+    unsigned int cstride = max_roi_width * max_roi_height;
 
-    if ((x >= maxOutW) || (y >= maxOutH))
+    if ((x >= max_roi_width) || (y >= max_roi_height))
         return;
     for (unsigned int n = 0; n < nchw.x; n++) {
-        __half *out_tensor = (__half *)output_tensor + n * outImgOffset + dst_buf_offset;
-        const uchar *inp_img = &inp_image_u8[n * imgOffset];
+        __half *out_tensor = (__half *)output_tensor + n * out_img_offset + dst_buf_offset;
+        const uchar *inp_img = &inp_image_u8[n * img_offset];
         unsigned int src_idx = (y * W + x) * C;
         // copy float3  pixels to dst
-        unsigned int dst_idx = y * maxOutW + x;
+        unsigned int dst_idx = y * max_roi_width + x;
         if (C == 3) {
             float3 dst;
             if (reverse_channels)
@@ -224,25 +224,25 @@ int HipExecCopyInt8ToNHWC(
     unsigned int fp16,
     const unsigned max_output_height,
     const unsigned max_output_width) {
-    int localThreads_x = 16, localThreads_y = 16;
+    int local_threads_x = 16, local_threads_y = 16;
     uint2 out_dims;
     if ((max_output_height == 0) || (max_output_width == 0))
         out_dims = make_uint2(h, w);
     else
         out_dims = make_uint2(max_output_height, max_output_width);
-    int globalThreads_x = w, globalThreads_y = h;
+    int global_threads_x = w, global_threads_y = h;
     if (!fp16) {
         hipLaunchKernelGGL(Hip_CopyInt8ToNHWC_fp32,
-                           dim3(ceil((float)globalThreads_x / localThreads_x), ceil((float)globalThreads_y / localThreads_y)),
-                           dim3(localThreads_x, localThreads_y),
+                           dim3(ceil((float)global_threads_x / local_threads_x), ceil((float)global_threads_y / local_threads_y)),
+                           dim3(local_threads_x, local_threads_y),
                            0, stream, (const uchar *)inp_image_u8, output_tensor, dst_buf_offset,
                            make_uint4(n, c, h, w), out_dims,
                            make_float3(multiplier0, multiplier1, multiplier2), make_float3(offset0, offset1, offset2),
                            reverse_channels);
     } else {
         hipLaunchKernelGGL(Hip_CopyInt8ToNHWC_fp16,
-                           dim3(ceil((float)globalThreads_x / localThreads_x), ceil((float)globalThreads_y / localThreads_y)),
-                           dim3(localThreads_x, localThreads_y),
+                           dim3(ceil((float)global_threads_x / local_threads_x), ceil((float)global_threads_y / local_threads_y)),
+                           dim3(local_threads_x, local_threads_y),
                            0, stream, (const uchar *)inp_image_u8, output_tensor, dst_buf_offset,
                            make_uint4(n, c, h, w), out_dims,
                            make_float3(multiplier0, multiplier1, multiplier2), make_float3(offset0, offset1, offset2),
@@ -270,25 +270,25 @@ int HipExecCopyInt8ToNCHW(
     unsigned int fp16,
     const unsigned max_output_height,
     const unsigned max_output_width) {
-    int localThreads_x = 16, localThreads_y = 16;
+    int local_threads_x = 16, local_threads_y = 16;
     uint2 out_dims;
     if ((max_output_height == 0) || (max_output_width == 0))
         out_dims = make_uint2(h, w);
     else
         out_dims = make_uint2(max_output_height, max_output_width);
-    int globalThreads_x = w, globalThreads_y = h;
+    int global_threads_x = w, global_threads_y = h;
     if (!fp16) {
         hipLaunchKernelGGL(Hip_CopyInt8ToNCHW_fp32,
-                           dim3(ceil((float)globalThreads_x / localThreads_x), ceil((float)globalThreads_y / localThreads_y)),
-                           dim3(localThreads_x, localThreads_y),
+                           dim3(ceil((float)global_threads_x / local_threads_x), ceil((float)global_threads_y / local_threads_y)),
+                           dim3(local_threads_x, local_threads_y),
                            0, stream, (const uchar *)inp_image_u8, output_tensor, dst_buf_offset,
                            make_uint4(n, c, h, w), out_dims,
                            make_float3(multiplier0, multiplier1, multiplier2), make_float3(offset0, offset1, offset2),
                            reverse_channels);
     } else {
         hipLaunchKernelGGL(Hip_CopyInt8ToNCHW_fp16,
-                           dim3(ceil((float)globalThreads_x / localThreads_x), ceil((float)globalThreads_y / localThreads_y)),
-                           dim3(localThreads_x, localThreads_y),
+                           dim3(ceil((float)global_threads_x / local_threads_x), ceil((float)global_threads_y / local_threads_y)),
+                           dim3(local_threads_x, local_threads_y),
                            0, stream, (const uchar *)inp_image_u8, output_tensor, dst_buf_offset,
                            make_uint4(n, c, h, w), out_dims,
                            make_float3(multiplier0, multiplier1, multiplier2), make_float3(offset0, offset1, offset2),
diff --git a/rocAL/rocAL_hip/rocal_hip_kernels.h b/rocAL/rocAL_hip/rocal_hip_kernels.h
index 0db801f59..a089c904e 100644
--- a/rocAL/rocAL_hip/rocal_hip_kernels.h
+++ b/rocAL/rocAL_hip/rocal_hip_kernels.h
@@ -39,8 +39,8 @@ int HipExecCopyInt8ToNHWC(
     float offset2,
     unsigned int reverse_channels,
     unsigned int fp16,
-    const unsigned max_output_height = 0,
-    const unsigned max_output_width = 0);
+    const unsigned max_roi_height = 0,
+    const unsigned max_roi_width = 0);
 
 int HipExecCopyInt8ToNCHW(
     hipStream_t stream,
@@ -59,5 +59,5 @@ int HipExecCopyInt8ToNCHW(
     float offset2,
     unsigned int reverse_channels,
     unsigned int fp16,
-    const unsigned max_output_height = 0,
-    const unsigned max_output_width = 0);
+    const unsigned max_roi_height = 0,
+    const unsigned max_roi_width = 0);
diff --git a/rocAL/source/api/rocal_api_data_transfer.cpp b/rocAL/source/api/rocal_api_data_transfer.cpp
index a3e3088cf..4202de1a3 100644
--- a/rocAL/source/api/rocal_api_data_transfer.cpp
+++ b/rocAL/source/api/rocal_api_data_transfer.cpp
@@ -30,7 +30,7 @@ THE SOFTWARE.
 RocalStatus ROCAL_API_CALL
 rocalToTensor(RocalContext p_context, void* out_ptr, RocalTensorLayout tensor_format, RocalTensorOutputType tensor_output_type, float multiplier0,
               float multiplier1, float multiplier2, float offset0, float offset1, float offset2,
-              bool reverse_channels, RocalOutputMemType output_mem_type, int max_height, int max_width) {
+              bool reverse_channels, RocalOutputMemType output_mem_type, int max_roi_height, int max_roi_width) {
     auto context = static_cast<Context*>(p_context);
     try {
         if (tensor_format != ROCAL_NHWC && tensor_format != ROCAL_NCHW)
@@ -42,7 +42,7 @@ rocalToTensor(RocalContext p_context, void* out_ptr, RocalTensorLayout tensor_fo
         auto tensor_layout = (tensor_format == ROCAL_NHWC) ? RocalTensorlayout::NHWC : RocalTensorlayout::NCHW;
         auto tensor_output_data_type = (tensor_output_type == ROCAL_FP32) ? RocalTensorDataType::FP32 : RocalTensorDataType::FP16;
         context->master_graph->to_tensor(out_ptr, tensor_layout, multiplier0, multiplier1, multiplier2,
-                                         offset0, offset1, offset2, reverse_channels, tensor_output_data_type, output_mem_type, max_height, max_width);
+                                         offset0, offset1, offset2, reverse_channels, tensor_output_data_type, output_mem_type, max_roi_height, max_roi_width);
     } catch (const std::exception& e) {
         context->capture_error(e.what());
         ERR(e.what())
diff --git a/rocAL/source/pipeline/master_graph.cpp b/rocAL/source/pipeline/master_graph.cpp
index 6e3620247..b63f568d6 100644
--- a/rocAL/source/pipeline/master_graph.cpp
+++ b/rocAL/source/pipeline/master_graph.cpp
@@ -452,7 +452,7 @@ MasterGraph::timing() {
 
 MasterGraph::Status
 MasterGraph::to_tensor(void *out_ptr, RocalTensorlayout format, float multiplier0, float multiplier1,
-                       float multiplier2, float offset0, float offset1, float offset2, bool reverse_channels, RocalTensorDataType output_data_type, RocalOutputMemType output_mem_type, uint max_height, uint max_width) {
+                       float multiplier2, float offset0, float offset1, float offset2, bool reverse_channels, RocalTensorDataType output_data_type, RocalOutputMemType output_mem_type, uint max_roi_height, uint max_roi_width) {
     if (no_more_processed_data())
         return MasterGraph::Status::NO_MORE_DATA;
 
@@ -474,9 +474,9 @@ MasterGraph::to_tensor(void *out_ptr, RocalTensorlayout format, float multiplier
     const size_t h = dims[1];
     const size_t w = dims[2];
     const size_t single_output_tensor_size = output_tensor_info.data_size();
-    if ((max_height == 0) || (max_width == 0)) {
-        max_height = h;
-        max_width = w;
+    if ((max_roi_height == 0) || (max_roi_width == 0)) {
+        max_roi_height = h;
+        max_roi_width = w;
     }
 
 #if ENABLE_OPENCL
@@ -560,11 +560,11 @@ MasterGraph::to_tensor(void *out_ptr, RocalTensorlayout format, float multiplier
             auto img_buffer = out_tensor;
             if (format == RocalTensorlayout::NHWC) {
                 HipExecCopyInt8ToNHWC(_device.resources()->hip_stream, (const void *)img_buffer, out_ptr, dest_buf_offset, n, c, h, w,
-                                      multiplier0, multiplier1, multiplier2, offset0, offset1, offset2, reverse_channels, fp16, max_height, max_width);
+                                      multiplier0, multiplier1, multiplier2, offset0, offset1, offset2, reverse_channels, fp16, max_roi_height, max_roi_width);
 
             } else {
                 HipExecCopyInt8ToNCHW(_device.resources()->hip_stream, (const void *)img_buffer, out_ptr, dest_buf_offset, n, c, h, w,
-                                      multiplier0, multiplier1, multiplier2, offset0, offset1, offset2, reverse_channels, fp16, max_height, max_width);
+                                      multiplier0, multiplier1, multiplier2, offset0, offset1, offset2, reverse_channels, fp16, max_roi_height, max_roi_width);
             }
             dest_buf_offset += single_output_tensor_size;
         }
@@ -597,11 +597,11 @@ MasterGraph::to_tensor(void *out_ptr, RocalTensorlayout format, float multiplier
 
                 if (format == RocalTensorlayout::NHWC) {
                     HipExecCopyInt8ToNHWC(_device.resources()->hip_stream, (const void *)_output_tensor_buffer, out_ptr, dest_buf_offset, n, c, h, w,
-                                          multiplier0, multiplier1, multiplier2, offset0, offset1, offset2, reverse_channels, fp16, max_height, max_width);
+                                          multiplier0, multiplier1, multiplier2, offset0, offset1, offset2, reverse_channels, fp16, max_roi_height, max_roi_width);
 
                 } else {
                     HipExecCopyInt8ToNCHW(_device.resources()->hip_stream, (const void *)_output_tensor_buffer, out_ptr, dest_buf_offset, n, c, h, w,
-                                          multiplier0, multiplier1, multiplier2, offset0, offset1, offset2, reverse_channels, fp16, max_height, max_width);
+                                          multiplier0, multiplier1, multiplier2, offset0, offset1, offset2, reverse_channels, fp16, max_roi_height, max_roi_width);
                 }
                 dest_buf_offset += single_output_tensor_size;
             }
@@ -618,8 +618,8 @@ MasterGraph::to_tensor(void *out_ptr, RocalTensorlayout format, float multiplier
             auto num_threads = _cpu_num_threads * 2;
             for (auto &&out_tensor : output_buffers) {
                 unsigned int single_tensor_size = w * c * h;
-                unsigned int channel_size = max_width * max_height;
-                unsigned int output_single_tensor_size = max_height * max_width * c;
+                unsigned int channel_size = max_roi_width * max_roi_height;
+                unsigned int output_single_tensor_size = max_roi_height * max_roi_width * c;
                 unsigned int input_width_stride = w * c;
 #pragma omp parallel for num_threads(num_threads)
                 for (unsigned int batch_count = 0; batch_count < n; batch_count++) {
@@ -675,10 +675,10 @@ MasterGraph::to_tensor(void *out_ptr, RocalTensorlayout format, float multiplier
                                 __m256 padd0 = _mm256_set1_ps(offset0);
                                 __m256 padd1 = _mm256_set1_ps(offset1);
                                 __m256 padd2 = _mm256_set1_ps(offset2);
-                                uint alignedLength = (max_width & ~7);  // multiple of 8
+                                uint alignedLength = (max_roi_width & ~7);  // multiple of 8
 
                                 __m256 fR, fG, fB;
-                                for (uint row = 0; row < max_height; row++) {
+                                for (uint row = 0; row < max_roi_height; row++) {
                                     unsigned char *in_buffer_row = reinterpret_cast<unsigned char *>(in_buffer) + (row * input_width_stride);
                                     uint col = 0;
                                     for (; col < alignedLength; col += 8) {
@@ -698,7 +698,7 @@ MasterGraph::to_tensor(void *out_ptr, RocalTensorlayout format, float multiplier
                                         R_buf += 8;
                                         in_buffer_row += 24;
                                     }
-                                    for (; col < max_width; col++, in_buffer_row += 3) {
+                                    for (; col < max_roi_width; col++, in_buffer_row += 3) {
                                         *B_buf++ = (in_buffer_row[0] * multiplier0) + offset0;
                                         *G_buf++ = (in_buffer_row[1] * multiplier1) + offset1;
                                         *R_buf++ = (in_buffer_row[2] * multiplier2) + offset1;
@@ -739,11 +739,11 @@ MasterGraph::to_tensor(void *out_ptr, RocalTensorlayout format, float multiplier
                                 __m256 padd0 = _mm256_set1_ps(offset0);
                                 __m256 padd1 = _mm256_set1_ps(offset1);
                                 __m256 padd2 = _mm256_set1_ps(offset2);
-                                uint alignedLength = (max_width & ~7);  // multiple of 8
+                                uint alignedLength = (max_roi_width & ~7);  // multiple of 8
 
                                 __m256 fR, fG, fB;
                                 __m128i tempR, tempG, tempB;
-                                for (uint row = 0; row < max_height; row++) {
+                                for (uint row = 0; row < max_roi_height; row++) {
                                     unsigned char *in_buffer_row = reinterpret_cast<unsigned char *>(in_buffer) + (row * input_width_stride);
                                     uint col = 0;
                                     for (; col < alignedLength; col += 8) {
@@ -766,7 +766,7 @@ MasterGraph::to_tensor(void *out_ptr, RocalTensorlayout format, float multiplier
                                         R_buf_16 += 8;
                                         in_buffer_row += 24;
                                     }
-                                    for (; col < max_width; col++, in_buffer_row += 3) {
+                                    for (; col < max_roi_width; col++, in_buffer_row += 3) {
                                         *B_buf_16++ = (half)(in_buffer_row[0] * multiplier0) + offset0;
                                         *G_buf_16++ = (half)(in_buffer_row[1] * multiplier1) + offset1;
                                         *R_buf_16++ = (half)(in_buffer_row[2] * multiplier2) + offset2;
diff --git a/rocAL_pybind/amd/rocal/pipeline.py b/rocAL_pybind/amd/rocal/pipeline.py
index b0ee8f440..fc454f97d 100644
--- a/rocAL_pybind/amd/rocal/pipeline.py
+++ b/rocAL_pybind/amd/rocal/pipeline.py
@@ -146,9 +146,9 @@ def define_graph(self):
     def get_handle(self):
         return self._handle
 
-    def copyToExternalTensor(self, array,  multiplier, offset, reverse_channels, tensor_format, tensor_dtype, max_height=0, max_width=0):
+    def copyToExternalTensor(self, array,  multiplier, offset, reverse_channels, tensor_format, tensor_dtype, max_roi_height=0, max_roi_width=0):
         b.rocalToTensor(self._handle, ctypes.c_void_p(array.data_ptr()), tensor_format, tensor_dtype,
-                        multiplier[0], multiplier[1], multiplier[2], offset[0], offset[1], offset[2], (1 if reverse_channels else 0), self._output_memory_type, max_height, max_width)
+                        multiplier[0], multiplier[1], multiplier[2], offset[0], offset[1], offset[2], (1 if reverse_channels else 0), self._output_memory_type, max_roi_height, max_roi_width)
 
     def get_one_hot_encoded_labels(self, array, device):
         if device == "cpu":
diff --git a/rocAL_pybind/rocal_pybind.cpp b/rocAL_pybind/rocal_pybind.cpp
index b00eb1f80..2c446f274 100644
--- a/rocAL_pybind/rocal_pybind.cpp
+++ b/rocAL_pybind/rocal_pybind.cpp
@@ -87,12 +87,12 @@ py::object wrapper_image_name(RocalContext context, int array_len) {
 py::object wrapper_copy_to_tensor(RocalContext context, py::object p,
                                   RocalTensorLayout tensor_format, RocalTensorOutputType tensor_output_type, float multiplier0,
                                   float multiplier1, float multiplier2, float offset0, float offset1, float offset2,
-                                  bool reverse_channels, RocalOutputMemType output_mem_type, uint max_height, uint max_width) {
+                                  bool reverse_channels, RocalOutputMemType output_mem_type, uint max_roi_height, uint max_roi_width) {
     auto ptr = ctypes_void_ptr(p);
     // call pure C++ function
     int status = rocalToTensor(context, ptr, tensor_format, tensor_output_type, multiplier0,
                                multiplier1, multiplier2, offset0, offset1, offset2,
-                               reverse_channels, output_mem_type, max_height, max_width);
+                               reverse_channels, output_mem_type, max_roi_height, max_roi_width);
     return py::cast<py::none>(Py_None);
 }
 

From 166d9ab1343eb1012efbc009f0a453f935b1189d Mon Sep 17 00:00:00 2001
From: SundarRajan98 <svaithiy@amd.com>
Date: Tue, 31 Oct 2023 17:03:23 +0000
Subject: [PATCH 10/33] Adding min_max scaling mode comment

---
 rocAL/source/api/rocal_api_augmentation.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rocAL/source/api/rocal_api_augmentation.cpp b/rocAL/source/api/rocal_api_augmentation.cpp
index 0988b58ed..33fb5b57a 100644
--- a/rocAL/source/api/rocal_api_augmentation.cpp
+++ b/rocAL/source/api/rocal_api_augmentation.cpp
@@ -554,10 +554,10 @@ RocalTensor ROCAL_API_CALL
     try {
         if ((dest_width | dest_height | resize_longer | resize_shorter) == 0)
             THROW("Atleast one size 'dest_width' or 'dest_height' or 'resize_shorter' or 'resize_longer' must be specified")
-        // Specifying dest width and height along with Resize_shorter and resize_longer can be used together in case of MIN_MAX_SCALING_MODE - for other scaling modes, this throws an error
+        // MaskRCNN training uses a new resize scaling mode - MIN_MAX_SCALING_MODE where min_size and max_size is passed and the final output size is calculated from the image size
+        // Only in the case of MIN_MAX_SCALING_MODE, both resize_shorter and resize_longer values can be passed together
         if ((dest_width | dest_height) && (resize_longer | resize_shorter) && (scaling_mode != RocalResizeScalingMode::ROCAL_SCALING_MODE_MIN_MAX))
             THROW("Only one method of specifying size can be used \ndest_width and/or dest_height\nresize_shorter\nresize_longer")
-        // Resize_shorter and resize_longer can be used together in case of MIN_MAX_SCALING_MODE - for other scaling modes, this throws an error
         if (resize_longer && resize_shorter && scaling_mode != RocalResizeScalingMode::ROCAL_SCALING_MODE_MIN_MAX)
             THROW("'resize_longer' and 'resize_shorter' can only be passed together for min max scaling mode")
 

From e49fece590a27dfc54a4f0e488f0e3d0d5dd5494 Mon Sep 17 00:00:00 2001
From: SundarRajan98 <svaithiy@amd.com>
Date: Wed, 1 Nov 2023 06:39:02 +0000
Subject: [PATCH 11/33] Removing unused vector in coco reader

---
 rocAL/include/readers/image/coco_file_source_reader.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/rocAL/include/readers/image/coco_file_source_reader.h b/rocAL/include/readers/image/coco_file_source_reader.h
index ffa35caea..fd14c5061 100644
--- a/rocAL/include/readers/image/coco_file_source_reader.h
+++ b/rocAL/include/readers/image/coco_file_source_reader.h
@@ -78,7 +78,6 @@ class COCOFileSourceReader : public Reader {
     struct dirent *_entity;
     std::vector<std::string> _file_names, _sorted_file_names;
     std::vector<float> _aspect_ratios;
-    std::vector<std::string> _files;
     unsigned _curr_file_idx;
     FILE *_current_fPtr;
     std::ifstream _current_ifs;

From e6e24bd85ec5d1f6d0990bf2b63b53f939416b84 Mon Sep 17 00:00:00 2001
From: fgladwin <fgladwin@amd.com>
Date: Thu, 2 Nov 2023 14:03:37 -0400
Subject: [PATCH 12/33] Improve code readability

Add appropriate comments
---
 rocAL/include/api/rocal_api_meta_data.h       |  6 +++---
 rocAL/include/pipeline/master_graph.h         | 14 +++++++-------
 rocAL/source/meta_data/bounding_box_graph.cpp |  2 +-
 rocAL/source/pipeline/master_graph.cpp        |  4 ++--
 4 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/rocAL/include/api/rocal_api_meta_data.h b/rocAL/include/api/rocal_api_meta_data.h
index d339944c7..5d043323e 100644
--- a/rocAL/include/api/rocal_api_meta_data.h
+++ b/rocAL/include/api/rocal_api_meta_data.h
@@ -309,11 +309,11 @@ extern "C" void ROCAL_API_CALL rocalGetJointsDataPtr(RocalContext p_context, Roc
 extern "C" void ROCAL_API_CALL rocalBoxIouMatcher(RocalContext p_context, std::vector<float>& anchors, float criteria,
                                                   float high_threshold, float low_threshold, bool allow_low_quality_matches = true);
 
-/*! \brief API to return the matched idices for the bounding box and anchors
+/*! \brief API to return the matched indices for the bounding box and anchors
  * \ingroup group_rocal_meta_data
- * \param [in] rocal_context rocAL context
+ * \param [in] p_context rocAL context
  * \return RocalTensorList of matched indices
  */
-extern "C" RocalTensorList ROCAL_API_CALL rocalGetMatchedIndices(RocalContext rocal_context);
+extern "C" RocalTensorList ROCAL_API_CALL rocalGetMatchedIndices(RocalContext p_context);
 
 #endif  // MIVISIONX_ROCAL_API_META_DATA_H
diff --git a/rocAL/include/pipeline/master_graph.h b/rocAL/include/pipeline/master_graph.h
index 5b03e0018..5744609a8 100644
--- a/rocAL/include/pipeline/master_graph.h
+++ b/rocAL/include/pipeline/master_graph.h
@@ -46,11 +46,11 @@ THE SOFTWARE.
 #include "randombboxcrop_meta_data_reader.h"
 #include "rocal_api_types.h"
 #define MAX_STRING_LENGTH 100
-#define MAX_OBJECTS 50        // Setting an arbitrary value 50.(Max number of objects/image in COCO dataset is 93)
+#define MAX_OBJECTS 50                // Setting an arbitrary value 50.(Max number of objects/image in COCO dataset is 93)
 #define BBOX_COUNT 4
-#define MAX_NUM_ANCHORS 8732  // Num of bbox achors used in SSD training
+#define MAX_SSD_ANCHORS 8732          // Num of bbox achors used in SSD training
 #define MAX_MASK_BUFFER 10000
-#define MAX_ANCHORS 120087  // Num of bbox achors used in Retinanet training
+#define MAX_RETINANET_ANCHORS 120087  // Num of bbox achors used in Retinanet training
 
 #if ENABLE_SIMD
 #if _WIN32
@@ -209,10 +209,10 @@ class MasterGraph {
     std::vector<float> _means, _stds;                                             //_means:  [x y w h] mean values for normalization _stds: [x y w h] standard deviations for offset normalization.
     bool _augmentation_metanode = false;
     // box IoU matcher variables
-    bool _is_box_iou_matcher = false; // bool variable to set the box iou matcher
-    float _high_threshold = 0.5f;    // Max IoU threshold
-    float _low_threshold = 0.4f;     // Min IoU threshold
-    bool _allow_low_quality_matches = true; // Set to true to include low quality matches in matched idx generation
+    bool _is_box_iou_matcher = false;                                             // bool variable to set the box iou matcher
+    float _high_threshold = 0.5f;                                                 // Max IoU threshold
+    float _low_threshold = 0.4f;                                                  // Min IoU threshold
+    bool _allow_low_quality_matches = true;                                       // Set to true to include low quality matches in matched idx generation
 #if ENABLE_HIP
     BoxEncoderGpu *_box_encoder_gpu = nullptr;
 #endif
diff --git a/rocAL/source/meta_data/bounding_box_graph.cpp b/rocAL/source/meta_data/bounding_box_graph.cpp
index 9e7d19d72..bc15ab0ac 100644
--- a/rocAL/source/meta_data/bounding_box_graph.cpp
+++ b/rocAL/source/meta_data/bounding_box_graph.cpp
@@ -262,7 +262,7 @@ void BoundingBoxGraph::update_box_iou_matcher(std::vector<float> *anchors, int *
 
             if (allow_low_quality_matches) {
                 for (unsigned int anchor_idx = 0; anchor_idx < anchors_size; anchor_idx++) {  // if the element is found
-                    if (fabs(bbox_iou[anchor_idx] - best_bbox_iou) < 1e-6)
+                    if (fabs(bbox_iou[anchor_idx] - best_bbox_iou) < 1e-6)                    // Compare the IOU values and check if they are equal with a tolerance of 1e-6
                         low_quality_preds[anchor_idx] = anchor_idx;
                 }
             }
diff --git a/rocAL/source/pipeline/master_graph.cpp b/rocAL/source/pipeline/master_graph.cpp
index 3dd7ab042..d982d188f 100644
--- a/rocAL/source/pipeline/master_graph.cpp
+++ b/rocAL/source/pipeline/master_graph.cpp
@@ -1022,7 +1022,7 @@ std::vector<rocalTensorList *> MasterGraph::create_coco_meta_data_reader(const c
     _meta_data_reader->read_all(source_path);
     if (!ltrb_bbox) _augmented_meta_data->set_xywh_bbox();
     std::vector<size_t> dims;
-    size_t max_objects = static_cast<size_t>(is_box_encoder ? MAX_NUM_ANCHORS : MAX_OBJECTS);
+    size_t max_objects = static_cast<size_t>(is_box_encoder ? MAX_SSD_ANCHORS : MAX_OBJECTS);
     dims = {max_objects};
     auto default_labels_info = TensorInfo(std::move(dims), _mem_type, RocalTensorDataType::INT32);  // Create default labels Info
     default_labels_info.set_metadata();
@@ -1043,7 +1043,7 @@ std::vector<rocalTensorList *> MasterGraph::create_coco_meta_data_reader(const c
     }
     if (is_box_iou_matcher) {
         _is_box_iou_matcher = true;
-        dims = {MAX_ANCHORS};
+        dims = {MAX_RETINANET_ANCHORS};
         default_matches_info = TensorInfo(std::move(dims), _mem_type, RocalTensorDataType::INT32);  // Create default matches info
         default_matches_info.set_metadata();
         _meta_data_buffer_size.emplace_back(_user_batch_size * default_matches_info.data_size());

From e7e719d5d7fa221cde33ce8d35951ffbeb90dbd9 Mon Sep 17 00:00:00 2001
From: Swetha B S <swetha@multicorewareinc.com>
Date: Thu, 9 Nov 2023 10:42:31 +0000
Subject: [PATCH 13/33] Add the test cases for numpy reader

---
 .../rocAL_unittests/rocAL_unittests.cpp       | 53 +++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/tests/cpp_api_tests/rocAL_unittests/rocAL_unittests.cpp b/tests/cpp_api_tests/rocAL_unittests/rocAL_unittests.cpp
index e48fe1d78..066aa4199 100644
--- a/tests/cpp_api_tests/rocAL_unittests/rocAL_unittests.cpp
+++ b/tests/cpp_api_tests/rocAL_unittests/rocAL_unittests.cpp
@@ -319,6 +319,12 @@ int test(int test_case, int reader_type, const char *path, const char *outName,
             rocalCreateMXNetReader(handle, path, true);
             decoded_output = rocalMXNetRecordSource(handle, path, color_format, num_threads, false, false, false, ROCAL_USE_USER_GIVEN_SIZE_RESTRICTED, decode_max_width, decode_max_height);
         } break;
+        case 12:  // Numpy reader
+        {
+            std::cout << ">>>>>>> Running Numpy reader" << std::endl;
+            pipeline_type = 4;
+            decoded_output = rocalNumpyFileSource(handle, path, num_threads, false, false, false, ROCAL_USE_MAX_SIZE);
+        } break;
         default: {
             std::cout << ">>>>>>> Running IMAGE READER" << std::endl;
             pipeline_type = 1;
@@ -766,6 +772,53 @@ int test(int test_case, int reader_type, const char *path, const char *outName,
                     }
                 }
             } break;
+            case 4: {  // numpy reader pipeline
+                RocalTensorList output_tensor_list;
+                output_tensor_list = rocalGetOutputTensors(handle);
+                for (int idx = 0; idx < output_tensor_list->size(); idx++) {
+                    unsigned char *out_buffer;
+                    if (output_tensor_list->at(idx)->data_type() == RocalTensorOutputType::ROCAL_FP32) {
+                        float *out_f_buffer;
+                        std::cout << "Creating float buffer of ";
+                        for (auto x : output_tensor_list->at(idx)->shape())
+                            std::cout << x << " x ";
+                        std::cout << "shape\n";
+                        if (output_tensor_list->at(idx)->backend() == RocalTensorBackend::ROCAL_GPU) {
+                            out_f_buffer = (float *)malloc(output_tensor_list->at(idx)->data_size());
+                            output_tensor_list->at(idx)->copy_data(out_f_buffer);
+                        } else if (output_tensor_list->at(idx)->backend() == RocalTensorBackend::ROCAL_CPU)
+                            out_f_buffer = (float *)output_tensor_list->at(idx)->buffer();
+
+                        out_buffer = (unsigned char *)malloc(output_tensor_list->at(idx)->data_size() / 4);
+                        // convert_float_to_uchar_buffer(out_f_buffer, out_buffer, output_tensor_list->at(idx)->data_size() / 4);
+                    } else if (output_tensor_list->at(idx)->data_type() == RocalTensorOutputType::ROCAL_FP16) {
+                        half *out_f16_buffer;
+                        std::cout << "Creating float16 buffer of ";
+                        for (auto x : output_tensor_list->at(idx)->shape())
+                            std::cout << x << " x ";
+                        std::cout << "shape\n";
+                        if (output_tensor_list->at(idx)->backend() == RocalTensorBackend::ROCAL_GPU) {
+                            out_f16_buffer = (half *)malloc(output_tensor_list->at(idx)->data_size());
+                            output_tensor_list->at(idx)->copy_data(out_f16_buffer);
+                        } else if (output_tensor_list->at(idx)->backend() == RocalTensorBackend::ROCAL_CPU)
+                            out_f16_buffer = (half *)output_tensor_list->at(idx)->buffer();
+
+                        out_buffer = (unsigned char *)malloc(output_tensor_list->at(idx)->data_size() / 2);
+                        // convert_float_to_uchar_buffer(out_f16_buffer, out_buffer, output_tensor_list->at(idx)->data_size() / 2);
+                    } else {
+                        std::cout << "Creating uchar buffer of ";
+                        for (auto x : output_tensor_list->at(idx)->shape())
+                            std::cout << x << " x ";
+                        std::cout << "shape\n";
+                        if (output_tensor_list->at(idx)->backend() == RocalTensorBackend::ROCAL_GPU) {
+                            out_buffer = (unsigned char *)malloc(output_tensor_list->at(idx)->data_size());
+                            output_tensor_list->at(idx)->copy_data(out_buffer);
+                        } else if (output_tensor_list->at(idx)->backend() == RocalTensorBackend::ROCAL_CPU)
+                            out_buffer = (unsigned char *)(output_tensor_list->at(idx)->buffer());
+                    }
+                }
+                std::cout << "Copied numpy data to buffers\n";
+            } break;
             default: {
                 std::cout << "Not a valid pipeline type ! Exiting!\n";
                 return -1;

From 935c769b3d6e94e0b6de0f8d746450683d0d42d7 Mon Sep 17 00:00:00 2001
From: SundarRajan98 <svaithiy@amd.com>
Date: Mon, 27 Nov 2023 13:31:38 +0000
Subject: [PATCH 14/33] Fixing bug with numpy reader shuffle

---
 rocAL/source/readers/image/numpy_data_reader.cpp | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/rocAL/source/readers/image/numpy_data_reader.cpp b/rocAL/source/readers/image/numpy_data_reader.cpp
index 1f1cc7b6b..7f57beaee 100644
--- a/rocAL/source/readers/image/numpy_data_reader.cpp
+++ b/rocAL/source/readers/image/numpy_data_reader.cpp
@@ -25,6 +25,7 @@ THE SOFTWARE.
 #include <commons.h>
 
 #include <algorithm>
+#include <numeric>
 #include <boost/filesystem.hpp>
 #include <cassert>
 
@@ -375,7 +376,20 @@ int NumpyDataReader::release() {
 
 void NumpyDataReader::reset() {
     _shuffle_time.start();
-    if (_shuffle) std::random_shuffle(_file_names.begin(), _file_names.end());
+    if (_shuffle) {
+        std::vector<std::string> shuffled_filenames;
+        std::vector<NumpyHeaderData> shuffled_headers;
+        std::vector<int> indexes(_file_names.size());
+        std::iota(indexes.begin(), indexes.end(), 0);
+        // Shuffle the index vector and use the index to fetch batch size elements for decoding
+        std::random_shuffle(indexes.begin(), indexes.end());
+        for (auto const idx : indexes) {
+            shuffled_filenames.push_back(_file_names[idx]);
+            shuffled_headers.push_back(_file_headers[idx]);
+        }
+        _file_names = shuffled_filenames;
+        _file_headers = shuffled_headers;
+    }
     _shuffle_time.end();
     _read_counter = 0;
     _curr_file_idx = 0;

From bcb050fc7f525bc2c5911371526b3ac246e4fdcf Mon Sep 17 00:00:00 2001
From: SundarRajan98 <svaithiy@amd.com>
Date: Mon, 27 Nov 2023 13:42:20 +0000
Subject: [PATCH 15/33] Resizing file headers after last batch padding

---
 rocAL/source/readers/image/numpy_data_reader.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rocAL/source/readers/image/numpy_data_reader.cpp b/rocAL/source/readers/image/numpy_data_reader.cpp
index 7f57beaee..a8917d997 100644
--- a/rocAL/source/readers/image/numpy_data_reader.cpp
+++ b/rocAL/source/readers/image/numpy_data_reader.cpp
@@ -71,6 +71,7 @@ Reader::Status NumpyDataReader::initialize(ReaderConfig desc) {
             replicate_last_batch_to_pad_partial_shard();
         }
     }
+    _file_headers.resize(_file_names.size());
     // shuffle dataset if set
     _shuffle_time.start();
     if (ret == Reader::Status::OK && _shuffle)
@@ -434,7 +435,6 @@ Reader::Status NumpyDataReader::subfolder_reading() {
         replicate_last_image_to_fill_last_shard();
         LOG("NumpyDataReader ShardID [" + TOSTR(_shard_id) + "] Replicated " + _folder_path + _last_file_name + " " + TOSTR((_batch_count - _in_batch_read_count)) + " times to fill the last batch")
     }
-    _file_headers.resize(_file_names.size());
     if (!_file_names.empty())
         LOG("NumpyDataReader ShardID [" + TOSTR(_shard_id) + "] Total of " + TOSTR(_file_names.size()) + " images loaded from " + _full_path)
     return ret;

From 15d46469607eae23bfe458b809b017baf40b9321 Mon Sep 17 00:00:00 2001
From: SundarRajan98 <svaithiy@amd.com>
Date: Fri, 8 Dec 2023 17:54:35 +0000
Subject: [PATCH 16/33] Adding changes for - normalize and transpose kernel
 support to rocAL - generic ROI changes

---
 rocAL/include/api/rocal_api_augmentation.h    | 38 ++++++++
 .../augmentations/augmentations_nodes.h       |  2 +
 .../effects_augmentations/node_normalize.h    | 46 ++++++++++
 .../geometry_augmentations/node_transpose.h   | 40 +++++++++
 rocAL/include/pipeline/tensor.h               | 13 ++-
 rocAL/source/api/rocal_api_augmentation.cpp   | 60 +++++++++++++
 .../effects_augmentations/node_normalize.cpp  | 90 +++++++++++++++++++
 .../geometry_augmentations/node_transpose.cpp | 51 +++++++++++
 rocAL/source/pipeline/tensor.cpp              | 18 ++--
 9 files changed, 344 insertions(+), 14 deletions(-)
 create mode 100644 rocAL/include/augmentations/effects_augmentations/node_normalize.h
 create mode 100644 rocAL/include/augmentations/geometry_augmentations/node_transpose.h
 create mode 100644 rocAL/source/augmentations/effects_augmentations/node_normalize.cpp
 create mode 100644 rocAL/source/augmentations/geometry_augmentations/node_transpose.cpp

diff --git a/rocAL/include/api/rocal_api_augmentation.h b/rocAL/include/api/rocal_api_augmentation.h
index d236073fa..b953211ec 100644
--- a/rocAL/include/api/rocal_api_augmentation.h
+++ b/rocAL/include/api/rocal_api_augmentation.h
@@ -329,6 +329,20 @@ extern "C" RocalTensor ROCAL_API_CALL rocalFlipFixed(RocalContext context, Rocal
                                                      RocalTensorLayout output_layout = ROCAL_NONE,
                                                      RocalTensorOutputType output_datatype = ROCAL_UINT8);
 
+/*! \brief Transposes the tensors by reordering the dimensions based on the perm parameter.
+ * \ingroup group_rocal_augmentations
+ * \param [in] context Rocal context
+ * \param [in] input Input Rocal tensor
+ * \param [in] perm Permutation of the dimensions of the input
+ * \param [in] is_output is the output tensor part of the graph output
+ * \param [in] output_layout the layout of the output tensor
+ * \param [in] output_datatype the data type of the output tensor
+ * \return RocalTensor
+ */
+extern "C" RocalTensor ROCAL_API_CALL rocalTranspose(RocalContext context, RocalTensor input, std::vector<unsigned> perm, bool is_output,
+                                                RocalTensorLayout output_layout = ROCAL_NONE,
+                                                RocalTensorOutputType output_datatype = ROCAL_UINT8);
+
 /*! \brief Applies blur effect to images.
  * \ingroup group_rocal_augmentations
  * \param [in] context Rocal context
@@ -997,6 +1011,30 @@ extern "C" RocalTensor ROCAL_API_CALL rocalCropMirrorNormalize(RocalContext cont
                                                                RocalTensorLayout output_layout = ROCAL_NONE,
                                                                RocalTensorOutputType output_datatype = ROCAL_UINT8);
 
+/*! \brief Performs normalization on images.
+ * \ingroup group_rocal_augmentations
+ * \param [in] context Rocal context
+ * \param [in] input Input Rocal tensor
+ * \param [in] axes axes list for tensor normalization
+ * \param [in] mean mean value (specified for each channel) for tensor normalization
+ * \param [in] std_dev standard deviation value (specified for each channel) for tensor normalization
+ * \param [in] scale scale value (specified for each channel) for tensor normalization
+ * \param [in] shift shift value (specified for each channel) for tensor normalization
+ * \param [in] is_output is the output tensor part of the graph output
+ * \param [in] mirror controls horizontal flip of the tensor
+ * \param [in] output_layout the layout of the output tensor
+ * \param [in] output_datatype the data type of the output tensor
+ * \return RocalTensor
+ */
+extern "C" RocalTensor ROCAL_API_CALL rocalNormalize(RocalContext context, RocalTensor input,
+                                                               std::vector<unsigned> &axes,
+                                                               std::vector<float> &mean,
+                                                               std::vector<float> &std_dev,
+                                                               bool is_output,
+                                                               float scale = 1.0, float shift = 0.0,
+                                                               RocalTensorLayout output_layout = ROCAL_NONE,
+                                                               RocalTensorOutputType output_datatype = ROCAL_UINT8);                                                               
+
 /*! \brief Crops images.
  * \ingroup group_rocal_augmentations
  * \param [in] context Rocal context
diff --git a/rocAL/include/augmentations/augmentations_nodes.h b/rocAL/include/augmentations/augmentations_nodes.h
index ef6beff32..e9344b4d4 100644
--- a/rocAL/include/augmentations/augmentations_nodes.h
+++ b/rocAL/include/augmentations/augmentations_nodes.h
@@ -57,3 +57,5 @@ THE SOFTWARE.
 #include "node_sequence_rearrange.h"
 #include "node_gaussian_noise.h"
 #include "node_slice.h"
+#include "node_transpose.h"
+#include "node_normalize.h"
diff --git a/rocAL/include/augmentations/effects_augmentations/node_normalize.h b/rocAL/include/augmentations/effects_augmentations/node_normalize.h
new file mode 100644
index 000000000..6ad49d08f
--- /dev/null
+++ b/rocAL/include/augmentations/effects_augmentations/node_normalize.h
@@ -0,0 +1,46 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+#include "node.h"
+#include "parameter_vx.h"
+
+class NormalizeNode : public Node {
+   public:
+    NormalizeNode(const std::vector<Tensor *> &inputs,
+                            const std::vector<Tensor *> &outputs);
+    NormalizeNode() = delete;
+    void init(std::vector<unsigned> &axes, std::vector<float> &mean, std::vector<float> &std_dev, float scale, float shift);
+
+   protected:
+    void create_node() override;
+    void update_node() override {};
+
+   private:
+    int _axis_mask = 0;
+    uint _compute_mean, _compute_stddev;
+    vx_array _mean_vx_array, _stddev_vx_array;
+    std::vector<unsigned> _axes;
+    std::vector<float> _mean, _std_dev;
+    float _scale, _shift;
+    std::vector<std::vector<uint32_t>> _normalize_roi;
+};
\ No newline at end of file
diff --git a/rocAL/include/augmentations/geometry_augmentations/node_transpose.h b/rocAL/include/augmentations/geometry_augmentations/node_transpose.h
new file mode 100644
index 000000000..d8b6e94c1
--- /dev/null
+++ b/rocAL/include/augmentations/geometry_augmentations/node_transpose.h
@@ -0,0 +1,40 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+#include "node.h"
+#include "parameter_vx.h"
+
+class TransposeNode : public Node {
+   public:
+    TransposeNode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
+    TransposeNode() = delete;
+    void init(std::vector<unsigned> perm);
+
+   protected:
+    void create_node() override;
+    void update_node() override {};
+
+   private:
+    std::vector<unsigned> _perm;
+    vx_array _perm_array;
+};
diff --git a/rocAL/include/pipeline/tensor.h b/rocAL/include/pipeline/tensor.h
index 26689e03f..71bcc1d98 100644
--- a/rocAL/include/pipeline/tensor.h
+++ b/rocAL/include/pipeline/tensor.h
@@ -205,10 +205,14 @@ class TensorInfo {
             get_modified_dims_from_layout(_layout, layout, new_dims);
             _dims = new_dims;
             modify_strides();
+            _max_shape.assign(_dims.begin() + 1, _dims.end());
         }
         _layout = layout;
-        if (_layout == RocalTensorlayout::NONE)
-            set_max_shape();
+        if (_layout == RocalTensorlayout::NHWC || _layout == RocalTensorlayout::NDHWC) {
+            _channels = _dims.back();
+        } else if (_layout == RocalTensorlayout::NCHW || _layout == RocalTensorlayout::NCDHW) {
+            _channels = _dims.at(1);
+        }
     }
     void set_dims(std::vector<size_t>& new_dims) {
         if (_num_of_dims == new_dims.size()) {
@@ -249,13 +253,14 @@ class TensorInfo {
     }
     void modify_dims(RocalTensorlayout layout, std::vector<int> new_dims) {
         switch (_layout) {
-            case RocalTensorlayout::NDHWC: {
+            case RocalTensorlayout::NHWC:
+            case RocalTensorlayout::NCHW: {
                 _max_shape[0] = _dims[1] = new_dims[0];
                 _max_shape[1] = _dims[2] = new_dims[1];
                 _max_shape[2] = _dims[3] = new_dims[2];
-                _max_shape[3] = _dims[4] = new_dims[3];
                 break;
             }
+            case RocalTensorlayout::NDHWC:
             case RocalTensorlayout::NCDHW: {
                 _max_shape[0] = _dims[1] = new_dims[0];
                 _max_shape[1] = _dims[2] = new_dims[1];
diff --git a/rocAL/source/api/rocal_api_augmentation.cpp b/rocAL/source/api/rocal_api_augmentation.cpp
index c740eadc5..efd233c93 100644
--- a/rocAL/source/api/rocal_api_augmentation.cpp
+++ b/rocAL/source/api/rocal_api_augmentation.cpp
@@ -1262,6 +1262,37 @@ rocalSlice(
     return output;
 }
 
+RocalTensor ROCAL_API_CALL
+rocalTranspose(
+    RocalContext p_context,
+    RocalTensor p_input,
+    std::vector<unsigned> perm,
+    bool is_output,
+    RocalTensorLayout output_layout,
+    RocalTensorOutputType output_datatype) {
+    Tensor* output = nullptr;
+    if ((p_context == nullptr) || (p_input == nullptr)) {
+        ERR("Invalid ROCAL context or invalid input image")
+        return output;
+    }
+    auto context = static_cast<Context*>(p_context);
+    auto input = static_cast<Tensor*>(p_input);
+    try {
+        RocalTensorlayout op_tensor_layout = static_cast<RocalTensorlayout>(output_layout);
+        RocalTensorDataType op_tensor_datatype = static_cast<RocalTensorDataType>(output_datatype);
+        TensorInfo output_info = input->info();
+        output_info.set_tensor_layout(op_tensor_layout);
+        output_info.set_data_type(op_tensor_datatype);
+        output = context->master_graph->create_tensor(output_info, is_output);
+        std::shared_ptr<TransposeNode> transpose_node = context->master_graph->add_node<TransposeNode>({input}, {output});
+        transpose_node->init(perm);
+    } catch (const std::exception& e) {
+        context->capture_error(e.what());
+        ERR(e.what())
+    }
+    return output;
+}
+
 RocalTensor ROCAL_API_CALL
 rocalFlip(
     RocalContext p_context,
@@ -1887,6 +1918,35 @@ rocalColorTwistFixed(
     return output;
 }
 
+RocalTensor ROCAL_API_CALL
+rocalNormalize(RocalContext p_context, RocalTensor p_input, std::vector<unsigned> &axes, 
+                         std::vector<float>& mean, std::vector<float>& std_dev, bool is_output,
+                         float scale, float shift,
+                         RocalTensorLayout output_layout,
+                         RocalTensorOutputType output_datatype) {
+    Tensor* output = nullptr;
+    if ((p_context == nullptr) || (p_input == nullptr)) {
+        ERR("Invalid ROCAL context or invalid input tensor")
+        return output;
+    }
+    auto context = static_cast<Context*>(p_context);
+    auto input = static_cast<Tensor*>(p_input);
+    try {
+        RocalTensorlayout op_tensor_layout = static_cast<RocalTensorlayout>(output_layout);
+        RocalTensorDataType op_tensor_datatype = static_cast<RocalTensorDataType>(output_datatype);
+        TensorInfo output_info = input->info();
+        output_info.set_tensor_layout(op_tensor_layout);
+        output_info.set_data_type(op_tensor_datatype);
+        output = context->master_graph->create_tensor(output_info, is_output);
+        std::shared_ptr<NormalizeNode> normalize_node = context->master_graph->add_node<NormalizeNode>({input}, {output});
+        normalize_node->init(axes, mean, std_dev, scale, shift);
+    } catch (const std::exception& e) {
+        context->capture_error(e.what());
+        ERR(e.what())
+    }
+    return output;
+}
+
 RocalTensor ROCAL_API_CALL
 rocalCropMirrorNormalize(RocalContext p_context, RocalTensor p_input, unsigned crop_height,
                          unsigned crop_width, float start_x, float start_y, std::vector<float>& mean,
diff --git a/rocAL/source/augmentations/effects_augmentations/node_normalize.cpp b/rocAL/source/augmentations/effects_augmentations/node_normalize.cpp
new file mode 100644
index 000000000..16bb59798
--- /dev/null
+++ b/rocAL/source/augmentations/effects_augmentations/node_normalize.cpp
@@ -0,0 +1,90 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "node_normalize.h"
+
+#include <graph.h>
+#include <vx_ext_rpp.h>
+
+#include "exception.h"
+
+NormalizeNode::NormalizeNode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) : Node(inputs, outputs) {}
+
+void NormalizeNode::create_node() {
+    if (_node)
+        return;
+
+    _compute_mean = _mean.size() ? 0 : 1;
+    _compute_stddev = _std_dev.size() ? 0 : 1;
+
+    uint mean_stddev_array_size = _mean.size();
+    std::vector<float> mean_vec, stddev_vec;
+    mean_vec.resize(_batch_size * mean_stddev_array_size, _mean[0]);
+    stddev_vec.resize(_batch_size * mean_stddev_array_size, _std_dev[0]);
+
+    if (!_compute_mean && !_compute_stddev)
+    for (uint i = 0; i < _batch_size; i++) {
+        for (uint j = 0; j < mean_stddev_array_size; j++) {
+            mean_vec[i * mean_stddev_array_size + j] = _mean[j];
+            stddev_vec[i * mean_stddev_array_size + j] = _std_dev[j];
+        }
+    }
+    vx_status status = VX_SUCCESS;
+    if (!_compute_mean) {
+        _mean_vx_array = vxCreateArray(vxGetContext((vx_reference)_graph->get()), VX_TYPE_FLOAT32, mean_vec.size());
+        status |= vxAddArrayItems(_mean_vx_array, mean_vec.size(), mean_vec.data(), sizeof(vx_float32));
+        if (status != 0)
+            THROW(" vxAddArrayItems failed in the normalize node (vxExtRppNormalize)  node: " + TOSTR(status) + "  " + TOSTR(status))
+    }
+
+    if (!_compute_stddev) {
+        _stddev_vx_array = vxCreateArray(vxGetContext((vx_reference)_graph->get()), VX_TYPE_FLOAT32, stddev_vec.size());
+        status |= vxAddArrayItems(_stddev_vx_array, stddev_vec.size(), stddev_vec.data(), sizeof(vx_float32));
+        if (status != 0)
+            THROW(" vxAddArrayItems failed in the normalize node (vxExtRppNormalize)  node: " + TOSTR(status) + "  " + TOSTR(status))
+    }
+    vx_scalar axis_mask = vxCreateScalar(vxGetContext((vx_reference)_graph->get()), VX_TYPE_INT32, &_axis_mask);
+    vx_scalar scale = vxCreateScalar(vxGetContext((vx_reference)_graph->get()), VX_TYPE_FLOAT32, &_scale);
+    vx_scalar shift = vxCreateScalar(vxGetContext((vx_reference)_graph->get()), VX_TYPE_FLOAT32, &_shift);
+    vx_scalar compute_mean = vxCreateScalar(vxGetContext((vx_reference)_graph->get()), VX_TYPE_UINT32, &_compute_mean);
+    vx_scalar compute_stddev = vxCreateScalar(vxGetContext((vx_reference)_graph->get()), VX_TYPE_UINT32, &_compute_stddev);
+    int input_layout = static_cast<int>(_inputs[0]->info().layout());
+    int output_layout = static_cast<int>(_outputs[0]->info().layout());
+    int roi_type = static_cast<int>(_inputs[0]->info().roi_type());
+    vx_scalar input_layout_vx = vxCreateScalar(vxGetContext((vx_reference)_graph->get()), VX_TYPE_INT32, &input_layout);
+    vx_scalar output_layout_vx = vxCreateScalar(vxGetContext((vx_reference)_graph->get()), VX_TYPE_INT32, &output_layout);
+    vx_scalar roi_type_vx = vxCreateScalar(vxGetContext((vx_reference)_graph->get()), VX_TYPE_INT32, &roi_type);
+
+    _node = vxExtRppNormalize(_graph->get(), _inputs[0]->handle(), _inputs[0]->get_roi_tensor(), _outputs[0]->handle(), axis_mask,
+                              _mean_vx_array, _stddev_vx_array, compute_mean, compute_stddev, scale, shift, input_layout_vx, output_layout_vx, roi_type_vx);
+    if ((status = vxGetStatus((vx_reference)_node)) != VX_SUCCESS)
+        THROW("Error adding the crop mirror normalize (vxExtRppNormalize) failed: " + TOSTR(status))
+}
+
+void NormalizeNode::init(std::vector<unsigned> &axes, std::vector<float> &mean, std::vector<float> &std_dev, float scale, float shift) {
+    _mean = mean;
+    _std_dev = std_dev;
+    _scale = scale;
+    _shift = shift;
+    for (unsigned d = 0; d < axes.size(); d++)
+        _axis_mask |= (1 << axes[d]);
+}
\ No newline at end of file
diff --git a/rocAL/source/augmentations/geometry_augmentations/node_transpose.cpp b/rocAL/source/augmentations/geometry_augmentations/node_transpose.cpp
new file mode 100644
index 000000000..9e4376e4b
--- /dev/null
+++ b/rocAL/source/augmentations/geometry_augmentations/node_transpose.cpp
@@ -0,0 +1,51 @@
+/*
+Copyright (c) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <vx_ext_rpp.h>
+#include "node_transpose.h"
+#include "exception.h"
+
+TransposeNode::TransposeNode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) : Node(inputs, outputs) {}
+
+void TransposeNode::create_node() {
+    if (_node)
+        return;
+
+    int input_layout = static_cast<int>(_inputs[0]->info().layout());
+    int output_layout = static_cast<int>(_outputs[0]->info().layout());
+    int roi_type = static_cast<int>(_inputs[0]->info().roi_type());
+    vx_scalar input_layout_vx = vxCreateScalar(vxGetContext((vx_reference)_graph->get()), VX_TYPE_INT32, &input_layout);
+    vx_scalar output_layout_vx = vxCreateScalar(vxGetContext((vx_reference)_graph->get()), VX_TYPE_INT32, &output_layout);
+    vx_scalar roi_type_vx = vxCreateScalar(vxGetContext((vx_reference)_graph->get()), VX_TYPE_INT32, &roi_type);
+    _perm_array = vxCreateArray(vxGetContext((vx_reference)_graph->get()), VX_TYPE_UINT32, _perm.size());
+    vx_status status = VX_SUCCESS;
+    status |= vxAddArrayItems(_perm_array, _perm.size(), _perm.data(), sizeof(vx_uint32));
+
+    _node = vxExtRppTranspose(_graph->get(), _inputs[0]->handle(), _inputs[0]->get_roi_tensor(), _outputs[0]->handle(),
+                         _perm_array, input_layout_vx, output_layout_vx, roi_type_vx);
+    if ((status = vxGetStatus((vx_reference)_node)) != VX_SUCCESS)
+        THROW("Adding the transpose (vxExtRppTranspose) node failed: " + TOSTR(status))
+}
+
+void TransposeNode::init(std::vector<unsigned> perm) {
+    _perm = perm;
+}
\ No newline at end of file
diff --git a/rocAL/source/pipeline/tensor.cpp b/rocAL/source/pipeline/tensor.cpp
index 5f0a53a42..f0ac4eaae 100644
--- a/rocAL/source/pipeline/tensor.cpp
+++ b/rocAL/source/pipeline/tensor.cpp
@@ -116,17 +116,17 @@ void TensorInfo::reset_tensor_roi_buffers() {
     auto roi_size = (_layout == RocalTensorlayout::NFCHW || _layout == RocalTensorlayout::NFHWC) ? _dims[0] * _dims[1] : _batch_size;  // For Sequences pre allocating the ROI to N * F to replicate in OpenVX extensions
     allocate_host_or_pinned_mem((void **)&roi_buf, roi_size * roi_no_of_dims * 2 * sizeof(unsigned), _mem_type);
     _roi.set_ptr(roi_buf, _mem_type, roi_size, roi_no_of_dims);
-    if (_layout == RocalTensorlayout::NCDHW || _layout == RocalTensorlayout::NDHWC) {
-        for (unsigned i = 0; i < _batch_size; i++) {
-            unsigned *tensor_shape = _roi[i].end;
-            tensor_shape[i] = _max_shape[i];
-        }
-    } else if (_is_image) {
+    if (_is_image) {
         Roi2DCords *roi = _roi.get_2D_roi();
         for (unsigned i = 0; i < _batch_size; i++) {
             roi[i].xywh.w = _max_shape.at(0);
             roi[i].xywh.h = _max_shape.at(1);
         }
+    } else {
+        for (unsigned i = 0; i < _batch_size; i++) {
+            unsigned *tensor_shape = _roi[i].end;
+            tensor_shape[i] = _max_shape[i];
+        }
     }
 }
 
@@ -221,10 +221,8 @@ void Tensor::update_tensor_roi(const std::vector<std::vector<uint32_t>> &shape)
             THROW("The number of dims to be updated and the num of dims of tensor info does not match")
         
         unsigned *tensor_shape = _info.roi()[i].end;
-        if (_info.layout() == RocalTensorlayout::NCDHW || _info.layout() == RocalTensorlayout::NDHWC) {
-            for (unsigned j = 0; j < max_shape.size(); j++) {
-                tensor_shape[j] = shape[i][j] > max_shape[j] ? max_shape[j] : shape[i][j];
-            }
+        for (unsigned j = 0; j < max_shape.size(); j++) {
+            tensor_shape[j] = shape[i][j] > max_shape[j] ? max_shape[j] : shape[i][j];
         }
     }
 }

From 4380c29b86361cf179c603722e097bf42e9e1d0e Mon Sep 17 00:00:00 2001
From: SundarRajan98 <svaithiy@amd.com>
Date: Fri, 8 Dec 2023 18:51:40 +0000
Subject: [PATCH 17/33] Adding pybind changes for deepcam integration

---
 rocAL_pybind/amd/rocal/fn.py                  | 13 ++++
 .../examples/rocAL_api_numpy_reader.py        | 69 +++++++------------
 rocAL_pybind/rocal_pybind.cpp                 |  4 ++
 3 files changed, 42 insertions(+), 44 deletions(-)

diff --git a/rocAL_pybind/amd/rocal/fn.py b/rocAL_pybind/amd/rocal/fn.py
index 371218313..703769a9a 100644
--- a/rocAL_pybind/amd/rocal/fn.py
+++ b/rocAL_pybind/amd/rocal/fn.py
@@ -1148,3 +1148,16 @@ def random_object_bbox(*inputs, format='anchor_shape', background=0, cache_objec
     else:
         print('Wrong format passed to random_object_bbox')
         return ()
+
+def transpose(*inputs, perm=[], output_layout=types.NHWC, output_dtype=types.UINT8):
+    # pybind call arguments
+    kwargs_pybind = {"input_image": inputs[0], "perm": perm, "is_output": False, "output_layout": output_layout, "output_dtype": output_dtype}
+    transposed_image = b.transpose(Pipeline._current_pipeline._handle, *(kwargs_pybind.values()))
+    return (transposed_image)
+
+def normalize(*inputs, axes=[], mean=[], stddev=[], scale=1.0, shift=0.0, output_layout=types.NHWC, output_dtype=types.UINT8):
+    # pybind call arguments
+    kwargs_pybind = {"input_image": inputs[0], "axes": axes, "mean": mean, "stddev": stddev, "is_output": False,
+                     "scale": scale, "shift": shift, "output_layout": output_layout, "output_dtype": output_dtype}
+    normalized_image = b.normalize(Pipeline._current_pipeline._handle, *(kwargs_pybind.values()))
+    return (normalized_image)
diff --git a/rocAL_pybind/examples/rocAL_api_numpy_reader.py b/rocAL_pybind/examples/rocAL_api_numpy_reader.py
index e2961eddc..60c797a31 100644
--- a/rocAL_pybind/examples/rocAL_api_numpy_reader.py
+++ b/rocAL_pybind/examples/rocAL_api_numpy_reader.py
@@ -10,8 +10,9 @@
 import sys
 import os, glob
 
-val_cases_list = ['00000', '00003', '00005', '00006', '00012', '00024', '00034', '00041', '00044', '00049', '00052', '00056', '00061', '00065', '00066', '00070', '00076', '00078', '00080', '00084',
-                  '00086', '00087', '00092', '00111', '00112', '00125', '00128', '00138', '00157', '00160', '00161', '00162', '00169', '00171', '00176', '00185', '00187', '00189', '00198', '00203', '00206', '00207']
+
+MEAN = [0.026144592091441154, -88.3379898071289, -84.62094116210938, -78.56366729736328, -77.72217559814453, 7.33015557974337e-12, 48330.79296875, 87595.4296875, 183.57638549804688, 208.38265991210938, -7.185957863625792e-19, 109.64270782470703, 94.19403076171875, -0.37584438920021057, 9952.041015625, 20.362579345703125]
+STDDEV = [108.9710922241211, 174.1948699951172, 173.99221801757812, 155.323486328125, 158.25418090820312, 0.14563894271850586, 58919.42578125, 24443.921875, 64.71000671386719, 77.63092041015625, 3.7348792830016464e-05, 242.97598266601562, 237.60250854492188, 5726.51611328125, 2953.1953125, 51.31494903564453]
 
 def load_data(path, files_pattern):
     data = sorted(glob.glob(os.path.join(path, files_pattern)))
@@ -19,19 +20,10 @@ def load_data(path, files_pattern):
     return data
 
 def get_data_split(path: str):
-    imgs = load_data(path, "*_x.npy")
-    lbls = load_data(path, "*_y.npy")
+    imgs = load_data(path, "data-*.npy")
+    lbls = load_data(path, "label-*.npy")
     assert len(imgs) == len(lbls), f"Found {len(imgs)} volumes but {len(lbls)} corresponding masks"
-    imgs_train, lbls_train, imgs_val, lbls_val = [], [], [], []
-    for (case_img, case_lbl) in zip(imgs, lbls):
-        if case_img.split("_")[-2] in val_cases_list:
-            imgs_val.append(case_img)
-            lbls_val.append(case_lbl)
-        else:
-            imgs_train.append(case_img)
-            lbls_train.append(case_lbl)
-
-    return imgs_train, imgs_val, lbls_train, lbls_val
+    return imgs, lbls
 
 def main():
     if  len(sys.argv) < 3:
@@ -45,63 +37,52 @@ def main():
     except OSError as error:
         print(error)
     data_path = sys.argv[1]
-    if(sys.argv[2] == "cpu"):
+    data_path1 = sys.argv[2]
+    if(sys.argv[3] == "cpu"):
         rocal_cpu = True
     else:
         rocal_cpu = False
-    batch_size = int(sys.argv[3])
+    batch_size = int(sys.argv[4])
     num_threads = 8
     device_id = 0
     local_rank = 0
     world_size = 1
     random_seed = random.SystemRandom().randint(0, 2**32 - 1)
-    x_train, x_val, y_train, y_val = get_data_split(data_path)
+    x_train, y_train = get_data_split(data_path)
+    x_val, y_val = get_data_split(data_path1)
 
     import time
     start = time.time()
-    pipeline = Pipeline(batch_size=batch_size, num_threads=num_threads, device_id=device_id, seed=random_seed, rocal_cpu=rocal_cpu, prefetch_queue_depth=2)
+    pipeline = Pipeline(batch_size=batch_size, num_threads=num_threads, device_id=device_id, seed=random_seed, rocal_cpu=rocal_cpu, prefetch_queue_depth=6)
 
     with pipeline:
         numpy_reader_output = fn.readers.numpy(file_root=data_path, files=x_train, shard_id=local_rank, num_shards=world_size, random_shuffle=True, seed=random_seed+local_rank)
-        numpy_reader_output1 = fn.readers.numpy(file_root=data_path, files=y_train, shard_id=local_rank, num_shards=world_size, random_shuffle=True, seed=random_seed+local_rank)
-        data_output = fn.set_layout(numpy_reader_output, output_layout=types.NCDHW)
-        label_output = fn.set_layout(numpy_reader_output1, output_layout=types.NCDHW)
-        [roi_start, roi_end] = fn.random_object_bbox(label_output, format="start_end", k_largest=2, foreground_prob=0.4)
-        anchor = fn.roi_random_crop(label_output, roi_start=roi_start, roi_end=roi_end, crop_shape=(1, 128, 128, 128))
-        data_sliced_output = fn.slice(data_output, anchor=anchor, shape=(1,128,128,128), output_layout=types.NCDHW, output_dtype=types.FLOAT)
-        label_sliced_output = fn.slice(label_output, anchor=anchor, shape=(1,128,128,128), output_layout=types.NCDHW, output_dtype=types.UINT8)       
-        hflip = fn.random.coin_flip(probability=0.33)
-        vflip = fn.random.coin_flip(probability=0.33)
-        dflip = fn.random.coin_flip(probability=0.33)
-        data_flip_output = fn.flip(data_sliced_output, horizontal=hflip, vertical=vflip, depth=dflip, output_layout=types.NCDHW, output_dtype=types.FLOAT)
-        label_flip_output = fn.flip(label_sliced_output, horizontal=hflip, vertical=vflip, depth=dflip, output_layout=types.NCDHW, output_dtype=types.UINT8)
-        brightness = fn.random.uniform(range=[0.7, 1.3])
-        add_brightness = fn.random.coin_flip(probability=0.1)
-        brightness_output = fn.brightness(data_flip_output, brightness=brightness, brightness_shift=0.0, conditional_execution=add_brightness, output_layout=types.NCDHW, output_dtype=types.FLOAT)
-        add_noise = fn.random.coin_flip(probability=0.5)
-        std_dev = fn.random.uniform(range=[0.0, 0.1])
-        noise_output = fn.gaussian_noise(brightness_output, mean=0.0, std_dev=std_dev, conditional_execution=add_noise, output_layout=types.NCDHW, output_dtype=types.FLOAT)
-        pipeline.set_outputs(noise_output, label_flip_output)
+        label_output = fn.readers.numpy(file_root=data_path, files=y_train, shard_id=local_rank, num_shards=world_size, random_shuffle=True, seed=random_seed+local_rank)
+        data_output = fn.set_layout(numpy_reader_output, output_layout=types.NHWC)
+        normalized_output = fn.normalize(data_output, axes=[0,1], mean=MEAN, stddev=STDDEV, output_layout=types.NHWC, output_dtype=types.FLOAT)
+        transposed_output = fn.transpose(normalized_output, perm=[2,1,0], output_layout=types.NCHW, output_dtype=types.FLOAT)
+        pipeline.set_outputs(transposed_output, label_output)
 
     pipeline.build()
 
     pipeline1 = Pipeline(batch_size=batch_size, num_threads=num_threads, device_id=device_id, seed=random_seed, rocal_cpu=rocal_cpu, prefetch_queue_depth=6)
 
     with pipeline1:
-        numpy_reader_output = fn.readers.numpy(file_root=data_path, files=x_val, shard_id=local_rank, num_shards=world_size)
-        numpy_reader_output1 = fn.readers.numpy(file_root=data_path, files=y_val, shard_id=local_rank, num_shards=world_size)
-        data_output = fn.set_layout(numpy_reader_output, output_layout=types.NCDHW)
-        label_output = fn.set_layout(numpy_reader_output1, output_layout=types.NCDHW)
-        pipeline1.set_outputs(data_output, label_output)
+        numpy_reader_output = fn.readers.numpy(file_root=data_path, files=x_val, shard_id=local_rank, num_shards=world_size, seed=random_seed+local_rank)
+        label_output = fn.readers.numpy(file_root=data_path, files=y_val, shard_id=local_rank, num_shards=world_size, seed=random_seed+local_rank)
+        data_output = fn.set_layout(numpy_reader_output, output_layout=types.NHWC)
+        normalized_output = fn.normalize(data_output, axes=[0,1], mean=MEAN, stddev=STDDEV, output_layout=types.NHWC, output_dtype=types.FLOAT)
+        transposed_output = fn.transpose(normalized_output, perm=[2,1,0], output_layout=types.NCHW, output_dtype=types.FLOAT)
+        pipeline1.set_outputs(transposed_output, label_output)
 
     pipeline1.build()
     
     numpyIteratorPipeline = ROCALNumpyIterator(pipeline, device='cpu' if rocal_cpu else 'gpu')
     print(len(numpyIteratorPipeline))
-    valNumpyIteratorPipeline = ROCALNumpyIterator(pipeline1, device='cpu' if rocal_cpu else 'gpu', return_roi=True)
+    valNumpyIteratorPipeline = ROCALNumpyIterator(pipeline1, device='cpu' if rocal_cpu else 'gpu')
     print(len(valNumpyIteratorPipeline))
     cnt = 0
-    for epoch in range(100):
+    for epoch in range(2):
         print("+++++++++++++++++++++++++++++EPOCH+++++++++++++++++++++++++++++++++++++",epoch)
         for i , it in enumerate(numpyIteratorPipeline):
             print(i, it[0].shape, it[1].shape)
diff --git a/rocAL_pybind/rocal_pybind.cpp b/rocAL_pybind/rocal_pybind.cpp
index c31e49b03..49399d2ef 100644
--- a/rocAL_pybind/rocal_pybind.cpp
+++ b/rocAL_pybind/rocal_pybind.cpp
@@ -675,5 +675,9 @@ PYBIND11_MODULE(rocal_pybind, m) {
           py::return_value_policy::reference);
     m.def("slice", &rocalSlice,
           py::return_value_policy::reference);
+    m.def("transpose", &rocalTranspose,
+          py::return_value_policy::reference);
+    m.def("normalize", &rocalNormalize,
+          py::return_value_policy::reference);
 }
 }  // namespace rocal

From 06018412bbeb899ba59554d69ae0c2d66ffa6981 Mon Sep 17 00:00:00 2001
From: SundarRajan98 <svaithiy@amd.com>
Date: Fri, 8 Dec 2023 19:56:10 +0000
Subject: [PATCH 18/33] Adding cast augmentation to rocAL

---
 rocAL/include/api/rocal_api_augmentation.h    | 11 +++++
 .../augmentations/augmentations_nodes.h       |  1 +
 rocAL/include/augmentations/node_cast.h       | 36 +++++++++++++++++
 rocAL/source/api/rocal_api_augmentation.cpp   | 29 ++++++++++++++
 rocAL/source/augmentations/node_cast.cpp      | 40 +++++++++++++++++++
 rocAL_pybind/amd/rocal/fn.py                  |  6 +++
 rocAL_pybind/rocal_pybind.cpp                 |  2 +
 7 files changed, 125 insertions(+)
 create mode 100644 rocAL/include/augmentations/node_cast.h
 create mode 100644 rocAL/source/augmentations/node_cast.cpp

diff --git a/rocAL/include/api/rocal_api_augmentation.h b/rocAL/include/api/rocal_api_augmentation.h
index b953211ec..4bc38b820 100644
--- a/rocAL/include/api/rocal_api_augmentation.h
+++ b/rocAL/include/api/rocal_api_augmentation.h
@@ -1200,6 +1200,17 @@ extern "C" RocalTensor ROCAL_API_CALL rocalSSDRandomCrop(RocalContext context, R
                                                          RocalTensorLayout output_layout = ROCAL_NONE,
                                                          RocalTensorOutputType output_datatype = ROCAL_UINT8);
 
+/**
+ * \brief Cast input tensor from one data type to another 
+ * \param context Rocal context
+ * \param input Input tensor
+ * \param is_output Sets if the output is to be given to user or as intermediate buffer
+ * \param output_datatype Datatype of the output tensor
+ */
+extern "C" RocalTensor ROCAL_API_CALL rocalCast(RocalContext context, RocalTensor input,
+                                                bool is_output,
+                                                RocalTensorOutputType output_datatype = ROCAL_UINT8);
+
 extern "C" RocalTensor ROCAL_API_CALL rocalSetLayout(RocalContext context, RocalTensor input,
                                                      RocalTensorLayout output_layout = ROCAL_NONE);
 
diff --git a/rocAL/include/augmentations/augmentations_nodes.h b/rocAL/include/augmentations/augmentations_nodes.h
index e9344b4d4..c01fb0691 100644
--- a/rocAL/include/augmentations/augmentations_nodes.h
+++ b/rocAL/include/augmentations/augmentations_nodes.h
@@ -59,3 +59,4 @@ THE SOFTWARE.
 #include "node_slice.h"
 #include "node_transpose.h"
 #include "node_normalize.h"
+#include "node_cast.h"
diff --git a/rocAL/include/augmentations/node_cast.h b/rocAL/include/augmentations/node_cast.h
new file mode 100644
index 000000000..67930261b
--- /dev/null
+++ b/rocAL/include/augmentations/node_cast.h
@@ -0,0 +1,36 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+#include "node.h"
+#include "graph.h"
+
+class CastNode : public Node
+{
+public:
+    CastNode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
+    CastNode() = delete;
+
+protected:
+    void create_node() override;
+    void update_node() override {};
+};
diff --git a/rocAL/source/api/rocal_api_augmentation.cpp b/rocAL/source/api/rocal_api_augmentation.cpp
index efd233c93..ea1c3344c 100644
--- a/rocAL/source/api/rocal_api_augmentation.cpp
+++ b/rocAL/source/api/rocal_api_augmentation.cpp
@@ -2326,6 +2326,35 @@ rocalNop(
     return output;
 }
 
+RocalTensor ROCAL_API_CALL rocalCast(RocalContext p_context, RocalTensor p_input,
+                                     bool is_output,
+                                     RocalTensorOutputType output_datatype) {
+    Tensor* output = nullptr;
+    if ((p_context == nullptr) || (p_input == nullptr)) {
+        ERR("Invalid ROCAL context or invalid input tensor")
+        return output;
+    }
+    auto context = static_cast<Context*>(p_context);
+    auto input = static_cast<Tensor*>(p_input);
+    try {
+        RocalTensorDataType op_tensor_datatype = static_cast<RocalTensorDataType>(output_datatype);
+
+        if (input->info().data_type() == op_tensor_datatype) {
+            output = context->master_graph->create_tensor(input->info(), is_output);
+            context->master_graph->add_node<CopyNode>({input}, {output});
+        } else {
+            TensorInfo output_info = input->info();
+            output_info.set_data_type(op_tensor_datatype);
+            output = context->master_graph->create_tensor(output_info, is_output);
+            context->master_graph->add_node<CastNode>({input}, {output});
+        }
+    } catch(const std::exception& e) {
+        context->capture_error(e.what());
+        ERR(e.what())
+    }
+    return output;
+}
+
 RocalTensor ROCAL_API_CALL
 rocalSetLayout(
     RocalContext p_context,
diff --git a/rocAL/source/augmentations/node_cast.cpp b/rocAL/source/augmentations/node_cast.cpp
new file mode 100644
index 000000000..d1949560e
--- /dev/null
+++ b/rocAL/source/augmentations/node_cast.cpp
@@ -0,0 +1,40 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <vx_ext_rpp.h>
+#include "node_cast.h"
+#include "exception.h"
+
+CastNode::CastNode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) :
+        Node(inputs, outputs) {}
+
+void CastNode::create_node() {
+    if(_node)
+        return;
+
+    _node = vxExtRppCast(_graph->get(), _inputs[0]->handle(), _src_tensor_roi, _outputs[0]->handle(), _input_layout, _roi_type);
+
+    vx_status status;
+    if((status = vxGetStatus((vx_reference)_node)) != VX_SUCCESS)
+        THROW("Adding the copy (vxCastNode) node failed: " + TOSTR(status))
+
+}
diff --git a/rocAL_pybind/amd/rocal/fn.py b/rocAL_pybind/amd/rocal/fn.py
index 703769a9a..3ae9b5b06 100644
--- a/rocAL_pybind/amd/rocal/fn.py
+++ b/rocAL_pybind/amd/rocal/fn.py
@@ -1161,3 +1161,9 @@ def normalize(*inputs, axes=[], mean=[], stddev=[], scale=1.0, shift=0.0, output
                      "scale": scale, "shift": shift, "output_layout": output_layout, "output_dtype": output_dtype}
     normalized_image = b.normalize(Pipeline._current_pipeline._handle, *(kwargs_pybind.values()))
     return (normalized_image)
+
+def cast(*inputs, output_dtype=types.UINT8):
+    # pybind call arguments
+    kwargs_pybind = {"input_image": inputs[0], "is_output": False, "output_dtype": output_dtype}
+    normalized_image = b.normalize(Pipeline._current_pipeline._handle, *(kwargs_pybind.values()))
+    return (normalized_image)
diff --git a/rocAL_pybind/rocal_pybind.cpp b/rocAL_pybind/rocal_pybind.cpp
index 49399d2ef..d125717f0 100644
--- a/rocAL_pybind/rocal_pybind.cpp
+++ b/rocAL_pybind/rocal_pybind.cpp
@@ -679,5 +679,7 @@ PYBIND11_MODULE(rocal_pybind, m) {
           py::return_value_policy::reference);
     m.def("normalize", &rocalNormalize,
           py::return_value_policy::reference);
+    m.def("cast", &rocalCast,
+          py::return_value_policy::reference);
 }
 }  // namespace rocal

From 510a83dad1d57da3ac17cef1493e8e0884f9988e Mon Sep 17 00:00:00 2001
From: Swetha B S <swetha@multicorewareinc.com>
Date: Thu, 14 Dec 2023 14:04:52 +0000
Subject: [PATCH 19/33] Resolve PR comments - 1

---
 rocAL/include/api/rocal_api_data_loaders.h         | 14 +++++++-------
 rocAL/include/loaders/image/node_numpy_loader.h    |  4 ++--
 .../loaders/image/node_numpy_loader_single_shard.h |  2 +-
 rocAL/include/loaders/image/numpy_loader.h         |  3 +--
 rocAL/include/loaders/image/numpy_loader_sharded.h |  3 +--
 rocAL/include/pipeline/tensor.h                    |  2 +-
 rocAL/include/readers/image/numpy_data_reader.h    |  2 +-
 rocAL/source/loaders/image/node_numpy_loader.cpp   |  2 +-
 .../image/node_numpy_loader_single_shard.cpp       |  2 +-
 rocAL/source/loaders/image/numpy_loader.cpp        |  2 +-
 .../source/loaders/image/numpy_loader_sharded.cpp  |  2 +-
 rocAL/source/readers/image/numpy_data_reader.cpp   |  2 +-
 12 files changed, 19 insertions(+), 21 deletions(-)

diff --git a/rocAL/include/api/rocal_api_data_loaders.h b/rocAL/include/api/rocal_api_data_loaders.h
index 430254736..0e53f8553 100644
--- a/rocAL/include/api/rocal_api_data_loaders.h
+++ b/rocAL/include/api/rocal_api_data_loaders.h
@@ -588,13 +588,13 @@ extern "C" RocalTensor ROCAL_API_CALL rocalRawTFRecordSourceSingleShard(RocalCon
  * \return Reference to the output tensor
  */
 extern "C"  RocalTensor  ROCAL_API_CALL rocalNumpyFileSource(
-                 RocalContext p_context,
-                 const char* source_path,
-                 unsigned internal_shard_count,
-                 bool is_output = false,
-                 bool shuffle = false,
-                 bool loop = false,
-                 RocalImageSizeEvaluationPolicy decode_size_policy = ROCAL_USE_MAX_SIZE);
+                                        RocalContext p_context,
+                                        const char* source_path,
+                                        unsigned internal_shard_count,
+                                        bool is_output = false,
+                                        bool shuffle = false,
+                                        bool loop = false,
+                                        RocalImageSizeEvaluationPolicy decode_size_policy = ROCAL_USE_MAX_SIZE);
 
 /*! \brief Creates Numpy raw data reader and loader. It allocates the resources and objects required to read raw data stored on the numpy arrays.
  * \ingroup group_rocal_data_loaders
diff --git a/rocAL/include/loaders/image/node_numpy_loader.h b/rocAL/include/loaders/image/node_numpy_loader.h
index 5e2a5975f..587f89e1a 100644
--- a/rocAL/include/loaders/image/node_numpy_loader.h
+++ b/rocAL/include/loaders/image/node_numpy_loader.h
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2019 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -50,4 +50,4 @@ class NumpyLoaderNode : public Node {
 
    private:
     std::shared_ptr<NumpyLoaderSharded> _loader_module = nullptr;
-};
\ No newline at end of file
+};
diff --git a/rocAL/include/loaders/image/node_numpy_loader_single_shard.h b/rocAL/include/loaders/image/node_numpy_loader_single_shard.h
index d2ce4a1f6..c1cffba54 100644
--- a/rocAL/include/loaders/image/node_numpy_loader_single_shard.h
+++ b/rocAL/include/loaders/image/node_numpy_loader_single_shard.h
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2019 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/rocAL/include/loaders/image/numpy_loader.h b/rocAL/include/loaders/image/numpy_loader.h
index 3b8fe4d24..b10cfc1a8 100644
--- a/rocAL/include/loaders/image/numpy_loader.h
+++ b/rocAL/include/loaders/image/numpy_loader.h
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2019 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -29,7 +29,6 @@ THE SOFTWARE.
 #include "circular_buffer.h"
 #include "commons.h"
 #include "image_read_and_decode.h"
-// #include "numpy_data_reader.h"
 //
 // NumpyLoader runs an internal thread for loading an decoding of numpy arrays asynchronously
 // it uses a circular buffer to store decoded numpy arrays for the user
diff --git a/rocAL/include/loaders/image/numpy_loader_sharded.h b/rocAL/include/loaders/image/numpy_loader_sharded.h
index b13f93f30..acd3eb6dd 100644
--- a/rocAL/include/loaders/image/numpy_loader_sharded.h
+++ b/rocAL/include/loaders/image/numpy_loader_sharded.h
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2019 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -21,7 +21,6 @@ THE SOFTWARE.
 */
 
 #pragma once
-#include <vector>
 
 #include "numpy_loader.h"
 //
diff --git a/rocAL/include/pipeline/tensor.h b/rocAL/include/pipeline/tensor.h
index 428dfe38d..82b2e30ef 100644
--- a/rocAL/include/pipeline/tensor.h
+++ b/rocAL/include/pipeline/tensor.h
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2019 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/rocAL/include/readers/image/numpy_data_reader.h b/rocAL/include/readers/image/numpy_data_reader.h
index faf881448..2ef319039 100644
--- a/rocAL/include/readers/image/numpy_data_reader.h
+++ b/rocAL/include/readers/image/numpy_data_reader.h
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2019 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/rocAL/source/loaders/image/node_numpy_loader.cpp b/rocAL/source/loaders/image/node_numpy_loader.cpp
index 63b8cd2ae..eeb51d35d 100644
--- a/rocAL/source/loaders/image/node_numpy_loader.cpp
+++ b/rocAL/source/loaders/image/node_numpy_loader.cpp
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2019 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/rocAL/source/loaders/image/node_numpy_loader_single_shard.cpp b/rocAL/source/loaders/image/node_numpy_loader_single_shard.cpp
index bc1a68b0c..705dd9561 100644
--- a/rocAL/source/loaders/image/node_numpy_loader_single_shard.cpp
+++ b/rocAL/source/loaders/image/node_numpy_loader_single_shard.cpp
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2019 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/rocAL/source/loaders/image/numpy_loader.cpp b/rocAL/source/loaders/image/numpy_loader.cpp
index 9fd856793..c43a25aa1 100644
--- a/rocAL/source/loaders/image/numpy_loader.cpp
+++ b/rocAL/source/loaders/image/numpy_loader.cpp
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2019 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/rocAL/source/loaders/image/numpy_loader_sharded.cpp b/rocAL/source/loaders/image/numpy_loader_sharded.cpp
index 8399abf11..c8413bc3b 100644
--- a/rocAL/source/loaders/image/numpy_loader_sharded.cpp
+++ b/rocAL/source/loaders/image/numpy_loader_sharded.cpp
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2019 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/rocAL/source/readers/image/numpy_data_reader.cpp b/rocAL/source/readers/image/numpy_data_reader.cpp
index 1f1cc7b6b..e088801ec 100644
--- a/rocAL/source/readers/image/numpy_data_reader.cpp
+++ b/rocAL/source/readers/image/numpy_data_reader.cpp
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2019 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

From b28e61cb3cb2bf449e317820d33e7fd196d82f11 Mon Sep 17 00:00:00 2001
From: SundarRajan98 <svaithiy@amd.com>
Date: Fri, 15 Dec 2023 07:58:35 +0000
Subject: [PATCH 20/33] Adding header caching, files and seed options to numpy
 reader

---
 rocAL/include/api/rocal_api_data_loaders.h    |  20 +--
 .../include/loaders/image/node_numpy_loader.h |   4 +-
 .../image/node_numpy_loader_single_shard.h    |   4 +-
 rocAL/include/readers/image/image_reader.h    |   6 +
 .../include/readers/image/numpy_data_reader.h |   6 +
 rocAL/source/api/rocal_api_data_loaders.cpp   |  23 ++--
 .../loaders/image/node_numpy_loader.cpp       |   8 +-
 .../image/node_numpy_loader_single_shard.cpp  |   8 +-
 .../readers/image/numpy_data_reader.cpp       | 127 +++++++++++-------
 rocAL_pybind/amd/rocal/plugin/pytorch.py      |  25 +++-
 rocAL_pybind/amd/rocal/readers.py             |   8 +-
 .../rocAL_unittests/rocAL_unittests.cpp       |   2 +-
 12 files changed, 164 insertions(+), 77 deletions(-)

diff --git a/rocAL/include/api/rocal_api_data_loaders.h b/rocAL/include/api/rocal_api_data_loaders.h
index 0e53f8553..626465e16 100644
--- a/rocAL/include/api/rocal_api_data_loaders.h
+++ b/rocAL/include/api/rocal_api_data_loaders.h
@@ -588,13 +588,15 @@ extern "C" RocalTensor ROCAL_API_CALL rocalRawTFRecordSourceSingleShard(RocalCon
  * \return Reference to the output tensor
  */
 extern "C"  RocalTensor  ROCAL_API_CALL rocalNumpyFileSource(
-                                        RocalContext p_context,
-                                        const char* source_path,
-                                        unsigned internal_shard_count,
-                                        bool is_output = false,
-                                        bool shuffle = false,
-                                        bool loop = false,
-                                        RocalImageSizeEvaluationPolicy decode_size_policy = ROCAL_USE_MAX_SIZE);
+                 RocalContext p_context,
+                 const char* source_path,
+                 unsigned internal_shard_count,
+                 std::vector<std::string> files = {},
+                 bool is_output = false,
+                 bool shuffle = false,
+                 bool loop = false,
+                 RocalImageSizeEvaluationPolicy decode_size_policy = ROCAL_USE_MAX_SIZE,
+                 unsigned seed = 0);
 
 /*! \brief Creates Numpy raw data reader and loader. It allocates the resources and objects required to read raw data stored on the numpy arrays.
  * \ingroup group_rocal_data_loaders
@@ -611,12 +613,14 @@ extern "C"  RocalTensor  ROCAL_API_CALL rocalNumpyFileSource(
 extern "C"  RocalTensor  rocalNumpyFileSourceSingleShard(
                  RocalContext p_context,
                  const char* source_path,
+                 std::vector<std::string> files = {},
                  bool is_output = false,
                  bool shuffle = false,
                  bool loop = false,
                  RocalImageSizeEvaluationPolicy decode_size_policy = ROCAL_USE_MAX_SIZE,
                  unsigned shard_id = 0,
-                 unsigned shard_count = 1);
+                 unsigned shard_count = 1,
+                 unsigned seed = 0);
 
 /*!
  * \brief Creates a video reader and decoder as a source. It allocates the resources and objects required to read and decode mp4 videos stored on the file systems.
diff --git a/rocAL/include/loaders/image/node_numpy_loader.h b/rocAL/include/loaders/image/node_numpy_loader.h
index 587f89e1a..49918e4f5 100644
--- a/rocAL/include/loaders/image/node_numpy_loader.h
+++ b/rocAL/include/loaders/image/node_numpy_loader.h
@@ -39,8 +39,8 @@ class NumpyLoaderNode : public Node {
     /// \param load_batch_count Defines the quantum count of the images to be loaded. It's usually equal to the user's batch size.
     /// The loader will repeat images if necessary to be able to have images in multiples of the load_batch_count,
     /// for example if there are 10 images in the dataset and load_batch_count is 3, the loader repeats 2 images as if there are 12 images available.
-    void init(unsigned internal_shard_count, const std::string &source_path, const std::string &json_path, StorageType storage_type, DecoderType decoder_type, bool shuffle, bool loop,
-              size_t load_batch_count, RocalMemType mem_type, bool decoder_keep_orig = false, const std::map<std::string, std::string> feature_key_map = std::map<std::string, std::string>(), const char *prefix = "", unsigned sequence_length = 0, unsigned step = 0, unsigned stride = 0);
+    void init(unsigned internal_shard_count, const std::string &source_path, const std::vector<std::string> &files, StorageType storage_type, DecoderType decoder_type, bool shuffle, bool loop,
+              size_t load_batch_count, RocalMemType mem_type, unsigned seed = 0, bool decoder_keep_orig = false, const std::map<std::string, std::string> feature_key_map = std::map<std::string, std::string>(), const char *prefix = "", unsigned sequence_length = 0, unsigned step = 0, unsigned stride = 0);
 
     std::shared_ptr<LoaderModule> get_loader_module();
 
diff --git a/rocAL/include/loaders/image/node_numpy_loader_single_shard.h b/rocAL/include/loaders/image/node_numpy_loader_single_shard.h
index c1cffba54..cd3b464e7 100644
--- a/rocAL/include/loaders/image/node_numpy_loader_single_shard.h
+++ b/rocAL/include/loaders/image/node_numpy_loader_single_shard.h
@@ -36,9 +36,9 @@ class NumpyLoaderSingleShardNode : public Node {
     /// \param load_batch_count Defines the quantum count of the images to be loaded. It's usually equal to the user's batch size.
     /// The loader will repeat images if necessary to be able to have images in multiples of the load_batch_count,
     /// for example if there are 10 images in the dataset and load_batch_count is 3, the loader repeats 2 images as if there are 12 images available.
-    void init(unsigned shard_id, unsigned shard_count, const std::string &source_path, const std::string &json_path,
+    void init(unsigned shard_id, unsigned shard_count, const std::string &source_path, const std::vector<std::string> &files,
               StorageType storage_type, DecoderType decoder_type, bool shuffle, bool loop,
-              size_t load_batch_count, RocalMemType mem_type, bool decoder_keep_orig = false, const std::map<std::string, std::string> feature_key_map = std::map<std::string, std::string>(), unsigned sequence_length = 0, unsigned step = 0, unsigned stride = 0);
+              size_t load_batch_count, RocalMemType mem_type, unsigned seed = 0, bool decoder_keep_orig = false, const std::map<std::string, std::string> feature_key_map = std::map<std::string, std::string>(), unsigned sequence_length = 0, unsigned step = 0, unsigned stride = 0);
 
     std::shared_ptr<LoaderModule> get_loader_module();
 
diff --git a/rocAL/include/readers/image/image_reader.h b/rocAL/include/readers/image/image_reader.h
index 6269e1781..75e44ff8a 100644
--- a/rocAL/include/readers/image/image_reader.h
+++ b/rocAL/include/readers/image/image_reader.h
@@ -73,6 +73,8 @@ struct ReaderConfig {
     void set_sequence_length(unsigned sequence_length) { _sequence_length = sequence_length; }
     void set_frame_step(unsigned step) { _sequence_frame_step = step; }
     void set_frame_stride(unsigned stride) { _sequence_frame_stride = stride; }
+    void set_files(const std::vector<std::string> &files) { _files = files; }
+    void set_seed(unsigned seed) { _seed = seed; }
     size_t get_shard_count() { return _shard_count; }
     size_t get_shard_id() { return _shard_id; }
     size_t get_cpu_num_threads() { return _cpu_num_threads; }
@@ -80,7 +82,9 @@ struct ReaderConfig {
     size_t get_sequence_length() { return _sequence_length; }
     size_t get_frame_step() { return _sequence_frame_step; }
     size_t get_frame_stride() { return _sequence_frame_stride; }
+    std::vector<std::string> get_files() { return _files; }
     std::string path() { return _path; }
+    unsigned seed() { return _seed; }
 #ifdef ROCAL_VIDEO
     void set_video_properties(VideoProperties video_prop) { _video_prop = video_prop; }
     VideoProperties get_video_properties() { return _video_prop; }
@@ -107,6 +111,8 @@ struct ReaderConfig {
     bool _loop = false;
     std::string _file_prefix = "";  //!< to read only files with prefix. supported only for cifar10_data_reader and tf_record_reader
     std::shared_ptr<MetaDataReader> _meta_data_reader = nullptr;
+    std::vector<std::string> _files;
+    unsigned _seed = 0;
 #ifdef ROCAL_VIDEO
     VideoProperties _video_prop;
 #endif
diff --git a/rocAL/include/readers/image/numpy_data_reader.h b/rocAL/include/readers/image/numpy_data_reader.h
index 2ef319039..e4fa7cc63 100644
--- a/rocAL/include/readers/image/numpy_data_reader.h
+++ b/rocAL/include/readers/image/numpy_data_reader.h
@@ -77,6 +77,7 @@ class NumpyDataReader : public Reader {
     DIR* _sub_dir;
     struct dirent* _entity;
     std::vector<std::string> _file_names;
+    std::vector<std::string> _files;
     std::vector<NumpyHeaderData> _file_headers;
     unsigned _curr_file_idx;
     FILE* _current_fPtr;
@@ -94,8 +95,11 @@ class NumpyDataReader : public Reader {
     bool _loop;
     bool _shuffle;
     int _read_counter = 0;
+    unsigned _seed = 0;
     //!< _file_count_all_shards total_number of files in to figure out the max_batch_size (usually needed for distributed training).
     size_t _file_count_all_shards;
+    std::mutex _cache_mutex_;
+    std::map<std::string, NumpyHeaderData> _header_cache_;
     const RocalTensorDataType TypeFromNumpyStr(const std::string& format);
     inline void SkipSpaces(const char*& ptr);
     void ParseHeaderContents(NumpyHeaderData& target, const std::string& header);
@@ -111,6 +115,8 @@ class NumpyDataReader : public Reader {
     void ParseHeader(NumpyHeaderData& parsed_header, std::string file_path);
     template <typename T>
     size_t ParseNumpyData(T* buf, std::vector<unsigned> strides, std::vector<unsigned> shapes, unsigned dim = 0);
+    bool GetFromCache(const std::string& file_name, NumpyHeaderData& target);
+    void UpdateCache(const std::string& file_name, const NumpyHeaderData& value);   
     void incremenet_read_ptr();
     int release();
     size_t get_file_shard_id();
diff --git a/rocAL/source/api/rocal_api_data_loaders.cpp b/rocAL/source/api/rocal_api_data_loaders.cpp
index 44dde077a..6f11c2a92 100644
--- a/rocAL/source/api/rocal_api_data_loaders.cpp
+++ b/rocAL/source/api/rocal_api_data_loaders.cpp
@@ -74,7 +74,7 @@ evaluate_image_data_set(RocalImageSizeEvaluationPolicy decode_size_policy, Stora
 
 std::vector<size_t>
 evaluate_numpy_data_set(RocalImageSizeEvaluationPolicy decode_size_policy, StorageType storage_type,
-                        DecoderType decoder_type, const std::string &source_path, const std::string &json_path)
+                        DecoderType decoder_type, const std::string &source_path, const std::vector<std::string> &files)
 {
     auto translate_image_size_policy = [](RocalImageSizeEvaluationPolicy decode_size_policy)
     {
@@ -92,7 +92,10 @@ evaluate_numpy_data_set(RocalImageSizeEvaluationPolicy decode_size_policy, Stora
 
     ImageSourceEvaluator source_evaluator;
     source_evaluator.set_size_evaluation_policy(translate_image_size_policy(decode_size_policy));
-    if(source_evaluator.create(ReaderConfig(storage_type, source_path, json_path)) != ImageSourceEvaluatorStatus::OK)
+    auto reader_cfg = ReaderConfig(storage_type, source_path);
+    if (!files.empty())
+        reader_cfg.set_files(files);
+    if (source_evaluator.create(reader_cfg) != ImageSourceEvaluatorStatus::OK)
         THROW("Initializing file source input evaluator failed ")
     auto max_dims = source_evaluator.max_numpy_dims();
     int data_type = (int)source_evaluator.get_numpy_dtype();
@@ -1636,15 +1639,17 @@ rocalNumpyFileSource(
     RocalContext p_context,
     const char* source_path,
     unsigned internal_shard_count,
+    std::vector<std::string> files,
     bool is_output,
     bool shuffle,
     bool loop,
-    RocalImageSizeEvaluationPolicy decode_size_policy) {
+    RocalImageSizeEvaluationPolicy decode_size_policy,
+    unsigned seed) {
     Tensor* output = nullptr;
     auto context = static_cast<Context*>(p_context);
     try {
         auto max_dimensions = evaluate_numpy_data_set(decode_size_policy, StorageType::NUMPY_DATA, DecoderType::SKIP_DECODE,
-                                                      source_path, "");
+                                                      source_path, files);
 
         RocalTensorlayout tensor_format = RocalTensorlayout::NONE;
         RocalTensorDataType tensor_data_type;
@@ -1672,7 +1677,7 @@ rocalNumpyFileSource(
         info.set_max_shape();
         output = context->master_graph->create_loader_output_tensor(info);
 
-        context->master_graph->add_node<NumpyLoaderNode>({}, {output})->init(internal_shard_count, source_path, "", StorageType::NUMPY_DATA, DecoderType::SKIP_DECODE, shuffle, loop, context->user_batch_size(), context->master_graph->mem_type());
+        context->master_graph->add_node<NumpyLoaderNode>({}, {output})->init(internal_shard_count, source_path, files, StorageType::NUMPY_DATA, DecoderType::SKIP_DECODE, shuffle, loop, context->user_batch_size(), context->master_graph->mem_type(), seed);
         context->master_graph->set_loop(loop);
 
         if (is_output) {
@@ -1691,12 +1696,14 @@ RocalTensor ROCAL_API_CALL
 rocalNumpyFileSourceSingleShard(
     RocalContext p_context,
     const char* source_path,
+    std::vector<std::string> files,
     bool is_output,
     bool shuffle,
     bool loop,
     RocalImageSizeEvaluationPolicy decode_size_policy,
     unsigned shard_id,
-    unsigned shard_count) {
+    unsigned shard_count,
+    unsigned seed) {
     Tensor* output = nullptr;
     auto context = static_cast<Context*>(p_context);
     try {
@@ -1707,7 +1714,7 @@ rocalNumpyFileSourceSingleShard(
             THROW("Shard id should be smaller than shard count")
 
         auto max_dimensions = evaluate_numpy_data_set(decode_size_policy, StorageType::NUMPY_DATA, DecoderType::SKIP_DECODE,
-                                                      source_path, "");
+                                                      source_path, files);
 
         RocalTensorlayout tensor_format = RocalTensorlayout::NONE;
         RocalTensorDataType tensor_data_type;
@@ -1735,7 +1742,7 @@ rocalNumpyFileSourceSingleShard(
         info.set_max_shape();
         output = context->master_graph->create_loader_output_tensor(info);
 
-        context->master_graph->add_node<NumpyLoaderSingleShardNode>({}, {output})->init(shard_id, shard_count, source_path, "", StorageType::NUMPY_DATA, DecoderType::SKIP_DECODE, shuffle, loop, context->user_batch_size(), context->master_graph->mem_type());
+        context->master_graph->add_node<NumpyLoaderSingleShardNode>({}, {output})->init(shard_id, shard_count, source_path, files, StorageType::NUMPY_DATA, DecoderType::SKIP_DECODE, shuffle, loop, context->user_batch_size(), context->master_graph->mem_type(), seed);
         context->master_graph->set_loop(loop);
 
         if (is_output) {
diff --git a/rocAL/source/loaders/image/node_numpy_loader.cpp b/rocAL/source/loaders/image/node_numpy_loader.cpp
index eeb51d35d..3f5319490 100644
--- a/rocAL/source/loaders/image/node_numpy_loader.cpp
+++ b/rocAL/source/loaders/image/node_numpy_loader.cpp
@@ -28,18 +28,20 @@ NumpyLoaderNode::NumpyLoaderNode(Tensor *output, void *device_resources) : Node(
     _loader_module = std::make_shared<NumpyLoaderSharded>(device_resources);
 }
 
-void NumpyLoaderNode::init(unsigned internal_shard_count, const std::string &source_path, const std::string &json_path, StorageType storage_type, DecoderType decoder_type, bool shuffle, bool loop,
-                           size_t load_batch_count, RocalMemType mem_type, bool decoder_keep_orig, const std::map<std::string, std::string> feature_key_map, const char *file_prefix, unsigned sequence_length, unsigned step, unsigned stride) {
+void NumpyLoaderNode::init(unsigned internal_shard_count, const std::string &source_path, const std::vector<std::string> &files, StorageType storage_type, DecoderType decoder_type, bool shuffle, bool loop,
+                           size_t load_batch_count, RocalMemType mem_type, unsigned seed, bool decoder_keep_orig, const std::map<std::string, std::string> feature_key_map, const char *file_prefix, unsigned sequence_length, unsigned step, unsigned stride) {
     if (!_loader_module)
         THROW("ERROR: loader module is not set for NumpyLoaderNode, cannot initialize")
     if (internal_shard_count < 1)
         THROW("Shard count should be greater than or equal to one")
     _loader_module->set_output(_outputs[0]);
     // Set reader and decoder config accordingly for the NumpyLoaderNode
-    auto reader_cfg = ReaderConfig(storage_type, source_path, json_path, feature_key_map, shuffle, loop);
+    auto reader_cfg = ReaderConfig(storage_type, source_path, "", feature_key_map, shuffle, loop);
     reader_cfg.set_shard_count(internal_shard_count);
     reader_cfg.set_batch_count(load_batch_count);
     reader_cfg.set_file_prefix(file_prefix);
+    reader_cfg.set_files(files);
+    reader_cfg.set_seed(seed);
     //  sequence_length, step and stride parameters used only for SequenceReader
     reader_cfg.set_sequence_length(sequence_length);
     reader_cfg.set_frame_step(step);
diff --git a/rocAL/source/loaders/image/node_numpy_loader_single_shard.cpp b/rocAL/source/loaders/image/node_numpy_loader_single_shard.cpp
index 705dd9561..ed9d3730a 100644
--- a/rocAL/source/loaders/image/node_numpy_loader_single_shard.cpp
+++ b/rocAL/source/loaders/image/node_numpy_loader_single_shard.cpp
@@ -28,8 +28,8 @@ NumpyLoaderSingleShardNode::NumpyLoaderSingleShardNode(Tensor *output, void *dev
     _loader_module = std::make_shared<NumpyLoader>(device_resources);
 }
 
-void NumpyLoaderSingleShardNode::init(unsigned shard_id, unsigned shard_count, const std::string &source_path, const std::string &json_path, StorageType storage_type, DecoderType decoder_type,
-                                      bool shuffle, bool loop, size_t load_batch_count, RocalMemType mem_type,
+void NumpyLoaderSingleShardNode::init(unsigned shard_id, unsigned shard_count, const std::string &source_path, const std::vector<std::string> &files, StorageType storage_type, DecoderType decoder_type,
+                                      bool shuffle, bool loop, size_t load_batch_count, RocalMemType mem_type, unsigned seed,
                                       bool decoder_keep_original, const std::map<std::string, std::string> feature_key_map, unsigned sequence_length, unsigned step, unsigned stride) {
     if (!_loader_module)
         THROW("ERROR: loader module is not set for NumpyLoaderNode, cannot initialize")
@@ -39,10 +39,12 @@ void NumpyLoaderSingleShardNode::init(unsigned shard_id, unsigned shard_count, c
         THROW("Shard is should be smaller than shard count")
     _loader_module->set_output(_outputs[0]);
     // Set reader and decoder config accordingly for the NumpyLoaderNode
-    auto reader_cfg = ReaderConfig(storage_type, source_path, json_path, feature_key_map, shuffle, loop);
+    auto reader_cfg = ReaderConfig(storage_type, source_path, "", feature_key_map, shuffle, loop);
     reader_cfg.set_shard_count(shard_count);
     reader_cfg.set_shard_id(shard_id);
     reader_cfg.set_batch_count(load_batch_count);
+    reader_cfg.set_files(files);
+    reader_cfg.set_seed(seed);
     //  sequence_length, step and stride parameters used only for SequenceReader
     reader_cfg.set_sequence_length(sequence_length);
     reader_cfg.set_frame_step(step);
diff --git a/rocAL/source/readers/image/numpy_data_reader.cpp b/rocAL/source/readers/image/numpy_data_reader.cpp
index ae24a4efa..67605d508 100644
--- a/rocAL/source/readers/image/numpy_data_reader.cpp
+++ b/rocAL/source/readers/image/numpy_data_reader.cpp
@@ -26,6 +26,7 @@ THE SOFTWARE.
 
 #include <algorithm>
 #include <numeric>
+#include <random>
 #include <boost/filesystem.hpp>
 #include <cassert>
 
@@ -61,9 +62,11 @@ Reader::Status NumpyDataReader::initialize(ReaderConfig desc) {
     _batch_count = desc.get_batch_size();
     _shuffle = desc.shuffle();
     _loop = desc.loop();
+    _files = desc.get_files();
+    _seed = desc.seed();
     ret = subfolder_reading();
     // the following code is required to make every shard the same size:: required for multi-gpu training
-    if (_shard_count > 1 && _batch_count > 1) {
+    if (_shard_count > 1 && _batch_count > 1 && _files.empty()) {
         int _num_batches = _file_names.size() / _batch_count;
         int max_batches_per_shard = (_file_count_all_shards + _shard_count - 1) / _shard_count;
         max_batches_per_shard = (max_batches_per_shard + _batch_count - 1) / _batch_count;
@@ -74,8 +77,10 @@ Reader::Status NumpyDataReader::initialize(ReaderConfig desc) {
     _file_headers.resize(_file_names.size());
     // shuffle dataset if set
     _shuffle_time.start();
-    if (ret == Reader::Status::OK && _shuffle)
-        std::random_shuffle(_file_names.begin(), _file_names.end());
+    if (ret == Reader::Status::OK && _shuffle) {
+        std::mt19937 rng(_seed);
+        std::shuffle(_file_names.begin(), _file_names.end(), rng);
+    }
     _shuffle_time.end();
     return ret;
 }
@@ -94,12 +99,36 @@ size_t NumpyDataReader::open() {
         _last_id.erase(0, last_slash_idx + 1);
     }
 
-    ParseHeader(_file_headers[_curr_file_idx], file_path);
+    auto ret = GetFromCache(file_path, _file_headers[_curr_file_idx]);
+    if (!ret) {
+        ParseHeader(_file_headers[_curr_file_idx], file_path);
+        UpdateCache(file_path, _file_headers[_curr_file_idx]);
+    } else {
+        _current_fPtr = std::fopen(file_path.c_str(), "rb");
+        if (_current_fPtr == nullptr)
+            THROW("Could not open file " + file_path + ": " + std::strerror(errno));
+    }
     fseek(_current_fPtr, 0, SEEK_SET);  // Take the file pointer back to the start
 
     return _file_headers[_curr_file_idx].nbytes();
 }
 
+bool NumpyDataReader::GetFromCache(const std::string& file_name, NumpyHeaderData& header) {
+    std::unique_lock<std::mutex> cache_lock(_cache_mutex_);
+    auto it = _header_cache_.find(file_name);
+    if (it == _header_cache_.end()) {
+        return false;
+    } else {
+        header = it->second;
+        return true;
+    }
+}
+
+void NumpyDataReader::UpdateCache(const std::string& file_name, const NumpyHeaderData& value) {
+    std::unique_lock<std::mutex> cache_lock(_cache_mutex_);
+    _header_cache_[file_name] = value;
+}
+
 const RocalTensorDataType NumpyDataReader::TypeFromNumpyStr(const std::string& format) {
     if (format == "u1") return RocalTensorDataType::UINT8;
     // if (format == "u2") return TypeTable::GetTypeInfo<uint16_t>();   // Currently not supported in rocAL
@@ -378,18 +407,8 @@ int NumpyDataReader::release() {
 void NumpyDataReader::reset() {
     _shuffle_time.start();
     if (_shuffle) {
-        std::vector<std::string> shuffled_filenames;
-        std::vector<NumpyHeaderData> shuffled_headers;
-        std::vector<int> indexes(_file_names.size());
-        std::iota(indexes.begin(), indexes.end(), 0);
-        // Shuffle the index vector and use the index to fetch batch size elements for decoding
-        std::random_shuffle(indexes.begin(), indexes.end());
-        for (auto const idx : indexes) {
-            shuffled_filenames.push_back(_file_names[idx]);
-            shuffled_headers.push_back(_file_headers[idx]);
-        }
-        _file_names = shuffled_filenames;
-        _file_headers = shuffled_headers;
+        std::mt19937 rng(_seed);
+        std::shuffle(_file_names.begin(), _file_names.end(), rng);
     }
     _shuffle_time.end();
     _read_counter = 0;
@@ -397,38 +416,56 @@ void NumpyDataReader::reset() {
 }
 
 Reader::Status NumpyDataReader::subfolder_reading() {
-    if ((_sub_dir = opendir(_folder_path.c_str())) == nullptr)
-        THROW("NumpyDataReader ShardID [" + TOSTR(_shard_id) + "] ERROR: Failed opening the directory at " + _folder_path);
-
-    std::vector<std::string> entry_name_list;
-    std::string _full_path = _folder_path;
+    auto ret = Reader::Status::OK;
+    if (!_files.empty()) {
+        for (unsigned file_count = 0; file_count < _files.size(); file_count++) {
+            std::string file_path = _files[file_count];
+            filesys::path pathObj(file_path);
+            if (filesys::exists(pathObj) && filesys::is_regular_file(pathObj)) {
+                // ignore files with extensions .tar, .zip, .7z
+                auto file_extension_idx = file_path.find_last_of(".");
+                if (file_extension_idx != std::string::npos) {
+                    std::string file_extension = file_path.substr(file_extension_idx + 1);
+                    if (file_extension != "npy")
+                        continue;
+                    else
+                        _file_names.push_back(file_path);
+                }
+            }
+        }
+    } else {
+        if ((_sub_dir = opendir(_folder_path.c_str())) == nullptr)
+            THROW("NumpyDataReader ShardID [" + TOSTR(_shard_id) + "] ERROR: Failed opening the directory at " + _folder_path);
 
-    while ((_entity = readdir(_sub_dir)) != nullptr) {
-        std::string entry_name(_entity->d_name);
-        if (strcmp(_entity->d_name, ".") == 0 || strcmp(_entity->d_name, "..") == 0) continue;
-        entry_name_list.push_back(entry_name);
-    }
-    closedir(_sub_dir);
-    std::sort(entry_name_list.begin(), entry_name_list.end());
+        std::vector<std::string> entry_name_list;
+        std::string _full_path = _folder_path;
 
-    auto ret = Reader::Status::OK;
-    for (unsigned dir_count = 0; dir_count < entry_name_list.size(); ++dir_count) {
-        std::string subfolder_path = _full_path + "/" + entry_name_list[dir_count];
-        filesys::path pathObj(subfolder_path);
-        if (filesys::exists(pathObj) && filesys::is_regular_file(pathObj)) {
-            // ignore files with extensions .tar, .zip, .7z
-            auto file_extension_idx = subfolder_path.find_last_of(".");
-            if (file_extension_idx != std::string::npos) {
-                std::string file_extension = subfolder_path.substr(file_extension_idx + 1);
-                if (file_extension != "npy")
-                    continue;
+        while ((_entity = readdir(_sub_dir)) != nullptr) {
+            std::string entry_name(_entity->d_name);
+            if (strcmp(_entity->d_name, ".") == 0 || strcmp(_entity->d_name, "..") == 0) continue;
+            entry_name_list.push_back(entry_name);
+        }
+        closedir(_sub_dir);
+        std::sort(entry_name_list.begin(), entry_name_list.end());
+
+        for (unsigned dir_count = 0; dir_count < entry_name_list.size(); ++dir_count) {
+            std::string subfolder_path = _full_path + "/" + entry_name_list[dir_count];
+            filesys::path pathObj(subfolder_path);
+            if (filesys::exists(pathObj) && filesys::is_regular_file(pathObj)) {
+                // ignore files with extensions .tar, .zip, .7z
+                auto file_extension_idx = subfolder_path.find_last_of(".");
+                if (file_extension_idx != std::string::npos) {
+                    std::string file_extension = subfolder_path.substr(file_extension_idx + 1);
+                    if (file_extension != "npy")
+                        continue;
+                }
+                ret = open_folder();
+                break;  // assume directory has only files.
+            } else if (filesys::exists(pathObj) && filesys::is_directory(pathObj)) {
+                _folder_path = subfolder_path;
+                if (open_folder() != Reader::Status::OK)
+                    WRN("NumpyDataReader ShardID [" + TOSTR(_shard_id) + "] File reader cannot access the storage at " + _folder_path);
             }
-            ret = open_folder();
-            break;  // assume directory has only files.
-        } else if (filesys::exists(pathObj) && filesys::is_directory(pathObj)) {
-            _folder_path = subfolder_path;
-            if (open_folder() != Reader::Status::OK)
-                WRN("NumpyDataReader ShardID [" + TOSTR(_shard_id) + "] File reader cannot access the storage at " + _folder_path);
         }
     }
     if (_in_batch_read_count > 0 && _in_batch_read_count < _batch_count) {
diff --git a/rocAL_pybind/amd/rocal/plugin/pytorch.py b/rocAL_pybind/amd/rocal/plugin/pytorch.py
index 01e7e7a05..2b5d3cdb8 100644
--- a/rocAL_pybind/amd/rocal/plugin/pytorch.py
+++ b/rocAL_pybind/amd/rocal/plugin/pytorch.py
@@ -30,13 +30,15 @@
 
 
 class ROCALNumpyIterator(object):
-    def __init__(self, pipeline, tensor_dtype=types.FLOAT, device="cpu", device_id=0):
+    def __init__(self, pipeline, tensor_dtype=types.FLOAT, device="cpu", device_id=0, return_roi=False):
         self.loader = pipeline
         self.tensor_dtype = tensor_dtype
         self.device = device
         self.device_id = device_id
         self.output_memory_type = self.loader._output_memory_type
         self.output_list = None
+        self.batch_size = self.loader._batch_size
+        self.return_roi = return_roi
         print("self.device", self.device)
         self.len = b.getRemainingImages(self.loader._handle)
 
@@ -53,6 +55,15 @@ def __next__(self):
             self.output_list = []
             for i in range(len(self.output_tensor_list)):
                 dimensions = self.output_tensor_list[i].dimensions()
+                if self.return_roi:
+                    self.num_dims = len(dimensions) - 1
+                    self.roi_array = np.zeros(self.batch_size * self.num_dims * 2, dtype=np.uint32)
+                    self.output_tensor_list[i].copy_roi(self.roi_array)
+                    self.max_roi_size = np.zeros(self.num_dims, dtype=np.uint32)
+                    for j in range(self.batch_size):
+                        index = j * self.num_dims * 2
+                        roi_size = self.roi_array[index + self.num_dims : index + self.num_dims * 2] - self.roi_array[index : index + self.num_dims]
+                        self.max_roi_size = np.maximum(roi_size, self.max_roi_size)
                 if self.device == "cpu":
                     torch_dtype = self.output_tensor_list[i].dtype()
                     output = torch.empty(
@@ -68,8 +79,20 @@ def __next__(self):
                 self.output_list.append(output)
         else:
             for i in range(len(self.output_tensor_list)):
+                if self.return_roi:
+                    self.output_tensor_list[i].copy_roi(self.roi_array)
+                    self.max_roi_size = np.zeros(self.num_dims, dtype=np.uint32)
+                    for j in range(self.batch_size):
+                        index = j * self.num_dims * 2
+                        roi_size = self.roi_array[index + self.num_dims : index + self.num_dims * 2] - self.roi_array[index : index + self.num_dims]
+                        self.max_roi_size = np.maximum(roi_size, self.max_roi_size)
                 self.output_tensor_list[i].copy_data(ctypes.c_void_p(
                     self.output_list[i].data_ptr()), self.output_memory_type)
+        if self.return_roi:
+            roi_output_list = []
+            for i in range(len(self.output_list)):
+                roi_output_list.append(self.output_list[i][:, :self.max_roi_size[0], :self.max_roi_size[1], :self.max_roi_size[2], :self.max_roi_size[3]])
+            return roi_output_list
         return self.output_list
 
     def reset(self):
diff --git a/rocAL_pybind/amd/rocal/readers.py b/rocAL_pybind/amd/rocal/readers.py
index 0ee1f3840..b115a3d92 100644
--- a/rocAL_pybind/amd/rocal/readers.py
+++ b/rocAL_pybind/amd/rocal/readers.py
@@ -352,13 +352,13 @@ def mxnet(path, stick_to_shard=False, pad_last_batch=False):
     return mxnet_metadata
 
 
-def numpy(*inputs, file_root='', num_shards=1,
-          random_shuffle=False, shard_id=0, stick_to_shard=False, pad_last_batch=False):
+def numpy(*inputs, file_root='', files=[], num_shards=1,
+          random_shuffle=False, shard_id=0, stick_to_shard=False, pad_last_batch=False, seed=0):
 
     Pipeline._current_pipeline._reader = "NumpyReader"
     # Output
-    kwargs_pybind = {"source_path": file_root, "is_output": False, "shuffle": random_shuffle,
-                     "loop": False, "decode_size_policy": types.MAX_SIZE, "shard_id": shard_id, "shard_count": num_shards}
+    kwargs_pybind = {"source_path": file_root, "files": files, "is_output": False, "shuffle": random_shuffle,
+                     "loop": False, "decode_size_policy": types.MAX_SIZE, "shard_id": shard_id, "shard_count": num_shards, "seed": seed}
     numpy_reader_output = b.numpyReaderSourceShard(
         Pipeline._current_pipeline._handle, *(kwargs_pybind.values()))
     return (numpy_reader_output)
diff --git a/tests/cpp_api_tests/rocAL_unittests/rocAL_unittests.cpp b/tests/cpp_api_tests/rocAL_unittests/rocAL_unittests.cpp
index 066aa4199..51265859f 100644
--- a/tests/cpp_api_tests/rocAL_unittests/rocAL_unittests.cpp
+++ b/tests/cpp_api_tests/rocAL_unittests/rocAL_unittests.cpp
@@ -323,7 +323,7 @@ int test(int test_case, int reader_type, const char *path, const char *outName,
         {
             std::cout << ">>>>>>> Running Numpy reader" << std::endl;
             pipeline_type = 4;
-            decoded_output = rocalNumpyFileSource(handle, path, num_threads, false, false, false, ROCAL_USE_MAX_SIZE);
+            decoded_output = rocalNumpyFileSource(handle, path, num_threads, {}, false, false, false, ROCAL_USE_MAX_SIZE);
         } break;
         default: {
             std::cout << ">>>>>>> Running IMAGE READER" << std::endl;

From 2f71922dd622bd05d57541fe11d7dbd5a489fec3 Mon Sep 17 00:00:00 2001
From: SundarRajan98 <svaithiy@amd.com>
Date: Fri, 15 Dec 2023 08:08:56 +0000
Subject: [PATCH 21/33] Fixing build issues in rocAL

---
 rocAL/source/augmentations/node_cast.cpp       |  6 +++++-
 .../examples/rocAL_api_numpy_reader.py         | 18 +++++++++---------
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/rocAL/source/augmentations/node_cast.cpp b/rocAL/source/augmentations/node_cast.cpp
index d1949560e..cff54c5c2 100644
--- a/rocAL/source/augmentations/node_cast.cpp
+++ b/rocAL/source/augmentations/node_cast.cpp
@@ -31,7 +31,11 @@ void CastNode::create_node() {
     if(_node)
         return;
 
-    _node = vxExtRppCast(_graph->get(), _inputs[0]->handle(), _src_tensor_roi, _outputs[0]->handle(), _input_layout, _roi_type);
+    int input_layout = (int)_inputs[0]->info().layout();
+    int roi_type = static_cast<int>(_inputs[0]->info().roi_type());
+    vx_scalar input_layout_vx = vxCreateScalar(vxGetContext((vx_reference)_graph->get()), VX_TYPE_INT32, &input_layout);
+    vx_scalar roi_type_vx = vxCreateScalar(vxGetContext((vx_reference)_graph->get()), VX_TYPE_INT32, &roi_type);
+    _node = vxExtRppCast(_graph->get(), _inputs[0]->handle(), _inputs[0]->get_roi_tensor(), _outputs[0]->handle(), input_layout_vx, roi_type_vx);
 
     vx_status status;
     if((status = vxGetStatus((vx_reference)_node)) != VX_SUCCESS)
diff --git a/rocAL_pybind/examples/rocAL_api_numpy_reader.py b/rocAL_pybind/examples/rocAL_api_numpy_reader.py
index 60c797a31..09e50a7f6 100644
--- a/rocAL_pybind/examples/rocAL_api_numpy_reader.py
+++ b/rocAL_pybind/examples/rocAL_api_numpy_reader.py
@@ -56,30 +56,30 @@ def main():
     pipeline = Pipeline(batch_size=batch_size, num_threads=num_threads, device_id=device_id, seed=random_seed, rocal_cpu=rocal_cpu, prefetch_queue_depth=6)
 
     with pipeline:
-        numpy_reader_output = fn.readers.numpy(file_root=data_path, files=x_train, shard_id=local_rank, num_shards=world_size, random_shuffle=True, seed=random_seed+local_rank)
-        label_output = fn.readers.numpy(file_root=data_path, files=y_train, shard_id=local_rank, num_shards=world_size, random_shuffle=True, seed=random_seed+local_rank)
+        numpy_reader_output = fn.readers.numpy(file_root=data_path, files=x_train, shard_id=local_rank, num_shards=world_size)
+        label_output = fn.readers.numpy(file_root=data_path, files=y_train, shard_id=local_rank, num_shards=world_size)
         data_output = fn.set_layout(numpy_reader_output, output_layout=types.NHWC)
         normalized_output = fn.normalize(data_output, axes=[0,1], mean=MEAN, stddev=STDDEV, output_layout=types.NHWC, output_dtype=types.FLOAT)
-        transposed_output = fn.transpose(normalized_output, perm=[2,1,0], output_layout=types.NCHW, output_dtype=types.FLOAT)
+        transposed_output = fn.transpose(normalized_output, perm=[2,0,1], output_layout=types.NCHW, output_dtype=types.FLOAT)
         pipeline.set_outputs(transposed_output, label_output)
 
     pipeline.build()
 
-    pipeline1 = Pipeline(batch_size=batch_size, num_threads=num_threads, device_id=device_id, seed=random_seed, rocal_cpu=rocal_cpu, prefetch_queue_depth=6)
+    val_pipeline = Pipeline(batch_size=batch_size, num_threads=num_threads, device_id=device_id, seed=random_seed, rocal_cpu=rocal_cpu, prefetch_queue_depth=6)
 
-    with pipeline1:
+    with val_pipeline:
         numpy_reader_output = fn.readers.numpy(file_root=data_path, files=x_val, shard_id=local_rank, num_shards=world_size, seed=random_seed+local_rank)
         label_output = fn.readers.numpy(file_root=data_path, files=y_val, shard_id=local_rank, num_shards=world_size, seed=random_seed+local_rank)
         data_output = fn.set_layout(numpy_reader_output, output_layout=types.NHWC)
         normalized_output = fn.normalize(data_output, axes=[0,1], mean=MEAN, stddev=STDDEV, output_layout=types.NHWC, output_dtype=types.FLOAT)
-        transposed_output = fn.transpose(normalized_output, perm=[2,1,0], output_layout=types.NCHW, output_dtype=types.FLOAT)
-        pipeline1.set_outputs(transposed_output, label_output)
+        transposed_output = fn.transpose(normalized_output, perm=[2,0,1], output_layout=types.NCHW, output_dtype=types.FLOAT)
+        val_pipeline.set_outputs(transposed_output, label_output)
 
-    pipeline1.build()
+    val_pipeline.build()
     
     numpyIteratorPipeline = ROCALNumpyIterator(pipeline, device='cpu' if rocal_cpu else 'gpu')
     print(len(numpyIteratorPipeline))
-    valNumpyIteratorPipeline = ROCALNumpyIterator(pipeline1, device='cpu' if rocal_cpu else 'gpu')
+    valNumpyIteratorPipeline = ROCALNumpyIterator(val_pipeline, device='cpu' if rocal_cpu else 'gpu')
     print(len(valNumpyIteratorPipeline))
     cnt = 0
     for epoch in range(2):

From 182773ceb733835aa8bb7ee60703b5d5537d62cb Mon Sep 17 00:00:00 2001
From: SundarRajan98 <svaithiy@amd.com>
Date: Fri, 19 Jan 2024 18:29:14 +0000
Subject: [PATCH 22/33] Adding back missed formatting for crop node

---
 .../augmentations/geometry_augmentations/node_crop.cpp    | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/rocAL/source/augmentations/geometry_augmentations/node_crop.cpp b/rocAL/source/augmentations/geometry_augmentations/node_crop.cpp
index 6574ea1bf..6ca4cbd2c 100644
--- a/rocAL/source/augmentations/geometry_augmentations/node_crop.cpp
+++ b/rocAL/source/augmentations/geometry_augmentations/node_crop.cpp
@@ -102,8 +102,8 @@ void CropNode::create_crop_tensor() {
     vx_size num_of_dims = 2;
     vx_size stride[num_of_dims];
     std::vector<size_t> _crop_tensor_dims = {_batch_size, 4};
-    if (_inputs[0]->info().layout() == RocalTensorlayout::NFCHW || _inputs[0]->info().layout() == RocalTensorlayout::NFHWC)
-        _crop_tensor_dims = {_inputs[0]->info().dims()[0] * _inputs[0]->info().dims()[1], 4};  // For Sequences pre allocating the ROI to N * F to replicate in OpenVX extensions
+    if(_inputs[0]->info().layout() == RocalTensorlayout::NFCHW || _inputs[0]->info().layout() == RocalTensorlayout::NFHWC)
+        _crop_tensor_dims = {_inputs[0]->info().dims()[0] * _inputs[0]->info().dims()[1], 4}; // For Sequences pre allocating the ROI to N * F to replicate in OpenVX extensions
     stride[0] = sizeof(vx_uint32);
     stride[1] = stride[0] * _crop_tensor_dims[0];
     vx_enum mem_type = VX_MEMORY_TYPE_HOST;
@@ -111,8 +111,8 @@ void CropNode::create_crop_tensor() {
         mem_type = VX_MEMORY_TYPE_HIP;
     allocate_host_or_pinned_mem(&_crop_coordinates, stride[1] * 4, _inputs[0]->info().mem_type());
 
-    _crop_tensor = vxCreateTensorFromHandle(vxGetContext((vx_reference)_graph->get()), num_of_dims, _crop_tensor_dims.data(), VX_TYPE_UINT32, 0,
-                                            stride, reinterpret_cast<void *>(_crop_coordinates), mem_type);
+    _crop_tensor = vxCreateTensorFromHandle(vxGetContext((vx_reference) _graph->get()), num_of_dims, _crop_tensor_dims.data(), VX_TYPE_UINT32, 0, 
+                                                                  stride, reinterpret_cast<void *>(_crop_coordinates), mem_type);
     vx_status status;
     if ((status = vxGetStatus((vx_reference)_crop_tensor)) != VX_SUCCESS)
         THROW("Error: vxCreateTensorFromHandle(_crop_tensor: failed " + TOSTR(status))

From 7e927d1f48f797883b0fe3ccf24da8c9275e4361 Mon Sep 17 00:00:00 2001
From: Hansel Yang <hansyang@amd.com>
Date: Tue, 23 Jan 2024 05:50:12 -0800
Subject: [PATCH 23/33] Include Fix for CPU Backend (#93)

---
 rocAL/source/pipeline/tensor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rocAL/source/pipeline/tensor.cpp b/rocAL/source/pipeline/tensor.cpp
index 4e3dd2403..bc234e813 100644
--- a/rocAL/source/pipeline/tensor.cpp
+++ b/rocAL/source/pipeline/tensor.cpp
@@ -22,7 +22,7 @@ THE SOFTWARE.
 */
 
 #include <cstdio>
-#if !ENABLE_HIP
+#if ENABLE_OPENCL
 #include <CL/cl.h>
 #endif
 #include <vx_ext_amd.h>

From 77038c5c4f46412f8423b51f827bee53b15746f7 Mon Sep 17 00:00:00 2001
From: Rajy Rawther <Rajy.MeeyakhanRawther@amd.com>
Date: Tue, 23 Jan 2024 21:31:16 -0800
Subject: [PATCH 24/33] turbojpeg library update for rocAL (#86)

* removed dependancy of libjpegturbo

* minor fix for the decoder python script

* fix broken link in setup.py

* add WITH_JPEG8 true building tjpeg

* fix for accuracy and convergence
---
 .../image_augmentation/image_augmentation.cpp |  25 +-
 docs/examples/image_processing/decoder.py     |  13 +-
 .../image_processing/decoder_examples.ipynb   |  17 +-
 .../image_processing/inference_pipeline.py    |   2 +-
 rocAL-setup.py                                |   6 +-
 rocAL/CMakeLists.txt                          |   9 +
 .../decoders/image/fused_crop_decoder.h       |   2 +-
 .../decoders/image/turbo_jpeg_decoder.h       |  22 +-
 .../include/decoders/libjpeg/libjpeg_extra.h  |  75 +++++
 .../include/decoders/libjpeg/libjpeg_utils.h  |  30 ++
 .../loaders/image/image_read_and_decode.h     |   8 -
 .../decoders/image/fused_crop_decoder.cpp     |   4 +-
 .../decoders/image/turbo_jpeg_decoder.cpp     |  47 ++--
 .../source/decoders/libjpeg/libjpeg_extra.cpp | 266 ++++++++++++++++++
 rocAL_pybind/setup.py                         |   2 +-
 15 files changed, 444 insertions(+), 84 deletions(-)
 create mode 100644 rocAL/include/decoders/libjpeg/libjpeg_extra.h
 create mode 100644 rocAL/include/decoders/libjpeg/libjpeg_utils.h
 create mode 100644 rocAL/source/decoders/libjpeg/libjpeg_extra.cpp

diff --git a/apps/image_augmentation/image_augmentation.cpp b/apps/image_augmentation/image_augmentation.cpp
index 1286d1862..a21b89393 100644
--- a/apps/image_augmentation/image_augmentation.cpp
+++ b/apps/image_augmentation/image_augmentation.cpp
@@ -48,12 +48,12 @@ int main(int argc, const char** argv) {
     if (argc < MIN_ARG_COUNT) {
         printf(
             "Usage: image_augmentation <image_dataset_folder/video_file> <processing_device=1/cpu=0>  \
-              decode_width decode_height video_mode gray_scale/rgb display_on_off decode_shard_count  <shuffle:0/1> <jpeg_dec_mode<0(tjpeg)/1(opencv)/2(hwdec)>\n");
+              decode_width decode_height decoder_mode gray_scale/rgb display_on_off decode_shard_count  <shuffle:0/1> <jpeg_dec_mode<0(tjpeg)/1(opencv)/2(hwdec)>\n");
         return -1;
     }
     int argIdx = 0;
     const char* folderPath1 = argv[++argIdx];
-    int video_mode = 0;  // 0 means no video decode, 1 means hardware, 2 means software decoding
+    int decoder_mode = 0;  // 0 means no video decode, 1 means hardware, 2 means software decoding
     bool display = 1;    // Display the images
     int aug_depth = 1;   // how deep is the augmentation tree
     int rgb = 1;         // process color images
@@ -62,7 +62,7 @@ int main(int argc, const char** argv) {
     bool processing_device = 1;
     size_t shard_count = 2;
     int shuffle = 0;
-    int dec_mode = 0;
+    int decoder_type = 0;
     const char *outName = "image_augmentation_app.png";
 
     if (argc >= argIdx + MIN_ARG_COUNT)
@@ -75,7 +75,7 @@ int main(int argc, const char** argv) {
         decode_height = atoi(argv[++argIdx]);
 
     if (argc >= argIdx + MIN_ARG_COUNT)
-        video_mode = atoi(argv[++argIdx]);
+        decoder_mode = atoi(argv[++argIdx]);
 
     if (argc >= argIdx + MIN_ARG_COUNT)
         rgb = atoi(argv[++argIdx]);
@@ -90,7 +90,7 @@ int main(int argc, const char** argv) {
         shuffle = atoi(argv[++argIdx]);
 
     if (argc >= argIdx + MIN_ARG_COUNT)
-        dec_mode = atoi(argv[++argIdx]);
+        decoder_type = atoi(argv[++argIdx]);
 
     if (argc >= argIdx + MIN_ARG_COUNT)
         outName = argv[++argIdx];
@@ -108,7 +108,7 @@ int main(int argc, const char** argv) {
         return -1;
     }
 
-    RocalDecoderType dec_type = (RocalDecoderType)dec_mode;
+    RocalDecoderType dec_type = (RocalDecoderType)decoder_type;
 
     /*>>>>>>>>>>>>>>>> Creating rocAL parameters  <<<<<<<<<<<<<<<<*/
 
@@ -126,7 +126,7 @@ int main(int argc, const char** argv) {
     /*>>>>>>>>>>>>>>>>>>> Graph description <<<<<<<<<<<<<<<<<<<*/
     RocalTensor input1;
 
-    if (video_mode != 0) {
+    if (decoder_mode >= 2) {
         unsigned sequence_length = 3;
         unsigned frame_step = 3;
         unsigned frame_stride = 1;
@@ -134,7 +134,12 @@ int main(int argc, const char** argv) {
             std::cout << "Output width and height is needed for video decode\n";
             return -1;
         }
-        input1 = rocalVideoFileSource(handle, folderPath1, color_format, ((video_mode == 1) ? RocalDecodeDevice::ROCAL_HW_DECODE : RocalDecodeDevice::ROCAL_SW_DECODE), shard_count, sequence_length, frame_step, frame_stride, shuffle, true, false);
+        input1 = rocalVideoFileSource(handle, folderPath1, color_format, (decoder_mode == 2)? ROCAL_SW_DECODE: ROCAL_HW_DECODE, shard_count, sequence_length, frame_step, frame_stride, shuffle, true, false);
+    } else if (decoder_mode == 1) {
+            std::vector<float> area = {0.08, 1};
+            std::vector<float> aspect_ratio = {3.0f / 4, 4.0f / 3};
+            input1 = rocalFusedJpegCrop(handle, folderPath1, color_format, shard_count, false, area, aspect_ratio, 10, false, false, ROCAL_USE_USER_GIVEN_SIZE_RESTRICTED, decode_width, decode_height);
+
     } else {
         // The jpeg file loader can automatically select the best size to decode all images to that size
         // User can alternatively set the size or change the policy that is used to automatically find the size
@@ -152,7 +157,7 @@ int main(int argc, const char** argv) {
 
     RocalTensor tensor0;
     int resize_w = 112, resize_h = 112;
-    if (video_mode) {
+    if (decoder_mode >= 2) {
         resize_h = decode_height;
         resize_w = decode_width;
         tensor0 = input1;
@@ -214,7 +219,7 @@ int main(int argc, const char** argv) {
     int w = rocalGetOutputWidth(handle);
     int p = ((color_format == RocalImageColor::ROCAL_COLOR_RGB24) ? 3 : 1);
     std::cout << "output width " << w << " output height " << h << " color planes " << p << std::endl;
-    const unsigned number_of_cols = video_mode ? 1 : 10;
+    const unsigned number_of_cols = (decoder_mode >= 2) ? 1 : 10;
     auto cv_color_format = ((color_format == RocalImageColor::ROCAL_COLOR_RGB24) ? CV_8UC3 : CV_8UC1);
     cv::Mat mat_output(h + AMD_ROCm_Black_resize.rows, w * number_of_cols, cv_color_format);
     cv::Mat mat_input(h, w, cv_color_format);
diff --git a/docs/examples/image_processing/decoder.py b/docs/examples/image_processing/decoder.py
index eccce45fd..073fa383c 100644
--- a/docs/examples/image_processing/decoder.py
+++ b/docs/examples/image_processing/decoder.py
@@ -9,7 +9,7 @@
 import cupy as cp
 
 seed = 1549361629
-image_dir = "../../../../data/images/AMD-tinyDataSet/"
+image_dir = "../../../data/images/AMD-tinyDataSet/"
 batch_size = 4
 gpu_id = 0
 
@@ -34,13 +34,13 @@ def show_pipeline_output(pipe, device):
     pipe.build()
     data_loader = ROCALClassificationIterator(pipe, device)
     images = next(iter(data_loader))
-    show_images(images[0], device)
+    show_images(images[0][0], device)
 
 @pipeline_def(seed=seed)
 def image_decoder_pipeline(device="cpu", path=image_dir):
-    jpegs, labels = fn.readers.file(file_root=path, shard_id=0, num_shards=1, random_shuffle=False)
+    jpegs, labels = fn.readers.file(file_root=path)
     images = fn.decoders.image(jpegs, file_root=path, device=device, output_type=types.RGB, shard_id=0, num_shards=1, random_shuffle=False)
-    return fn.resize(images, device=device, resize_x=300, resize_y=300)
+    return fn.resize(images, device=device, resize_width=300, resize_height=300)
 
 def main():
     print ('Optional arguments: <cpu/gpu image_folder>')
@@ -52,9 +52,8 @@ def main():
           rocal_device = "gpu"
     if  len(sys.argv) > 2:
       img_folder = sys.argv[2]
-
-    pipe = image_decoder_pipeline(batch_size=bs, num_threads=1, device_id=gpu_id, rocal_cpu=True, tensor_layout=types.NHWC,
-                                  reverse_channels=True, mean = [0, 0, 0], std=[255, 255, 255], device=rocal_device, path=img_folder)
+    pipe = image_decoder_pipeline(batch_size=bs, num_threads=1, device_id=gpu_id, rocal_cpu=True, tensor_layout=types.NHWC, 
+                                reverse_channels=True, mean = [0, 0, 0], std=[255,255,255], device=rocal_device, path=img_folder)
     show_pipeline_output(pipe, device=rocal_device)
 
 if __name__ == '__main__':
diff --git a/docs/examples/image_processing/decoder_examples.ipynb b/docs/examples/image_processing/decoder_examples.ipynb
index 27098f079..cb1bef27e 100644
--- a/docs/examples/image_processing/decoder_examples.ipynb
+++ b/docs/examples/image_processing/decoder_examples.ipynb
@@ -38,7 +38,7 @@
     "%matplotlib inline\n",
     "\n",
     "seed = 1549361629\n",
-    "image_dir = \"../../../../data/images/AMD-tinyDataSet/\"\n",
+    "image_dir = \"../../../data/images/AMD-tinyDataSet/\"\n",
     "batch_size = 4\n",
     "gpu_id = 0\n",
     "\n",
@@ -61,7 +61,7 @@
     "    pipe.build()\n",
     "    data_loader = ROCALClassificationIterator(pipe, device, device_id)\n",
     "    images = next(iter(data_loader))\n",
-    "    show_images(images[0], device)\n"
+    "    show_images(images[0][0], device)\n"
    ]
   },
   {
@@ -82,9 +82,9 @@
    "source": [
     "@pipeline_def(seed=seed)\n",
     "def image_decoder_pipeline(device=\"cpu\"):\n",
-    "    jpegs, labels = fn.readers.file(file_root=image_dir, shard_id=0, num_shards=1, random_shuffle=False)\n",
+    "    jpegs, labels = fn.readers.file(file_root=image_dir)\n",
     "    images = fn.decoders.image(jpegs, file_root=image_dir, device=device, output_type=types.RGB, shard_id=0, num_shards=1, random_shuffle=False)\n",
-    "    return fn.resize(images, device=device, resize_x=300, resize_y=300)\n",
+    "    return fn.resize(images, device=device, resize_width=300, resize_height=300)\n",
     "\n",
     "pipe = image_decoder_pipeline(batch_size=batch_size, num_threads=1, device_id=gpu_id, rocal_cpu=True, tensor_layout=types.NHWC, \n",
     "                            reverse_channels=True, mean = [0, 0, 0], std=[255,255,255], device=\"cpu\")\n",
@@ -109,12 +109,13 @@
    "source": [
     "@pipeline_def(seed=seed)\n",
     "def image_decoder_random_crop_pipeline(device=\"cpu\"):\n",
-    "    jpegs, labels = fn.readers.file(file_root=image_dir, shard_id=0, num_shards=1, random_shuffle=False)\n",
+    "    jpegs, labels = fn.readers.file(file_root=image_dir)\n",
     "    images = fn.decoders.image_slice(jpegs, file_root=image_dir, \n",
-    "                                     device=device,\n",
     "                                     output_type=types.RGB,\n",
+    "                                     shard_id = 0,\n",
+    "                                     num_shards = 1,\n",
     "                                     random_shuffle=True)\n",
-    "    return fn.resize(images, device=device, resize_x=300, resize_y=300)\n",
+    "    return fn.resize(images, device=device, resize_width=300, resize_height=300)\n",
     "    \n",
     "pipe = image_decoder_random_crop_pipeline(batch_size=batch_size, num_threads=1, device_id=gpu_id, rocal_cpu=True, tensor_layout=types.NHWC, \n",
     "                                          reverse_channels=True, mean=[0,0,0], std = [255,255,255], device=\"cpu\")\n",
@@ -184,7 +185,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.6"
+   "version": "3.10.12"
   },
   "vscode": {
    "interpreter": {
diff --git a/docs/examples/image_processing/inference_pipeline.py b/docs/examples/image_processing/inference_pipeline.py
index a7db74e16..f97da7b37 100644
--- a/docs/examples/image_processing/inference_pipeline.py
+++ b/docs/examples/image_processing/inference_pipeline.py
@@ -31,7 +31,7 @@
 
 
 seed = 1549361629
-image_dir = "../../../../data/images/AMD-tinyDataSet/"
+image_dir = "../../../data/images/AMD-tinyDataSet/"
 batch_size = 4
 gpu_id = 0
 
diff --git a/rocAL-setup.py b/rocAL-setup.py
index fa6b5de91..1032aef6c 100644
--- a/rocAL-setup.py
+++ b/rocAL-setup.py
@@ -311,11 +311,11 @@
         os.system('sudo '+linuxFlag+' '+linuxSystemInstall+' ' +
                   linuxSystemInstall_check+' install lmdb-devel rapidjson-devel')
 
-    # turbo-JPEG - https://github.com/rrawther/libjpeg-turbo.git -- 2.0.6.2
+    # turbo-JPEG - https://github.com/libjpeg-turbo/libjpeg-turbo.git -- 3.0.1
     os.system(
-        '(cd '+deps_dir+'; git clone -b 2.0.6.2 https://github.com/rrawther/libjpeg-turbo.git )')
+        '(cd '+deps_dir+'; git clone -b 3.0.1 https://github.com/libjpeg-turbo/libjpeg-turbo.git )')
     os.system('(cd '+deps_dir+'/libjpeg-turbo; mkdir build; cd build; '+linuxCMake +
-              ' -DCMAKE_INSTALL_PREFIX=/usr -DCMAKE_BUILD_TYPE=RELEASE -DENABLE_STATIC=FALSE -DCMAKE_INSTALL_DEFAULT_LIBDIR=lib ..; make -j 4; sudo make install )')
+              ' -DCMAKE_INSTALL_PREFIX=/usr -DCMAKE_BUILD_TYPE=RELEASE -DENABLE_STATIC=FALSE -DCMAKE_INSTALL_DEFAULT_LIBDIR=lib -DWITH_JPEG8=TRUE ..; make -j 4; sudo make install )')
     # RPP
     os.system('sudo -v')
     os.system('(cd '+deps_dir+'; git clone -b '+rppVersion+' https://github.com/GPUOpen-ProfessionalCompute-Libraries/rpp.git; cd rpp; mkdir build-'+backend+'; cd build-'+backend+'; ' +
diff --git a/rocAL/CMakeLists.txt b/rocAL/CMakeLists.txt
index 1dc4630e1..c81ed5f99 100644
--- a/rocAL/CMakeLists.txt
+++ b/rocAL/CMakeLists.txt
@@ -42,6 +42,14 @@ find_package(RapidJSON QUIET)
 find_package(StdFilesystem QUIET)
 find_package(HALF QUIET)
 
+if(DEFINED ENV{ROCM_PATH})
+  set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Default ROCm installation path")
+elseif(ROCM_PATH)
+  message("-- INFO:ROCM_PATH Set -- ${ROCM_PATH}")
+else()
+  set(ROCM_PATH /opt/rocm CACHE PATH "Default ROCm installation path")
+endif()
+
 # HIP Backend
 if(GPU_SUPPORT AND "${BACKEND}" STREQUAL "HIP")
     if(NOT DEFINED HIP_PATH)
@@ -225,6 +233,7 @@ if(${BUILD_ROCAL})
                 include/augmentations/geometry_augmentations/
                 include/decoders/image/
                 include/decoders/video/
+                include/decoders/libjpeg/
                 include/device/
                 include/loaders/
                 include/loaders/image/
diff --git a/rocAL/include/decoders/image/fused_crop_decoder.h b/rocAL/include/decoders/image/fused_crop_decoder.h
index 718919b90..ae59f6bf1 100644
--- a/rocAL/include/decoders/image/fused_crop_decoder.h
+++ b/rocAL/include/decoders/image/fused_crop_decoder.h
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/rocAL/include/decoders/image/turbo_jpeg_decoder.h b/rocAL/include/decoders/image/turbo_jpeg_decoder.h
index ce4dba600..99e67abac 100644
--- a/rocAL/include/decoders/image/turbo_jpeg_decoder.h
+++ b/rocAL/include/decoders/image/turbo_jpeg_decoder.h
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -64,24 +64,8 @@ class TJDecoder : public Decoder {
 
    private:
     tjhandle m_jpegDecompressor;
-    const static unsigned SCALING_FACTORS_COUNT = 16;
-    const tjscalingfactor SCALING_FACTORS[SCALING_FACTORS_COUNT] = {
-        {2, 1},
-        {15, 8},
-        {7, 4},
-        {13, 8},
-        {3, 2},
-        {11, 8},
-        {5, 4},
-        {9, 8},
-        {1, 1},
-        {7, 8},
-        {3, 4},
-        {5, 8},
-        {1, 2},
-        {3, 8},
-        {1, 4},
-        {1, 8}};
+    tjscalingfactor *_scaling_factors = nullptr;
+    int _num_scaling_factors = 0;
     bool _is_partial_decoder = false;
     std::vector<float> _bbox_coord;
     const static unsigned _max_scaling_factor = 8;
diff --git a/rocAL/include/decoders/libjpeg/libjpeg_extra.h b/rocAL/include/decoders/libjpeg/libjpeg_extra.h
new file mode 100644
index 000000000..69db1028a
--- /dev/null
+++ b/rocAL/include/decoders/libjpeg/libjpeg_extra.h
@@ -0,0 +1,75 @@
+/*
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#pragma once
+
+#include <turbojpeg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include "libjpeg_utils.h"
+
+extern "C" {
+
+//! extra apis for rocal to support partial decoding
+
+//! * Helper function to se the source
+//! * This function doesn't scale the decoded image
+
+//! * Decompress a subregion of JPEG image to an RGB, grayscale, or CMYK image.
+//! * This function doesn't scale the decoded image
+
+/*!
+  \param handle  TJPeg handle
+  \param jpegBuf compressed jpeg image buffer
+  \param jpegSize Size of the compressed data provided in the input_buffer
+  \param dstBuf user provided output buffer
+  \param width, pitch, height  width, stride and height of the allocated buffer
+  \param flags  TJPEG flags
+  \param pixelFormat  pixel format of the image
+  \param crop_x_diff,  crop_width_diff Actual crop_x and crop_w (adjusted to MB boundery)
+  \param x1, y1, crop_width, crop_height requested crop window
+*/
+
+int tjDecompress2_partial(tjhandle handle, const unsigned char *jpegBuf,
+                                    unsigned long jpegSize, unsigned char *dstBuf,
+                                    int width, int pitch, int height, int pixelFormat,
+                                    int flags, unsigned int *crop_x_diff, unsigned int *crop_width_diff,
+                                    unsigned int x1, unsigned int y1, unsigned int crop_width, unsigned int crop_height);
+
+
+//! * Decompress a subregion of JPEG image to an RGB, grayscale, or CMYK image.
+//! * This function scale the decoded image to fit the output dims
+/*!
+  \param handle  TJPeg handle
+  \param jpegBuf compressed jpeg image buffer
+  \param jpegSize Size of the compressed data provided in the input_buffer
+  \param dstBuf user provided output buffer
+  \param width, pitch, height  width, stride and height of the allocated buffer
+  \param flags  TJPEG flags
+  \param crop_width, crop_height requested crop window
+*/
+
+int tjDecompress2_partial_scale(tjhandle handle, const unsigned char *jpegBuf,
+                            unsigned long jpegSize, unsigned char *dstBuf,
+                            int width, int pitch, int height, int pixelFormat,
+                            int flags, unsigned int crop_width, unsigned int crop_height);
+}
\ No newline at end of file
diff --git a/rocAL/include/decoders/libjpeg/libjpeg_utils.h b/rocAL/include/decoders/libjpeg/libjpeg_utils.h
new file mode 100644
index 000000000..1c588ee0b
--- /dev/null
+++ b/rocAL/include/decoders/libjpeg/libjpeg_utils.h
@@ -0,0 +1,30 @@
+/*
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#pragma once
+
+//! turbojpeg includes
+
+extern "C" {
+#include "jerror.h"  
+#include "jpeglib.h" 
+#include "jpegint.h"
+}
diff --git a/rocAL/include/loaders/image/image_read_and_decode.h b/rocAL/include/loaders/image/image_read_and_decode.h
index 471164b54..6682d85f6 100644
--- a/rocAL/include/loaders/image/image_read_and_decode.h
+++ b/rocAL/include/loaders/image/image_read_and_decode.h
@@ -33,14 +33,6 @@ THE SOFTWARE.
 #include "timing_debug.h"
 #include "turbo_jpeg_decoder.h"
 
-/**
- * Compute the scaled value of <tt>dimension</tt> using the given scaling
- * factor.  This macro performs the integer equivalent of <tt>ceil(dimension *
- * scalingFactor)</tt>.
- */
-#define TJSCALED(dimension, scalingFactor)                       \
-    ((dimension * scalingFactor.num + scalingFactor.denom - 1) / \
-     scalingFactor.denom)
 
 class ImageReadAndDecode {
    public:
diff --git a/rocAL/source/decoders/image/fused_crop_decoder.cpp b/rocAL/source/decoders/image/fused_crop_decoder.cpp
index 2522bca4e..ee14c0f11 100644
--- a/rocAL/source/decoders/image/fused_crop_decoder.cpp
+++ b/rocAL/source/decoders/image/fused_crop_decoder.cpp
@@ -20,11 +20,13 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */
 
-#include "fused_crop_decoder.h"
 
 #include <commons.h>
 #include <stdio.h>
 #include <string.h>
+#include "fused_crop_decoder.h"
+#include "libjpeg_extra.h"
+
 
 FusedCropTJDecoder::FusedCropTJDecoder() {
     m_jpegDecompressor = tjInitDecompress();
diff --git a/rocAL/source/decoders/image/turbo_jpeg_decoder.cpp b/rocAL/source/decoders/image/turbo_jpeg_decoder.cpp
index 772fc8535..b285e891d 100644
--- a/rocAL/source/decoders/image/turbo_jpeg_decoder.cpp
+++ b/rocAL/source/decoders/image/turbo_jpeg_decoder.cpp
@@ -20,24 +20,21 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */
 
-#include "turbo_jpeg_decoder.h"
 
-#include <commons.h>
 #include <stdio.h>
+#include <commons.h>
+#include "turbo_jpeg_decoder.h"
+#include "libjpeg_extra.h"
 
 TJDecoder::TJDecoder() {
     m_jpegDecompressor = tjInitDecompress();
-
-#if 0
-    int num_avail_scalings = 0;
-    auto scaling_factors = tjGetScalingFactors	(&num_avail_scalings);	
-    for(int i = 0; i < num_avail_scalings; i++) {
-        if(scaling_factors[i].num < scaling_factors[i].denom) {
-
-            printf("%d / %d  - ",scaling_factors[i].num, scaling_factors[i].denom );
+    if ((_scaling_factors = tj3GetScalingFactors(&_num_scaling_factors)) == NULL)
+        THROW("tjDecompress2_partial_scale(): error getting scaling factors");
+    for(int i = 0; i < _num_scaling_factors; i++) {
+        if(_scaling_factors[i].num < _scaling_factors[i].denom) {
+            INFO(STR(_scaling_factors[i].num) + "/" + STR(_scaling_factors[i].denom));
         }
     }
-#endif
 };
 
 Decoder::Status TJDecoder::decode_info(unsigned char* input_buffer, size_t input_size, int* width, int* height, int* color_comps) {
@@ -90,7 +87,7 @@ Decoder::Status TJDecoder::decode(unsigned char* input_buffer, size_t input_size
                 crop_width = _max_scaling_factor * max_decoded_width;
                 if (crop_width > original_image_width) crop_width = original_image_width;
                 crop_height = crop_width * (1.0 / in_ratio);
-                if (crop_height > _max_scaling_factor * max_decoded_width) crop_height = _max_scaling_factor * max_decoded_width;
+                if (crop_height > _max_scaling_factor * max_decoded_height) crop_height = _max_scaling_factor * max_decoded_height;
             } else if (original_image_height > (_max_scaling_factor * max_decoded_height)) {
                 crop_height = _max_scaling_factor * max_decoded_height;
                 if (crop_height > original_image_height) crop_height = original_image_height;
@@ -114,9 +111,9 @@ Decoder::Status TJDecoder::decode(unsigned char* input_buffer, size_t input_size
             }
             // Find the decoded image size using the predefined scaling factors in the turbo jpeg decoder
             uint scaledw = max_decoded_width, scaledh = max_decoded_height;
-            for (auto scaling_factor : SCALING_FACTORS) {
-                scaledw = TJSCALED(crop_width, scaling_factor);
-                scaledh = TJSCALED(crop_height, scaling_factor);
+            for (int j=0; j < _num_scaling_factors; j++) {
+                scaledw = TJSCALED(original_image_width, _scaling_factors[j]);
+                scaledh = TJSCALED(original_image_height, _scaling_factors[j]);
                 if (scaledw <= max_decoded_width && scaledh <= max_decoded_height) {
                     break;
                 }
@@ -142,9 +139,9 @@ Decoder::Status TJDecoder::decode(unsigned char* input_buffer, size_t input_size
             }
             // Find the decoded image size using the predefined scaling factors in the turbo jpeg decoder
             uint scaledw = max_decoded_width, scaledh = max_decoded_height;
-            for (auto scaling_factor : SCALING_FACTORS) {
-                scaledw = TJSCALED(original_image_width, scaling_factor);
-                scaledh = TJSCALED(original_image_height, scaling_factor);
+            for (int j=0; j < _num_scaling_factors; j++) {
+                scaledw = TJSCALED(original_image_width, _scaling_factors[j]);
+                scaledh = TJSCALED(original_image_height, _scaling_factors[j]);
                 if (scaledw <= max_decoded_width && scaledh <= max_decoded_height)
                     break;
             }
@@ -168,7 +165,7 @@ Decoder::Status TJDecoder::decode(unsigned char* input_buffer, size_t input_size
                 crop_width = _max_scaling_factor * max_decoded_width;
                 if (crop_width > original_image_width) crop_width = original_image_width;
                 crop_height = crop_width * (1.0 / in_ratio);
-                if (crop_height > _max_scaling_factor * max_decoded_width) crop_height = _max_scaling_factor * max_decoded_width;
+                if (crop_height > _max_scaling_factor * max_decoded_height) crop_height = _max_scaling_factor * max_decoded_height;
             } else if (original_image_height > (_max_scaling_factor * max_decoded_height)) {
                 crop_height = _max_scaling_factor * max_decoded_height;
                 if (crop_height > original_image_height) crop_height = original_image_height;
@@ -192,9 +189,9 @@ Decoder::Status TJDecoder::decode(unsigned char* input_buffer, size_t input_size
             }
             // Find the decoded image size using the predefined scaling factors in the turbo jpeg decoder
             uint scaledw = max_decoded_width, scaledh = max_decoded_height;
-            for (auto scaling_factor : SCALING_FACTORS) {
-                scaledw = TJSCALED(crop_width, scaling_factor);
-                scaledh = TJSCALED(crop_height, scaling_factor);
+            for (int j=0; j < _num_scaling_factors; j++) {
+                scaledw = TJSCALED(original_image_width, _scaling_factors[j]);
+                scaledh = TJSCALED(original_image_height, _scaling_factors[j]);
                 if (scaledw <= max_decoded_width && scaledh <= max_decoded_height) {
                     break;
                 }
@@ -219,9 +216,9 @@ Decoder::Status TJDecoder::decode(unsigned char* input_buffer, size_t input_size
             // Find the decoded image size using the predefined scaling factors in the turbo jpeg decoder
             if ((actual_decoded_width != original_image_width) || (actual_decoded_height != original_image_height)) {
                 uint scaledw = actual_decoded_width, scaledh = actual_decoded_height;
-                for (auto scaling_factor : SCALING_FACTORS) {
-                    scaledw = TJSCALED(original_image_width, scaling_factor);
-                    scaledh = TJSCALED(original_image_height, scaling_factor);
+                for (int j=0; j < _num_scaling_factors; j++) {
+                    scaledw = TJSCALED(original_image_width, _scaling_factors[j]);
+                    scaledh = TJSCALED(original_image_height, _scaling_factors[j]);
                     if (scaledw <= max_decoded_width && scaledh <= max_decoded_height)
                         break;
                 }
diff --git a/rocAL/source/decoders/libjpeg/libjpeg_extra.cpp b/rocAL/source/decoders/libjpeg/libjpeg_extra.cpp
new file mode 100644
index 000000000..ca86f644a
--- /dev/null
+++ b/rocAL/source/decoders/libjpeg/libjpeg_extra.cpp
@@ -0,0 +1,266 @@
+/*
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of inst software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and inst permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "libjpeg_extra.h"
+#include <setjmp.h>
+#include <string.h>
+#include "commons.h"
+
+enum { COMPRESS = 1, DECOMPRESS = 2 };
+static J_COLOR_SPACE pf2cs[TJ_NUMPF] = {
+  JCS_EXT_RGB, JCS_EXT_BGR, JCS_EXT_RGBX, JCS_EXT_BGRX, JCS_EXT_XBGR,
+  JCS_EXT_XRGB, JCS_GRAYSCALE, JCS_EXT_RGBA, JCS_EXT_BGRA, JCS_EXT_ABGR,
+  JCS_EXT_ARGB, JCS_CMYK
+};
+
+struct my_error_mgr {
+  struct jpeg_error_mgr pub;
+  jmp_buf setjmp_buffer;
+  void (*emit_message) (j_common_ptr, int);
+  boolean warning, stopOnWarning;
+};
+typedef struct my_error_mgr *my_error_ptr;
+
+/*
+ * Here's the routine that will replace the standard error_exit method:
+ */
+
+METHODDEF(void)
+my_error_exit(j_common_ptr cinfo)
+{
+  /* cinfo->err really points to a my_error_mgr struct, so coerce pointer */
+  my_error_ptr myerr = (my_error_ptr)cinfo->err;
+
+  /* Always display the message. */
+  /* We could postpone this until after returning, if we chose. */
+  (*cinfo->err->output_message) (cinfo);
+
+  /* Return control to the setjmp point */
+  longjmp(myerr->setjmp_buffer, 1);
+}
+
+
+//! * Decompress a subregion of JPEG image to an RGB, grayscale, or CMYK image.
+//! * inst function doesn't scale the decoded image
+int tjDecompress2_partial(tjhandle handle, const unsigned char *jpegBuf,
+                                    unsigned long jpegSize, unsigned char *dstBuf,
+                                    int width, int pitch, int height, int pixelFormat,
+                                    int flags, unsigned int *crop_x_diff, unsigned int *crop_width_diff,
+                                    unsigned int crop_x, unsigned int crop_y,
+                                    unsigned int crop_width, unsigned int crop_height)
+{
+    JSAMPROW *row_pointer = NULL;
+    int i, retval = 0;
+
+    if (jpegBuf == NULL || jpegSize <= 0 || dstBuf == NULL || width < 0 ||
+        pitch < 0 || height < 0 || pixelFormat < 0 || pixelFormat >= TJ_NUMPF)
+        THROW("tjDecompress2_partial(): Invalid argument");
+
+    struct jpeg_decompress_struct cinfo;
+    // Initialize libjpeg structures to have a memory source
+    // Modify the usual jpeg error manager to catch fatal errors.
+    struct my_error_mgr jerr;
+    cinfo.err = jpeg_std_error(&jerr.pub);
+    jerr.pub.error_exit = my_error_exit;
+    if (setjmp(jerr.setjmp_buffer)) {
+      /* If we get here, the JPEG code has signaled an error. */
+      retval = -1;  goto bailout;
+    }
+
+    // set up, read header, set image parameters, save size
+    jpeg_create_decompress(&cinfo);
+    jpeg_mem_src(&cinfo, jpegBuf, jpegSize);
+    jpeg_read_header(&cinfo, TRUE);
+    cinfo.out_color_space = pf2cs[pixelFormat];
+    if (flags & TJFLAG_FASTDCT) cinfo.dct_method = JDCT_FASTEST;
+    if (flags & TJFLAG_FASTUPSAMPLE) cinfo.do_fancy_upsampling = FALSE;
+
+    jpeg_start_decompress(&cinfo);
+    /* Check for valid crop dimensions.  We cannot check these values until
+    * after jpeg_start_decompress() is called.
+    */
+    if (crop_x + crop_width > cinfo.output_width || crop_y + crop_height > cinfo.output_height) {
+        ERR("crop dimensions:" << crop_width << " x " << crop_height << " exceed image dimensions" <<
+            cinfo.output_width << " x " << cinfo.output_height);
+        retval = -1;  goto bailout;
+    }
+
+    jpeg_crop_scanline(&cinfo, &crop_x, &crop_width);
+    *crop_x_diff = crop_x;
+    *crop_width_diff = crop_width;
+
+    if (pitch == 0) pitch = cinfo.output_width * tjPixelSize[pixelFormat];
+
+    if ((row_pointer = (JSAMPROW *)malloc(sizeof(JSAMPROW) * cinfo.output_height)) == NULL) {
+      THROW("tjDecompress2_partial(): Memory allocation failure");
+      if (setjmp(jerr.setjmp_buffer)) {
+          /* If we get here, the JPEG code has signaled an error. */
+          retval = -1;  goto bailout;
+      }
+    }
+    
+    // set row pointer for destination
+    for (i = 0; i < (int)cinfo.output_height; i++) {
+      if (flags & TJFLAG_BOTTOMUP)
+        row_pointer[i] = &dstBuf[(cinfo.output_height - i - 1) * (size_t)pitch];
+      else
+        row_pointer[i] = &dstBuf[i * (size_t)pitch];
+    }
+
+    /* Process data */
+    JDIMENSION num_scanlines;
+    jpeg_skip_scanlines(&cinfo, crop_y);
+    while (cinfo.output_scanline <  crop_y + crop_height) {
+        if (cinfo.output_scanline < crop_y)
+          num_scanlines = jpeg_read_scanlines(&cinfo,  &row_pointer[cinfo.output_scanline],
+                                          crop_y + crop_height - cinfo.output_scanline);
+        else
+          num_scanlines = jpeg_read_scanlines(&cinfo,  &row_pointer[cinfo.output_scanline - crop_y],
+                                          crop_y + crop_height - cinfo.output_scanline);
+        if (num_scanlines == 0){
+          ERR("Premature end of Jpeg data. Stopped at " << cinfo.output_scanline - crop_y << "/"
+              << cinfo.output_height)
+        }
+    }      
+    jpeg_skip_scanlines(&cinfo, cinfo.output_height - crop_y - crop_height);
+    jpeg_finish_decompress(&cinfo);
+
+    bailout:
+    if (cinfo.global_state > DSTATE_START) jpeg_abort_decompress(&cinfo);
+    if (row_pointer) free(row_pointer);
+    return retval;
+}
+
+//! * Decompress a subregion of JPEG image to an RGB, grayscale, or CMYK image.
+//! * inst function scale the decoded image to fit the output dims
+
+int tjDecompress2_partial_scale(tjhandle handle, const unsigned char *jpegBuf,
+                            unsigned long jpegSize, unsigned char *dstBuf,
+                            int width, int pitch, int height, int pixelFormat,
+                            int flags, unsigned int crop_width, unsigned int crop_height)
+{
+    JSAMPROW *row_pointer = NULL;
+    int i, retval = 0, jpegwidth, jpegheight;
+    unsigned int scaledw, scaledh, crop_x, crop_y, max_crop_width;
+    tjscalingfactor *scalingFactors = NULL;
+    int numScalingFactors = 0;
+
+    unsigned char *tmp_row = NULL;
+    if (jpegBuf == NULL || jpegSize <= 0 || dstBuf == NULL || width < 0 || 
+          pitch < 0 || height < 0 || pixelFormat < 0 || pixelFormat >= TJ_NUMPF) {
+        THROW("tjDecompress2_partial_scale(): Invalid argument");
+    }
+
+    struct jpeg_decompress_struct cinfo;
+    // Initialize libjpeg structures to have a memory source
+    // Modify the usual jpeg error manager to catch fatal errors.
+    struct my_error_mgr jerr;
+    cinfo.err = jpeg_std_error(&jerr.pub);
+    jerr.pub.error_exit = my_error_exit;
+    if (setjmp(jerr.setjmp_buffer)) {
+        /* If we get here, the JPEG code has signaled an error. */
+        retval = -1;  goto bailout;
+    }
+
+    jpeg_mem_src(&cinfo, jpegBuf, jpegSize);
+    jpeg_read_header(&cinfo, TRUE);
+    cinfo.out_color_space = pf2cs[pixelFormat];
+    if (flags & TJFLAG_FASTDCT) cinfo.dct_method = JDCT_FASTEST;
+    if (flags & TJFLAG_FASTUPSAMPLE) cinfo.do_fancy_upsampling = FALSE;
+
+    jpegwidth = cinfo.image_width;  jpegheight = cinfo.image_height;
+    if (width == 0) width = jpegwidth;
+    if (height == 0) height = jpegheight;
+    if ((scalingFactors = tj3GetScalingFactors(&numScalingFactors)) == NULL)
+        THROW("tjDecompress2_partial_scale(): error getting scaling factors");
+
+    for (i = 0; i < numScalingFactors; i++) {
+      scaledw = TJSCALED(crop_width, scalingFactors[i]);
+      scaledh = TJSCALED(crop_height, scalingFactors[i]);
+      if (scaledw <= (unsigned int)width && scaledh <= (unsigned int)height)
+        break;
+    }
+
+    if (i >= numScalingFactors)
+      THROW("tjDecompress2_partial_scale(): Could not scale down to desired image dimensions");
+    
+    if (cinfo.num_components > 3)
+      THROW("tjDecompress2_partial_scale(): JPEG image must have 3 or fewer components");
+    
+    //width = scaledw;  height = scaledh;
+    cinfo.scale_num = scalingFactors[i].num;
+    cinfo.scale_denom = scalingFactors[i].denom;
+
+    jpeg_start_decompress(&cinfo);
+    crop_x = cinfo.output_width - scaledw;
+    crop_y = cinfo.output_height - scaledh;
+
+    /* Check for valid crop dimensions.  We cannot check these values until
+    * after jpeg_start_decompress() is called.
+    */
+    if (crop_x + scaledw   > cinfo.output_width || scaledh   > cinfo.output_height) {
+        ERR("crop dimensions:" << crop_x + scaledw << " x " << scaledh << " exceed image dimensions" <<
+            cinfo.output_width << " x " << cinfo.output_height);
+        retval = -1;  goto bailout;
+    }
+
+    if (pitch == 0) pitch = cinfo.output_width * tjPixelSize[pixelFormat];
+
+    if ((row_pointer =
+        (JSAMPROW *)malloc(sizeof(JSAMPROW) * cinfo.output_height)) == NULL)
+        THROW("tjDecompress2_partial_scale(): Memory allocation failure");
+    // allocate row of tmp storage for storing discarded data
+    tmp_row = (unsigned char *)malloc((size_t)pitch);
+
+    if (setjmp(jerr.setjmp_buffer)) {
+      /* If we get here, the JPEG code has signaled an error. */
+      retval = -1;  goto bailout;
+    }
+
+    for (i = 0; i < (int)cinfo.output_height; i++) {
+        if (i < height) {
+            if (flags & TJFLAG_BOTTOMUP)
+                row_pointer[i] = &dstBuf[(cinfo.output_height - i - 1) * (size_t)pitch];
+            else
+                row_pointer[i] = &dstBuf[i * (size_t)pitch];
+        } else {
+            row_pointer[i] = tmp_row;
+        }
+    }
+    // the width for the crop shouln't exceed output_width
+    max_crop_width = scaledw;
+    jpeg_crop_scanline(&cinfo, &crop_x, &max_crop_width);
+    jpeg_skip_scanlines(&cinfo, crop_y);
+    while (cinfo.output_scanline <  cinfo.output_height) {
+      if (cinfo.output_scanline < crop_y)
+          jpeg_read_scanlines(&cinfo,  &row_pointer[cinfo.output_scanline], cinfo.output_height - cinfo.output_scanline);
+      else
+          jpeg_read_scanlines(&cinfo,  &row_pointer[cinfo.output_scanline- crop_y], cinfo.output_height - cinfo.output_scanline);
+    }
+    jpeg_finish_decompress(&cinfo);
+
+  bailout:
+    if (cinfo.global_state > DSTATE_START) jpeg_abort_decompress(&cinfo);
+    if (row_pointer) free(row_pointer);
+    if (tmp_row) free(tmp_row);
+    return retval;
+}
diff --git a/rocAL_pybind/setup.py b/rocAL_pybind/setup.py
index 7d3598d35..9ee8e57ea 100644
--- a/rocAL_pybind/setup.py
+++ b/rocAL_pybind/setup.py
@@ -36,7 +36,7 @@ def has_ext_modules(self):
 setup(
     name='amd-rocal',
     description='AMD ROCm Augmentation Library',
-    url='https://github.com/GPUOpen-ProfessionalCompute-Libraries/MIVisionX/rocAL',
+    url='https://github.com/ROCm/rocAL',
     version='1.0.0',
     author='AMD',
     license='Apache License 2.0',

From 64d3a8366ead6d587d982331855c041869bd5bca Mon Sep 17 00:00:00 2001
From: SundarRajan98 <svaithiy@amd.com>
Date: Wed, 24 Jan 2024 14:43:21 +0000
Subject: [PATCH 25/33] Fixing build issues

---
 rocAL/include/loaders/image/numpy_loader.h         | 2 +-
 rocAL/include/loaders/image/numpy_loader_sharded.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/rocAL/include/loaders/image/numpy_loader.h b/rocAL/include/loaders/image/numpy_loader.h
index 2c3285561..ec3b5955e 100644
--- a/rocAL/include/loaders/image/numpy_loader.h
+++ b/rocAL/include/loaders/image/numpy_loader.h
@@ -54,7 +54,7 @@ class NumpyLoader : public LoaderModule {
     void set_prefetch_queue_depth(size_t prefetch_queue_depth) override;
     void shut_down() override;
     void feed_external_input(const std::vector<std::string>& input_images_names, const std::vector<unsigned char*>& input_buffer,
-                             const std::vector<ROIxywh>& roi_xywh, unsigned int max_width, unsigned int max_height, int channels, ExternalSourceFileMode mode, bool eos) override {}
+                             const std::vector<ROIxywh>& roi_xywh, unsigned int max_width, unsigned int max_height, unsigned int channels, ExternalSourceFileMode mode, bool eos) override {}
 
    private:
     bool is_out_of_data();
diff --git a/rocAL/include/loaders/image/numpy_loader_sharded.h b/rocAL/include/loaders/image/numpy_loader_sharded.h
index ada22c06b..ee55eff73 100644
--- a/rocAL/include/loaders/image/numpy_loader_sharded.h
+++ b/rocAL/include/loaders/image/numpy_loader_sharded.h
@@ -46,7 +46,7 @@ class NumpyLoaderSharded : public LoaderModule {
     void set_prefetch_queue_depth(size_t prefetch_queue_depth) override;
     void shut_down() override;
     void feed_external_input(const std::vector<std::string>& input_images_names, const std::vector<unsigned char*>& input_buffer,
-                             const std::vector<ROIxywh>& roi_xywh, unsigned int max_width, unsigned int max_height, int channels, ExternalSourceFileMode mode, bool eos) override {}
+                             const std::vector<ROIxywh>& roi_xywh, unsigned int max_width, unsigned int max_height, unsigned int channels, ExternalSourceFileMode mode, bool eos) override {}
 
    private:
     void increment_loader_idx();

From b02002115365ddcfbf0a7bc8a34d09124ff08256 Mon Sep 17 00:00:00 2001
From: SundarRajan98 <svaithiy@amd.com>
Date: Wed, 24 Jan 2024 15:25:29 +0000
Subject: [PATCH 26/33] Fixing bug with ROI changes for deepcam

---
 rocAL/include/pipeline/tensor.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/rocAL/include/pipeline/tensor.h b/rocAL/include/pipeline/tensor.h
index 14daf513c..0fb722da0 100644
--- a/rocAL/include/pipeline/tensor.h
+++ b/rocAL/include/pipeline/tensor.h
@@ -205,7 +205,6 @@ class TensorInfo {
             get_modified_dims_from_layout(_layout, layout, new_dims);
             _dims = new_dims;
             modify_strides();
-            _max_shape.assign(_dims.begin() + 1, _dims.end());
         }
         _layout = layout;
         if (_layout == RocalTensorlayout::NHWC || _layout == RocalTensorlayout::NDHWC) {

From 249704711b0a6a6aab945b6e4aff458d24f08141 Mon Sep 17 00:00:00 2001
From: SundarRajan98 <svaithiy@amd.com>
Date: Thu, 25 Jan 2024 18:37:51 +0000
Subject: [PATCH 27/33] Adding parameterVX changes

---
 rocAL/include/parameters/parameter.h          |  4 ++
 rocAL/include/parameters/parameter_factory.h  |  6 ++
 rocAL/include/parameters/parameter_random.h   | 61 ++++++++++++++++++-
 rocAL/include/parameters/parameter_simple.h   | 30 ++++++++-
 rocAL/include/parameters/parameter_vx.h       | 20 +++---
 rocAL/source/parameters/parameter_factory.cpp | 21 +++++--
 6 files changed, 125 insertions(+), 17 deletions(-)

diff --git a/rocAL/include/parameters/parameter.h b/rocAL/include/parameters/parameter.h
index 723c3dbd7..1bec7b334 100644
--- a/rocAL/include/parameters/parameter.h
+++ b/rocAL/include/parameters/parameter.h
@@ -33,6 +33,10 @@ class Parameter {
     /// used to internally renew state of the parameter if needed (for random parameters)
     virtual void renew(){};
 
+    virtual void create_array(unsigned batch_size){};
+
+    virtual std::vector<T> get_array() { return {}; };
+
     virtual ~Parameter() {}
     ///
     /// \return returns if this parameter takes a single value (vs a range of values or many values)
diff --git a/rocAL/include/parameters/parameter_factory.h b/rocAL/include/parameters/parameter_factory.h
index ccd3b4d2c..582d51fb5 100644
--- a/rocAL/include/parameters/parameter_factory.h
+++ b/rocAL/include/parameters/parameter_factory.h
@@ -29,6 +29,8 @@ THE SOFTWARE.
 #include "parameter_random.h"
 #include "parameter_simple.h"
 
+const int MAX_SEEDS = 1024;
+
 enum class RocalParameterType {
     DETERMINISTIC = 0,
     RANDOM_UNIFORM,
@@ -72,6 +74,8 @@ class ParameterFactory {
     void set_seed(unsigned seed);
     unsigned get_seed();
     void generate_seed();
+    int64_t get_seed_from_seedsequence();
+    void increment_seed_sequence_idx();
 
     template <typename T>
     Parameter<T>* create_uniform_rand_param(T start, T end) {
@@ -104,4 +108,6 @@ class ParameterFactory {
     static ParameterFactory* _instance;
     static std::mutex _mutex;
     ParameterFactory();
+    std::vector<int64_t> _seed_vector;
+    int _seed_sequence_idx = 0;
 };
diff --git a/rocAL/include/parameters/parameter_random.h b/rocAL/include/parameters/parameter_random.h
index c379a894f..0df53e856 100644
--- a/rocAL/include/parameters/parameter_random.h
+++ b/rocAL/include/parameters/parameter_random.h
@@ -51,7 +51,12 @@ class UniformRand : public Parameter<T> {
     T get() override {
         return _updated_val;
     };
-    void renew() override {
+
+    std::vector<T> get_array() override {
+        return _array;
+    }
+
+    void renew_value() {
         std::unique_lock<std::mutex> lock(_lock);
         auto val = _generator();
 
@@ -64,6 +69,21 @@ class UniformRand : public Parameter<T> {
                 ((double)val / (double)_generator.max()) * ((double)_end - (double)_start) + (double)_start);
         }
     }
+
+    void renew_array() {
+        for (uint i = 0; i < _batch_size; i++) {
+            renew_value();
+            _array[i] = _updated_val;
+        }
+    }
+
+    void renew() override {
+        if (_array.size() > 0) {
+            renew_array();
+        } else {
+            renew_value();
+        }
+    }
     int update(T start, T end) {
         std::unique_lock<std::mutex> lock(_lock);
         if (end < start)
@@ -73,6 +93,13 @@ class UniformRand : public Parameter<T> {
         _end = end;
         return 0;
     }
+
+    void create_array(unsigned batch_size) override {
+        if (_array.size() == 0)
+            _array.resize(batch_size);
+        _batch_size = batch_size;
+    }
+
     bool single_value() const override {
         return (_start == _end);
     }
@@ -81,8 +108,10 @@ class UniformRand : public Parameter<T> {
     T _start;
     T _end;
     T _updated_val;
+    std::vector<T> _array;
     std::mt19937 _generator;
     std::mutex _lock;
+    unsigned _batch_size;
 };
 
 template <typename T>
@@ -142,7 +171,8 @@ struct CustomRand : public Parameter<T> {
     T default_value() const override {
         return static_cast<T>(_mean);
     }
-    void renew() override {
+
+    void renew_value() {
         std::unique_lock<std::mutex> lock(_lock);
         if (single_value()) {
             // If there is only a single value possible for the random variable
@@ -161,10 +191,35 @@ struct CustomRand : public Parameter<T> {
             _updated_val = _values[idx];
         }
     }
+
+    void renew_array() {
+        for (uint i = 0; i < _batch_size; i++) {
+            renew_value();
+            _array[i] = _updated_val;
+        }
+    }
+
+    void renew() override {
+        if (_array.size() > 0) {
+            renew_array();
+        } else {
+            renew_value();
+        }
+    }
     T get() override {
         return _updated_val;
     };
 
+    std::vector<T> get_array() override {
+        return _array;
+    }
+
+    void create_array(unsigned batch_size) override {
+        if (_array.size() == 0)
+            _array.resize(batch_size);
+        _batch_size = batch_size;
+    }
+
     bool single_value() const override {
         return (_values.size() == 1);
     }
@@ -175,6 +230,8 @@ struct CustomRand : public Parameter<T> {
     std::vector<double> _comltv_dist;  //!< commulative probabilities
     double _mean;
     T _updated_val;
+    std::vector<T> _array;
     std::mt19937 _generator;
     std::mutex _lock;
+    unsigned _batch_size;
 };
\ No newline at end of file
diff --git a/rocAL/include/parameters/parameter_simple.h b/rocAL/include/parameters/parameter_simple.h
index d3fb0dc3f..c1ee1d5a2 100644
--- a/rocAL/include/parameters/parameter_simple.h
+++ b/rocAL/include/parameters/parameter_simple.h
@@ -35,11 +35,37 @@ class SimpleParameter : public Parameter<T> {
     T get() override {
         return _val;
     }
-    int update(T new_val) {
+
+    std::vector<T> get_array() override {
+        return _array;
+    }
+
+    void update_single_value(T new_val) {
         _val = new_val;
+    }
+
+    void update_array(T new_val) {
+        for (uint i = 0; i < _batch_size; i++) {
+            update_single_value(new_val);
+            _array[i] = _val;
+        }
+    }
+
+    int update(T new_val) {
+        if (_array.size() > 0)
+            update_array(new_val);
+        else
+            update_single_value(new_val);
         return 0;
     }
 
+    void create_array(unsigned batch_size) override {
+        if (_array.size() == 0)
+            _array.resize(batch_size);
+        _batch_size = batch_size;
+        update(_val);
+    }
+
     ~SimpleParameter() = default;
 
     bool single_value() const override {
@@ -48,6 +74,8 @@ class SimpleParameter : public Parameter<T> {
 
    private:
     T _val;
+    std::vector<T> _array;
+    unsigned _batch_size;
 };
 using pIntParam = std::shared_ptr<SimpleParameter<int>>;
 using pFloatParam = std::shared_ptr<SimpleParameter<float>>;
diff --git a/rocAL/include/parameters/parameter_vx.h b/rocAL/include/parameters/parameter_vx.h
index e63da998f..e71cd48ee 100644
--- a/rocAL/include/parameters/parameter_vx.h
+++ b/rocAL/include/parameters/parameter_vx.h
@@ -52,11 +52,12 @@ class ParameterVX {
             THROW("Reading vx scalar failed" + TOSTR(status));
     }
     void create_array(std::shared_ptr<Graph> graph, vx_enum data_type, unsigned batch_size) {
-        // _arrVal = (T*)malloc(sizeof(T) * _batch_size);
         _batch_size = batch_size;
-        _arrVal.resize(_batch_size);
+        _param->create_array(_batch_size);
         _array = vxCreateArray(vxGetContext((vx_reference)graph->get()), data_type, _batch_size);
-        vxAddArrayItems(_array, _batch_size, _arrVal.data(), sizeof(T));
+        auto status = vxAddArrayItems(_array, _batch_size, get_array().data(), sizeof(T));
+        if (status != 0)
+            THROW(" vxAddArrayItems failed in create_array (ParameterVX): " + TOSTR(status))
         update_array();
     }
     void set_param(Parameter<T>* param) {
@@ -96,11 +97,7 @@ class ParameterVX {
     }
     void update_array() {
         vx_status status;
-        for (uint i = 0; i < _batch_size; i++) {
-            _arrVal[i] = renew();
-            // INFO("update_array: " + TOSTR(i) + "," + TOSTR(_arrVal[i]));
-        }
-        status = vxCopyArrayRange((vx_array)_array, 0, _batch_size, sizeof(T), _arrVal.data(), VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST);
+        status = vxCopyArrayRange((vx_array)_array, 0, _batch_size, sizeof(T), get_array().data(), VX_WRITE_ONLY, VX_MEMORY_TYPE_HOST);
         if (status != 0)
             THROW(" vxCopyArrayRange failed in update_array (ParameterVX): " + TOSTR(status))
     }
@@ -109,12 +106,15 @@ class ParameterVX {
         return _param->get();
     }
 
+    std::vector<T> get_array() {
+        return _param->get_array();
+    }
+
    private:
     vx_scalar _scalar;
-    vx_array _array;
+    vx_array _array = nullptr;
     Parameter<T>* _param;
     T _val;
-    std::vector<T> _arrVal;
     unsigned _batch_size;
     unsigned OVX_PARAM_IDX;
     const T _DEFAULT_RANGE_START;
diff --git a/rocAL/source/parameters/parameter_factory.cpp b/rocAL/source/parameters/parameter_factory.cpp
index 6f3800bb4..cb31a55b8 100644
--- a/rocAL/source/parameters/parameter_factory.cpp
+++ b/rocAL/source/parameters/parameter_factory.cpp
@@ -104,33 +104,46 @@ void ParameterFactory::generate_seed() {
     _seed = rd();
 }
 
+int64_t
+ParameterFactory::get_seed_from_seedsequence() {
+    increment_seed_sequence_idx();
+    return _seed_vector[_seed_sequence_idx];
+}
+
+void ParameterFactory::increment_seed_sequence_idx() {
+    _seed_sequence_idx = (_seed_sequence_idx + 1) % MAX_SEEDS;
+}
+
 void ParameterFactory::set_seed(unsigned seed) {
     _seed = seed;
+    _seed_vector.resize(MAX_SEEDS);
+    std::seed_seq ss{seed};
+    ss.generate(_seed_vector.begin(), _seed_vector.end());
 }
 
 IntParam* ParameterFactory::create_uniform_int_rand_param(int start, int end) {
-    auto gen = new UniformRand<int>(start, end, _seed);
+    auto gen = new UniformRand<int>(start, end, get_seed_from_seedsequence());
     auto ret = new IntParam(gen, RocalParameterType::RANDOM_UNIFORM);
     _parameters.insert(gen);
     return ret;
 }
 
 FloatParam* ParameterFactory::create_uniform_float_rand_param(float start, float end) {
-    auto gen = new UniformRand<float>(start, end, _seed);
+    auto gen = new UniformRand<float>(start, end, get_seed_from_seedsequence());
     auto ret = new FloatParam(gen, RocalParameterType::RANDOM_UNIFORM);
     _parameters.insert(gen);
     return ret;
 }
 
 IntParam* ParameterFactory::create_custom_int_rand_param(const int* value, const double* frequencies, size_t size) {
-    auto gen = new CustomRand<int>(value, frequencies, size, _seed);
+    auto gen = new CustomRand<int>(value, frequencies, size, get_seed_from_seedsequence());
     auto ret = new IntParam(gen, RocalParameterType::RANDOM_CUSTOM);
     _parameters.insert(gen);
     return ret;
 }
 
 FloatParam* ParameterFactory::create_custom_float_rand_param(const float* value, const double* frequencies, size_t size) {
-    auto gen = new CustomRand<float>(value, frequencies, size, _seed);
+    auto gen = new CustomRand<float>(value, frequencies, size, get_seed_from_seedsequence());
     auto ret = new FloatParam(gen, RocalParameterType::RANDOM_CUSTOM);
     _parameters.insert(gen);
     return ret;

From d0f9a87d367f883c56062d565187afb3a7594960 Mon Sep 17 00:00:00 2001
From: SundarRajan98 <svaithiy@amd.com>
Date: Thu, 25 Jan 2024 18:49:40 +0000
Subject: [PATCH 28/33] Adding ROI changes for numpy reader

---
 rocAL/include/pipeline/tensor.h  | 17 +++++++++------
 rocAL/source/pipeline/tensor.cpp | 37 +++++++++++---------------------
 2 files changed, 23 insertions(+), 31 deletions(-)

diff --git a/rocAL/include/pipeline/tensor.h b/rocAL/include/pipeline/tensor.h
index 244d6563c..86eb8e9be 100644
--- a/rocAL/include/pipeline/tensor.h
+++ b/rocAL/include/pipeline/tensor.h
@@ -183,17 +183,17 @@ class TensorInfo {
                 _channels = _dims.at(2);
             } else if (_layout == RocalTensorlayout::NDHWC) {
                 _is_image = false;
-                _max_shape.resize(3);
-                _max_shape = {_dims.at(1), _dims.at(2), _dims.at(3)};
+                _max_shape.resize(4);
+                _max_shape.assign(_dims.begin() + 1, _dims.end());
                 _channels = _dims.at(4);
             } else if (_layout == RocalTensorlayout::NCDHW) {
                 _is_image = false;
-                _max_shape.resize(3);
-                _max_shape = {_dims.at(2), _dims.at(3), _dims.at(4)};
+                _max_shape.resize(4);
+                _max_shape.assign(_dims.begin() + 1, _dims.end());
                 _channels = _dims.at(1);
             }
         } else {
-            if (!_max_shape.size()) _max_shape.resize(_num_of_dims - 1, 0);  // Since 2 values will be stored in the vector
+            if (!_max_shape.size()) _max_shape.resize(_num_of_dims - 1, 0);
             _max_shape.assign(_dims.begin() + 1, _dims.end());
         }
         reset_tensor_roi_buffers();
@@ -207,8 +207,11 @@ class TensorInfo {
             modify_strides();
         }
         _layout = layout;
-        if (_layout == RocalTensorlayout::NONE)
-            set_max_shape();
+        if (_layout == RocalTensorlayout::NHWC || _layout == RocalTensorlayout::NDHWC) {
+            _channels = _dims.back();
+        } else if (_layout == RocalTensorlayout::NCHW || _layout == RocalTensorlayout::NCDHW) {
+            _channels = _dims.at(1);
+        }
     }
     void set_dims(std::vector<size_t>& new_dims) {
         if (_num_of_dims == new_dims.size()) {
diff --git a/rocAL/source/pipeline/tensor.cpp b/rocAL/source/pipeline/tensor.cpp
index e35428bf6..69fd8c60f 100644
--- a/rocAL/source/pipeline/tensor.cpp
+++ b/rocAL/source/pipeline/tensor.cpp
@@ -77,6 +77,10 @@ vx_enum interpret_tensor_data_type(RocalTensorDataType data_type) {
             return VX_TYPE_FLOAT16;
         case RocalTensorDataType::UINT8:
             return VX_TYPE_UINT8;
+        case RocalTensorDataType::UINT32:
+            return VX_TYPE_UINT32;
+        case RocalTensorDataType::INT32:
+            return VX_TYPE_INT32;
         default:
             THROW("Unsupported Tensor type " + TOSTR(data_type))
     }
@@ -108,30 +112,21 @@ bool operator==(const TensorInfo &rhs, const TensorInfo &lhs) {
 
 void TensorInfo::reset_tensor_roi_buffers() {
     unsigned *roi_buf;
-    auto roi_no_of_dims = _is_image ? 2 : (_num_of_dims - 2);
+    auto roi_no_of_dims = _is_image ? 2 : (_num_of_dims - 1);
     auto roi_size = (_layout == RocalTensorlayout::NFCHW || _layout == RocalTensorlayout::NFHWC) ? _dims[0] * _dims[1] : _batch_size;  // For Sequences pre allocating the ROI to N * F to replicate in OpenVX extensions
     allocate_host_or_pinned_mem((void **)&roi_buf, roi_size * roi_no_of_dims * 2 * sizeof(unsigned), _mem_type);
     _roi.set_ptr(roi_buf, _mem_type, roi_size, roi_no_of_dims);
-    if (_layout == RocalTensorlayout::NCDHW) {
-        for (unsigned i = 0; i < _batch_size; i++) {
-            unsigned *tensor_shape = _roi[i].end;
-            tensor_shape[2] = _max_shape[1];
-            tensor_shape[1] = _max_shape[2];
-            tensor_shape[0] = _max_shape[3];
-        }
-    } else if (_layout == RocalTensorlayout::NDHWC) {
-        for (unsigned i = 0; i < _batch_size; i++) {
-            unsigned *tensor_shape = _roi[i].end;
-            tensor_shape[2] = _max_shape[0];
-            tensor_shape[1] = _max_shape[1];
-            tensor_shape[0] = _max_shape[2];
-        }
-    } else if (_is_image) {
+    if (_is_image) {
         Roi2DCords *roi = _roi.get_2D_roi();
         for (unsigned i = 0; i < _batch_size; i++) {
             roi[i].xywh.w = _max_shape.at(0);
             roi[i].xywh.h = _max_shape.at(1);
         }
+    } else {
+        for (unsigned i = 0; i < _batch_size; i++) {
+            unsigned *tensor_shape = _roi[i].end;
+            tensor_shape[i] = _max_shape[i];
+        }
     }
 }
 
@@ -226,14 +221,8 @@ void Tensor::update_tensor_roi(const std::vector<std::vector<uint32_t>> &shape)
             THROW("The number of dims to be updated and the num of dims of tensor info does not match")
         
         unsigned *tensor_shape = _info.roi()[i].end;
-        if (_info.layout() == RocalTensorlayout::NCDHW) {
-            tensor_shape[2] = shape[i][1] > max_shape[1] ? max_shape[1] : shape[i][1];
-            tensor_shape[1] = shape[i][2] > max_shape[2] ? max_shape[2] : shape[i][2];
-            tensor_shape[0] = shape[i][3] > max_shape[3] ? max_shape[3] : shape[i][3];
-        } else if (_info.layout() == RocalTensorlayout::NDHWC) {
-            tensor_shape[2] = shape[i][0] > max_shape[0] ? max_shape[0] : shape[i][0];
-            tensor_shape[1] = shape[i][1] > max_shape[1] ? max_shape[1] : shape[i][1];
-            tensor_shape[0] = shape[i][2] > max_shape[2] ? max_shape[2] : shape[i][2];
+        for (unsigned j = 0; j < max_shape.size(); j++) {
+            tensor_shape[j] = shape[i][j] > max_shape[j] ? max_shape[j] : shape[i][j];
         }
     }
 }

From 338fe5ea6cd93e872157875a4e8e389439fe9de9 Mon Sep 17 00:00:00 2001
From: SundarRajan98 <svaithiy@amd.com>
Date: Thu, 25 Jan 2024 18:50:01 +0000
Subject: [PATCH 29/33] Adding setLayout function for numpy reader

---
 rocAL/include/api/rocal_api_augmentation.h  |  2 ++
 rocAL/source/api/rocal_api_augmentation.cpp | 23 +++++++++++++++++++++
 rocAL_pybind/rocal_pybind.cpp               |  2 ++
 3 files changed, 27 insertions(+)

diff --git a/rocAL/include/api/rocal_api_augmentation.h b/rocAL/include/api/rocal_api_augmentation.h
index e9c9a68b0..0ff3b08f5 100644
--- a/rocAL/include/api/rocal_api_augmentation.h
+++ b/rocAL/include/api/rocal_api_augmentation.h
@@ -1098,4 +1098,6 @@ extern "C" RocalTensor ROCAL_API_CALL rocalSSDRandomCrop(RocalContext context, R
                                                          RocalTensorLayout output_layout = ROCAL_NONE,
                                                          RocalTensorOutputType output_datatype = ROCAL_UINT8);
 
+extern "C" RocalTensor ROCAL_API_CALL rocalSetLayout(RocalContext context, RocalTensor input,
+                                                     RocalTensorLayout output_layout = ROCAL_NONE);
 #endif  // MIVISIONX_ROCAL_API_AUGMENTATION_H
diff --git a/rocAL/source/api/rocal_api_augmentation.cpp b/rocAL/source/api/rocal_api_augmentation.cpp
index 33fb5b57a..f746c3c4f 100644
--- a/rocAL/source/api/rocal_api_augmentation.cpp
+++ b/rocAL/source/api/rocal_api_augmentation.cpp
@@ -2155,3 +2155,26 @@ rocalNop(
     }
     return output;
 }
+
+RocalTensor ROCAL_API_CALL
+rocalSetLayout(
+    RocalContext p_context,
+    RocalTensor p_input,
+    RocalTensorLayout output_layout) {
+    Tensor* output = nullptr;
+    if ((p_context == nullptr) || (p_input == nullptr)) {
+        ERR("Invalid ROCAL context or invalid input tensor")
+        return output;
+    }
+
+    auto context = static_cast<Context*>(p_context);
+    auto input = static_cast<Tensor*>(p_input);
+    try {
+        RocalTensorlayout op_tensor_layout = static_cast<RocalTensorlayout>(output_layout);
+        input->set_layout(op_tensor_layout);
+    } catch (const std::exception& e) {
+        context->capture_error(e.what());
+        ERR(e.what())
+    }
+    return input;
+}
diff --git a/rocAL_pybind/rocal_pybind.cpp b/rocAL_pybind/rocal_pybind.cpp
index ae0623f20..45235dcd7 100644
--- a/rocAL_pybind/rocal_pybind.cpp
+++ b/rocAL_pybind/rocal_pybind.cpp
@@ -641,6 +641,8 @@ PYBIND11_MODULE(rocal_pybind, m) {
     m.def("rocalResetLoaders", &rocalResetLoaders);
     m.def("videoMetaDataReader", &rocalCreateVideoLabelReader, py::return_value_policy::reference);
     // rocal_api_augmentation.h
+    m.def("setLayout", &rocalSetLayout,
+          py::return_value_policy::reference);
     m.def("ssdRandomCrop", &rocalSSDRandomCrop,
           py::return_value_policy::reference);
     m.def("resize", &rocalResize,

From dd6c61660dc741c8baa2ac426bb7f41cac343b2a Mon Sep 17 00:00:00 2001
From: SundarRajan98 <svaithiy@amd.com>
Date: Thu, 25 Jan 2024 19:03:12 +0000
Subject: [PATCH 30/33] Fixing numpy header order issue in numpy reader

---
 .../include/readers/image/numpy_data_reader.h |  1 +
 .../readers/image/numpy_data_reader.cpp       | 33 ++++++++++---------
 2 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/rocAL/include/readers/image/numpy_data_reader.h b/rocAL/include/readers/image/numpy_data_reader.h
index cfee3e3c4..70c947a4f 100644
--- a/rocAL/include/readers/image/numpy_data_reader.h
+++ b/rocAL/include/readers/image/numpy_data_reader.h
@@ -83,6 +83,7 @@ class NumpyDataReader : public Reader {
     unsigned _curr_file_idx;
     FILE* _current_fPtr;
     unsigned _current_file_size;
+    NumpyHeaderData _curr_file_header;
     std::string _last_id;
     std::string _last_file_name;
     size_t _shard_id = 0;
diff --git a/rocAL/source/readers/image/numpy_data_reader.cpp b/rocAL/source/readers/image/numpy_data_reader.cpp
index aba25c480..8090514a7 100644
--- a/rocAL/source/readers/image/numpy_data_reader.cpp
+++ b/rocAL/source/readers/image/numpy_data_reader.cpp
@@ -89,7 +89,8 @@ void NumpyDataReader::incremenet_read_ptr() {
 }
 
 size_t NumpyDataReader::open() {
-    auto file_path = _file_names[_curr_file_idx];  // Get next file name
+    auto file_path = _file_names[_curr_file_idx];  // Get current file name
+    _curr_file_header = _file_headers[_curr_file_idx];  // Get current file header
     incremenet_read_ptr();
     _last_id = file_path;
     auto last_slash_idx = _last_id.find_last_of("\\/");
@@ -97,10 +98,10 @@ size_t NumpyDataReader::open() {
         _last_id.erase(0, last_slash_idx + 1);
     }
 
-    auto ret = GetFromCache(file_path, _file_headers[_curr_file_idx]);
+    auto ret = GetFromCache(file_path, _curr_file_header);
     if (!ret) {
-        ParseHeader(_file_headers[_curr_file_idx], file_path);
-        UpdateCache(file_path, _file_headers[_curr_file_idx]);
+        ParseHeader(_curr_file_header, file_path);
+        UpdateCache(file_path, _curr_file_header);
     } else {
         _current_fPtr = std::fopen(file_path.c_str(), "rb");
         if (_current_fPtr == nullptr)
@@ -108,7 +109,7 @@ size_t NumpyDataReader::open() {
     }
     fseek(_current_fPtr, 0, SEEK_SET);  // Take the file pointer back to the start
 
-    return _file_headers[_curr_file_idx].nbytes();
+    return _curr_file_header.nbytes();
 }
 
 bool NumpyDataReader::GetFromCache(const std::string& file_name, NumpyHeaderData& header) {
@@ -321,10 +322,10 @@ size_t NumpyDataReader::read_numpy_data(void* buf, size_t read_size, std::vector
     // Requested read size bigger than the file size? just read as many bytes as the file size
     read_size = (read_size > _current_file_size) ? _current_file_size : read_size;
 
-    if (std::fseek(_current_fPtr, _file_headers[_curr_file_idx]._data_offset, SEEK_SET))
+    if (std::fseek(_current_fPtr, _curr_file_header._data_offset, SEEK_SET))
         THROW("Seek operation failed: " + std::strerror(errno));
 
-    auto shape = _file_headers[_curr_file_idx].shape();
+    auto shape = _curr_file_header.shape();
     auto num_dims = max_shape.size();
     std::vector<unsigned> strides(num_dims + 1);
     strides[num_dims] = 1;
@@ -333,21 +334,21 @@ size_t NumpyDataReader::read_numpy_data(void* buf, size_t read_size, std::vector
     }
 
     size_t actual_read_size = 0;
-    if (_file_headers[_curr_file_idx].type() == RocalTensorDataType::UINT8)
+    if (_curr_file_header.type() == RocalTensorDataType::UINT8)
         actual_read_size = ParseNumpyData<u_int8_t>((u_int8_t*)buf, strides, shape);
-    if (_file_headers[_curr_file_idx].type() == RocalTensorDataType::UINT32)
+    if (_curr_file_header.type() == RocalTensorDataType::UINT32)
         actual_read_size = ParseNumpyData<u_int32_t>((u_int32_t*)buf, strides, shape);
-    if (_file_headers[_curr_file_idx].type() == RocalTensorDataType::INT8)
+    if (_curr_file_header.type() == RocalTensorDataType::INT8)
         actual_read_size = ParseNumpyData<int8_t>((int8_t*)buf, strides, shape);
-    if (_file_headers[_curr_file_idx].type() == RocalTensorDataType::INT32)
+    if (_curr_file_header.type() == RocalTensorDataType::INT32)
         actual_read_size = ParseNumpyData<int32_t>((int32_t*)buf, strides, shape);
-    if (_file_headers[_curr_file_idx].type() == RocalTensorDataType::FP16)
+    if (_curr_file_header.type() == RocalTensorDataType::FP16)
 #if defined(AMD_FP16_SUPPORT)
         actual_read_size = ParseNumpyData<half>((half*)buf, strides, shape);
 #else
         THROW("FLOAT16 type tensor not supported")
 #endif
-    if (_file_headers[_curr_file_idx].type() == RocalTensorDataType::FP32)
+    if (_curr_file_header.type() == RocalTensorDataType::FP32)
         actual_read_size = ParseNumpyData<float>((float*)buf, strides, shape);
 
     return actual_read_size;
@@ -369,7 +370,7 @@ size_t NumpyDataReader::ParseNumpyData(T* buf, std::vector<unsigned> strides, st
 }
 
 const NumpyHeaderData NumpyDataReader::get_numpy_header_data() {
-    return _file_headers[_curr_file_idx];
+    return _curr_file_header;
 }
 
 size_t NumpyDataReader::read_data(unsigned char* buf, size_t read_size) {
@@ -379,10 +380,10 @@ size_t NumpyDataReader::read_data(unsigned char* buf, size_t read_size) {
     // Requested read size bigger than the file size? just read as many bytes as the file size
     read_size = (read_size > _current_file_size) ? _current_file_size : read_size;
 
-    if (std::fseek(_current_fPtr, _file_headers[_curr_file_idx]._data_offset, SEEK_SET))
+    if (std::fseek(_current_fPtr, _curr_file_header._data_offset, SEEK_SET))
         THROW("Seek operation failed: " + std::strerror(errno));
 
-    size_t actual_read_size = std::fread(buf, 1, _file_headers[_curr_file_idx].nbytes(), _current_fPtr);
+    size_t actual_read_size = std::fread(buf, 1, _curr_file_header.nbytes(), _current_fPtr);
     return actual_read_size;
 }
 

From 23b193b2ab8f06057475574132f5db93358e1b8e Mon Sep 17 00:00:00 2001
From: SundarRajan98 <svaithiy@amd.com>
Date: Thu, 25 Jan 2024 19:30:41 +0000
Subject: [PATCH 31/33] Formatting changes for numpy data reader

---
 .../include/readers/image/numpy_data_reader.h |  26 ++---
 .../readers/image/numpy_data_reader.cpp       | 103 +++++++++---------
 2 files changed, 65 insertions(+), 64 deletions(-)

diff --git a/rocAL/include/readers/image/numpy_data_reader.h b/rocAL/include/readers/image/numpy_data_reader.h
index 70c947a4f..bc814a046 100644
--- a/rocAL/include/readers/image/numpy_data_reader.h
+++ b/rocAL/include/readers/image/numpy_data_reader.h
@@ -24,9 +24,9 @@ THE SOFTWARE.
 #include <dirent.h>
 
 #include <memory>
+#include <mutex>
 #include <string>
 #include <vector>
-#include <mutex>
 
 #include "commons.h"
 #include "image_reader.h"
@@ -102,23 +102,23 @@ class NumpyDataReader : public Reader {
     size_t _file_count_all_shards;
     std::mutex _cache_mutex_;
     std::map<std::string, NumpyHeaderData> _header_cache_;
-    const RocalTensorDataType TypeFromNumpyStr(const std::string& format);
-    inline void SkipSpaces(const char*& ptr);
-    void ParseHeaderContents(NumpyHeaderData& target, const std::string& header);
+    const RocalTensorDataType get_numpy_dtype(const std::string& format);
+    inline void ignore_spaces(const char*& ptr);
+    void decode_header(NumpyHeaderData& target, const std::string& header);
     template <size_t N>
-    void Skip(const char*& ptr, const char (&what)[N]);
+    void skip_string(const char*& ptr, const char (&what)[N]);
     template <size_t N>
-    bool TrySkip(const char*& ptr, const char (&what)[N]);
+    bool check_and_skip_string(const char*& ptr, const char (&what)[N]);
     template <size_t N>
-    void SkipFieldName(const char*& ptr, const char (&name)[N]);
+    void skip_field(const char*& ptr, const char (&name)[N]);
     template <typename T = int64_t>
-    T ParseInteger(const char*& ptr);
-    std::string ParseStringValue(const char*& input, char delim_start = '\'', char delim_end = '\'');
-    void ParseHeader(NumpyHeaderData& parsed_header, std::string file_path);
+    T parse_int(const char*& ptr);
+    std::string read_dtype_string(const char*& input, char delim_start = '\'', char delim_end = '\'');
+    void read_header(NumpyHeaderData& parsed_header, std::string file_path);
     template <typename T>
-    size_t ParseNumpyData(T* buf, std::vector<unsigned> strides, std::vector<unsigned> shapes, unsigned dim = 0);
-    bool GetFromCache(const std::string& file_name, NumpyHeaderData& target);
-    void UpdateCache(const std::string& file_name, const NumpyHeaderData& value);   
+    size_t copy_array_data(T* buf, std::vector<unsigned> strides, std::vector<unsigned> shapes, unsigned dim = 0);
+    bool get_cached_header(const std::string& file_name, NumpyHeaderData& target);
+    void update_header_cache(const std::string& file_name, const NumpyHeaderData& value);
     void incremenet_read_ptr();
     int release();
     size_t get_file_shard_id();
diff --git a/rocAL/source/readers/image/numpy_data_reader.cpp b/rocAL/source/readers/image/numpy_data_reader.cpp
index 8090514a7..2f2171509 100644
--- a/rocAL/source/readers/image/numpy_data_reader.cpp
+++ b/rocAL/source/readers/image/numpy_data_reader.cpp
@@ -25,9 +25,10 @@ THE SOFTWARE.
 #include <commons.h>
 
 #include <algorithm>
+#include <cassert>
 #include <numeric>
 #include <random>
-#include <cassert>
+
 #include "filesystem.h"
 
 NumpyDataReader::NumpyDataReader() : _shuffle_time("shuffle_time", DBG_TIMING) {
@@ -89,7 +90,7 @@ void NumpyDataReader::incremenet_read_ptr() {
 }
 
 size_t NumpyDataReader::open() {
-    auto file_path = _file_names[_curr_file_idx];  // Get current file name
+    auto file_path = _file_names[_curr_file_idx];       // Get current file name
     _curr_file_header = _file_headers[_curr_file_idx];  // Get current file header
     incremenet_read_ptr();
     _last_id = file_path;
@@ -98,10 +99,10 @@ size_t NumpyDataReader::open() {
         _last_id.erase(0, last_slash_idx + 1);
     }
 
-    auto ret = GetFromCache(file_path, _curr_file_header);
+    auto ret = get_cached_header(file_path, _curr_file_header);
     if (!ret) {
-        ParseHeader(_curr_file_header, file_path);
-        UpdateCache(file_path, _curr_file_header);
+        read_header(_curr_file_header, file_path);
+        update_header_cache(file_path, _curr_file_header);
     } else {
         _current_fPtr = std::fopen(file_path.c_str(), "rb");
         if (_current_fPtr == nullptr)
@@ -112,7 +113,7 @@ size_t NumpyDataReader::open() {
     return _curr_file_header.nbytes();
 }
 
-bool NumpyDataReader::GetFromCache(const std::string& file_name, NumpyHeaderData& header) {
+bool NumpyDataReader::get_cached_header(const std::string& file_name, NumpyHeaderData& header) {
     std::unique_lock<std::mutex> cache_lock(_cache_mutex_);
     auto it = _header_cache_.find(file_name);
     if (it == _header_cache_.end()) {
@@ -123,12 +124,12 @@ bool NumpyDataReader::GetFromCache(const std::string& file_name, NumpyHeaderData
     }
 }
 
-void NumpyDataReader::UpdateCache(const std::string& file_name, const NumpyHeaderData& value) {
+void NumpyDataReader::update_header_cache(const std::string& file_name, const NumpyHeaderData& value) {
     std::unique_lock<std::mutex> cache_lock(_cache_mutex_);
     _header_cache_[file_name] = value;
 }
 
-const RocalTensorDataType NumpyDataReader::TypeFromNumpyStr(const std::string& format) {
+const RocalTensorDataType NumpyDataReader::get_numpy_dtype(const std::string& format) {
     if (format == "u1") return RocalTensorDataType::UINT8;
     // if (format == "u2") return TypeTable::GetTypeInfo<uint16_t>();   // Currently not supported in rocAL
     if (format == "u4") return RocalTensorDataType::UINT32;
@@ -148,20 +149,20 @@ const RocalTensorDataType NumpyDataReader::TypeFromNumpyStr(const std::string& f
     THROW("Unknown Numpy type string");
 }
 
-inline void NumpyDataReader::SkipSpaces(const char*& ptr) {
+inline void NumpyDataReader::ignore_spaces(const char*& ptr) {
     while (::isspace(*ptr))
         ptr++;
 }
 
 template <size_t N>
-void NumpyDataReader::Skip(const char*& ptr, const char (&what)[N]) {
+void NumpyDataReader::skip_string(const char*& ptr, const char (&what)[N]) {
     if (strncmp(ptr, what, N - 1))
         THROW("Found wrong symbol during parsing");
     ptr += N - 1;
 }
 
 template <size_t N>
-bool NumpyDataReader::TrySkip(const char*& ptr, const char (&what)[N]) {
+bool NumpyDataReader::check_and_skip_string(const char*& ptr, const char (&what)[N]) {
     if (!strncmp(ptr, what, N - 1)) {
         ptr += N - 1;
         return true;
@@ -171,18 +172,18 @@ bool NumpyDataReader::TrySkip(const char*& ptr, const char (&what)[N]) {
 }
 
 template <size_t N>
-void NumpyDataReader::SkipFieldName(const char*& ptr, const char (&name)[N]) {
-    SkipSpaces(ptr);
-    Skip(ptr, "'");
-    Skip(ptr, name);
-    Skip(ptr, "'");
-    SkipSpaces(ptr);
-    Skip(ptr, ":");
-    SkipSpaces(ptr);
+void NumpyDataReader::skip_field(const char*& ptr, const char (&name)[N]) {
+    ignore_spaces(ptr);
+    skip_string(ptr, "'");
+    skip_string(ptr, name);
+    skip_string(ptr, "'");
+    ignore_spaces(ptr);
+    skip_string(ptr, ":");
+    ignore_spaces(ptr);
 }
 
 template <typename T = int64_t>
-T NumpyDataReader::ParseInteger(const char*& ptr) {
+T NumpyDataReader::parse_int(const char*& ptr) {
     char* out_ptr = const_cast<char*>(ptr);  // strtol takes a non-const pointer
     T value = static_cast<T>(strtol(ptr, &out_ptr, 10));
     if (out_ptr == ptr)
@@ -191,7 +192,7 @@ T NumpyDataReader::ParseInteger(const char*& ptr) {
     return value;
 }
 
-std::string NumpyDataReader::ParseStringValue(const char*& input, char delim_start, char delim_end) {
+std::string NumpyDataReader::read_dtype_string(const char*& input, char delim_start, char delim_end) {
     if (*input++ != delim_start)
         THROW("Expected \'" + std::to_string(delim_start) + "\'");
     std::string out;
@@ -229,39 +230,39 @@ std::string NumpyDataReader::ParseStringValue(const char*& input, char delim_sta
     return out;
 }
 
-void NumpyDataReader::ParseHeaderContents(NumpyHeaderData& target, const std::string& header) {
+void NumpyDataReader::decode_header(NumpyHeaderData& target, const std::string& header) {
     const char* hdr = header.c_str();
-    SkipSpaces(hdr);
-    Skip(hdr, "{");
-    SkipFieldName(hdr, "descr");
-    auto typestr = ParseStringValue(hdr);
+    ignore_spaces(hdr);
+    skip_string(hdr, "{");
+    skip_field(hdr, "descr");
+    auto typestr = read_dtype_string(hdr);
     // < means LE, | means N/A, = means native. In all those cases, we can read
     bool little_endian = (typestr[0] == '<' || typestr[0] == '|' || typestr[0] == '=');
     if (!little_endian)
         THROW("Big Endian files are not supported.");
-    target._type_info = TypeFromNumpyStr(typestr.substr(1));
+    target._type_info = get_numpy_dtype(typestr.substr(1));
 
-    SkipSpaces(hdr);
-    Skip(hdr, ",");
-    SkipFieldName(hdr, "fortran_order");
-    if (TrySkip(hdr, "True")) {
+    ignore_spaces(hdr);
+    skip_string(hdr, ",");
+    skip_field(hdr, "fortran_order");
+    if (check_and_skip_string(hdr, "True")) {
         target._fortran_order = true;
-    } else if (TrySkip(hdr, "False")) {
+    } else if (check_and_skip_string(hdr, "False")) {
         target._fortran_order = false;
     } else {
         THROW("Failed to parse fortran_order field.");
     }
-    SkipSpaces(hdr);
-    Skip(hdr, ",");
-    SkipFieldName(hdr, "shape");
-    Skip(hdr, "(");
-    SkipSpaces(hdr);
+    ignore_spaces(hdr);
+    skip_string(hdr, ",");
+    skip_field(hdr, "shape");
+    skip_string(hdr, "(");
+    ignore_spaces(hdr);
     target._shape.clear();
     while (*hdr != ')') {
-        // ParseInteger already skips the leading spaces (strtol does).
-        target._shape.push_back(static_cast<unsigned>(ParseInteger<int64_t>(hdr)));
-        SkipSpaces(hdr);
-        if (!(TrySkip(hdr, ",")) && (target._shape.size() <= 1))
+        // parse_int already skips the leading spaces (strtol does).
+        target._shape.push_back(static_cast<unsigned>(parse_int<int64_t>(hdr)));
+        ignore_spaces(hdr);
+        if (!(check_and_skip_string(hdr, ",")) && (target._shape.size() <= 1))
             THROW("The first number in a tuple must be followed by a comma.");
     }
     if (target._fortran_order) {
@@ -270,7 +271,7 @@ void NumpyDataReader::ParseHeaderContents(NumpyHeaderData& target, const std::st
     }
 }
 
-void NumpyDataReader::ParseHeader(NumpyHeaderData& parsed_header, std::string file_path) {
+void NumpyDataReader::read_header(NumpyHeaderData& parsed_header, std::string file_path) {
     // check if the file is actually a numpy file
     std::vector<char> token(128);
     _current_fPtr = std::fopen(file_path.c_str(), "rb");
@@ -311,7 +312,7 @@ void NumpyDataReader::ParseHeader(NumpyHeaderData& parsed_header, std::string fi
     if (std::fseek(_current_fPtr, offset, SEEK_SET))
         THROW("Seek operation failed: " + std::strerror(errno));
 
-    ParseHeaderContents(parsed_header, header);
+    decode_header(parsed_header, header);
     parsed_header._data_offset = offset;
 }
 
@@ -335,27 +336,27 @@ size_t NumpyDataReader::read_numpy_data(void* buf, size_t read_size, std::vector
 
     size_t actual_read_size = 0;
     if (_curr_file_header.type() == RocalTensorDataType::UINT8)
-        actual_read_size = ParseNumpyData<u_int8_t>((u_int8_t*)buf, strides, shape);
+        actual_read_size = copy_array_data<u_int8_t>((u_int8_t*)buf, strides, shape);
     if (_curr_file_header.type() == RocalTensorDataType::UINT32)
-        actual_read_size = ParseNumpyData<u_int32_t>((u_int32_t*)buf, strides, shape);
+        actual_read_size = copy_array_data<u_int32_t>((u_int32_t*)buf, strides, shape);
     if (_curr_file_header.type() == RocalTensorDataType::INT8)
-        actual_read_size = ParseNumpyData<int8_t>((int8_t*)buf, strides, shape);
+        actual_read_size = copy_array_data<int8_t>((int8_t*)buf, strides, shape);
     if (_curr_file_header.type() == RocalTensorDataType::INT32)
-        actual_read_size = ParseNumpyData<int32_t>((int32_t*)buf, strides, shape);
+        actual_read_size = copy_array_data<int32_t>((int32_t*)buf, strides, shape);
     if (_curr_file_header.type() == RocalTensorDataType::FP16)
 #if defined(AMD_FP16_SUPPORT)
-        actual_read_size = ParseNumpyData<half>((half*)buf, strides, shape);
+        actual_read_size = copy_array_data<half>((half*)buf, strides, shape);
 #else
         THROW("FLOAT16 type tensor not supported")
 #endif
     if (_curr_file_header.type() == RocalTensorDataType::FP32)
-        actual_read_size = ParseNumpyData<float>((float*)buf, strides, shape);
+        actual_read_size = copy_array_data<float>((float*)buf, strides, shape);
 
     return actual_read_size;
 }
 
 template <typename T>
-size_t NumpyDataReader::ParseNumpyData(T* buf, std::vector<unsigned> strides, std::vector<unsigned> shapes, unsigned dim) {
+size_t NumpyDataReader::copy_array_data(T* buf, std::vector<unsigned> strides, std::vector<unsigned> shapes, unsigned dim) {
     if (dim == (shapes.size() - 1)) {
         auto actual_read_size = std::fread(buf, sizeof(T), shapes[dim], _current_fPtr);
         return actual_read_size;
@@ -363,7 +364,7 @@ size_t NumpyDataReader::ParseNumpyData(T* buf, std::vector<unsigned> strides, st
     T* startPtr = buf;
     size_t read_size = 0;
     for (unsigned d = 0; d < shapes[dim]; d++) {
-        read_size += ParseNumpyData<T>(startPtr, strides, shapes, dim + 1);
+        read_size += copy_array_data<T>(startPtr, strides, shapes, dim + 1);
         startPtr += strides[dim + 1];
     }
     return read_size;

From 14079e031968865484918d55c3b6b677f88677fc Mon Sep 17 00:00:00 2001
From: SundarRajan98 <svaithiy@amd.com>
Date: Mon, 29 Jan 2024 08:43:56 +0000
Subject: [PATCH 32/33] Adding cast augmentation API

---
 rocAL/include/api/rocal_api_augmentation.h  | 11 ++++++++
 rocAL/source/api/rocal_api_augmentation.cpp | 29 +++++++++++++++++++++
 2 files changed, 40 insertions(+)

diff --git a/rocAL/include/api/rocal_api_augmentation.h b/rocAL/include/api/rocal_api_augmentation.h
index 71264e42c..7cb74e75e 100644
--- a/rocAL/include/api/rocal_api_augmentation.h
+++ b/rocAL/include/api/rocal_api_augmentation.h
@@ -1200,6 +1200,17 @@ extern "C" RocalTensor ROCAL_API_CALL rocalSSDRandomCrop(RocalContext context, R
                                                          RocalTensorLayout output_layout = ROCAL_NONE,
                                                          RocalTensorOutputType output_datatype = ROCAL_UINT8);
 
+/**
+ * \brief Cast input tensor from one data type to another 
+ * \param context Rocal context
+ * \param input Input tensor
+ * \param is_output Sets if the output is to be given to user or as intermediate buffer
+ * \param output_datatype Datatype of the output tensor
+ */
+extern "C" RocalTensor ROCAL_API_CALL rocalCast(RocalContext context, RocalTensor input,
+                                                bool is_output,
+                                                RocalTensorOutputType output_datatype = ROCAL_UINT8);
+
 extern "C" RocalTensor ROCAL_API_CALL rocalSetLayout(RocalContext context, RocalTensor input,
                                                      RocalTensorLayout output_layout = ROCAL_NONE);
 #endif  // MIVISIONX_ROCAL_API_AUGMENTATION_H
diff --git a/rocAL/source/api/rocal_api_augmentation.cpp b/rocAL/source/api/rocal_api_augmentation.cpp
index efd233c93..ea1c3344c 100644
--- a/rocAL/source/api/rocal_api_augmentation.cpp
+++ b/rocAL/source/api/rocal_api_augmentation.cpp
@@ -2326,6 +2326,35 @@ rocalNop(
     return output;
 }
 
+RocalTensor ROCAL_API_CALL rocalCast(RocalContext p_context, RocalTensor p_input,
+                                     bool is_output,
+                                     RocalTensorOutputType output_datatype) {
+    Tensor* output = nullptr;
+    if ((p_context == nullptr) || (p_input == nullptr)) {
+        ERR("Invalid ROCAL context or invalid input tensor")
+        return output;
+    }
+    auto context = static_cast<Context*>(p_context);
+    auto input = static_cast<Tensor*>(p_input);
+    try {
+        RocalTensorDataType op_tensor_datatype = static_cast<RocalTensorDataType>(output_datatype);
+
+        if (input->info().data_type() == op_tensor_datatype) {
+            output = context->master_graph->create_tensor(input->info(), is_output);
+            context->master_graph->add_node<CopyNode>({input}, {output});
+        } else {
+            TensorInfo output_info = input->info();
+            output_info.set_data_type(op_tensor_datatype);
+            output = context->master_graph->create_tensor(output_info, is_output);
+            context->master_graph->add_node<CastNode>({input}, {output});
+        }
+    } catch(const std::exception& e) {
+        context->capture_error(e.what());
+        ERR(e.what())
+    }
+    return output;
+}
+
 RocalTensor ROCAL_API_CALL
 rocalSetLayout(
     RocalContext p_context,

From e56f1622580c0f548d0c4a7fc2271c6120c7fb0b Mon Sep 17 00:00:00 2001
From: SundarRajan98 <svaithiy@amd.com>
Date: Tue, 30 Jan 2024 19:25:25 +0000
Subject: [PATCH 33/33] Modifying vx_roi_handle creation for generic ROI

---
 rocAL/source/pipeline/tensor.cpp | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/rocAL/source/pipeline/tensor.cpp b/rocAL/source/pipeline/tensor.cpp
index 2cab4ce79..043c08319 100644
--- a/rocAL/source/pipeline/tensor.cpp
+++ b/rocAL/source/pipeline/tensor.cpp
@@ -333,18 +333,21 @@ void Tensor::create_roi_tensor_from_handle(void **handle) {
         THROW("Empty ROI handle is passed")
     }
 
-    vx_size num_of_dims = 2;
-    vx_size stride[num_of_dims];
-    std::vector<size_t> roi_dims = {_info.batch_size(), 4};
+    auto _is_image = _info.is_image();
+    vx_size roi_num_of_dims = 2;
+    vx_size num_of_dims = _is_image ? 2 : (_info.num_of_dims() - 1);
+    std::vector<size_t> roi_dims;
+    roi_dims = {_info.batch_size(), num_of_dims * 2};
     if (_info.layout() == RocalTensorlayout::NFCHW || _info.layout() == RocalTensorlayout::NFHWC)
         roi_dims = {_info.dims()[0] * _info.dims()[1], 4};  // For Sequences pre allocating the ROI to N * F to replicate in OpenVX extensions        stride[0] = sizeof(vx_uint32);
+    vx_size stride[roi_num_of_dims];
     stride[0] = sizeof(vx_uint32);
     stride[1] = stride[0] * roi_dims[0];
     vx_enum mem_type = VX_MEMORY_TYPE_HOST;
     if (_info.mem_type() == RocalMemType::HIP)
         mem_type = VX_MEMORY_TYPE_HIP;
 
-    _vx_roi_handle = vxCreateTensorFromHandle(_context, num_of_dims, roi_dims.data(),
+    _vx_roi_handle = vxCreateTensorFromHandle(_context, roi_num_of_dims, roi_dims.data(),
                                               VX_TYPE_UINT32, 0, stride, *handle, mem_type);
     vx_status status;
     if ((status = vxGetStatus((vx_reference)_vx_roi_handle)) != VX_SUCCESS)