diff --git a/apps/image_augmentation/image_augmentation.cpp b/apps/image_augmentation/image_augmentation.cpp
index 1286d1862..a21b89393 100644
--- a/apps/image_augmentation/image_augmentation.cpp
+++ b/apps/image_augmentation/image_augmentation.cpp
@@ -48,12 +48,12 @@ int main(int argc, const char** argv) {
     if (argc < MIN_ARG_COUNT) {
         printf(
             "Usage: image_augmentation <image_dataset_folder/video_file> <processing_device=1/cpu=0>  \
-              decode_width decode_height video_mode gray_scale/rgb display_on_off decode_shard_count  <shuffle:0/1> <jpeg_dec_mode<0(tjpeg)/1(opencv)/2(hwdec)>\n");
+              decode_width decode_height decoder_mode gray_scale/rgb display_on_off decode_shard_count  <shuffle:0/1> <jpeg_dec_mode<0(tjpeg)/1(opencv)/2(hwdec)>\n");
         return -1;
     }
     int argIdx = 0;
     const char* folderPath1 = argv[++argIdx];
-    int video_mode = 0;  // 0 means no video decode, 1 means hardware, 2 means software decoding
+    int decoder_mode = 0;  // 0 means no video decode, 1 means hardware, 2 means software decoding
     bool display = 1;    // Display the images
     int aug_depth = 1;   // how deep is the augmentation tree
     int rgb = 1;         // process color images
@@ -62,7 +62,7 @@ int main(int argc, const char** argv) {
     bool processing_device = 1;
     size_t shard_count = 2;
     int shuffle = 0;
-    int dec_mode = 0;
+    int decoder_type = 0;
     const char *outName = "image_augmentation_app.png";
 
     if (argc >= argIdx + MIN_ARG_COUNT)
@@ -75,7 +75,7 @@ int main(int argc, const char** argv) {
         decode_height = atoi(argv[++argIdx]);
 
     if (argc >= argIdx + MIN_ARG_COUNT)
-        video_mode = atoi(argv[++argIdx]);
+        decoder_mode = atoi(argv[++argIdx]);
 
     if (argc >= argIdx + MIN_ARG_COUNT)
         rgb = atoi(argv[++argIdx]);
@@ -90,7 +90,7 @@ int main(int argc, const char** argv) {
         shuffle = atoi(argv[++argIdx]);
 
     if (argc >= argIdx + MIN_ARG_COUNT)
-        dec_mode = atoi(argv[++argIdx]);
+        decoder_type = atoi(argv[++argIdx]);
 
     if (argc >= argIdx + MIN_ARG_COUNT)
         outName = argv[++argIdx];
@@ -108,7 +108,7 @@ int main(int argc, const char** argv) {
         return -1;
     }
 
-    RocalDecoderType dec_type = (RocalDecoderType)dec_mode;
+    RocalDecoderType dec_type = (RocalDecoderType)decoder_type;
 
     /*>>>>>>>>>>>>>>>> Creating rocAL parameters  <<<<<<<<<<<<<<<<*/
 
@@ -126,7 +126,7 @@ int main(int argc, const char** argv) {
     /*>>>>>>>>>>>>>>>>>>> Graph description <<<<<<<<<<<<<<<<<<<*/
     RocalTensor input1;
 
-    if (video_mode != 0) {
+    if (decoder_mode >= 2) {
         unsigned sequence_length = 3;
         unsigned frame_step = 3;
         unsigned frame_stride = 1;
@@ -134,7 +134,12 @@ int main(int argc, const char** argv) {
             std::cout << "Output width and height is needed for video decode\n";
             return -1;
         }
-        input1 = rocalVideoFileSource(handle, folderPath1, color_format, ((video_mode == 1) ? RocalDecodeDevice::ROCAL_HW_DECODE : RocalDecodeDevice::ROCAL_SW_DECODE), shard_count, sequence_length, frame_step, frame_stride, shuffle, true, false);
+        input1 = rocalVideoFileSource(handle, folderPath1, color_format, (decoder_mode == 2)? ROCAL_SW_DECODE: ROCAL_HW_DECODE, shard_count, sequence_length, frame_step, frame_stride, shuffle, true, false);
+    } else if (decoder_mode == 1) {
+            std::vector<float> area = {0.08, 1};
+            std::vector<float> aspect_ratio = {3.0f / 4, 4.0f / 3};
+            input1 = rocalFusedJpegCrop(handle, folderPath1, color_format, shard_count, false, area, aspect_ratio, 10, false, false, ROCAL_USE_USER_GIVEN_SIZE_RESTRICTED, decode_width, decode_height);
+
     } else {
         // The jpeg file loader can automatically select the best size to decode all images to that size
         // User can alternatively set the size or change the policy that is used to automatically find the size
@@ -152,7 +157,7 @@ int main(int argc, const char** argv) {
 
     RocalTensor tensor0;
     int resize_w = 112, resize_h = 112;
-    if (video_mode) {
+    if (decoder_mode >= 2) {
         resize_h = decode_height;
         resize_w = decode_width;
         tensor0 = input1;
@@ -214,7 +219,7 @@ int main(int argc, const char** argv) {
     int w = rocalGetOutputWidth(handle);
     int p = ((color_format == RocalImageColor::ROCAL_COLOR_RGB24) ? 3 : 1);
     std::cout << "output width " << w << " output height " << h << " color planes " << p << std::endl;
-    const unsigned number_of_cols = video_mode ? 1 : 10;
+    const unsigned number_of_cols = (decoder_mode >= 2) ? 1 : 10;
     auto cv_color_format = ((color_format == RocalImageColor::ROCAL_COLOR_RGB24) ? CV_8UC3 : CV_8UC1);
     cv::Mat mat_output(h + AMD_ROCm_Black_resize.rows, w * number_of_cols, cv_color_format);
     cv::Mat mat_input(h, w, cv_color_format);
diff --git a/docs/examples/image_processing/decoder.py b/docs/examples/image_processing/decoder.py
index eccce45fd..073fa383c 100644
--- a/docs/examples/image_processing/decoder.py
+++ b/docs/examples/image_processing/decoder.py
@@ -9,7 +9,7 @@
 import cupy as cp
 
 seed = 1549361629
-image_dir = "../../../../data/images/AMD-tinyDataSet/"
+image_dir = "../../../data/images/AMD-tinyDataSet/"
 batch_size = 4
 gpu_id = 0
 
@@ -34,13 +34,13 @@ def show_pipeline_output(pipe, device):
     pipe.build()
     data_loader = ROCALClassificationIterator(pipe, device)
     images = next(iter(data_loader))
-    show_images(images[0], device)
+    show_images(images[0][0], device)
 
 @pipeline_def(seed=seed)
 def image_decoder_pipeline(device="cpu", path=image_dir):
-    jpegs, labels = fn.readers.file(file_root=path, shard_id=0, num_shards=1, random_shuffle=False)
+    jpegs, labels = fn.readers.file(file_root=path)
     images = fn.decoders.image(jpegs, file_root=path, device=device, output_type=types.RGB, shard_id=0, num_shards=1, random_shuffle=False)
-    return fn.resize(images, device=device, resize_x=300, resize_y=300)
+    return fn.resize(images, device=device, resize_width=300, resize_height=300)
 
 def main():
     print ('Optional arguments: <cpu/gpu image_folder>')
@@ -52,9 +52,8 @@ def main():
           rocal_device = "gpu"
     if  len(sys.argv) > 2:
       img_folder = sys.argv[2]
-
-    pipe = image_decoder_pipeline(batch_size=bs, num_threads=1, device_id=gpu_id, rocal_cpu=True, tensor_layout=types.NHWC,
-                                  reverse_channels=True, mean = [0, 0, 0], std=[255, 255, 255], device=rocal_device, path=img_folder)
+    pipe = image_decoder_pipeline(batch_size=bs, num_threads=1, device_id=gpu_id, rocal_cpu=True, tensor_layout=types.NHWC, 
+                                reverse_channels=True, mean = [0, 0, 0], std=[255,255,255], device=rocal_device, path=img_folder)
     show_pipeline_output(pipe, device=rocal_device)
 
 if __name__ == '__main__':
diff --git a/docs/examples/image_processing/decoder_examples.ipynb b/docs/examples/image_processing/decoder_examples.ipynb
index 27098f079..cb1bef27e 100644
--- a/docs/examples/image_processing/decoder_examples.ipynb
+++ b/docs/examples/image_processing/decoder_examples.ipynb
@@ -38,7 +38,7 @@
     "%matplotlib inline\n",
     "\n",
     "seed = 1549361629\n",
-    "image_dir = \"../../../../data/images/AMD-tinyDataSet/\"\n",
+    "image_dir = \"../../../data/images/AMD-tinyDataSet/\"\n",
     "batch_size = 4\n",
     "gpu_id = 0\n",
     "\n",
@@ -61,7 +61,7 @@
     "    pipe.build()\n",
     "    data_loader = ROCALClassificationIterator(pipe, device, device_id)\n",
     "    images = next(iter(data_loader))\n",
-    "    show_images(images[0], device)\n"
+    "    show_images(images[0][0], device)\n"
    ]
   },
   {
@@ -82,9 +82,9 @@
    "source": [
     "@pipeline_def(seed=seed)\n",
     "def image_decoder_pipeline(device=\"cpu\"):\n",
-    "    jpegs, labels = fn.readers.file(file_root=image_dir, shard_id=0, num_shards=1, random_shuffle=False)\n",
+    "    jpegs, labels = fn.readers.file(file_root=image_dir)\n",
     "    images = fn.decoders.image(jpegs, file_root=image_dir, device=device, output_type=types.RGB, shard_id=0, num_shards=1, random_shuffle=False)\n",
-    "    return fn.resize(images, device=device, resize_x=300, resize_y=300)\n",
+    "    return fn.resize(images, device=device, resize_width=300, resize_height=300)\n",
     "\n",
     "pipe = image_decoder_pipeline(batch_size=batch_size, num_threads=1, device_id=gpu_id, rocal_cpu=True, tensor_layout=types.NHWC, \n",
     "                            reverse_channels=True, mean = [0, 0, 0], std=[255,255,255], device=\"cpu\")\n",
@@ -109,12 +109,13 @@
    "source": [
     "@pipeline_def(seed=seed)\n",
     "def image_decoder_random_crop_pipeline(device=\"cpu\"):\n",
-    "    jpegs, labels = fn.readers.file(file_root=image_dir, shard_id=0, num_shards=1, random_shuffle=False)\n",
+    "    jpegs, labels = fn.readers.file(file_root=image_dir)\n",
     "    images = fn.decoders.image_slice(jpegs, file_root=image_dir, \n",
-    "                                     device=device,\n",
     "                                     output_type=types.RGB,\n",
+    "                                     shard_id = 0,\n",
+    "                                     num_shards = 1,\n",
     "                                     random_shuffle=True)\n",
-    "    return fn.resize(images, device=device, resize_x=300, resize_y=300)\n",
+    "    return fn.resize(images, device=device, resize_width=300, resize_height=300)\n",
     "    \n",
     "pipe = image_decoder_random_crop_pipeline(batch_size=batch_size, num_threads=1, device_id=gpu_id, rocal_cpu=True, tensor_layout=types.NHWC, \n",
     "                                          reverse_channels=True, mean=[0,0,0], std = [255,255,255], device=\"cpu\")\n",
@@ -184,7 +185,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.6"
+   "version": "3.10.12"
   },
   "vscode": {
    "interpreter": {
diff --git a/docs/examples/image_processing/inference_pipeline.py b/docs/examples/image_processing/inference_pipeline.py
index a7db74e16..f97da7b37 100644
--- a/docs/examples/image_processing/inference_pipeline.py
+++ b/docs/examples/image_processing/inference_pipeline.py
@@ -31,7 +31,7 @@
 
 
 seed = 1549361629
-image_dir = "../../../../data/images/AMD-tinyDataSet/"
+image_dir = "../../../data/images/AMD-tinyDataSet/"
 batch_size = 4
 gpu_id = 0
 
diff --git a/rocAL-setup.py b/rocAL-setup.py
index fa6b5de91..1032aef6c 100644
--- a/rocAL-setup.py
+++ b/rocAL-setup.py
@@ -311,11 +311,11 @@
         os.system('sudo '+linuxFlag+' '+linuxSystemInstall+' ' +
                   linuxSystemInstall_check+' install lmdb-devel rapidjson-devel')
 
-    # turbo-JPEG - https://github.com/rrawther/libjpeg-turbo.git -- 2.0.6.2
+    # turbo-JPEG - https://github.com/libjpeg-turbo/libjpeg-turbo.git -- 3.0.1
     os.system(
-        '(cd '+deps_dir+'; git clone -b 2.0.6.2 https://github.com/rrawther/libjpeg-turbo.git )')
+        '(cd '+deps_dir+'; git clone -b 3.0.1 https://github.com/libjpeg-turbo/libjpeg-turbo.git )')
     os.system('(cd '+deps_dir+'/libjpeg-turbo; mkdir build; cd build; '+linuxCMake +
-              ' -DCMAKE_INSTALL_PREFIX=/usr -DCMAKE_BUILD_TYPE=RELEASE -DENABLE_STATIC=FALSE -DCMAKE_INSTALL_DEFAULT_LIBDIR=lib ..; make -j 4; sudo make install )')
+              ' -DCMAKE_INSTALL_PREFIX=/usr -DCMAKE_BUILD_TYPE=RELEASE -DENABLE_STATIC=FALSE -DCMAKE_INSTALL_DEFAULT_LIBDIR=lib -DWITH_JPEG8=TRUE ..; make -j 4; sudo make install )')
     # RPP
     os.system('sudo -v')
     os.system('(cd '+deps_dir+'; git clone -b '+rppVersion+' https://github.com/GPUOpen-ProfessionalCompute-Libraries/rpp.git; cd rpp; mkdir build-'+backend+'; cd build-'+backend+'; ' +
diff --git a/rocAL/CMakeLists.txt b/rocAL/CMakeLists.txt
index 1dc4630e1..c81ed5f99 100644
--- a/rocAL/CMakeLists.txt
+++ b/rocAL/CMakeLists.txt
@@ -42,6 +42,14 @@ find_package(RapidJSON QUIET)
 find_package(StdFilesystem QUIET)
 find_package(HALF QUIET)
 
+if(DEFINED ENV{ROCM_PATH})
+  set(ROCM_PATH $ENV{ROCM_PATH} CACHE PATH "Default ROCm installation path")
+elseif(ROCM_PATH)
+  message("-- INFO:ROCM_PATH Set -- ${ROCM_PATH}")
+else()
+  set(ROCM_PATH /opt/rocm CACHE PATH "Default ROCm installation path")
+endif()
+
 # HIP Backend
 if(GPU_SUPPORT AND "${BACKEND}" STREQUAL "HIP")
     if(NOT DEFINED HIP_PATH)
@@ -225,6 +233,7 @@ if(${BUILD_ROCAL})
                 include/augmentations/geometry_augmentations/
                 include/decoders/image/
                 include/decoders/video/
+                include/decoders/libjpeg/
                 include/device/
                 include/loaders/
                 include/loaders/image/
diff --git a/rocAL/include/api/rocal_api_augmentation.h b/rocAL/include/api/rocal_api_augmentation.h
index d236073fa..7cb74e75e 100644
--- a/rocAL/include/api/rocal_api_augmentation.h
+++ b/rocAL/include/api/rocal_api_augmentation.h
@@ -329,6 +329,20 @@ extern "C" RocalTensor ROCAL_API_CALL rocalFlipFixed(RocalContext context, Rocal
                                                      RocalTensorLayout output_layout = ROCAL_NONE,
                                                      RocalTensorOutputType output_datatype = ROCAL_UINT8);
 
+/*! \brief Transposes the tensors by reordering the dimensions based on the perm parameter.
+ * \ingroup group_rocal_augmentations
+ * \param [in] context Rocal context
+ * \param [in] input Input Rocal tensor
+ * \param [in] perm Permutation of the dimensions of the input
+ * \param [in] is_output is the output tensor part of the graph output
+ * \param [in] output_layout the layout of the output tensor
+ * \param [in] output_datatype the data type of the output tensor
+ * \return RocalTensor
+ */
+extern "C" RocalTensor ROCAL_API_CALL rocalTranspose(RocalContext context, RocalTensor input, std::vector<unsigned> perm, bool is_output,
+                                                RocalTensorLayout output_layout = ROCAL_NONE,
+                                                RocalTensorOutputType output_datatype = ROCAL_UINT8);
+
 /*! \brief Applies blur effect to images.
  * \ingroup group_rocal_augmentations
  * \param [in] context Rocal context
@@ -997,6 +1011,30 @@ extern "C" RocalTensor ROCAL_API_CALL rocalCropMirrorNormalize(RocalContext cont
                                                                RocalTensorLayout output_layout = ROCAL_NONE,
                                                                RocalTensorOutputType output_datatype = ROCAL_UINT8);
 
+/*! \brief Performs normalization on images.
+ * \ingroup group_rocal_augmentations
+ * \param [in] context Rocal context
+ * \param [in] input Input Rocal tensor
+ * \param [in] axes axes list for tensor normalization
+ * \param [in] mean mean value (specified for each channel) for tensor normalization
+ * \param [in] std_dev standard deviation value (specified for each channel) for tensor normalization
+ * \param [in] scale scale value (specified for each channel) for tensor normalization
+ * \param [in] shift shift value (specified for each channel) for tensor normalization
+ * \param [in] is_output is the output tensor part of the graph output
+ * \param [in] mirror controls horizontal flip of the tensor
+ * \param [in] output_layout the layout of the output tensor
+ * \param [in] output_datatype the data type of the output tensor
+ * \return RocalTensor
+ */
+extern "C" RocalTensor ROCAL_API_CALL rocalNormalize(RocalContext context, RocalTensor input,
+                                                               std::vector<unsigned> &axes,
+                                                               std::vector<float> &mean,
+                                                               std::vector<float> &std_dev,
+                                                               bool is_output,
+                                                               float scale = 1.0, float shift = 0.0,
+                                                               RocalTensorLayout output_layout = ROCAL_NONE,
+                                                               RocalTensorOutputType output_datatype = ROCAL_UINT8);                                                               
+
 /*! \brief Crops images.
  * \ingroup group_rocal_augmentations
  * \param [in] context Rocal context
@@ -1162,7 +1200,17 @@ extern "C" RocalTensor ROCAL_API_CALL rocalSSDRandomCrop(RocalContext context, R
                                                          RocalTensorLayout output_layout = ROCAL_NONE,
                                                          RocalTensorOutputType output_datatype = ROCAL_UINT8);
 
+/**
+ * \brief Cast input tensor from one data type to another 
+ * \param context Rocal context
+ * \param input Input tensor
+ * \param is_output Sets if the output is to be given to user or as intermediate buffer
+ * \param output_datatype Datatype of the output tensor
+ */
+extern "C" RocalTensor ROCAL_API_CALL rocalCast(RocalContext context, RocalTensor input,
+                                                bool is_output,
+                                                RocalTensorOutputType output_datatype = ROCAL_UINT8);
+
 extern "C" RocalTensor ROCAL_API_CALL rocalSetLayout(RocalContext context, RocalTensor input,
                                                      RocalTensorLayout output_layout = ROCAL_NONE);
-
 #endif  // MIVISIONX_ROCAL_API_AUGMENTATION_H
diff --git a/rocAL/include/augmentations/augmentations_nodes.h b/rocAL/include/augmentations/augmentations_nodes.h
index ef6beff32..c01fb0691 100644
--- a/rocAL/include/augmentations/augmentations_nodes.h
+++ b/rocAL/include/augmentations/augmentations_nodes.h
@@ -57,3 +57,6 @@ THE SOFTWARE.
 #include "node_sequence_rearrange.h"
 #include "node_gaussian_noise.h"
 #include "node_slice.h"
+#include "node_transpose.h"
+#include "node_normalize.h"
+#include "node_cast.h"
diff --git a/rocAL/include/augmentations/effects_augmentations/node_normalize.h b/rocAL/include/augmentations/effects_augmentations/node_normalize.h
new file mode 100644
index 000000000..6ad49d08f
--- /dev/null
+++ b/rocAL/include/augmentations/effects_augmentations/node_normalize.h
@@ -0,0 +1,46 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+#include "node.h"
+#include "parameter_vx.h"
+
+class NormalizeNode : public Node {
+   public:
+    NormalizeNode(const std::vector<Tensor *> &inputs,
+                            const std::vector<Tensor *> &outputs);
+    NormalizeNode() = delete;
+    void init(std::vector<unsigned> &axes, std::vector<float> &mean, std::vector<float> &std_dev, float scale, float shift);
+
+   protected:
+    void create_node() override;
+    void update_node() override {};
+
+   private:
+    int _axis_mask = 0;
+    uint _compute_mean, _compute_stddev;
+    vx_array _mean_vx_array, _stddev_vx_array;
+    std::vector<unsigned> _axes;
+    std::vector<float> _mean, _std_dev;
+    float _scale, _shift;
+    std::vector<std::vector<uint32_t>> _normalize_roi;
+};
\ No newline at end of file
diff --git a/rocAL/include/augmentations/geometry_augmentations/node_transpose.h b/rocAL/include/augmentations/geometry_augmentations/node_transpose.h
new file mode 100644
index 000000000..d8b6e94c1
--- /dev/null
+++ b/rocAL/include/augmentations/geometry_augmentations/node_transpose.h
@@ -0,0 +1,40 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+#include "node.h"
+#include "parameter_vx.h"
+
+class TransposeNode : public Node {
+   public:
+    TransposeNode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
+    TransposeNode() = delete;
+    void init(std::vector<unsigned> perm);
+
+   protected:
+    void create_node() override;
+    void update_node() override {};
+
+   private:
+    std::vector<unsigned> _perm;
+    vx_array _perm_array;
+};
diff --git a/rocAL/include/augmentations/node_cast.h b/rocAL/include/augmentations/node_cast.h
new file mode 100644
index 000000000..67930261b
--- /dev/null
+++ b/rocAL/include/augmentations/node_cast.h
@@ -0,0 +1,36 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#pragma once
+#include "node.h"
+#include "graph.h"
+
+class CastNode : public Node
+{
+public:
+    CastNode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs);
+    CastNode() = delete;
+
+protected:
+    void create_node() override;
+    void update_node() override {};
+};
diff --git a/rocAL/include/decoders/image/fused_crop_decoder.h b/rocAL/include/decoders/image/fused_crop_decoder.h
index 718919b90..ae59f6bf1 100644
--- a/rocAL/include/decoders/image/fused_crop_decoder.h
+++ b/rocAL/include/decoders/image/fused_crop_decoder.h
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/rocAL/include/decoders/image/turbo_jpeg_decoder.h b/rocAL/include/decoders/image/turbo_jpeg_decoder.h
index ce4dba600..99e67abac 100644
--- a/rocAL/include/decoders/image/turbo_jpeg_decoder.h
+++ b/rocAL/include/decoders/image/turbo_jpeg_decoder.h
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -64,24 +64,8 @@ class TJDecoder : public Decoder {
 
    private:
     tjhandle m_jpegDecompressor;
-    const static unsigned SCALING_FACTORS_COUNT = 16;
-    const tjscalingfactor SCALING_FACTORS[SCALING_FACTORS_COUNT] = {
-        {2, 1},
-        {15, 8},
-        {7, 4},
-        {13, 8},
-        {3, 2},
-        {11, 8},
-        {5, 4},
-        {9, 8},
-        {1, 1},
-        {7, 8},
-        {3, 4},
-        {5, 8},
-        {1, 2},
-        {3, 8},
-        {1, 4},
-        {1, 8}};
+    tjscalingfactor *_scaling_factors = nullptr;
+    int _num_scaling_factors = 0;
     bool _is_partial_decoder = false;
     std::vector<float> _bbox_coord;
     const static unsigned _max_scaling_factor = 8;
diff --git a/rocAL/include/decoders/libjpeg/libjpeg_extra.h b/rocAL/include/decoders/libjpeg/libjpeg_extra.h
new file mode 100644
index 000000000..69db1028a
--- /dev/null
+++ b/rocAL/include/decoders/libjpeg/libjpeg_extra.h
@@ -0,0 +1,75 @@
+/*
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#pragma once
+
+#include <turbojpeg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include "libjpeg_utils.h"
+
+extern "C" {
+
+//! extra apis for rocal to support partial decoding
+
+//! * Helper function to se the source
+//! * This function doesn't scale the decoded image
+
+//! * Decompress a subregion of JPEG image to an RGB, grayscale, or CMYK image.
+//! * This function doesn't scale the decoded image
+
+/*!
+  \param handle  TJPeg handle
+  \param jpegBuf compressed jpeg image buffer
+  \param jpegSize Size of the compressed data provided in the input_buffer
+  \param dstBuf user provided output buffer
+  \param width, pitch, height  width, stride and height of the allocated buffer
+  \param flags  TJPEG flags
+  \param pixelFormat  pixel format of the image
+  \param crop_x_diff,  crop_width_diff Actual crop_x and crop_w (adjusted to MB boundery)
+  \param x1, y1, crop_width, crop_height requested crop window
+*/
+
+int tjDecompress2_partial(tjhandle handle, const unsigned char *jpegBuf,
+                                    unsigned long jpegSize, unsigned char *dstBuf,
+                                    int width, int pitch, int height, int pixelFormat,
+                                    int flags, unsigned int *crop_x_diff, unsigned int *crop_width_diff,
+                                    unsigned int x1, unsigned int y1, unsigned int crop_width, unsigned int crop_height);
+
+
+//! * Decompress a subregion of JPEG image to an RGB, grayscale, or CMYK image.
+//! * This function scale the decoded image to fit the output dims
+/*!
+  \param handle  TJPeg handle
+  \param jpegBuf compressed jpeg image buffer
+  \param jpegSize Size of the compressed data provided in the input_buffer
+  \param dstBuf user provided output buffer
+  \param width, pitch, height  width, stride and height of the allocated buffer
+  \param flags  TJPEG flags
+  \param crop_width, crop_height requested crop window
+*/
+
+int tjDecompress2_partial_scale(tjhandle handle, const unsigned char *jpegBuf,
+                            unsigned long jpegSize, unsigned char *dstBuf,
+                            int width, int pitch, int height, int pixelFormat,
+                            int flags, unsigned int crop_width, unsigned int crop_height);
+}
\ No newline at end of file
diff --git a/rocAL/include/decoders/libjpeg/libjpeg_utils.h b/rocAL/include/decoders/libjpeg/libjpeg_utils.h
new file mode 100644
index 000000000..1c588ee0b
--- /dev/null
+++ b/rocAL/include/decoders/libjpeg/libjpeg_utils.h
@@ -0,0 +1,30 @@
+/*
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#pragma once
+
+//! turbojpeg includes
+
+extern "C" {
+#include "jerror.h"  
+#include "jpeglib.h" 
+#include "jpegint.h"
+}
diff --git a/rocAL/include/loaders/image/image_read_and_decode.h b/rocAL/include/loaders/image/image_read_and_decode.h
index 471164b54..6682d85f6 100644
--- a/rocAL/include/loaders/image/image_read_and_decode.h
+++ b/rocAL/include/loaders/image/image_read_and_decode.h
@@ -33,14 +33,6 @@ THE SOFTWARE.
 #include "timing_debug.h"
 #include "turbo_jpeg_decoder.h"
 
-/**
- * Compute the scaled value of <tt>dimension</tt> using the given scaling
- * factor.  This macro performs the integer equivalent of <tt>ceil(dimension *
- * scalingFactor)</tt>.
- */
-#define TJSCALED(dimension, scalingFactor)                       \
-    ((dimension * scalingFactor.num + scalingFactor.denom - 1) / \
-     scalingFactor.denom)
 
 class ImageReadAndDecode {
    public:
diff --git a/rocAL/include/loaders/image/node_numpy_loader.h b/rocAL/include/loaders/image/node_numpy_loader.h
index 91fdd278e..49918e4f5 100644
--- a/rocAL/include/loaders/image/node_numpy_loader.h
+++ b/rocAL/include/loaders/image/node_numpy_loader.h
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2019 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -50,4 +50,4 @@ class NumpyLoaderNode : public Node {
 
    private:
     std::shared_ptr<NumpyLoaderSharded> _loader_module = nullptr;
-};
\ No newline at end of file
+};
diff --git a/rocAL/include/loaders/image/node_numpy_loader_single_shard.h b/rocAL/include/loaders/image/node_numpy_loader_single_shard.h
index 4dc19699e..cd3b464e7 100644
--- a/rocAL/include/loaders/image/node_numpy_loader_single_shard.h
+++ b/rocAL/include/loaders/image/node_numpy_loader_single_shard.h
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2019 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/rocAL/include/loaders/image/numpy_loader.h b/rocAL/include/loaders/image/numpy_loader.h
index 2c3285561..0ff053da2 100644
--- a/rocAL/include/loaders/image/numpy_loader.h
+++ b/rocAL/include/loaders/image/numpy_loader.h
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2019 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -29,7 +29,6 @@ THE SOFTWARE.
 #include "circular_buffer.h"
 #include "commons.h"
 #include "image_read_and_decode.h"
-// #include "numpy_data_reader.h"
 //
 // NumpyLoader runs an internal thread for loading an decoding of numpy arrays asynchronously
 // it uses a circular buffer to store decoded numpy arrays for the user
@@ -54,7 +53,7 @@ class NumpyLoader : public LoaderModule {
     void set_prefetch_queue_depth(size_t prefetch_queue_depth) override;
     void shut_down() override;
     void feed_external_input(const std::vector<std::string>& input_images_names, const std::vector<unsigned char*>& input_buffer,
-                             const std::vector<ROIxywh>& roi_xywh, unsigned int max_width, unsigned int max_height, int channels, ExternalSourceFileMode mode, bool eos) override {}
+                             const std::vector<ROIxywh>& roi_xywh, unsigned int max_width, unsigned int max_height, unsigned int channels, ExternalSourceFileMode mode, bool eos) override {}
 
    private:
     bool is_out_of_data();
diff --git a/rocAL/include/loaders/image/numpy_loader_sharded.h b/rocAL/include/loaders/image/numpy_loader_sharded.h
index ada22c06b..744cfc716 100644
--- a/rocAL/include/loaders/image/numpy_loader_sharded.h
+++ b/rocAL/include/loaders/image/numpy_loader_sharded.h
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2019 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -21,7 +21,6 @@ THE SOFTWARE.
 */
 
 #pragma once
-#include <vector>
 
 #include "numpy_loader.h"
 //
@@ -46,7 +45,7 @@ class NumpyLoaderSharded : public LoaderModule {
     void set_prefetch_queue_depth(size_t prefetch_queue_depth) override;
     void shut_down() override;
     void feed_external_input(const std::vector<std::string>& input_images_names, const std::vector<unsigned char*>& input_buffer,
-                             const std::vector<ROIxywh>& roi_xywh, unsigned int max_width, unsigned int max_height, int channels, ExternalSourceFileMode mode, bool eos) override {}
+                             const std::vector<ROIxywh>& roi_xywh, unsigned int max_width, unsigned int max_height, unsigned int channels, ExternalSourceFileMode mode, bool eos) override {}
 
    private:
     void increment_loader_idx();
diff --git a/rocAL/include/parameters/parameter.h b/rocAL/include/parameters/parameter.h
index 8ce731a5a..1bec7b334 100644
--- a/rocAL/include/parameters/parameter.h
+++ b/rocAL/include/parameters/parameter.h
@@ -33,9 +33,9 @@ class Parameter {
     /// used to internally renew state of the parameter if needed (for random parameters)
     virtual void renew(){};
 
-    virtual void create_array(unsigned batch_size) {};
+    virtual void create_array(unsigned batch_size){};
 
-    virtual std::vector<T> get_array() { return{};};
+    virtual std::vector<T> get_array() { return {}; };
 
     virtual ~Parameter() {}
     ///
diff --git a/rocAL/include/parameters/parameter_random.h b/rocAL/include/parameters/parameter_random.h
index 07500636b..54414ae07 100644
--- a/rocAL/include/parameters/parameter_random.h
+++ b/rocAL/include/parameters/parameter_random.h
@@ -56,8 +56,6 @@ class UniformRand : public Parameter<T> {
         return _array;
     }
 
-
-
     void renew_value() {
         std::unique_lock<std::mutex> lock(_lock);
         auto val = _generator();
@@ -70,7 +68,6 @@ class UniformRand : public Parameter<T> {
             _updated_val = static_cast<T>(
                 ((double)val / (double)_generator.max()) * ((double)_end - (double)_start) + (double)_start);
         }
-
     }
 
     void renew_array() {
@@ -81,10 +78,9 @@ class UniformRand : public Parameter<T> {
     }
 
     void renew() override {
-        if (_array.size()>0) {
+        if (_array.size() > 0) {
             renew_array();
-        }
-        else {
+        } else {
             renew_value();
         }
     }
@@ -206,8 +202,7 @@ struct CustomRand : public Parameter<T> {
     void renew() override {
         if (_array.size() > 0) {
             renew_array();
-        }
-        else {
+        } else {
             renew_value();
         }
     }
diff --git a/rocAL/include/parameters/parameter_vx.h b/rocAL/include/parameters/parameter_vx.h
index 5fa59116f..e71cd48ee 100644
--- a/rocAL/include/parameters/parameter_vx.h
+++ b/rocAL/include/parameters/parameter_vx.h
@@ -55,7 +55,7 @@ class ParameterVX {
         _batch_size = batch_size;
         _param->create_array(_batch_size);
         _array = vxCreateArray(vxGetContext((vx_reference)graph->get()), data_type, _batch_size);
-        auto status  = vxAddArrayItems(_array, _batch_size, get_array().data(), sizeof(T));
+        auto status = vxAddArrayItems(_array, _batch_size, get_array().data(), sizeof(T));
         if (status != 0)
             THROW(" vxAddArrayItems failed in create_array (ParameterVX): " + TOSTR(status))
         update_array();
diff --git a/rocAL/include/pipeline/tensor.h b/rocAL/include/pipeline/tensor.h
index 0ccd15770..9c300702b 100644
--- a/rocAL/include/pipeline/tensor.h
+++ b/rocAL/include/pipeline/tensor.h
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2019 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -207,8 +207,11 @@ class TensorInfo {
             modify_strides();
         }
         _layout = layout;
-        if (_layout == RocalTensorlayout::NONE)
-            set_max_shape();
+        if (_layout == RocalTensorlayout::NHWC || _layout == RocalTensorlayout::NDHWC) {
+            _channels = _dims.back();
+        } else if (_layout == RocalTensorlayout::NCHW || _layout == RocalTensorlayout::NCDHW) {
+            _channels = _dims.at(1);
+        }
     }
     void set_dims(std::vector<size_t>& new_dims) {
         if (_num_of_dims == new_dims.size()) {
diff --git a/rocAL/include/readers/image/numpy_data_reader.h b/rocAL/include/readers/image/numpy_data_reader.h
index 201eb4fa0..48115c165 100644
--- a/rocAL/include/readers/image/numpy_data_reader.h
+++ b/rocAL/include/readers/image/numpy_data_reader.h
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2019 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -24,6 +24,7 @@ THE SOFTWARE.
 #include <dirent.h>
 
 #include <memory>
+#include <mutex>
 #include <string>
 #include <vector>
 #include <mutex>
@@ -83,6 +84,7 @@ class NumpyDataReader : public Reader {
     unsigned _curr_file_idx;
     FILE* _current_fPtr;
     unsigned _current_file_size;
+    NumpyHeaderData _curr_file_header;
     std::string _last_id;
     std::string _last_file_name;
     size_t _shard_id = 0;
@@ -101,23 +103,23 @@ class NumpyDataReader : public Reader {
     size_t _file_count_all_shards;
     std::mutex _cache_mutex_;
     std::map<std::string, NumpyHeaderData> _header_cache_;
-    const RocalTensorDataType TypeFromNumpyStr(const std::string& format);
-    inline void SkipSpaces(const char*& ptr);
-    void ParseHeaderContents(NumpyHeaderData& target, const std::string& header);
+    const RocalTensorDataType get_numpy_dtype(const std::string& format);
+    inline void ignore_spaces(const char*& ptr);
+    void decode_header(NumpyHeaderData& target, const std::string& header);
     template <size_t N>
-    void Skip(const char*& ptr, const char (&what)[N]);
+    void skip_string(const char*& ptr, const char (&what)[N]);
     template <size_t N>
-    bool TrySkip(const char*& ptr, const char (&what)[N]);
+    bool check_and_skip_string(const char*& ptr, const char (&what)[N]);
     template <size_t N>
-    void SkipFieldName(const char*& ptr, const char (&name)[N]);
+    void skip_field(const char*& ptr, const char (&name)[N]);
     template <typename T = int64_t>
-    T ParseInteger(const char*& ptr);
-    std::string ParseStringValue(const char*& input, char delim_start = '\'', char delim_end = '\'');
-    void ParseHeader(NumpyHeaderData& parsed_header, std::string file_path);
+    T parse_int(const char*& ptr);
+    std::string read_dtype_string(const char*& input, char delim_start = '\'', char delim_end = '\'');
+    void read_header(NumpyHeaderData& parsed_header, std::string file_path);
     template <typename T>
-    size_t ParseNumpyData(T* buf, std::vector<unsigned> strides, std::vector<unsigned> shapes, unsigned dim = 0);
-    bool GetFromCache(const std::string& file_name, NumpyHeaderData& target);
-    void UpdateCache(const std::string& file_name, const NumpyHeaderData& value);   
+    size_t copy_array_data(T* buf, std::vector<unsigned> strides, std::vector<unsigned> shapes, unsigned dim = 0);
+    bool get_cached_header(const std::string& file_name, NumpyHeaderData& target);
+    void update_header_cache(const std::string& file_name, const NumpyHeaderData& value);
     void incremenet_read_ptr();
     int release();
     size_t get_file_shard_id();
diff --git a/rocAL/source/api/rocal_api_augmentation.cpp b/rocAL/source/api/rocal_api_augmentation.cpp
index c740eadc5..ea1c3344c 100644
--- a/rocAL/source/api/rocal_api_augmentation.cpp
+++ b/rocAL/source/api/rocal_api_augmentation.cpp
@@ -1262,6 +1262,37 @@ rocalSlice(
     return output;
 }
 
+RocalTensor ROCAL_API_CALL
+rocalTranspose(
+    RocalContext p_context,
+    RocalTensor p_input,
+    std::vector<unsigned> perm,
+    bool is_output,
+    RocalTensorLayout output_layout,
+    RocalTensorOutputType output_datatype) {
+    Tensor* output = nullptr;
+    if ((p_context == nullptr) || (p_input == nullptr)) {
+        ERR("Invalid ROCAL context or invalid input image")
+        return output;
+    }
+    auto context = static_cast<Context*>(p_context);
+    auto input = static_cast<Tensor*>(p_input);
+    try {
+        RocalTensorlayout op_tensor_layout = static_cast<RocalTensorlayout>(output_layout);
+        RocalTensorDataType op_tensor_datatype = static_cast<RocalTensorDataType>(output_datatype);
+        TensorInfo output_info = input->info();
+        output_info.set_tensor_layout(op_tensor_layout);
+        output_info.set_data_type(op_tensor_datatype);
+        output = context->master_graph->create_tensor(output_info, is_output);
+        std::shared_ptr<TransposeNode> transpose_node = context->master_graph->add_node<TransposeNode>({input}, {output});
+        transpose_node->init(perm);
+    } catch (const std::exception& e) {
+        context->capture_error(e.what());
+        ERR(e.what())
+    }
+    return output;
+}
+
 RocalTensor ROCAL_API_CALL
 rocalFlip(
     RocalContext p_context,
@@ -1887,6 +1918,35 @@ rocalColorTwistFixed(
     return output;
 }
 
+RocalTensor ROCAL_API_CALL
+rocalNormalize(RocalContext p_context, RocalTensor p_input, std::vector<unsigned> &axes, 
+                         std::vector<float>& mean, std::vector<float>& std_dev, bool is_output,
+                         float scale, float shift,
+                         RocalTensorLayout output_layout,
+                         RocalTensorOutputType output_datatype) {
+    Tensor* output = nullptr;
+    if ((p_context == nullptr) || (p_input == nullptr)) {
+        ERR("Invalid ROCAL context or invalid input tensor")
+        return output;
+    }
+    auto context = static_cast<Context*>(p_context);
+    auto input = static_cast<Tensor*>(p_input);
+    try {
+        RocalTensorlayout op_tensor_layout = static_cast<RocalTensorlayout>(output_layout);
+        RocalTensorDataType op_tensor_datatype = static_cast<RocalTensorDataType>(output_datatype);
+        TensorInfo output_info = input->info();
+        output_info.set_tensor_layout(op_tensor_layout);
+        output_info.set_data_type(op_tensor_datatype);
+        output = context->master_graph->create_tensor(output_info, is_output);
+        std::shared_ptr<NormalizeNode> normalize_node = context->master_graph->add_node<NormalizeNode>({input}, {output});
+        normalize_node->init(axes, mean, std_dev, scale, shift);
+    } catch (const std::exception& e) {
+        context->capture_error(e.what());
+        ERR(e.what())
+    }
+    return output;
+}
+
 RocalTensor ROCAL_API_CALL
 rocalCropMirrorNormalize(RocalContext p_context, RocalTensor p_input, unsigned crop_height,
                          unsigned crop_width, float start_x, float start_y, std::vector<float>& mean,
@@ -2266,6 +2326,35 @@ rocalNop(
     return output;
 }
 
+RocalTensor ROCAL_API_CALL rocalCast(RocalContext p_context, RocalTensor p_input,
+                                     bool is_output,
+                                     RocalTensorOutputType output_datatype) {
+    Tensor* output = nullptr;
+    if ((p_context == nullptr) || (p_input == nullptr)) {
+        ERR("Invalid ROCAL context or invalid input tensor")
+        return output;
+    }
+    auto context = static_cast<Context*>(p_context);
+    auto input = static_cast<Tensor*>(p_input);
+    try {
+        RocalTensorDataType op_tensor_datatype = static_cast<RocalTensorDataType>(output_datatype);
+
+        if (input->info().data_type() == op_tensor_datatype) {
+            output = context->master_graph->create_tensor(input->info(), is_output);
+            context->master_graph->add_node<CopyNode>({input}, {output});
+        } else {
+            TensorInfo output_info = input->info();
+            output_info.set_data_type(op_tensor_datatype);
+            output = context->master_graph->create_tensor(output_info, is_output);
+            context->master_graph->add_node<CastNode>({input}, {output});
+        }
+    } catch(const std::exception& e) {
+        context->capture_error(e.what());
+        ERR(e.what())
+    }
+    return output;
+}
+
 RocalTensor ROCAL_API_CALL
 rocalSetLayout(
     RocalContext p_context,
diff --git a/rocAL/source/augmentations/effects_augmentations/node_normalize.cpp b/rocAL/source/augmentations/effects_augmentations/node_normalize.cpp
new file mode 100644
index 000000000..16bb59798
--- /dev/null
+++ b/rocAL/source/augmentations/effects_augmentations/node_normalize.cpp
@@ -0,0 +1,90 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "node_normalize.h"
+
+#include <graph.h>
+#include <vx_ext_rpp.h>
+
+#include "exception.h"
+
+NormalizeNode::NormalizeNode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) : Node(inputs, outputs) {}
+
+void NormalizeNode::create_node() {
+    if (_node)
+        return;
+
+    _compute_mean = _mean.size() ? 0 : 1;
+    _compute_stddev = _std_dev.size() ? 0 : 1;
+
+    uint mean_stddev_array_size = _mean.size();
+    std::vector<float> mean_vec, stddev_vec;
+    mean_vec.resize(_batch_size * mean_stddev_array_size, _mean[0]);
+    stddev_vec.resize(_batch_size * mean_stddev_array_size, _std_dev[0]);
+
+    if (!_compute_mean && !_compute_stddev)
+    for (uint i = 0; i < _batch_size; i++) {
+        for (uint j = 0; j < mean_stddev_array_size; j++) {
+            mean_vec[i * mean_stddev_array_size + j] = _mean[j];
+            stddev_vec[i * mean_stddev_array_size + j] = _std_dev[j];
+        }
+    }
+    vx_status status = VX_SUCCESS;
+    if (!_compute_mean) {
+        _mean_vx_array = vxCreateArray(vxGetContext((vx_reference)_graph->get()), VX_TYPE_FLOAT32, mean_vec.size());
+        status |= vxAddArrayItems(_mean_vx_array, mean_vec.size(), mean_vec.data(), sizeof(vx_float32));
+        if (status != 0)
+            THROW(" vxAddArrayItems failed in the normalize node (vxExtRppNormalize)  node: " + TOSTR(status) + "  " + TOSTR(status))
+    }
+
+    if (!_compute_stddev) {
+        _stddev_vx_array = vxCreateArray(vxGetContext((vx_reference)_graph->get()), VX_TYPE_FLOAT32, stddev_vec.size());
+        status |= vxAddArrayItems(_stddev_vx_array, stddev_vec.size(), stddev_vec.data(), sizeof(vx_float32));
+        if (status != 0)
+            THROW(" vxAddArrayItems failed in the normalize node (vxExtRppNormalize)  node: " + TOSTR(status) + "  " + TOSTR(status))
+    }
+    vx_scalar axis_mask = vxCreateScalar(vxGetContext((vx_reference)_graph->get()), VX_TYPE_INT32, &_axis_mask);
+    vx_scalar scale = vxCreateScalar(vxGetContext((vx_reference)_graph->get()), VX_TYPE_FLOAT32, &_scale);
+    vx_scalar shift = vxCreateScalar(vxGetContext((vx_reference)_graph->get()), VX_TYPE_FLOAT32, &_shift);
+    vx_scalar compute_mean = vxCreateScalar(vxGetContext((vx_reference)_graph->get()), VX_TYPE_UINT32, &_compute_mean);
+    vx_scalar compute_stddev = vxCreateScalar(vxGetContext((vx_reference)_graph->get()), VX_TYPE_UINT32, &_compute_stddev);
+    int input_layout = static_cast<int>(_inputs[0]->info().layout());
+    int output_layout = static_cast<int>(_outputs[0]->info().layout());
+    int roi_type = static_cast<int>(_inputs[0]->info().roi_type());
+    vx_scalar input_layout_vx = vxCreateScalar(vxGetContext((vx_reference)_graph->get()), VX_TYPE_INT32, &input_layout);
+    vx_scalar output_layout_vx = vxCreateScalar(vxGetContext((vx_reference)_graph->get()), VX_TYPE_INT32, &output_layout);
+    vx_scalar roi_type_vx = vxCreateScalar(vxGetContext((vx_reference)_graph->get()), VX_TYPE_INT32, &roi_type);
+
+    _node = vxExtRppNormalize(_graph->get(), _inputs[0]->handle(), _inputs[0]->get_roi_tensor(), _outputs[0]->handle(), axis_mask,
+                              _mean_vx_array, _stddev_vx_array, compute_mean, compute_stddev, scale, shift, input_layout_vx, output_layout_vx, roi_type_vx);
+    if ((status = vxGetStatus((vx_reference)_node)) != VX_SUCCESS)
+        THROW("Error adding the crop mirror normalize (vxExtRppNormalize) failed: " + TOSTR(status))
+}
+
+void NormalizeNode::init(std::vector<unsigned> &axes, std::vector<float> &mean, std::vector<float> &std_dev, float scale, float shift) {
+    _mean = mean;
+    _std_dev = std_dev;
+    _scale = scale;
+    _shift = shift;
+    for (unsigned d = 0; d < axes.size(); d++)
+        _axis_mask |= (1 << axes[d]);
+}
\ No newline at end of file
diff --git a/rocAL/source/augmentations/geometry_augmentations/node_crop.cpp b/rocAL/source/augmentations/geometry_augmentations/node_crop.cpp
index 6574ea1bf..6ca4cbd2c 100644
--- a/rocAL/source/augmentations/geometry_augmentations/node_crop.cpp
+++ b/rocAL/source/augmentations/geometry_augmentations/node_crop.cpp
@@ -102,8 +102,8 @@ void CropNode::create_crop_tensor() {
     vx_size num_of_dims = 2;
     vx_size stride[num_of_dims];
     std::vector<size_t> _crop_tensor_dims = {_batch_size, 4};
-    if (_inputs[0]->info().layout() == RocalTensorlayout::NFCHW || _inputs[0]->info().layout() == RocalTensorlayout::NFHWC)
-        _crop_tensor_dims = {_inputs[0]->info().dims()[0] * _inputs[0]->info().dims()[1], 4};  // For Sequences pre allocating the ROI to N * F to replicate in OpenVX extensions
+    if(_inputs[0]->info().layout() == RocalTensorlayout::NFCHW || _inputs[0]->info().layout() == RocalTensorlayout::NFHWC)
+        _crop_tensor_dims = {_inputs[0]->info().dims()[0] * _inputs[0]->info().dims()[1], 4}; // For Sequences pre allocating the ROI to N * F to replicate in OpenVX extensions
     stride[0] = sizeof(vx_uint32);
     stride[1] = stride[0] * _crop_tensor_dims[0];
     vx_enum mem_type = VX_MEMORY_TYPE_HOST;
@@ -111,8 +111,8 @@ void CropNode::create_crop_tensor() {
         mem_type = VX_MEMORY_TYPE_HIP;
     allocate_host_or_pinned_mem(&_crop_coordinates, stride[1] * 4, _inputs[0]->info().mem_type());
 
-    _crop_tensor = vxCreateTensorFromHandle(vxGetContext((vx_reference)_graph->get()), num_of_dims, _crop_tensor_dims.data(), VX_TYPE_UINT32, 0,
-                                            stride, reinterpret_cast<void *>(_crop_coordinates), mem_type);
+    _crop_tensor = vxCreateTensorFromHandle(vxGetContext((vx_reference) _graph->get()), num_of_dims, _crop_tensor_dims.data(), VX_TYPE_UINT32, 0, 
+                                                                  stride, reinterpret_cast<void *>(_crop_coordinates), mem_type);
     vx_status status;
     if ((status = vxGetStatus((vx_reference)_crop_tensor)) != VX_SUCCESS)
         THROW("Error: vxCreateTensorFromHandle(_crop_tensor: failed " + TOSTR(status))
diff --git a/rocAL/source/augmentations/geometry_augmentations/node_transpose.cpp b/rocAL/source/augmentations/geometry_augmentations/node_transpose.cpp
new file mode 100644
index 000000000..9e4376e4b
--- /dev/null
+++ b/rocAL/source/augmentations/geometry_augmentations/node_transpose.cpp
@@ -0,0 +1,51 @@
+/*
+Copyright (c) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <vx_ext_rpp.h>
+#include "node_transpose.h"
+#include "exception.h"
+
+TransposeNode::TransposeNode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) : Node(inputs, outputs) {}
+
+void TransposeNode::create_node() {
+    if (_node)
+        return;
+
+    int input_layout = static_cast<int>(_inputs[0]->info().layout());
+    int output_layout = static_cast<int>(_outputs[0]->info().layout());
+    int roi_type = static_cast<int>(_inputs[0]->info().roi_type());
+    vx_scalar input_layout_vx = vxCreateScalar(vxGetContext((vx_reference)_graph->get()), VX_TYPE_INT32, &input_layout);
+    vx_scalar output_layout_vx = vxCreateScalar(vxGetContext((vx_reference)_graph->get()), VX_TYPE_INT32, &output_layout);
+    vx_scalar roi_type_vx = vxCreateScalar(vxGetContext((vx_reference)_graph->get()), VX_TYPE_INT32, &roi_type);
+    _perm_array = vxCreateArray(vxGetContext((vx_reference)_graph->get()), VX_TYPE_UINT32, _perm.size());
+    vx_status status = VX_SUCCESS;
+    status |= vxAddArrayItems(_perm_array, _perm.size(), _perm.data(), sizeof(vx_uint32));
+
+    _node = vxExtRppTranspose(_graph->get(), _inputs[0]->handle(), _inputs[0]->get_roi_tensor(), _outputs[0]->handle(),
+                         _perm_array, input_layout_vx, output_layout_vx, roi_type_vx);
+    if ((status = vxGetStatus((vx_reference)_node)) != VX_SUCCESS)
+        THROW("Adding the transpose (vxExtRppTranspose) node failed: " + TOSTR(status))
+}
+
+void TransposeNode::init(std::vector<unsigned> perm) {
+    _perm = perm;
+}
\ No newline at end of file
diff --git a/rocAL/source/augmentations/node_cast.cpp b/rocAL/source/augmentations/node_cast.cpp
new file mode 100644
index 000000000..cff54c5c2
--- /dev/null
+++ b/rocAL/source/augmentations/node_cast.cpp
@@ -0,0 +1,44 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include <vx_ext_rpp.h>
+#include "node_cast.h"
+#include "exception.h"
+
+CastNode::CastNode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) :
+        Node(inputs, outputs) {}
+
+void CastNode::create_node() {
+    if(_node)
+        return;
+
+    int input_layout = (int)_inputs[0]->info().layout();
+    int roi_type = static_cast<int>(_inputs[0]->info().roi_type());
+    vx_scalar input_layout_vx = vxCreateScalar(vxGetContext((vx_reference)_graph->get()), VX_TYPE_INT32, &input_layout);
+    vx_scalar roi_type_vx = vxCreateScalar(vxGetContext((vx_reference)_graph->get()), VX_TYPE_INT32, &roi_type);
+    _node = vxExtRppCast(_graph->get(), _inputs[0]->handle(), _inputs[0]->get_roi_tensor(), _outputs[0]->handle(), input_layout_vx, roi_type_vx);
+
+    vx_status status;
+    if((status = vxGetStatus((vx_reference)_node)) != VX_SUCCESS)
+        THROW("Adding the copy (vxCastNode) node failed: " + TOSTR(status))
+
+}
diff --git a/rocAL/source/decoders/image/fused_crop_decoder.cpp b/rocAL/source/decoders/image/fused_crop_decoder.cpp
index 2522bca4e..ee14c0f11 100644
--- a/rocAL/source/decoders/image/fused_crop_decoder.cpp
+++ b/rocAL/source/decoders/image/fused_crop_decoder.cpp
@@ -20,11 +20,13 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */
 
-#include "fused_crop_decoder.h"
 
 #include <commons.h>
 #include <stdio.h>
 #include <string.h>
+#include "fused_crop_decoder.h"
+#include "libjpeg_extra.h"
+
 
 FusedCropTJDecoder::FusedCropTJDecoder() {
     m_jpegDecompressor = tjInitDecompress();
diff --git a/rocAL/source/decoders/image/turbo_jpeg_decoder.cpp b/rocAL/source/decoders/image/turbo_jpeg_decoder.cpp
index 772fc8535..b285e891d 100644
--- a/rocAL/source/decoders/image/turbo_jpeg_decoder.cpp
+++ b/rocAL/source/decoders/image/turbo_jpeg_decoder.cpp
@@ -20,24 +20,21 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */
 
-#include "turbo_jpeg_decoder.h"
 
-#include <commons.h>
 #include <stdio.h>
+#include <commons.h>
+#include "turbo_jpeg_decoder.h"
+#include "libjpeg_extra.h"
 
 TJDecoder::TJDecoder() {
     m_jpegDecompressor = tjInitDecompress();
-
-#if 0
-    int num_avail_scalings = 0;
-    auto scaling_factors = tjGetScalingFactors	(&num_avail_scalings);	
-    for(int i = 0; i < num_avail_scalings; i++) {
-        if(scaling_factors[i].num < scaling_factors[i].denom) {
-
-            printf("%d / %d  - ",scaling_factors[i].num, scaling_factors[i].denom );
+    if ((_scaling_factors = tj3GetScalingFactors(&_num_scaling_factors)) == NULL)
+        THROW("tjDecompress2_partial_scale(): error getting scaling factors");
+    for(int i = 0; i < _num_scaling_factors; i++) {
+        if(_scaling_factors[i].num < _scaling_factors[i].denom) {
+            INFO(STR(_scaling_factors[i].num) + "/" + STR(_scaling_factors[i].denom));
         }
     }
-#endif
 };
 
 Decoder::Status TJDecoder::decode_info(unsigned char* input_buffer, size_t input_size, int* width, int* height, int* color_comps) {
@@ -90,7 +87,7 @@ Decoder::Status TJDecoder::decode(unsigned char* input_buffer, size_t input_size
                 crop_width = _max_scaling_factor * max_decoded_width;
                 if (crop_width > original_image_width) crop_width = original_image_width;
                 crop_height = crop_width * (1.0 / in_ratio);
-                if (crop_height > _max_scaling_factor * max_decoded_width) crop_height = _max_scaling_factor * max_decoded_width;
+                if (crop_height > _max_scaling_factor * max_decoded_height) crop_height = _max_scaling_factor * max_decoded_height;
             } else if (original_image_height > (_max_scaling_factor * max_decoded_height)) {
                 crop_height = _max_scaling_factor * max_decoded_height;
                 if (crop_height > original_image_height) crop_height = original_image_height;
@@ -114,9 +111,9 @@ Decoder::Status TJDecoder::decode(unsigned char* input_buffer, size_t input_size
             }
             // Find the decoded image size using the predefined scaling factors in the turbo jpeg decoder
             uint scaledw = max_decoded_width, scaledh = max_decoded_height;
-            for (auto scaling_factor : SCALING_FACTORS) {
-                scaledw = TJSCALED(crop_width, scaling_factor);
-                scaledh = TJSCALED(crop_height, scaling_factor);
+            for (int j=0; j < _num_scaling_factors; j++) {
+                scaledw = TJSCALED(original_image_width, _scaling_factors[j]);
+                scaledh = TJSCALED(original_image_height, _scaling_factors[j]);
                 if (scaledw <= max_decoded_width && scaledh <= max_decoded_height) {
                     break;
                 }
@@ -142,9 +139,9 @@ Decoder::Status TJDecoder::decode(unsigned char* input_buffer, size_t input_size
             }
             // Find the decoded image size using the predefined scaling factors in the turbo jpeg decoder
             uint scaledw = max_decoded_width, scaledh = max_decoded_height;
-            for (auto scaling_factor : SCALING_FACTORS) {
-                scaledw = TJSCALED(original_image_width, scaling_factor);
-                scaledh = TJSCALED(original_image_height, scaling_factor);
+            for (int j=0; j < _num_scaling_factors; j++) {
+                scaledw = TJSCALED(original_image_width, _scaling_factors[j]);
+                scaledh = TJSCALED(original_image_height, _scaling_factors[j]);
                 if (scaledw <= max_decoded_width && scaledh <= max_decoded_height)
                     break;
             }
@@ -168,7 +165,7 @@ Decoder::Status TJDecoder::decode(unsigned char* input_buffer, size_t input_size
                 crop_width = _max_scaling_factor * max_decoded_width;
                 if (crop_width > original_image_width) crop_width = original_image_width;
                 crop_height = crop_width * (1.0 / in_ratio);
-                if (crop_height > _max_scaling_factor * max_decoded_width) crop_height = _max_scaling_factor * max_decoded_width;
+                if (crop_height > _max_scaling_factor * max_decoded_height) crop_height = _max_scaling_factor * max_decoded_height;
             } else if (original_image_height > (_max_scaling_factor * max_decoded_height)) {
                 crop_height = _max_scaling_factor * max_decoded_height;
                 if (crop_height > original_image_height) crop_height = original_image_height;
@@ -192,9 +189,9 @@ Decoder::Status TJDecoder::decode(unsigned char* input_buffer, size_t input_size
             }
             // Find the decoded image size using the predefined scaling factors in the turbo jpeg decoder
             uint scaledw = max_decoded_width, scaledh = max_decoded_height;
-            for (auto scaling_factor : SCALING_FACTORS) {
-                scaledw = TJSCALED(crop_width, scaling_factor);
-                scaledh = TJSCALED(crop_height, scaling_factor);
+            for (int j=0; j < _num_scaling_factors; j++) {
+                scaledw = TJSCALED(original_image_width, _scaling_factors[j]);
+                scaledh = TJSCALED(original_image_height, _scaling_factors[j]);
                 if (scaledw <= max_decoded_width && scaledh <= max_decoded_height) {
                     break;
                 }
@@ -219,9 +216,9 @@ Decoder::Status TJDecoder::decode(unsigned char* input_buffer, size_t input_size
             // Find the decoded image size using the predefined scaling factors in the turbo jpeg decoder
             if ((actual_decoded_width != original_image_width) || (actual_decoded_height != original_image_height)) {
                 uint scaledw = actual_decoded_width, scaledh = actual_decoded_height;
-                for (auto scaling_factor : SCALING_FACTORS) {
-                    scaledw = TJSCALED(original_image_width, scaling_factor);
-                    scaledh = TJSCALED(original_image_height, scaling_factor);
+                for (int j=0; j < _num_scaling_factors; j++) {
+                    scaledw = TJSCALED(original_image_width, _scaling_factors[j]);
+                    scaledh = TJSCALED(original_image_height, _scaling_factors[j]);
                     if (scaledw <= max_decoded_width && scaledh <= max_decoded_height)
                         break;
                 }
diff --git a/rocAL/source/decoders/libjpeg/libjpeg_extra.cpp b/rocAL/source/decoders/libjpeg/libjpeg_extra.cpp
new file mode 100644
index 000000000..ca86f644a
--- /dev/null
+++ b/rocAL/source/decoders/libjpeg/libjpeg_extra.cpp
@@ -0,0 +1,266 @@
+/*
+Copyright (c) 2019 - 2024 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of inst software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and inst permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include "libjpeg_extra.h"
+#include <setjmp.h>
+#include <string.h>
+#include "commons.h"
+
+enum { COMPRESS = 1, DECOMPRESS = 2 };
+static J_COLOR_SPACE pf2cs[TJ_NUMPF] = {
+  JCS_EXT_RGB, JCS_EXT_BGR, JCS_EXT_RGBX, JCS_EXT_BGRX, JCS_EXT_XBGR,
+  JCS_EXT_XRGB, JCS_GRAYSCALE, JCS_EXT_RGBA, JCS_EXT_BGRA, JCS_EXT_ABGR,
+  JCS_EXT_ARGB, JCS_CMYK
+};
+
+struct my_error_mgr {
+  struct jpeg_error_mgr pub;
+  jmp_buf setjmp_buffer;
+  void (*emit_message) (j_common_ptr, int);
+  boolean warning, stopOnWarning;
+};
+typedef struct my_error_mgr *my_error_ptr;
+
+/*
+ * Here's the routine that will replace the standard error_exit method:
+ */
+
+METHODDEF(void)
+my_error_exit(j_common_ptr cinfo)
+{
+  /* cinfo->err really points to a my_error_mgr struct, so coerce pointer */
+  my_error_ptr myerr = (my_error_ptr)cinfo->err;
+
+  /* Always display the message. */
+  /* We could postpone this until after returning, if we chose. */
+  (*cinfo->err->output_message) (cinfo);
+
+  /* Return control to the setjmp point */
+  longjmp(myerr->setjmp_buffer, 1);
+}
+
+
+//! * Decompress a subregion of JPEG image to an RGB, grayscale, or CMYK image.
+//! * inst function doesn't scale the decoded image
+int tjDecompress2_partial(tjhandle handle, const unsigned char *jpegBuf,
+                                    unsigned long jpegSize, unsigned char *dstBuf,
+                                    int width, int pitch, int height, int pixelFormat,
+                                    int flags, unsigned int *crop_x_diff, unsigned int *crop_width_diff,
+                                    unsigned int crop_x, unsigned int crop_y,
+                                    unsigned int crop_width, unsigned int crop_height)
+{
+    JSAMPROW *row_pointer = NULL;
+    int i, retval = 0;
+
+    if (jpegBuf == NULL || jpegSize <= 0 || dstBuf == NULL || width < 0 ||
+        pitch < 0 || height < 0 || pixelFormat < 0 || pixelFormat >= TJ_NUMPF)
+        THROW("tjDecompress2_partial(): Invalid argument");
+
+    struct jpeg_decompress_struct cinfo;
+    // Initialize libjpeg structures to have a memory source
+    // Modify the usual jpeg error manager to catch fatal errors.
+    struct my_error_mgr jerr;
+    cinfo.err = jpeg_std_error(&jerr.pub);
+    jerr.pub.error_exit = my_error_exit;
+    if (setjmp(jerr.setjmp_buffer)) {
+      /* If we get here, the JPEG code has signaled an error. */
+      retval = -1;  goto bailout;
+    }
+
+    // set up, read header, set image parameters, save size
+    jpeg_create_decompress(&cinfo);
+    jpeg_mem_src(&cinfo, jpegBuf, jpegSize);
+    jpeg_read_header(&cinfo, TRUE);
+    cinfo.out_color_space = pf2cs[pixelFormat];
+    if (flags & TJFLAG_FASTDCT) cinfo.dct_method = JDCT_FASTEST;
+    if (flags & TJFLAG_FASTUPSAMPLE) cinfo.do_fancy_upsampling = FALSE;
+
+    jpeg_start_decompress(&cinfo);
+    /* Check for valid crop dimensions.  We cannot check these values until
+    * after jpeg_start_decompress() is called.
+    */
+    if (crop_x + crop_width > cinfo.output_width || crop_y + crop_height > cinfo.output_height) {
+        ERR("crop dimensions:" << crop_width << " x " << crop_height << " exceed image dimensions" <<
+            cinfo.output_width << " x " << cinfo.output_height);
+        retval = -1;  goto bailout;
+    }
+
+    jpeg_crop_scanline(&cinfo, &crop_x, &crop_width);
+    *crop_x_diff = crop_x;
+    *crop_width_diff = crop_width;
+
+    if (pitch == 0) pitch = cinfo.output_width * tjPixelSize[pixelFormat];
+
+    if ((row_pointer = (JSAMPROW *)malloc(sizeof(JSAMPROW) * cinfo.output_height)) == NULL) {
+      THROW("tjDecompress2_partial(): Memory allocation failure");
+      if (setjmp(jerr.setjmp_buffer)) {
+          /* If we get here, the JPEG code has signaled an error. */
+          retval = -1;  goto bailout;
+      }
+    }
+    
+    // set row pointer for destination
+    for (i = 0; i < (int)cinfo.output_height; i++) {
+      if (flags & TJFLAG_BOTTOMUP)
+        row_pointer[i] = &dstBuf[(cinfo.output_height - i - 1) * (size_t)pitch];
+      else
+        row_pointer[i] = &dstBuf[i * (size_t)pitch];
+    }
+
+    /* Process data */
+    JDIMENSION num_scanlines;
+    jpeg_skip_scanlines(&cinfo, crop_y);
+    while (cinfo.output_scanline <  crop_y + crop_height) {
+        if (cinfo.output_scanline < crop_y)
+          num_scanlines = jpeg_read_scanlines(&cinfo,  &row_pointer[cinfo.output_scanline],
+                                          crop_y + crop_height - cinfo.output_scanline);
+        else
+          num_scanlines = jpeg_read_scanlines(&cinfo,  &row_pointer[cinfo.output_scanline - crop_y],
+                                          crop_y + crop_height - cinfo.output_scanline);
+        if (num_scanlines == 0){
+          ERR("Premature end of Jpeg data. Stopped at " << cinfo.output_scanline - crop_y << "/"
+              << cinfo.output_height)
+        }
+    }      
+    jpeg_skip_scanlines(&cinfo, cinfo.output_height - crop_y - crop_height);
+    jpeg_finish_decompress(&cinfo);
+
+    bailout:
+    if (cinfo.global_state > DSTATE_START) jpeg_abort_decompress(&cinfo);
+    if (row_pointer) free(row_pointer);
+    return retval;
+}
+
+//! * Decompress a subregion of JPEG image to an RGB, grayscale, or CMYK image.
+//! * inst function scale the decoded image to fit the output dims
+
+int tjDecompress2_partial_scale(tjhandle handle, const unsigned char *jpegBuf,
+                            unsigned long jpegSize, unsigned char *dstBuf,
+                            int width, int pitch, int height, int pixelFormat,
+                            int flags, unsigned int crop_width, unsigned int crop_height)
+{
+    JSAMPROW *row_pointer = NULL;
+    int i, retval = 0, jpegwidth, jpegheight;
+    unsigned int scaledw, scaledh, crop_x, crop_y, max_crop_width;
+    tjscalingfactor *scalingFactors = NULL;
+    int numScalingFactors = 0;
+
+    unsigned char *tmp_row = NULL;
+    if (jpegBuf == NULL || jpegSize <= 0 || dstBuf == NULL || width < 0 || 
+          pitch < 0 || height < 0 || pixelFormat < 0 || pixelFormat >= TJ_NUMPF) {
+        THROW("tjDecompress2_partial_scale(): Invalid argument");
+    }
+
+    struct jpeg_decompress_struct cinfo;
+    // Initialize libjpeg structures to have a memory source
+    // Modify the usual jpeg error manager to catch fatal errors.
+    struct my_error_mgr jerr;
+    cinfo.err = jpeg_std_error(&jerr.pub);
+    jerr.pub.error_exit = my_error_exit;
+    if (setjmp(jerr.setjmp_buffer)) {
+        /* If we get here, the JPEG code has signaled an error. */
+        retval = -1;  goto bailout;
+    }
+
+    jpeg_mem_src(&cinfo, jpegBuf, jpegSize);
+    jpeg_read_header(&cinfo, TRUE);
+    cinfo.out_color_space = pf2cs[pixelFormat];
+    if (flags & TJFLAG_FASTDCT) cinfo.dct_method = JDCT_FASTEST;
+    if (flags & TJFLAG_FASTUPSAMPLE) cinfo.do_fancy_upsampling = FALSE;
+
+    jpegwidth = cinfo.image_width;  jpegheight = cinfo.image_height;
+    if (width == 0) width = jpegwidth;
+    if (height == 0) height = jpegheight;
+    if ((scalingFactors = tj3GetScalingFactors(&numScalingFactors)) == NULL)
+        THROW("tjDecompress2_partial_scale(): error getting scaling factors");
+
+    for (i = 0; i < numScalingFactors; i++) {
+      scaledw = TJSCALED(crop_width, scalingFactors[i]);
+      scaledh = TJSCALED(crop_height, scalingFactors[i]);
+      if (scaledw <= (unsigned int)width && scaledh <= (unsigned int)height)
+        break;
+    }
+
+    if (i >= numScalingFactors)
+      THROW("tjDecompress2_partial_scale(): Could not scale down to desired image dimensions");
+    
+    if (cinfo.num_components > 3)
+      THROW("tjDecompress2_partial_scale(): JPEG image must have 3 or fewer components");
+    
+    //width = scaledw;  height = scaledh;
+    cinfo.scale_num = scalingFactors[i].num;
+    cinfo.scale_denom = scalingFactors[i].denom;
+
+    jpeg_start_decompress(&cinfo);
+    crop_x = cinfo.output_width - scaledw;
+    crop_y = cinfo.output_height - scaledh;
+
+    /* Check for valid crop dimensions.  We cannot check these values until
+    * after jpeg_start_decompress() is called.
+    */
+    if (crop_x + scaledw   > cinfo.output_width || scaledh   > cinfo.output_height) {
+        ERR("crop dimensions:" << crop_x + scaledw << " x " << scaledh << " exceed image dimensions" <<
+            cinfo.output_width << " x " << cinfo.output_height);
+        retval = -1;  goto bailout;
+    }
+
+    if (pitch == 0) pitch = cinfo.output_width * tjPixelSize[pixelFormat];
+
+    if ((row_pointer =
+        (JSAMPROW *)malloc(sizeof(JSAMPROW) * cinfo.output_height)) == NULL)
+        THROW("tjDecompress2_partial_scale(): Memory allocation failure");
+    // allocate row of tmp storage for storing discarded data
+    tmp_row = (unsigned char *)malloc((size_t)pitch);
+
+    if (setjmp(jerr.setjmp_buffer)) {
+      /* If we get here, the JPEG code has signaled an error. */
+      retval = -1;  goto bailout;
+    }
+
+    for (i = 0; i < (int)cinfo.output_height; i++) {
+        if (i < height) {
+            if (flags & TJFLAG_BOTTOMUP)
+                row_pointer[i] = &dstBuf[(cinfo.output_height - i - 1) * (size_t)pitch];
+            else
+                row_pointer[i] = &dstBuf[i * (size_t)pitch];
+        } else {
+            row_pointer[i] = tmp_row;
+        }
+    }
+    // the width for the crop shouln't exceed output_width
+    max_crop_width = scaledw;
+    jpeg_crop_scanline(&cinfo, &crop_x, &max_crop_width);
+    jpeg_skip_scanlines(&cinfo, crop_y);
+    while (cinfo.output_scanline <  cinfo.output_height) {
+      if (cinfo.output_scanline < crop_y)
+          jpeg_read_scanlines(&cinfo,  &row_pointer[cinfo.output_scanline], cinfo.output_height - cinfo.output_scanline);
+      else
+          jpeg_read_scanlines(&cinfo,  &row_pointer[cinfo.output_scanline- crop_y], cinfo.output_height - cinfo.output_scanline);
+    }
+    jpeg_finish_decompress(&cinfo);
+
+  bailout:
+    if (cinfo.global_state > DSTATE_START) jpeg_abort_decompress(&cinfo);
+    if (row_pointer) free(row_pointer);
+    if (tmp_row) free(tmp_row);
+    return retval;
+}
diff --git a/rocAL/source/loaders/image/node_numpy_loader.cpp b/rocAL/source/loaders/image/node_numpy_loader.cpp
index b29339c81..3f5319490 100644
--- a/rocAL/source/loaders/image/node_numpy_loader.cpp
+++ b/rocAL/source/loaders/image/node_numpy_loader.cpp
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2019 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/rocAL/source/loaders/image/node_numpy_loader_single_shard.cpp b/rocAL/source/loaders/image/node_numpy_loader_single_shard.cpp
index 061fe18f7..ed9d3730a 100644
--- a/rocAL/source/loaders/image/node_numpy_loader_single_shard.cpp
+++ b/rocAL/source/loaders/image/node_numpy_loader_single_shard.cpp
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2019 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/rocAL/source/loaders/image/numpy_loader.cpp b/rocAL/source/loaders/image/numpy_loader.cpp
index f9d658dae..4e614dca3 100644
--- a/rocAL/source/loaders/image/numpy_loader.cpp
+++ b/rocAL/source/loaders/image/numpy_loader.cpp
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2019 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/rocAL/source/loaders/image/numpy_loader_sharded.cpp b/rocAL/source/loaders/image/numpy_loader_sharded.cpp
index 916c24989..b514baf91 100644
--- a/rocAL/source/loaders/image/numpy_loader_sharded.cpp
+++ b/rocAL/source/loaders/image/numpy_loader_sharded.cpp
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2019 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/rocAL/source/pipeline/tensor.cpp b/rocAL/source/pipeline/tensor.cpp
index 5f0a53a42..043c08319 100644
--- a/rocAL/source/pipeline/tensor.cpp
+++ b/rocAL/source/pipeline/tensor.cpp
@@ -22,7 +22,7 @@ THE SOFTWARE.
 */
 
 #include <cstdio>
-#if !ENABLE_HIP
+#if ENABLE_OPENCL
 #include <CL/cl.h>
 #endif
 #include <vx_ext_amd.h>
@@ -116,17 +116,17 @@ void TensorInfo::reset_tensor_roi_buffers() {
     auto roi_size = (_layout == RocalTensorlayout::NFCHW || _layout == RocalTensorlayout::NFHWC) ? _dims[0] * _dims[1] : _batch_size;  // For Sequences pre allocating the ROI to N * F to replicate in OpenVX extensions
     allocate_host_or_pinned_mem((void **)&roi_buf, roi_size * roi_no_of_dims * 2 * sizeof(unsigned), _mem_type);
     _roi.set_ptr(roi_buf, _mem_type, roi_size, roi_no_of_dims);
-    if (_layout == RocalTensorlayout::NCDHW || _layout == RocalTensorlayout::NDHWC) {
-        for (unsigned i = 0; i < _batch_size; i++) {
-            unsigned *tensor_shape = _roi[i].end;
-            tensor_shape[i] = _max_shape[i];
-        }
-    } else if (_is_image) {
+    if (_is_image) {
         Roi2DCords *roi = _roi.get_2D_roi();
         for (unsigned i = 0; i < _batch_size; i++) {
             roi[i].xywh.w = _max_shape.at(0);
             roi[i].xywh.h = _max_shape.at(1);
         }
+    } else {
+        for (unsigned i = 0; i < _batch_size; i++) {
+            unsigned *tensor_shape = _roi[i].end;
+            tensor_shape[i] = _max_shape[i];
+        }
     }
 }
 
@@ -221,10 +221,8 @@ void Tensor::update_tensor_roi(const std::vector<std::vector<uint32_t>> &shape)
             THROW("The number of dims to be updated and the num of dims of tensor info does not match")
         
         unsigned *tensor_shape = _info.roi()[i].end;
-        if (_info.layout() == RocalTensorlayout::NCDHW || _info.layout() == RocalTensorlayout::NDHWC) {
-            for (unsigned j = 0; j < max_shape.size(); j++) {
-                tensor_shape[j] = shape[i][j] > max_shape[j] ? max_shape[j] : shape[i][j];
-            }
+        for (unsigned j = 0; j < max_shape.size(); j++) {
+            tensor_shape[j] = shape[i][j] > max_shape[j] ? max_shape[j] : shape[i][j];
         }
     }
 }
@@ -335,18 +333,21 @@ void Tensor::create_roi_tensor_from_handle(void **handle) {
         THROW("Empty ROI handle is passed")
     }
 
-    vx_size num_of_dims = 2;
-    vx_size stride[num_of_dims];
-    std::vector<size_t> roi_dims = {_info.batch_size(), 4};
+    auto _is_image = _info.is_image();
+    vx_size roi_num_of_dims = 2;
+    vx_size num_of_dims = _is_image ? 2 : (_info.num_of_dims() - 1);
+    std::vector<size_t> roi_dims;
+    roi_dims = {_info.batch_size(), num_of_dims * 2};
     if (_info.layout() == RocalTensorlayout::NFCHW || _info.layout() == RocalTensorlayout::NFHWC)
         roi_dims = {_info.dims()[0] * _info.dims()[1], 4};  // For Sequences pre allocating the ROI to N * F to replicate in OpenVX extensions        stride[0] = sizeof(vx_uint32);
+    vx_size stride[roi_num_of_dims];
     stride[0] = sizeof(vx_uint32);
     stride[1] = stride[0] * roi_dims[0];
     vx_enum mem_type = VX_MEMORY_TYPE_HOST;
     if (_info.mem_type() == RocalMemType::HIP)
         mem_type = VX_MEMORY_TYPE_HIP;
 
-    _vx_roi_handle = vxCreateTensorFromHandle(_context, num_of_dims, roi_dims.data(),
+    _vx_roi_handle = vxCreateTensorFromHandle(_context, roi_num_of_dims, roi_dims.data(),
                                               VX_TYPE_UINT32, 0, stride, *handle, mem_type);
     vx_status status;
     if ((status = vxGetStatus((vx_reference)_vx_roi_handle)) != VX_SUCCESS)
diff --git a/rocAL/source/readers/image/numpy_data_reader.cpp b/rocAL/source/readers/image/numpy_data_reader.cpp
index 94f0445dd..2f2171509 100644
--- a/rocAL/source/readers/image/numpy_data_reader.cpp
+++ b/rocAL/source/readers/image/numpy_data_reader.cpp
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2019 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,9 +25,10 @@ THE SOFTWARE.
 #include <commons.h>
 
 #include <algorithm>
+#include <cassert>
 #include <numeric>
 #include <random>
-#include <cassert>
+
 #include "filesystem.h"
 
 NumpyDataReader::NumpyDataReader() : _shuffle_time("shuffle_time", DBG_TIMING) {
@@ -89,7 +90,8 @@ void NumpyDataReader::incremenet_read_ptr() {
 }
 
 size_t NumpyDataReader::open() {
-    auto file_path = _file_names[_curr_file_idx];  // Get next file name
+    auto file_path = _file_names[_curr_file_idx];       // Get current file name
+    _curr_file_header = _file_headers[_curr_file_idx];  // Get current file header
     incremenet_read_ptr();
     _last_id = file_path;
     auto last_slash_idx = _last_id.find_last_of("\\/");
@@ -97,10 +99,10 @@ size_t NumpyDataReader::open() {
         _last_id.erase(0, last_slash_idx + 1);
     }
 
-    auto ret = GetFromCache(file_path, _file_headers[_curr_file_idx]);
+    auto ret = get_cached_header(file_path, _curr_file_header);
     if (!ret) {
-        ParseHeader(_file_headers[_curr_file_idx], file_path);
-        UpdateCache(file_path, _file_headers[_curr_file_idx]);
+        read_header(_curr_file_header, file_path);
+        update_header_cache(file_path, _curr_file_header);
     } else {
         _current_fPtr = std::fopen(file_path.c_str(), "rb");
         if (_current_fPtr == nullptr)
@@ -108,10 +110,10 @@ size_t NumpyDataReader::open() {
     }
     fseek(_current_fPtr, 0, SEEK_SET);  // Take the file pointer back to the start
 
-    return _file_headers[_curr_file_idx].nbytes();
+    return _curr_file_header.nbytes();
 }
 
-bool NumpyDataReader::GetFromCache(const std::string& file_name, NumpyHeaderData& header) {
+bool NumpyDataReader::get_cached_header(const std::string& file_name, NumpyHeaderData& header) {
     std::unique_lock<std::mutex> cache_lock(_cache_mutex_);
     auto it = _header_cache_.find(file_name);
     if (it == _header_cache_.end()) {
@@ -122,12 +124,12 @@ bool NumpyDataReader::GetFromCache(const std::string& file_name, NumpyHeaderData
     }
 }
 
-void NumpyDataReader::UpdateCache(const std::string& file_name, const NumpyHeaderData& value) {
+void NumpyDataReader::update_header_cache(const std::string& file_name, const NumpyHeaderData& value) {
     std::unique_lock<std::mutex> cache_lock(_cache_mutex_);
     _header_cache_[file_name] = value;
 }
 
-const RocalTensorDataType NumpyDataReader::TypeFromNumpyStr(const std::string& format) {
+const RocalTensorDataType NumpyDataReader::get_numpy_dtype(const std::string& format) {
     if (format == "u1") return RocalTensorDataType::UINT8;
     // if (format == "u2") return TypeTable::GetTypeInfo<uint16_t>();   // Currently not supported in rocAL
     if (format == "u4") return RocalTensorDataType::UINT32;
@@ -147,20 +149,20 @@ const RocalTensorDataType NumpyDataReader::TypeFromNumpyStr(const std::string& f
     THROW("Unknown Numpy type string");
 }
 
-inline void NumpyDataReader::SkipSpaces(const char*& ptr) {
+inline void NumpyDataReader::ignore_spaces(const char*& ptr) {
     while (::isspace(*ptr))
         ptr++;
 }
 
 template <size_t N>
-void NumpyDataReader::Skip(const char*& ptr, const char (&what)[N]) {
+void NumpyDataReader::skip_string(const char*& ptr, const char (&what)[N]) {
     if (strncmp(ptr, what, N - 1))
         THROW("Found wrong symbol during parsing");
     ptr += N - 1;
 }
 
 template <size_t N>
-bool NumpyDataReader::TrySkip(const char*& ptr, const char (&what)[N]) {
+bool NumpyDataReader::check_and_skip_string(const char*& ptr, const char (&what)[N]) {
     if (!strncmp(ptr, what, N - 1)) {
         ptr += N - 1;
         return true;
@@ -170,18 +172,18 @@ bool NumpyDataReader::TrySkip(const char*& ptr, const char (&what)[N]) {
 }
 
 template <size_t N>
-void NumpyDataReader::SkipFieldName(const char*& ptr, const char (&name)[N]) {
-    SkipSpaces(ptr);
-    Skip(ptr, "'");
-    Skip(ptr, name);
-    Skip(ptr, "'");
-    SkipSpaces(ptr);
-    Skip(ptr, ":");
-    SkipSpaces(ptr);
+void NumpyDataReader::skip_field(const char*& ptr, const char (&name)[N]) {
+    ignore_spaces(ptr);
+    skip_string(ptr, "'");
+    skip_string(ptr, name);
+    skip_string(ptr, "'");
+    ignore_spaces(ptr);
+    skip_string(ptr, ":");
+    ignore_spaces(ptr);
 }
 
 template <typename T = int64_t>
-T NumpyDataReader::ParseInteger(const char*& ptr) {
+T NumpyDataReader::parse_int(const char*& ptr) {
     char* out_ptr = const_cast<char*>(ptr);  // strtol takes a non-const pointer
     T value = static_cast<T>(strtol(ptr, &out_ptr, 10));
     if (out_ptr == ptr)
@@ -190,7 +192,7 @@ T NumpyDataReader::ParseInteger(const char*& ptr) {
     return value;
 }
 
-std::string NumpyDataReader::ParseStringValue(const char*& input, char delim_start, char delim_end) {
+std::string NumpyDataReader::read_dtype_string(const char*& input, char delim_start, char delim_end) {
     if (*input++ != delim_start)
         THROW("Expected \'" + std::to_string(delim_start) + "\'");
     std::string out;
@@ -228,39 +230,39 @@ std::string NumpyDataReader::ParseStringValue(const char*& input, char delim_sta
     return out;
 }
 
-void NumpyDataReader::ParseHeaderContents(NumpyHeaderData& target, const std::string& header) {
+void NumpyDataReader::decode_header(NumpyHeaderData& target, const std::string& header) {
     const char* hdr = header.c_str();
-    SkipSpaces(hdr);
-    Skip(hdr, "{");
-    SkipFieldName(hdr, "descr");
-    auto typestr = ParseStringValue(hdr);
+    ignore_spaces(hdr);
+    skip_string(hdr, "{");
+    skip_field(hdr, "descr");
+    auto typestr = read_dtype_string(hdr);
     // < means LE, | means N/A, = means native. In all those cases, we can read
     bool little_endian = (typestr[0] == '<' || typestr[0] == '|' || typestr[0] == '=');
     if (!little_endian)
         THROW("Big Endian files are not supported.");
-    target._type_info = TypeFromNumpyStr(typestr.substr(1));
+    target._type_info = get_numpy_dtype(typestr.substr(1));
 
-    SkipSpaces(hdr);
-    Skip(hdr, ",");
-    SkipFieldName(hdr, "fortran_order");
-    if (TrySkip(hdr, "True")) {
+    ignore_spaces(hdr);
+    skip_string(hdr, ",");
+    skip_field(hdr, "fortran_order");
+    if (check_and_skip_string(hdr, "True")) {
         target._fortran_order = true;
-    } else if (TrySkip(hdr, "False")) {
+    } else if (check_and_skip_string(hdr, "False")) {
         target._fortran_order = false;
     } else {
         THROW("Failed to parse fortran_order field.");
     }
-    SkipSpaces(hdr);
-    Skip(hdr, ",");
-    SkipFieldName(hdr, "shape");
-    Skip(hdr, "(");
-    SkipSpaces(hdr);
+    ignore_spaces(hdr);
+    skip_string(hdr, ",");
+    skip_field(hdr, "shape");
+    skip_string(hdr, "(");
+    ignore_spaces(hdr);
     target._shape.clear();
     while (*hdr != ')') {
-        // ParseInteger already skips the leading spaces (strtol does).
-        target._shape.push_back(static_cast<unsigned>(ParseInteger<int64_t>(hdr)));
-        SkipSpaces(hdr);
-        if (!(TrySkip(hdr, ",")) && (target._shape.size() <= 1))
+        // parse_int already skips the leading spaces (strtol does).
+        target._shape.push_back(static_cast<unsigned>(parse_int<int64_t>(hdr)));
+        ignore_spaces(hdr);
+        if (!(check_and_skip_string(hdr, ",")) && (target._shape.size() <= 1))
             THROW("The first number in a tuple must be followed by a comma.");
     }
     if (target._fortran_order) {
@@ -269,7 +271,7 @@ void NumpyDataReader::ParseHeaderContents(NumpyHeaderData& target, const std::st
     }
 }
 
-void NumpyDataReader::ParseHeader(NumpyHeaderData& parsed_header, std::string file_path) {
+void NumpyDataReader::read_header(NumpyHeaderData& parsed_header, std::string file_path) {
     // check if the file is actually a numpy file
     std::vector<char> token(128);
     _current_fPtr = std::fopen(file_path.c_str(), "rb");
@@ -310,7 +312,7 @@ void NumpyDataReader::ParseHeader(NumpyHeaderData& parsed_header, std::string fi
     if (std::fseek(_current_fPtr, offset, SEEK_SET))
         THROW("Seek operation failed: " + std::strerror(errno));
 
-    ParseHeaderContents(parsed_header, header);
+    decode_header(parsed_header, header);
     parsed_header._data_offset = offset;
 }
 
@@ -321,10 +323,10 @@ size_t NumpyDataReader::read_numpy_data(void* buf, size_t read_size, std::vector
     // Requested read size bigger than the file size? just read as many bytes as the file size
     read_size = (read_size > _current_file_size) ? _current_file_size : read_size;
 
-    if (std::fseek(_current_fPtr, _file_headers[_curr_file_idx]._data_offset, SEEK_SET))
+    if (std::fseek(_current_fPtr, _curr_file_header._data_offset, SEEK_SET))
         THROW("Seek operation failed: " + std::strerror(errno));
 
-    auto shape = _file_headers[_curr_file_idx].shape();
+    auto shape = _curr_file_header.shape();
     auto num_dims = max_shape.size();
     std::vector<unsigned> strides(num_dims + 1);
     strides[num_dims] = 1;
@@ -333,28 +335,28 @@ size_t NumpyDataReader::read_numpy_data(void* buf, size_t read_size, std::vector
     }
 
     size_t actual_read_size = 0;
-    if (_file_headers[_curr_file_idx].type() == RocalTensorDataType::UINT8)
-        actual_read_size = ParseNumpyData<u_int8_t>((u_int8_t*)buf, strides, shape);
-    if (_file_headers[_curr_file_idx].type() == RocalTensorDataType::UINT32)
-        actual_read_size = ParseNumpyData<u_int32_t>((u_int32_t*)buf, strides, shape);
-    if (_file_headers[_curr_file_idx].type() == RocalTensorDataType::INT8)
-        actual_read_size = ParseNumpyData<int8_t>((int8_t*)buf, strides, shape);
-    if (_file_headers[_curr_file_idx].type() == RocalTensorDataType::INT32)
-        actual_read_size = ParseNumpyData<int32_t>((int32_t*)buf, strides, shape);
-    if (_file_headers[_curr_file_idx].type() == RocalTensorDataType::FP16)
+    if (_curr_file_header.type() == RocalTensorDataType::UINT8)
+        actual_read_size = copy_array_data<u_int8_t>((u_int8_t*)buf, strides, shape);
+    if (_curr_file_header.type() == RocalTensorDataType::UINT32)
+        actual_read_size = copy_array_data<u_int32_t>((u_int32_t*)buf, strides, shape);
+    if (_curr_file_header.type() == RocalTensorDataType::INT8)
+        actual_read_size = copy_array_data<int8_t>((int8_t*)buf, strides, shape);
+    if (_curr_file_header.type() == RocalTensorDataType::INT32)
+        actual_read_size = copy_array_data<int32_t>((int32_t*)buf, strides, shape);
+    if (_curr_file_header.type() == RocalTensorDataType::FP16)
 #if defined(AMD_FP16_SUPPORT)
-        actual_read_size = ParseNumpyData<half>((half*)buf, strides, shape);
+        actual_read_size = copy_array_data<half>((half*)buf, strides, shape);
 #else
         THROW("FLOAT16 type tensor not supported")
 #endif
-    if (_file_headers[_curr_file_idx].type() == RocalTensorDataType::FP32)
-        actual_read_size = ParseNumpyData<float>((float*)buf, strides, shape);
+    if (_curr_file_header.type() == RocalTensorDataType::FP32)
+        actual_read_size = copy_array_data<float>((float*)buf, strides, shape);
 
     return actual_read_size;
 }
 
 template <typename T>
-size_t NumpyDataReader::ParseNumpyData(T* buf, std::vector<unsigned> strides, std::vector<unsigned> shapes, unsigned dim) {
+size_t NumpyDataReader::copy_array_data(T* buf, std::vector<unsigned> strides, std::vector<unsigned> shapes, unsigned dim) {
     if (dim == (shapes.size() - 1)) {
         auto actual_read_size = std::fread(buf, sizeof(T), shapes[dim], _current_fPtr);
         return actual_read_size;
@@ -362,14 +364,14 @@ size_t NumpyDataReader::ParseNumpyData(T* buf, std::vector<unsigned> strides, st
     T* startPtr = buf;
     size_t read_size = 0;
     for (unsigned d = 0; d < shapes[dim]; d++) {
-        read_size += ParseNumpyData<T>(startPtr, strides, shapes, dim + 1);
+        read_size += copy_array_data<T>(startPtr, strides, shapes, dim + 1);
         startPtr += strides[dim + 1];
     }
     return read_size;
 }
 
 const NumpyHeaderData NumpyDataReader::get_numpy_header_data() {
-    return _file_headers[_curr_file_idx];
+    return _curr_file_header;
 }
 
 size_t NumpyDataReader::read_data(unsigned char* buf, size_t read_size) {
@@ -379,10 +381,10 @@ size_t NumpyDataReader::read_data(unsigned char* buf, size_t read_size) {
     // Requested read size bigger than the file size? just read as many bytes as the file size
     read_size = (read_size > _current_file_size) ? _current_file_size : read_size;
 
-    if (std::fseek(_current_fPtr, _file_headers[_curr_file_idx]._data_offset, SEEK_SET))
+    if (std::fseek(_current_fPtr, _curr_file_header._data_offset, SEEK_SET))
         THROW("Seek operation failed: " + std::strerror(errno));
 
-    size_t actual_read_size = std::fread(buf, 1, _file_headers[_curr_file_idx].nbytes(), _current_fPtr);
+    size_t actual_read_size = std::fread(buf, 1, _curr_file_header.nbytes(), _current_fPtr);
     return actual_read_size;
 }
 
diff --git a/rocAL_pybind/amd/rocal/fn.py b/rocAL_pybind/amd/rocal/fn.py
index 1ec5289c2..8623dea2a 100644
--- a/rocAL_pybind/amd/rocal/fn.py
+++ b/rocAL_pybind/amd/rocal/fn.py
@@ -1161,3 +1161,22 @@ def random_object_bbox(*inputs, format='anchor_shape', background=0, cache_objec
     else:
         print('Wrong format passed to random_object_bbox')
         return ()
+
+def transpose(*inputs, perm=[], output_layout=types.NHWC, output_dtype=types.UINT8):
+    # pybind call arguments
+    kwargs_pybind = {"input_image": inputs[0], "perm": perm, "is_output": False, "output_layout": output_layout, "output_dtype": output_dtype}
+    transposed_image = b.transpose(Pipeline._current_pipeline._handle, *(kwargs_pybind.values()))
+    return (transposed_image)
+
+def normalize(*inputs, axes=[], mean=[], stddev=[], scale=1.0, shift=0.0, output_layout=types.NHWC, output_dtype=types.UINT8):
+    # pybind call arguments
+    kwargs_pybind = {"input_image": inputs[0], "axes": axes, "mean": mean, "stddev": stddev, "is_output": False,
+                     "scale": scale, "shift": shift, "output_layout": output_layout, "output_dtype": output_dtype}
+    normalized_image = b.normalize(Pipeline._current_pipeline._handle, *(kwargs_pybind.values()))
+    return (normalized_image)
+
+def cast(*inputs, output_dtype=types.UINT8):
+    # pybind call arguments
+    kwargs_pybind = {"input_image": inputs[0], "is_output": False, "output_dtype": output_dtype}
+    normalized_image = b.normalize(Pipeline._current_pipeline._handle, *(kwargs_pybind.values()))
+    return (normalized_image)
diff --git a/rocAL_pybind/amd/rocal/readers.py b/rocAL_pybind/amd/rocal/readers.py
index d45cc0201..b115a3d92 100644
--- a/rocAL_pybind/amd/rocal/readers.py
+++ b/rocAL_pybind/amd/rocal/readers.py
@@ -352,8 +352,8 @@ def mxnet(path, stick_to_shard=False, pad_last_batch=False):
     return mxnet_metadata
 
 
-def numpy(*inputs, file_root='', num_shards=1,
-          random_shuffle=False, shard_id=0, files=[], stick_to_shard=False, pad_last_batch=False, seed=0):
+def numpy(*inputs, file_root='', files=[], num_shards=1,
+          random_shuffle=False, shard_id=0, stick_to_shard=False, pad_last_batch=False, seed=0):
 
     Pipeline._current_pipeline._reader = "NumpyReader"
     # Output
diff --git a/rocAL_pybind/examples/rocAL_api_numpy_reader.py b/rocAL_pybind/examples/rocAL_api_numpy_reader.py
index e2961eddc..09e50a7f6 100644
--- a/rocAL_pybind/examples/rocAL_api_numpy_reader.py
+++ b/rocAL_pybind/examples/rocAL_api_numpy_reader.py
@@ -10,8 +10,9 @@
 import sys
 import os, glob
 
-val_cases_list = ['00000', '00003', '00005', '00006', '00012', '00024', '00034', '00041', '00044', '00049', '00052', '00056', '00061', '00065', '00066', '00070', '00076', '00078', '00080', '00084',
-                  '00086', '00087', '00092', '00111', '00112', '00125', '00128', '00138', '00157', '00160', '00161', '00162', '00169', '00171', '00176', '00185', '00187', '00189', '00198', '00203', '00206', '00207']
+
+MEAN = [0.026144592091441154, -88.3379898071289, -84.62094116210938, -78.56366729736328, -77.72217559814453, 7.33015557974337e-12, 48330.79296875, 87595.4296875, 183.57638549804688, 208.38265991210938, -7.185957863625792e-19, 109.64270782470703, 94.19403076171875, -0.37584438920021057, 9952.041015625, 20.362579345703125]
+STDDEV = [108.9710922241211, 174.1948699951172, 173.99221801757812, 155.323486328125, 158.25418090820312, 0.14563894271850586, 58919.42578125, 24443.921875, 64.71000671386719, 77.63092041015625, 3.7348792830016464e-05, 242.97598266601562, 237.60250854492188, 5726.51611328125, 2953.1953125, 51.31494903564453]
 
 def load_data(path, files_pattern):
     data = sorted(glob.glob(os.path.join(path, files_pattern)))
@@ -19,19 +20,10 @@ def load_data(path, files_pattern):
     return data
 
 def get_data_split(path: str):
-    imgs = load_data(path, "*_x.npy")
-    lbls = load_data(path, "*_y.npy")
+    imgs = load_data(path, "data-*.npy")
+    lbls = load_data(path, "label-*.npy")
     assert len(imgs) == len(lbls), f"Found {len(imgs)} volumes but {len(lbls)} corresponding masks"
-    imgs_train, lbls_train, imgs_val, lbls_val = [], [], [], []
-    for (case_img, case_lbl) in zip(imgs, lbls):
-        if case_img.split("_")[-2] in val_cases_list:
-            imgs_val.append(case_img)
-            lbls_val.append(case_lbl)
-        else:
-            imgs_train.append(case_img)
-            lbls_train.append(case_lbl)
-
-    return imgs_train, imgs_val, lbls_train, lbls_val
+    return imgs, lbls
 
 def main():
     if  len(sys.argv) < 3:
@@ -45,63 +37,52 @@ def main():
     except OSError as error:
         print(error)
     data_path = sys.argv[1]
-    if(sys.argv[2] == "cpu"):
+    data_path1 = sys.argv[2]
+    if(sys.argv[3] == "cpu"):
         rocal_cpu = True
     else:
         rocal_cpu = False
-    batch_size = int(sys.argv[3])
+    batch_size = int(sys.argv[4])
     num_threads = 8
     device_id = 0
     local_rank = 0
     world_size = 1
     random_seed = random.SystemRandom().randint(0, 2**32 - 1)
-    x_train, x_val, y_train, y_val = get_data_split(data_path)
+    x_train, y_train = get_data_split(data_path)
+    x_val, y_val = get_data_split(data_path1)
 
     import time
     start = time.time()
-    pipeline = Pipeline(batch_size=batch_size, num_threads=num_threads, device_id=device_id, seed=random_seed, rocal_cpu=rocal_cpu, prefetch_queue_depth=2)
+    pipeline = Pipeline(batch_size=batch_size, num_threads=num_threads, device_id=device_id, seed=random_seed, rocal_cpu=rocal_cpu, prefetch_queue_depth=6)
 
     with pipeline:
-        numpy_reader_output = fn.readers.numpy(file_root=data_path, files=x_train, shard_id=local_rank, num_shards=world_size, random_shuffle=True, seed=random_seed+local_rank)
-        numpy_reader_output1 = fn.readers.numpy(file_root=data_path, files=y_train, shard_id=local_rank, num_shards=world_size, random_shuffle=True, seed=random_seed+local_rank)
-        data_output = fn.set_layout(numpy_reader_output, output_layout=types.NCDHW)
-        label_output = fn.set_layout(numpy_reader_output1, output_layout=types.NCDHW)
-        [roi_start, roi_end] = fn.random_object_bbox(label_output, format="start_end", k_largest=2, foreground_prob=0.4)
-        anchor = fn.roi_random_crop(label_output, roi_start=roi_start, roi_end=roi_end, crop_shape=(1, 128, 128, 128))
-        data_sliced_output = fn.slice(data_output, anchor=anchor, shape=(1,128,128,128), output_layout=types.NCDHW, output_dtype=types.FLOAT)
-        label_sliced_output = fn.slice(label_output, anchor=anchor, shape=(1,128,128,128), output_layout=types.NCDHW, output_dtype=types.UINT8)       
-        hflip = fn.random.coin_flip(probability=0.33)
-        vflip = fn.random.coin_flip(probability=0.33)
-        dflip = fn.random.coin_flip(probability=0.33)
-        data_flip_output = fn.flip(data_sliced_output, horizontal=hflip, vertical=vflip, depth=dflip, output_layout=types.NCDHW, output_dtype=types.FLOAT)
-        label_flip_output = fn.flip(label_sliced_output, horizontal=hflip, vertical=vflip, depth=dflip, output_layout=types.NCDHW, output_dtype=types.UINT8)
-        brightness = fn.random.uniform(range=[0.7, 1.3])
-        add_brightness = fn.random.coin_flip(probability=0.1)
-        brightness_output = fn.brightness(data_flip_output, brightness=brightness, brightness_shift=0.0, conditional_execution=add_brightness, output_layout=types.NCDHW, output_dtype=types.FLOAT)
-        add_noise = fn.random.coin_flip(probability=0.5)
-        std_dev = fn.random.uniform(range=[0.0, 0.1])
-        noise_output = fn.gaussian_noise(brightness_output, mean=0.0, std_dev=std_dev, conditional_execution=add_noise, output_layout=types.NCDHW, output_dtype=types.FLOAT)
-        pipeline.set_outputs(noise_output, label_flip_output)
+        numpy_reader_output = fn.readers.numpy(file_root=data_path, files=x_train, shard_id=local_rank, num_shards=world_size)
+        label_output = fn.readers.numpy(file_root=data_path, files=y_train, shard_id=local_rank, num_shards=world_size)
+        data_output = fn.set_layout(numpy_reader_output, output_layout=types.NHWC)
+        normalized_output = fn.normalize(data_output, axes=[0,1], mean=MEAN, stddev=STDDEV, output_layout=types.NHWC, output_dtype=types.FLOAT)
+        transposed_output = fn.transpose(normalized_output, perm=[2,0,1], output_layout=types.NCHW, output_dtype=types.FLOAT)
+        pipeline.set_outputs(transposed_output, label_output)
 
     pipeline.build()
 
-    pipeline1 = Pipeline(batch_size=batch_size, num_threads=num_threads, device_id=device_id, seed=random_seed, rocal_cpu=rocal_cpu, prefetch_queue_depth=6)
+    val_pipeline = Pipeline(batch_size=batch_size, num_threads=num_threads, device_id=device_id, seed=random_seed, rocal_cpu=rocal_cpu, prefetch_queue_depth=6)
 
-    with pipeline1:
-        numpy_reader_output = fn.readers.numpy(file_root=data_path, files=x_val, shard_id=local_rank, num_shards=world_size)
-        numpy_reader_output1 = fn.readers.numpy(file_root=data_path, files=y_val, shard_id=local_rank, num_shards=world_size)
-        data_output = fn.set_layout(numpy_reader_output, output_layout=types.NCDHW)
-        label_output = fn.set_layout(numpy_reader_output1, output_layout=types.NCDHW)
-        pipeline1.set_outputs(data_output, label_output)
+    with val_pipeline:
+        numpy_reader_output = fn.readers.numpy(file_root=data_path, files=x_val, shard_id=local_rank, num_shards=world_size, seed=random_seed+local_rank)
+        label_output = fn.readers.numpy(file_root=data_path, files=y_val, shard_id=local_rank, num_shards=world_size, seed=random_seed+local_rank)
+        data_output = fn.set_layout(numpy_reader_output, output_layout=types.NHWC)
+        normalized_output = fn.normalize(data_output, axes=[0,1], mean=MEAN, stddev=STDDEV, output_layout=types.NHWC, output_dtype=types.FLOAT)
+        transposed_output = fn.transpose(normalized_output, perm=[2,0,1], output_layout=types.NCHW, output_dtype=types.FLOAT)
+        val_pipeline.set_outputs(transposed_output, label_output)
 
-    pipeline1.build()
+    val_pipeline.build()
     
     numpyIteratorPipeline = ROCALNumpyIterator(pipeline, device='cpu' if rocal_cpu else 'gpu')
     print(len(numpyIteratorPipeline))
-    valNumpyIteratorPipeline = ROCALNumpyIterator(pipeline1, device='cpu' if rocal_cpu else 'gpu', return_roi=True)
+    valNumpyIteratorPipeline = ROCALNumpyIterator(val_pipeline, device='cpu' if rocal_cpu else 'gpu')
     print(len(valNumpyIteratorPipeline))
     cnt = 0
-    for epoch in range(100):
+    for epoch in range(2):
         print("+++++++++++++++++++++++++++++EPOCH+++++++++++++++++++++++++++++++++++++",epoch)
         for i , it in enumerate(numpyIteratorPipeline):
             print(i, it[0].shape, it[1].shape)
diff --git a/rocAL_pybind/rocal_pybind.cpp b/rocAL_pybind/rocal_pybind.cpp
index 1562575dd..578c5e572 100644
--- a/rocAL_pybind/rocal_pybind.cpp
+++ b/rocAL_pybind/rocal_pybind.cpp
@@ -725,5 +725,11 @@ PYBIND11_MODULE(rocal_pybind, m) {
           py::return_value_policy::reference);
     m.def("slice", &rocalSlice,
           py::return_value_policy::reference);
+    m.def("transpose", &rocalTranspose,
+          py::return_value_policy::reference);
+    m.def("normalize", &rocalNormalize,
+          py::return_value_policy::reference);
+    m.def("cast", &rocalCast,
+          py::return_value_policy::reference);
 }
 }  // namespace rocal
diff --git a/rocAL_pybind/setup.py b/rocAL_pybind/setup.py
index 7d3598d35..9ee8e57ea 100644
--- a/rocAL_pybind/setup.py
+++ b/rocAL_pybind/setup.py
@@ -36,7 +36,7 @@ def has_ext_modules(self):
 setup(
     name='amd-rocal',
     description='AMD ROCm Augmentation Library',
-    url='https://github.com/GPUOpen-ProfessionalCompute-Libraries/MIVisionX/rocAL',
+    url='https://github.com/ROCm/rocAL',
     version='1.0.0',
     author='AMD',
     license='Apache License 2.0',
diff --git a/tests/cpp_api_tests/rocAL_unittests/rocAL_unittests.cpp b/tests/cpp_api_tests/rocAL_unittests/rocAL_unittests.cpp
index e48fe1d78..51265859f 100644
--- a/tests/cpp_api_tests/rocAL_unittests/rocAL_unittests.cpp
+++ b/tests/cpp_api_tests/rocAL_unittests/rocAL_unittests.cpp
@@ -319,6 +319,12 @@ int test(int test_case, int reader_type, const char *path, const char *outName,
             rocalCreateMXNetReader(handle, path, true);
             decoded_output = rocalMXNetRecordSource(handle, path, color_format, num_threads, false, false, false, ROCAL_USE_USER_GIVEN_SIZE_RESTRICTED, decode_max_width, decode_max_height);
         } break;
+        case 12:  // Numpy reader
+        {
+            std::cout << ">>>>>>> Running Numpy reader" << std::endl;
+            pipeline_type = 4;
+            decoded_output = rocalNumpyFileSource(handle, path, num_threads, {}, false, false, false, ROCAL_USE_MAX_SIZE);
+        } break;
         default: {
             std::cout << ">>>>>>> Running IMAGE READER" << std::endl;
             pipeline_type = 1;
@@ -766,6 +772,53 @@ int test(int test_case, int reader_type, const char *path, const char *outName,
                     }
                 }
             } break;
+            case 4: {  // numpy reader pipeline
+                RocalTensorList output_tensor_list;
+                output_tensor_list = rocalGetOutputTensors(handle);
+                for (int idx = 0; idx < output_tensor_list->size(); idx++) {
+                    unsigned char *out_buffer;
+                    if (output_tensor_list->at(idx)->data_type() == RocalTensorOutputType::ROCAL_FP32) {
+                        float *out_f_buffer;
+                        std::cout << "Creating float buffer of ";
+                        for (auto x : output_tensor_list->at(idx)->shape())
+                            std::cout << x << " x ";
+                        std::cout << "shape\n";
+                        if (output_tensor_list->at(idx)->backend() == RocalTensorBackend::ROCAL_GPU) {
+                            out_f_buffer = (float *)malloc(output_tensor_list->at(idx)->data_size());
+                            output_tensor_list->at(idx)->copy_data(out_f_buffer);
+                        } else if (output_tensor_list->at(idx)->backend() == RocalTensorBackend::ROCAL_CPU)
+                            out_f_buffer = (float *)output_tensor_list->at(idx)->buffer();
+
+                        out_buffer = (unsigned char *)malloc(output_tensor_list->at(idx)->data_size() / 4);
+                        // convert_float_to_uchar_buffer(out_f_buffer, out_buffer, output_tensor_list->at(idx)->data_size() / 4);
+                    } else if (output_tensor_list->at(idx)->data_type() == RocalTensorOutputType::ROCAL_FP16) {
+                        half *out_f16_buffer;
+                        std::cout << "Creating float16 buffer of ";
+                        for (auto x : output_tensor_list->at(idx)->shape())
+                            std::cout << x << " x ";
+                        std::cout << "shape\n";
+                        if (output_tensor_list->at(idx)->backend() == RocalTensorBackend::ROCAL_GPU) {
+                            out_f16_buffer = (half *)malloc(output_tensor_list->at(idx)->data_size());
+                            output_tensor_list->at(idx)->copy_data(out_f16_buffer);
+                        } else if (output_tensor_list->at(idx)->backend() == RocalTensorBackend::ROCAL_CPU)
+                            out_f16_buffer = (half *)output_tensor_list->at(idx)->buffer();
+
+                        out_buffer = (unsigned char *)malloc(output_tensor_list->at(idx)->data_size() / 2);
+                        // convert_float_to_uchar_buffer(out_f16_buffer, out_buffer, output_tensor_list->at(idx)->data_size() / 2);
+                    } else {
+                        std::cout << "Creating uchar buffer of ";
+                        for (auto x : output_tensor_list->at(idx)->shape())
+                            std::cout << x << " x ";
+                        std::cout << "shape\n";
+                        if (output_tensor_list->at(idx)->backend() == RocalTensorBackend::ROCAL_GPU) {
+                            out_buffer = (unsigned char *)malloc(output_tensor_list->at(idx)->data_size());
+                            output_tensor_list->at(idx)->copy_data(out_buffer);
+                        } else if (output_tensor_list->at(idx)->backend() == RocalTensorBackend::ROCAL_CPU)
+                            out_buffer = (unsigned char *)(output_tensor_list->at(idx)->buffer());
+                    }
+                }
+                std::cout << "Copied numpy data to buffers\n";
+            } break;
             default: {
                 std::cout << "Not a valid pipeline type ! Exiting!\n";
                 return -1;