diff --git a/.github/workflows/test_packages.yml b/.github/workflows/test_packages.yml index 3a13dd4521..2c3b67b07f 100644 --- a/.github/workflows/test_packages.yml +++ b/.github/workflows/test_packages.yml @@ -33,6 +33,7 @@ jobs: - perception/multimodal_human_centric - perception/pose_estimation - perception/fall_detection + - perception/gesture_recognition - perception/speech_recognition - perception/skeleton_based_action_recognition/costgcn - perception/skeleton_based_action_recognition/pstgcn @@ -93,6 +94,7 @@ jobs: - perception/multimodal_human_centric - perception/pose_estimation - perception/fall_detection + - perception/gesture_recognition - perception/speech_recognition - perception/skeleton_based_action_recognition/costgcn - perception/skeleton_based_action_recognition/pstgcn diff --git a/.github/workflows/tests_suite.yml b/.github/workflows/tests_suite.yml index 13c5c6521c..cc438c34a0 100644 --- a/.github/workflows/tests_suite.yml +++ b/.github/workflows/tests_suite.yml @@ -61,6 +61,7 @@ jobs: - perception/multimodal_human_centric - perception/pose_estimation - perception/fall_detection + - perception/gesture_recognition - perception/speech_recognition - perception/skeleton_based_action_recognition/costgcn - perception/skeleton_based_action_recognition/pstgcn @@ -172,6 +173,7 @@ jobs: - perception/multimodal_human_centric - perception/pose_estimation - perception/fall_detection + - perception/gesture_recognition - perception/speech_recognition - perception/skeleton_based_action_recognition/costgcn - perception/skeleton_based_action_recognition/pstgcn @@ -258,6 +260,7 @@ jobs: - perception/multimodal_human_centric - perception/pose_estimation - perception/fall_detection + - perception/gesture_recognition - perception/speech_recognition - perception/skeleton_based_action_recognition/costgcn - perception/skeleton_based_action_recognition/pstgcn @@ -362,6 +365,7 @@ jobs: - perception/multimodal_human_centric - perception/pose_estimation - perception/fall_detection + - perception/gesture_recognition - perception/speech_recognition - perception/skeleton_based_action_recognition/costgcn - perception/skeleton_based_action_recognition/pstgcn diff --git a/.github/workflows/tests_suite_develop.yml b/.github/workflows/tests_suite_develop.yml index cc1e35266f..4820986c9d 100644 --- a/.github/workflows/tests_suite_develop.yml +++ b/.github/workflows/tests_suite_develop.yml @@ -62,6 +62,7 @@ jobs: - perception/multimodal_human_centric - perception/pose_estimation - perception/fall_detection + - perception/gesture_recognition - perception/speech_recognition - perception/skeleton_based_action_recognition/costgcn - perception/skeleton_based_action_recognition/pstgcn @@ -176,6 +177,7 @@ jobs: - perception/multimodal_human_centric - perception/pose_estimation - perception/fall_detection + - perception/gesture_recognition - perception/speech_recognition - perception/skeleton_based_action_recognition/costgcn - perception/skeleton_based_action_recognition/pstgcn @@ -266,6 +268,7 @@ jobs: - perception/multimodal_human_centric - perception/pose_estimation - perception/fall_detection + - perception/gesture_recognition - perception/speech_recognition - perception/skeleton_based_action_recognition/costgcn - perception/skeleton_based_action_recognition/pstgcn @@ -376,6 +379,7 @@ jobs: - perception/multimodal_human_centric - perception/pose_estimation - perception/fall_detection + - perception/gesture_recognition - perception/speech_recognition - perception/skeleton_based_action_recognition/costgcn - perception/skeleton_based_action_recognition/pstgcn diff --git a/docs/reference/gesture-recognition-learner.md b/docs/reference/gesture-recognition-learner.md new file mode 100644 index 0000000000..8bd167761b --- /dev/null +++ b/docs/reference/gesture-recognition-learner.md @@ -0,0 +1,217 @@ +## gesture_recognition module + +The *gesture_recognition* module contains the *GestureRecognitionLearner* class and can be used to recognize and localize 18 hand gestures. +The module relies on Nanodet object detection module. +We provide data processing scripts and a pre-trained model for [Hagrid dataset](https://github.com/hukenovs/hagrid/tree/master). + +### Class GestureRecognitionLearner +Bases: `object_detection_2d.nanodet.NanodetLearner` + +The learner has the following public methods: + +#### `GestureRecognitionLearner` constructor +```python +GestureRecognitionLearner(self, model_to_use, iters, lr, batch_size, checkpoint_after_iter, checkpoint_load_iter, temp_path, device, + weight_decay, warmup_steps, warmup_ratio, lr_schedule_T_max, lr_schedule_eta_min, grad_clip) +``` + +Constructor parameters: + +- **model_to_use**: *{"plus_m_1.5x_416"}, default=plus_m_1.5x_416*\ + Specifies the model to use and the config file. Currently plus_m_1.5x_416 is supported, while other models can be created following the config file. +- **iters**: *int, default=None*\ + Specifies the number of epochs the training should run for. +- **lr**: *float, default=None*\ + Specifies the initial learning rate to be used during training. +- **batch_size**: *int, default=None*\ + Specifies number of images to be bundled up in a batch during training. + This heavily affects memory usage, adjust according to your system. +- **checkpoint_after_iter**: *int, default=None*\ + Specifies per how many training iterations a checkpoint should be saved. + If it is set to 0 no checkpoints will be saved. +- **checkpoint_load_iter**: *int, default=None*\ + Specifies which checkpoint should be loaded. + If it is set to 0, no checkpoints will be loaded. +- **temp_path**: *str, default=''*\ + Specifies a path where the algorithm looks for saving the checkpoints along with the logging files. If *''* the `cfg.save_dir` will be used instead. +- **device**: *{'cpu', 'cuda'}, default='cuda'*\ + Specifies the device to be used. +- **weight_decay**: *float, default=None* +- **warmup_steps**: *int, default=None* +- **warmup_ratio**: *float, default=None* +- **lr_schedule_T_max**: *int, default=None* +- **lr_schedule_eta_min**: *float, default=None* +- **grad_clip**: *int, default=None* + +#### `GestureRecognitionLearner.preprocess_data` +```python +GestureRecognitionLearner.preprocess_data(self, preprocess, download, verbose, save_path) +``` + +This method is used for downloading the [gesture recognition dataset](https://github.com/hukenovs/hagrid/tree/master) and preprocessing it to COCO format. + +Parameters: + +- **preprocess**: *bool, default=True*\ + Indicates whether to preprocess data located in save_path to COCO format. +- **download** : *bool, default=False*\ + Indicates whether to download data to save_path. +- **verbose** : *bool, default=True*\ + Enables verbosity. +- **save_path** : *str, default='./data'*\ + Path where to save data or where the downloaded data that needs to be preprocessed is located. + +#### `GestureRecognitionLearner.fit` +```python +GestureRecognitionLearner.fit(self, dataset, val_dataset, logging_path, verbose, logging, seed, local_rank) +``` + +This method is used for training the algorithm on a train dataset and validating on a val dataset. + +Parameters: + +- **dataset**: *object*\ + Object that holds the training dataset of `ExternalDataset` type. +- **val_dataset** : *object, default=None*\ + Object that holds the validation dataset of `ExternalDataset` type. +- **logging_path** : *str, default=''*\ + Subdirectory in temp_path to save log files and TensorBoard. +- **verbose** : *bool, default=True*\ + Enables verbosity. +- **logging** : *bool, default=False*\ + Enables the maximum verbosity and the logger. +- **seed** : *int, default=123*\ + Seed for repeatability. +- **local_rank** : *int, default=1*\ + Needed if training on multiple machines. + +#### `GestureRecognitionLearner.eval` +```python +GestureRecognitionLearner.eval(self, dataset, verbose, logging, local_rank) +``` + +This method is used to evaluate a trained model on an evaluation dataset. +Saves a txt logger file containing stats regarding evaluation. + +Parameters: + +- **dataset** : *object*\ + Object that holds the evaluation dataset of type `ExternalDataset`. +- **verbose**: *bool, default=True*\ + Enables verbosity. +- **logging**: *bool, default=False*\ + Enables the maximum verbosity and logger. +- **local_rank** : *int, default=1*\ + Needed if evaluating on multiple machines. + +#### `GestureRecognitionLearner.infer` +```python +GestureRecognitionLearner.infer(self, input, conf_threshold, iou_threshold, nms_max_num) +``` + +This method is used to perform gesture recognition (detection) on an image. +Returns an `engine.target.BoundingBoxList` object, which contains bounding boxes that are described by the top-left corner and +their width and height, or returns an empty list if no detections were made on the input image. + +Parameters: +- **input** : *object*\ + Object of type engine.data.Image. + Image type object to perform inference on. +- **conf_threshold**: *float, default=0.35*\ + Specifies the threshold for gesture detection inference. + An object is detected if the confidence of the output is higher than the specified threshold. +- **iou_threshold**: *float, default=0.6*\ + Specifies the IOU threshold for NMS in inference. +- **nms_max_num**: *int, default=100*\ + Determines the maximum number of bounding boxes that will be retained following the nms. + +#### `GestureRecognitionLearner.save` +```python +GestureRecognitionLearner.save(self, path, verbose) +``` + +This method is used to save a trained model with its metadata. +Provided with the path, it creates the *path* directory, if it does not already exist. +Inside this folder, the model is saved as *nanodet_{model_name}.pth* and a metadata file *nanodet_{model_name}.json*. +If the directory already exists, the *nanodet_{model_name}.pth* and *nanodet_{model_name}.json* files are overwritten. +If optimization is performed, the optimized model is saved instead. + +Parameters: + +- **path**: *str, default=None*\ + Path to save the model, if None it will be `"temp_folder"` or `"cfg.save_dir"` from the learner. +- **verbose**: *bool, default=True*\ + Enables the maximum verbosity and logger. + +#### `GestureRecognitionLearner.load` +```python +GestureRecognitionLearner.load(self, path, verbose) +``` + +This method is used to load a previously saved model from its saved folder. +Loads the model from inside the directory of the path provided, using the metadata .json file included. +If optimization is performed, the optimized model is loaded instead. + +Parameters: + +- **path**: *str, default=None*\ + Path of the model to be loaded. +- **verbose**: *bool, default=True*\ + Enables the maximum verbosity. + +#### `GestureRecognitionLearner.download` +```python +GestureRecognitionLearner.download(self, path, model, verbose, url) +``` + +Downloads the provided pretrained model. + +Parameters: + +- **path**: *str, default=None*\ + Specifies the folder where data will be downloaded. If *None*, the *self.temp_path* directory is used instead. +- **verbose**: *bool, default=True*\ + Enables the maximum verbosity. +- **url**: *str, default=OpenDR FTP URL*\ + URL of the FTP server. + +#### Examples + +* **Training example** + + ```python + from opendr.perception.gesture_recognition.gesture_recognition_learner import GestureRecognitionLearner + + + if __name__ == '__main__': + model_save_dir = './save_dir/' + data_save_dir = './data/' + + gesture_model = GestureRecognitionLearner(model_to_use='plus_m_1.5x_416', iters=100, lr=1e-3, batch_size=32,checkpoint_after_iter=1, checkpoint_load_iter=0, device="cuda", temp_path = model_save_dir) + + dataset, val_dataset, test_dataset = gesture_model.preprocess_data(preprocess=True, download=True, verbose=True, save_path=data_save_dir) + + gesture_model.fit(dataset, val_dataset, logging_path = './logs', logging=True) + gesture_model.save() + + ``` + +* **Inference and result drawing example on a test image** + + This example shows how to perform inference on an image and draw the resulting bounding boxes + + ```python + from opendr.perception.gesture_recognition.gesture_recognition_learner import GestureRecognitionLearner + from opendr.engine.data import Image + from opendr.perception.object_detection_2d import draw_bounding_boxes + + if __name__ == '__main__': + gesture_model = GestureRecognitionLearner(model_to_use='plus_m_1.5x_416') + gesture_model.download("./") + gesture_model.load("./nanodet_plus_m_1.5x_416", verbose=True) + img = Image.open("./test_image.jpg") + boxes = gesture_model.infer(input=img) + + draw_bounding_boxes(img.opencv(), boxes, class_names=gesture_model.classes, show=True) + ``` + diff --git a/docs/reference/index.md b/docs/reference/index.md index 300a9693c2..971e5118e2 100644 --- a/docs/reference/index.md +++ b/docs/reference/index.md @@ -32,6 +32,8 @@ Neither the copyright holder nor any applicable licensor will be liable for any - pose estimation: - [lightweight_open_pose Module](lightweight-open-pose.md) - [high_resolution_pose_estimation Module](high-resolution-pose-estimation.md) + - gesture recognition: + - [gesture_recognition Module](gesture-recognition-learner.md) - activity recognition: - [skeleton-based action recognition](skeleton-based-action-recognition.md) - [continual skeleton-based action recognition Module](skeleton-based-action-recognition.md#class-costgcnlearner) diff --git a/projects/opendr_ws/README.md b/projects/opendr_ws/README.md index 9a44e25852..fc987e3f65 100644 --- a/projects/opendr_ws/README.md +++ b/projects/opendr_ws/README.md @@ -82,6 +82,7 @@ Currently, apart from tools, opendr_ws contains the following ROS nodes (categor 14. [Landmark-based Facial Expression Recognition](src/opendr_perception/README.md#landmark-based-facial-expression-recognition-ros-node) 15. [Skeleton-based Human Action Recognition](src/opendr_perception/README.md#skeleton-based-human-action-recognition-ros-nodes) 16. [Video Human Activity Recognition](src/opendr_perception/README.md#video-human-activity-recognition-ros-node) +17. [RGB Hand Gesture Recognition](src/opendr_perception/README.md#rgb-gesture-recognition-ros-node) ## RGB + Infrared input 1. [End-to-End Multi-Modal Object Detection (GEM)](src/opendr_perception/README.md#2d-object-detection-gem-ros-node) diff --git a/projects/opendr_ws/src/opendr_perception/README.md b/projects/opendr_ws/src/opendr_perception/README.md index c194781e78..45ff8eeed2 100644 --- a/projects/opendr_ws/src/opendr_perception/README.md +++ b/projects/opendr_ws/src/opendr_perception/README.md @@ -746,6 +746,31 @@ The node makes use of the toolkit's video human activity recognition tools which You can find the corresponding IDs regarding activity recognition [here](https://github.com/opendr-eu/opendr/blob/master/src/opendr/perception/activity_recognition/datasets/kinetics400_classes.csv). +### RGB Gesture Recognition ROS Node + +For gesture recognition, the ROS [node](./scripts/gesture_recognition_node.py) is based on the gesture recognition learner defined [here](../../../../src/opendr/perception/gesture_recognition/gesture_recognition_learner.py), and the documentation of the learner can be found [here](../../../../docs/reference/gesture-recognition-learner.md). + +#### Instructions for basic usage: + +1. Start the node responsible for publishing images. If you have a USB camera, then you can use the `usb_cam_node` as explained in the [prerequisites above](#prerequisites). + +2. Start the gesture recognition node: + ```shell + rosrun opendr_perception gesture_recognition_node.py + ``` + The following arguments are available: + - `-i or --input_rgb_image_topic INPUT_RGB_IMAGE_TOPIC`: topic name for input RGB image (default=`/usb_cam/image_raw`) + - `-o or --output_rgb_image_topic OUTPUT_RGB_IMAGE_TOPIC`: topic name for output annotated RGB image (default=`/opendr/image_gesture_annotated`) + - `-d or --detections_topic DETECTIONS_TOPIC`: topic name for detection messages (default=`/opendr/gestures`) + - `--performance_topic PERFORMANCE_TOPIC`: topic name for performance messages (default=`None`, disabled) + - `--device DEVICE`: Device to use, either `cpu` or `cuda`, falls back to `cpu` if GPU or CUDA is not found (default=`cuda`) + - `--threshold THRESHOLD`: Confidence threshold for predictions (default=0.5) + - `--model MODEL`: Config file name of the model that will be used (default=`plus_m_1.5x_416)` + +3. Default output topics: + - Output images: `/opendr/image_gesture_annotated` + - Detection messages: `/opendr/gestures` + ## RGB + Infrared input ### 2D Object Detection GEM ROS Node diff --git a/projects/opendr_ws/src/opendr_perception/scripts/gesture_recognition_node.py b/projects/opendr_ws/src/opendr_perception/scripts/gesture_recognition_node.py new file mode 100755 index 0000000000..09ffb2ffcc --- /dev/null +++ b/projects/opendr_ws/src/opendr_perception/scripts/gesture_recognition_node.py @@ -0,0 +1,160 @@ +#!/usr/bin/env python3 +# Copyright 2020-2023 OpenDR European Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import torch +from time import perf_counter +import rospy +from vision_msgs.msg import Detection2DArray +from sensor_msgs.msg import Image as ROS_Image +from std_msgs.msg import Float32 +from opendr_bridge import ROSBridge + +from opendr.engine.data import Image +from opendr.perception.gesture_recognition.gesture_recognition_learner import GestureRecognitionLearner +from opendr.perception.object_detection_2d import draw_bounding_boxes + + +class GestureRecognitionNode: + + def __init__(self, input_rgb_image_topic="/usb_cam/image_raw", + output_rgb_image_topic="/opendr/image_gesture_annotated", detections_topic="/opendr/gestures", + performance_topic=None, device="cuda", model="plus_m_1.5x_416", threshold=0.5): + """ + :param input_rgb_image_topic: Topic from which we are reading the input image + :type input_rgb_image_topic: str + :param output_rgb_image_topic: Topic to which we are publishing the annotated image (if None, no annotated + image is published) + :type output_rgb_image_topic: str + :param detections_topic: Topic to which we are publishing the annotations (if None, no object detection message + is published) + :type detections_topic: str + :param performance_topic: Topic to which performance information is published + :type performance_topic: str + :param device: device on which we are running inference ('cpu' or 'cuda') + :type device: str + :param model: the name of the model of which we want to load the config file + :type model: str + """ + self.input_rgb_image_topic = input_rgb_image_topic + + if output_rgb_image_topic is not None: + self.image_publisher = rospy.Publisher(output_rgb_image_topic, ROS_Image, queue_size=1) + else: + self.image_publisher = None + + if detections_topic is not None: + self.object_publisher = rospy.Publisher(detections_topic, Detection2DArray, queue_size=1) + else: + self.object_publisher = None + + if performance_topic is not None: + self.performance_publisher = rospy.Publisher(performance_topic, Float32, queue_size=1) + else: + self.performance_publisher = None + + self.bridge = ROSBridge() + + # Initialize the gesture model + self.gesture_model = GestureRecognitionLearner(model_to_use=model, device=device) + self.gesture_model.download(path=".", verbose=True) + self.gesture_model.load("./nanodet_{}".format(model)) + + self.threshold = threshold + + def listen(self): + """ + Start the node and begin processing input data. + """ + rospy.init_node('opendr_gesture_recognition_node', anonymous=True) + rospy.Subscriber(self.input_rgb_image_topic, ROS_Image, self.callback, queue_size=1, buff_size=10000000) + rospy.loginfo("Gesture recognition node started.") + rospy.spin() + + def callback(self, data): + """ + Callback that processes the input data and publishes to the corresponding topics. + :param data: input message + :type data: sensor_msgs.msg.Image + """ + if self.performance_publisher: + start_time = perf_counter() + # Convert sensor_msgs.msg.Image into OpenDR Image + image = self.bridge.from_ros_image(data, encoding='bgr8') + + # Run object detection + boxes = self.gesture_model.infer(image, conf_threshold=self.threshold, nms_max_num=2) + # Get an OpenCV image back + image = image.opencv() + + # Publish detections in ROS message + ros_boxes = self.bridge.to_ros_bounding_box_list(boxes) # Convert to ROS boxes + + if self.performance_publisher is not None: + end_time = perf_counter() + fps = 1.0 / (end_time - start_time) + fps_msg = Float32() + fps_msg.data = fps + self.performance_publisher.publish(fps_msg) + + if self.object_publisher is not None: + self.object_publisher.publish(ros_boxes) + + if self.image_publisher is not None: + # Annotate image with object detection boxes + image = draw_bounding_boxes(image, boxes, class_names=self.gesture_model.classes) + # Convert the annotated OpenDR image to ROS2 image message using bridge and publish it + self.image_publisher.publish(self.bridge.to_ros_image(Image(image), encoding='bgr8')) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("-i", "--input_rgb_image_topic", help="Topic name for input rgb image", + type=str, default="/usb_cam/image_raw") + parser.add_argument("-o", "--output_rgb_image_topic", help="Topic name for output annotated rgb image", + type=lambda value: value if value.lower() != "none" else None, + default="/opendr/image_gesture_annotated") + parser.add_argument("-d", "--detections_topic", help="Topic name for detection messages", + type=lambda value: value if value.lower() != "none" else None, + default="/opendr/gestures") + parser.add_argument("--performance_topic", help="Topic name for performance messages", type=str, default=None) + parser.add_argument("--device", help="Device to use (cpu, cuda)", type=str, default="cpu", choices=["cuda", "cpu"]) + parser.add_argument("--threshold", help="Confidence threshold for prediction", type=float, default=0.5) + parser.add_argument("--model", help="Model that config file will be used", type=str, default="plus_m_1.5x_416") + args = parser.parse_args() + + try: + if args.device == "cuda" and torch.cuda.is_available(): + device = "cuda" + elif args.device == "cuda": + print("GPU not found. Using CPU instead.") + device = "cpu" + else: + print("Using CPU.") + device = "cpu" + except: + print("Using CPU.") + device = "cpu" + + gesture_recognition_node = GestureRecognitionNode(device=device, model=args.model, + input_rgb_image_topic=args.input_rgb_image_topic, + output_rgb_image_topic=args.output_rgb_image_topic, + detections_topic=args.detections_topic, + performance_topic=args.performance_topic, threshold=args.threshold) + gesture_recognition_node.listen() + + +if __name__ == '__main__': + main() diff --git a/projects/opendr_ws_2/README.md b/projects/opendr_ws_2/README.md index 3870394fb3..999f6b6ce7 100644 --- a/projects/opendr_ws_2/README.md +++ b/projects/opendr_ws_2/README.md @@ -75,6 +75,7 @@ Currently, apart from tools, opendr_ws_2 contains the following ROS2 nodes (cate 14. [Landmark-based Facial Expression Recognition](src/opendr_perception/README.md#landmark-based-facial-expression-recognition-ros2-node) 15. [Skeleton-based Human Action Recognition](src/opendr_perception/README.md#skeleton-based-human-action-recognition-ros2-nodes) 16. [Video Human Activity Recognition](src/opendr_perception/README.md#video-human-activity-recognition-ros2-node) +17. [RGB Hand Gesture Recognition](src/opendr_perception/README.md#rgb-gesture-recognition-ros2-node) ## RGB + Infrared input 1. [End-to-End Multi-Modal Object Detection (GEM)](src/opendr_perception/README.md#2d-object-detection-gem-ros2-node) diff --git a/projects/opendr_ws_2/src/opendr_bridge/opendr_bridge/bridge.py b/projects/opendr_ws_2/src/opendr_bridge/opendr_bridge/bridge.py index 58db1e8942..74254cfad0 100644 --- a/projects/opendr_ws_2/src/opendr_bridge/opendr_bridge/bridge.py +++ b/projects/opendr_ws_2/src/opendr_bridge/opendr_bridge/bridge.py @@ -167,8 +167,14 @@ def to_ros_boxes(self, box_list): ros_box.bbox = BoundingBox2D() ros_box.results.append(ObjectHypothesisWithPose()) ros_box.bbox.center = Pose2D() - ros_box.bbox.center.x = box.left + box.width / 2. - ros_box.bbox.center.y = box.top + box.height / 2. + try: + ros_box.bbox.center.x = box.left + box.width / 2. + except: + ros_box.bbox.center.x = float(box.left + box.width / 2.) + try: + ros_box.bbox.center.y = box.top + box.height / 2. + except: + ros_box.bbox.center.y = float(box.top + box.height / 2.) ros_box.bbox.size_x = float(box.width) ros_box.bbox.size_y = float(box.height) ros_box.results[0].id = str(box.name) diff --git a/projects/opendr_ws_2/src/opendr_perception/README.md b/projects/opendr_ws_2/src/opendr_perception/README.md index 7c717501bf..b6937d9e4e 100755 --- a/projects/opendr_ws_2/src/opendr_perception/README.md +++ b/projects/opendr_ws_2/src/opendr_perception/README.md @@ -689,7 +689,7 @@ Their documentation can be found [here](../../../../docs/reference/skeleton-base 2. You are then ready to start the skeleton-based human action recognition node: 1. Skeleton-based action recognition node ```shell - ros2 run opendr_perception skeleton_based_action_recognition_node.py + ros2 run opendr_perception skeleton_based_action_recognition ``` The following optional argument is available for the skeleton-based action recognition node: - `--model` MODEL: model to use, options are `stgcn` or `pstgcn`, (default=`stgcn`) @@ -698,7 +698,7 @@ Their documentation can be found [here](../../../../docs/reference/skeleton-base 2. Continual skeleton-based action recognition node ```shell - ros2 run opendr_perception continual_skeleton_based_action_recognition_node.py + ros2 run opendr_perception continual_skeleton_based_action_recognition ``` The following optional argument is available for the continual skeleton-based action recognition node: - `--model` MODEL: model to use, options are `costgcn`, (default=`costgcn`) @@ -758,6 +758,31 @@ The node makes use of the toolkit's video human activity recognition tools which You can find the corresponding IDs regarding activity recognition [here](https://github.com/opendr-eu/opendr/blob/master/src/opendr/perception/activity_recognition/datasets/kinetics400_classes.csv). +### RGB Gesture Recognition ROS2 Node + +For gesture recognition, the ROS2 [node](./opendr_perception/gesture_recognition_node.py) is based on the gesture recognition learner defined [here](../../../../src/opendr/perception/gesture_recognition/gesture_recognition_learner.py), and the documentation of the learner can be found [here](../../../../docs/reference/gesture-recognition-learner.md). + +#### Instructions for basic usage: + +1. Start the node responsible for publishing images. If you have a USB camera, then you can use the `usb_cam_node` as explained in the [prerequisites above](#prerequisites). + +2. Start the gesture recognition node: + ```shell + ros2 run opendr_perception gesture_recognition + ``` + The following arguments are available: + - `-i or --input_rgb_image_topic INPUT_RGB_IMAGE_TOPIC`: topic name for input RGB image (default=`/usb_cam/image_raw`) + - `-o or --output_rgb_image_topic OUTPUT_RGB_IMAGE_TOPIC`: topic name for output annotated RGB image (default=`/opendr/image_gesture_annotated`) + - `-d or --detections_topic DETECTIONS_TOPIC`: topic name for detection messages (default=`/opendr/gestures`) + - `--performance_topic PERFORMANCE_TOPIC`: topic name for performance messages (default=`None`, disabled) + - `--device DEVICE`: Device to use, either `cpu` or `cuda`, falls back to `cpu` if GPU or CUDA is not found (default=`cuda`) + - `--threshold THRESHOLD`: Confidence threshold for predictions (default=0.5) + - `--model MODEL`: Config file name of the model that will be used (default=`plus_m_1.5x_416)` + +3. Default output topics: + - Output images: `/opendr/image_gesture_annotated` + - Detection messages: `/opendr/gestures` + ## RGB + Infrared input ### 2D Object Detection GEM ROS2 Node diff --git a/projects/opendr_ws_2/src/opendr_perception/opendr_perception/gesture_recognition_node.py b/projects/opendr_ws_2/src/opendr_perception/opendr_perception/gesture_recognition_node.py new file mode 100755 index 0000000000..a9d9910d6c --- /dev/null +++ b/projects/opendr_ws_2/src/opendr_perception/opendr_perception/gesture_recognition_node.py @@ -0,0 +1,167 @@ +#!/usr/bin/env python +# Copyright 2020-2023 OpenDR European Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import torch +from time import perf_counter + +import rclpy +from rclpy.node import Node + +from std_msgs.msg import Float32 +from sensor_msgs.msg import Image as ROS_Image +from vision_msgs.msg import Detection2DArray +from opendr_bridge import ROS2Bridge + +from opendr.engine.data import Image +from opendr.perception.gesture_recognition.gesture_recognition_learner import GestureRecognitionLearner +from opendr.perception.object_detection_2d import draw_bounding_boxes + + +class GestureRecognitionNode(Node): + + def __init__(self, input_rgb_image_topic="image_raw", output_rgb_image_topic="/opendr/image_gesture_annotated", + detections_topic="/opendr/gestures", performance_topic=None, device="cuda", + model="plus_m_1.5x_416", threshold=0.5): + """ + Creates a ROS2 Node for gesture recognition. + :param input_rgb_image_topic: Topic from which we are reading the input image + :type input_rgb_image_topic: str + :param output_rgb_image_topic: Topic to which we are publishing the annotated image (if None, no annotated + image is published) + :type output_rgb_image_topic: str + :param detections_topic: Topic to which we are publishing the predictions (if None, no object detection message + is published) + :type detections_topic: str + :param performance_topic: Topic to which we are publishing performance information (if None, no performance + message is published) + :type performance_topic: str + :param device: device on which we are running inference ('cpu' or 'cuda') + :type device: str + :param model: the name of the model of which we want to load the config file + :type model: str + """ + super().__init__('gesture_recognition_node') + + self.image_subscriber = self.create_subscription(ROS_Image, input_rgb_image_topic, self.callback, 1) + + if output_rgb_image_topic is not None: + self.image_publisher = self.create_publisher(ROS_Image, output_rgb_image_topic, 1) + else: + self.image_publisher = None + + if detections_topic is not None: + self.object_publisher = self.create_publisher(Detection2DArray, detections_topic, 1) + else: + self.object_publisher = None + + if performance_topic is not None: + self.performance_publisher = self.create_publisher(Float32, performance_topic, 1) + else: + self.performance_publisher = None + + self.bridge = ROS2Bridge() + + # Initialize the object detector + self.gesture_model = GestureRecognitionLearner(model_to_use=model, device=device) + self.gesture_model.download(path=".", verbose=True) + self.gesture_model.load("./nanodet_{}".format(model)) + self.threshold = threshold + self.get_logger().info("Gesture recognition node initialized.") + + def callback(self, data): + """ + Callback that processes the input data and publishes to the corresponding topics. + :param data: input message + :type data: sensor_msgs.msg.Image + """ + if self.performance_publisher: + start_time = perf_counter() + # Convert sensor_msgs.msg.Image into OpenDR Image + image = self.bridge.from_ros_image(data, encoding='bgr8') + + # Run gesture recognition + boxes = self.gesture_model.infer(image, conf_threshold=self.threshold) + + if self.performance_publisher: + end_time = perf_counter() + fps = 1.0 / (end_time - start_time) # NOQA + fps_msg = Float32() + fps_msg.data = fps + self.performance_publisher.publish(fps_msg) + + # Publish gesture detections in ROS message + if self.object_publisher is not None: + self.object_publisher.publish(self.bridge.to_ros_boxes(boxes)) + + if self.image_publisher is not None: + # Get an OpenCV image back + image = image.opencv() + # Annotate image with gesture boxes + image = draw_bounding_boxes(image, boxes, class_names=self.gesture_model.classes) + # Convert the annotated OpenDR image to ROS2 image message using bridge and publish it + self.image_publisher.publish(self.bridge.to_ros_image(Image(image), encoding='bgr8')) + + +def main(args=None): + rclpy.init(args=args) + + parser = argparse.ArgumentParser() + parser.add_argument("-i", "--input_rgb_image_topic", help="Topic name for input rgb image", + type=str, default="image_raw") + parser.add_argument("-o", "--output_rgb_image_topic", help="Topic name for output annotated rgb image", + type=lambda value: value if value.lower() != "none" else None, + default="/opendr/image_gesture_annotated") + parser.add_argument("-d", "--detections_topic", help="Topic name for detection messages", + type=lambda value: value if value.lower() != "none" else None, + default="/opendr/gestures") + parser.add_argument("--performance_topic", help="Topic name for performance messages, disabled (None) by default", + type=str, default=None) + parser.add_argument("--device", help="Device to use (cpu, cuda)", type=str, default="cuda", choices=["cuda", "cpu"]) + parser.add_argument("--model", help="Model that config file will be used", type=str, default="plus_m_1.5x_416") + parser.add_argument("--threshold", help="Confidence threshold for inference", default=0.5) + args = parser.parse_args() + + try: + if args.device == "cuda" and torch.cuda.is_available(): + device = "cuda" + elif args.device == "cuda": + print("GPU not found. Using CPU instead.") + device = "cpu" + else: + print("Using CPU.") + device = "cpu" + except: + print("Using CPU.") + device = "cpu" + + gesture_recognition_node = GestureRecognitionNode(device=device, model=args.model, + input_rgb_image_topic=args.input_rgb_image_topic, + output_rgb_image_topic=args.output_rgb_image_topic, + detections_topic=args.detections_topic, + performance_topic=args.performance_topic, + threshold=args.threshold) + + rclpy.spin(gesture_recognition_node) + + # Destroy the node explicitly + # (optional - otherwise it will be done automatically + # when the garbage collector destroys the node object) + gesture_recognition_node.destroy_node() + rclpy.shutdown() + + +if __name__ == '__main__': + main() diff --git a/projects/opendr_ws_2/src/opendr_perception/setup.py b/projects/opendr_ws_2/src/opendr_perception/setup.py index 0225d99a62..0e4d11a18b 100644 --- a/projects/opendr_ws_2/src/opendr_perception/setup.py +++ b/projects/opendr_ws_2/src/opendr_perception/setup.py @@ -56,6 +56,7 @@ 'binary_high_resolution = opendr_perception.binary_high_resolution_node:main', 'continual_skeleton_based_action_recognition = \ opendr_perception.continual_skeleton_based_action_recognition_node:main', + 'gesture_recognition = opendr_perception.gesture_recognition_node:main', 'performance = opendr_perception.performance_node:main', ], }, diff --git a/projects/python/perception/gesture_recognition/README.md b/projects/python/perception/gesture_recognition/README.md new file mode 100644 index 0000000000..d9f0e40985 --- /dev/null +++ b/projects/python/perception/gesture_recognition/README.md @@ -0,0 +1,11 @@ +# Hand Gesture Recognition + +This demo performs hand gesture recognition from webcam. +The list of gestures is: 'call', 'dislike', 'fist', 'four', 'like', 'mute', 'ok', 'one', 'palm', 'peace', 'peace inv', 'rock', 'stop', 'stop inv', 'three', 'three 2', 'two up', 'two up inv', 'no gesture', an example can be seen in [here](https://github.com/hukenovs/hagrid/tree/master). +By default single person (two hands) is assumed, but this can be changed by setting `max_hands` parameter. +The dataset is trained on images with distance to camera of 1..4 meters, so similar distance is expected to perform best. + +Demo can be run as follows: +```python +python3 webcam_demo.py --max_hands 2 +``` diff --git a/projects/python/perception/gesture_recognition/webcam_demo.py b/projects/python/perception/gesture_recognition/webcam_demo.py new file mode 100644 index 0000000000..390b76c067 --- /dev/null +++ b/projects/python/perception/gesture_recognition/webcam_demo.py @@ -0,0 +1,92 @@ +# Copyright 2020-2023 OpenDR European Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse + +import cv2 +import time +from opendr.engine.data import Image +from opendr.perception.gesture_recognition.gesture_recognition_learner import GestureRecognitionLearner +from opendr.perception.object_detection_2d import draw_bounding_boxes + + +class VideoReader(object): + def __init__(self, file_name): + self.file_name = file_name + self.cap = cv2.VideoCapture(self.file_name) + if not self.cap.isOpened(): + raise IOError('Video {} cannot be opened'.format(self.file_name)) + try: # OpenCV needs int to read from webcam + self.file_name = int(file_name) + except ValueError: + pass + + def __iter__(self): + if not self.cap.isOpened(): + raise IOError('Video {} cannot be opened'.format(self.file_name)) + return self + + def __next__(self): + was_read, img = self.cap.read() + if not was_read: + raise StopIteration + return img + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument("--device", help="Device to use (cpu, cuda)", type=str, default="cpu", choices=["cuda", "cpu"]) + parser.add_argument("--model", help="Model for which a config file will be used", type=str, default="plus_m_1.5x_416") + parser.add_argument("--max-hands", type=int, default=2) + args = parser.parse_args() + + device, model = args.device, args.model + + gesture_model = GestureRecognitionLearner(model_to_use=model, device=device) + gesture_model.download("./predefined_examples") + gesture_model.load("./predefined_examples/nanodet_{}".format(args.model), verbose=True) + + # Use the first camera available on the system + image_provider = VideoReader(0) + + while True: + counter, avg_fps = 0, 0 + for img in image_provider: + + img = Image(img) + + start_time = time.perf_counter() + + # Perform inference + boxes = gesture_model.infer(img, conf_threshold=0.35, iou_threshold=0.6, nms_max_num=args.max_hands) + end_time = time.perf_counter() + fps = 1.0 / (end_time - start_time) + + # Calculate a running average on FPS + avg_fps = 0.8 * fps + 0.2 * avg_fps + + img = img.opencv() + + if boxes: + draw_bounding_boxes(img, boxes, class_names=gesture_model.classes, line_thickness=3) + + # Wait a few frames for FPS to stabilize + if counter < 5: + counter += 1 + else: + img = cv2.putText(img, "FPS: %.2f" % (avg_fps,), (50, 50), cv2.FONT_HERSHEY_SIMPLEX, + 1, (255, 0, 0), 2, cv2.LINE_AA) + + cv2.imshow('Result', img) + cv2.waitKey(1) diff --git a/src/opendr/perception/gesture_recognition/algorithm/config/nanodet_plus_m_1.5x_416.yml b/src/opendr/perception/gesture_recognition/algorithm/config/nanodet_plus_m_1.5x_416.yml new file mode 100644 index 0000000000..5b767d17ac --- /dev/null +++ b/src/opendr/perception/gesture_recognition/algorithm/config/nanodet_plus_m_1.5x_416.yml @@ -0,0 +1,111 @@ +# nanodet-plus-m-1.5x_416 +# Hagrid +# mAP: 0.8259552204046907 +# AP_50: 0.9856925561494753 +# AP_75: 0.9544670343699412 +# AP_small: 0.8283787128712872 +# AP_m: 0.6140567858343431 +# AP_l: 0.8365065829188417 +save_dir: ./temp/hagrid-nanodet_plus_m_1.5x_416 +check_point_name: plus_m_1.5x_416 +model: + arch: + name: NanoDetPlus + detach_epoch: 10 + backbone: + name: ShuffleNetV2 + model_size: 1.5x + out_stages: [2,3,4] + activation: LeakyReLU + fpn: + name: GhostPAN + in_channels: [176, 352, 704] + out_channels: 128 + kernel_size: 5 + num_extra_level: 1 + use_depthwise: True + activation: LeakyReLU + head: + name: NanoDetPlusHead + num_classes: 19 + input_channel: 128 + feat_channels: 128 + stacked_convs: 2 + kernel_size: 5 + strides: [8, 16, 32, 64] + activation: LeakyReLU + reg_max: 7 + norm_cfg: + type: BN + loss: + loss_qfl: + name: QualityFocalLoss + use_sigmoid: True + beta: 2.0 + loss_weight: 1.0 + loss_dfl: + name: DistributionFocalLoss + loss_weight: 0.25 + loss_bbox: + name: GIoULoss + loss_weight: 2.0 + # Auxiliary head, only use in training time. + aux_head: + name: SimpleConvHead + num_classes: 19 + input_channel: 256 + feat_channels: 256 + stacked_convs: 4 + strides: [8, 16, 32, 64] + activation: LeakyReLU + reg_max: 7 +data: + train: + input_size: [416,416] + keep_ratio: False + pipeline: + perspective: 0.0 + scale: [0.6, 1.4] + stretch: [[0.8, 1.2], [0.8, 1.2]] + rotation: 0 + shear: 0 + translate: 0.2 + flip: 0.5 + brightness: 0.2 + contrast: [0.6, 1.4] + saturation: [0.5, 1.2] + normalize: [[0,0,0],[1,1,1]] + val: + input_size: [416,416] + keep_ratio: False + pipeline: + normalize: [[0,0,0],[1,1,1]] +device: + gpu_ids: [0] + workers_per_gpu: 4 + batchsize_per_gpu: 96 +schedule: + resume: 0 + optimizer: + name: AdamW + lr: 0.001 + weight_decay: 0.05 + warmup: + name: linear + steps: 500 + ratio: 0.0001 + total_epochs: 300 + lr_schedule: + name: CosineAnnealingLR + T_max: 300 + eta_min: 0.00005 + val_intervals: 10 +grad_clip: 35 +evaluator: + name: CocoDetectionEvaluator + save_key: mAP +log: + interval: 50 +class_names: ['call', 'dislike', 'fist', 'four', 'like', 'mute', 'ok', 'one', 'palm', 'peace', 'rock', 'stop', 'stop_inverted', 'three', 'two_up', 'two_up_inverted', 'three2', 'peace_inverted', 'no_gesture'] + + diff --git a/src/opendr/perception/gesture_recognition/algorithm/data/dataset/__init__.py b/src/opendr/perception/gesture_recognition/algorithm/data/dataset/__init__.py new file mode 100644 index 0000000000..637b98e06d --- /dev/null +++ b/src/opendr/perception/gesture_recognition/algorithm/data/dataset/__init__.py @@ -0,0 +1,40 @@ +# Modifications Copyright 2023 - present, OpenDR European Project +# +# Copyright 2021 RangiLyu. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import copy +from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.data.dataset.coco import CocoDataset + + +def build_dataset(cfg, dataset, class_names, mode, verbose=True, preprocess=True, download=False): + dataset_cfg = copy.deepcopy(cfg) + if verbose: + print("Loading type dataset from {}".format(dataset.path)) + + if mode == "train": + img_path = "{}/train".format(dataset.path) + ann_path = "{}/train.json".format(dataset.path) + elif mode == "val": + img_path = "{}/val".format(dataset.path) + ann_path = "{}/val.json".format(dataset.path) + else: + img_path = "{}/test".format(dataset.path) + ann_path = "{}/test.json".format(dataset.path) + dataset = CocoDataset(img_path=img_path, ann_path=ann_path, mode=mode, **dataset_cfg) + + if verbose: + print("ExternalDataset loaded.") + return dataset diff --git a/src/opendr/perception/gesture_recognition/algorithm/data/dataset/hagrid2coco.py b/src/opendr/perception/gesture_recognition/algorithm/data/dataset/hagrid2coco.py new file mode 100644 index 0000000000..eaab310722 --- /dev/null +++ b/src/opendr/perception/gesture_recognition/algorithm/data/dataset/hagrid2coco.py @@ -0,0 +1,219 @@ +''' +Modifications Copyright 2023 - present, OpenDR European Project + +This code is modified from +https://github.com/hukenovs/hagrid/blob/master/converters/hagrid_to_coco.py +under public license at https://github.com/hukenovs/hagrid/blob/master/license/en_us.pdf +''' + +import json +import logging +import os +from typing import Tuple + +import numpy as np +import pandas as pd +from PIL import Image +from tqdm import tqdm + +IMAGES = (".jpeg", ".jpg", ".jp2", ".png", ".tiff", ".jfif", ".bmp", ".webp", ".heic") + +tqdm.pandas() + +logging.getLogger().setLevel(logging.INFO) + + +def get_area(bboxes: list) -> list: + """ + Get area of bboxes + Parameters + ---------- + bboxes: list + list of bboxes + + Returns + ------- + list + """ + bboxes = np.array(bboxes) + area = bboxes[:, 2] * bboxes[:, 3] + return area + + +def get_w_h(img_path: str) -> Tuple[int, int]: + """ + Get width and height of image + Parameters + ---------- + img_path: str + path to image + + Returns + ------- + Tuple[int, int] + """ + img = Image.open(img_path) + img_w, img_h = img.size + return img_w, img_h + + +def get_abs_bboxes(bboxes: list, im_size: tuple) -> list: + """ + Get absolute bboxes in format [xmin, ymin, w, h] + Parameters + ---------- + bboxes: list + list of bboxes + im_size: tuple + image size + + Returns + ------- + list + """ + width, height = im_size + bboxes_out = [] + for box in bboxes: + x1, y1, w, h = box + bbox_abs = [x1 * width, y1 * height, w * width, h * height] + bboxes_out.append(bbox_abs) + return bboxes_out + + +def get_poly(bboxes: list) -> list: + """ + Get polygon from bboxes + Parameters + ---------- + bboxes: list + list of bboxes + + Returns + ------- + list + """ + poly = [] + for box in bboxes: + xmin, ymin, w, h = box + xmax = xmin + w + ymax = ymin + h + poly.append([xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax]) + + return poly + + +def get_files_from_dir(pth: str, extns: Tuple) -> list: + """ + Get files from directory + Parameters + ---------- + pth: str + path to directory + extns: Tuple + extensions of files + + Returns + ------- + list + """ + if not os.path.exists(pth): + logging.error(f"Dataset directory doesn't exist {pth}") + return [] + files = [f for f in os.listdir(pth) if f.endswith(extns)] + return files + + +def get_dataframe(dataset_annotations, dataset_folder, targets, phase): + annotations_all = None + exists_images = [] + + for target in tqdm(targets): + ps_phase = 'train_val' if (phase == 'train' or phase == 'val') else phase + target_json = os.path.join(dataset_annotations, f"ann_{ps_phase}", f"{target}.json") + print('target json: ', target_json) + if os.path.exists(target_json): + json_annotation = json.load(open(os.path.join(target_json))) + + json_annotation = [ + dict(annotation, **{"name": f"{name}.jpg"}) + for name, annotation in zip(json_annotation, json_annotation.values()) + ] + + annotation = pd.DataFrame(json_annotation) + + annotation["target"] = target + annotations_all = pd.concat([annotations_all, annotation], ignore_index=True) + exists_images.extend(get_files_from_dir(os.path.join(dataset_folder, phase, target), IMAGES)) + else: + logging.warning(f"Database for {phase}/{target} not found") + + annotations_all["exists"] = annotations_all["name"].isin(exists_images) + annotations = annotations_all[annotations_all["exists"]] + + return annotations + + +def convert_to_coco(out='./hagrid_coco_format', dataset_folder='./data/', dataset_annotations='./data/annotations/'): + targets = ['call', 'dislike', 'fist', 'four', 'like', 'mute', 'ok', 'one', 'palm', 'peace', + 'rock', 'stop', 'stop_inverted', 'three', 'two_up', 'two_up_inverted', + 'three2', 'peace_inverted', 'no_gesture'] + labels = {label: num for (label, num) in zip(targets, range(len(targets)))} + if not os.path.exists(out): + os.makedirs(out) + phases = ['train', 'val', 'test'] + for phase in phases: + + logging.info(f"Run convert {phase}") + logging.info("Create Dataframe") + annotations = get_dataframe(dataset_annotations, dataset_folder, targets, phase) + + logging.info("Create image_path") + annotations["image_path"] = annotations.progress_apply( + lambda row: os.path.join(dataset_folder, phase, row["target"], row["name"]), axis=1 + ) + + logging.info("Create width, height") + w_h = annotations["image_path"].progress_apply(lambda x: get_w_h(x)) + annotations["width"] = np.array(w_h.to_list())[:, 0] + annotations["height"] = np.array(w_h.to_list())[:, 1] + + logging.info("Create id") + annotations["id"] = annotations.index + + logging.info("Create abs_bboxes") + annotations["abs_bboxes"] = annotations.progress_apply( + lambda row: get_abs_bboxes(row["bboxes"], (row["width"], row["height"])), axis=1 + ) + logging.info("Create area") + annotations["area"] = annotations["abs_bboxes"].progress_apply(lambda bboxes: get_area(bboxes)) + logging.info("Create segmentation") + annotations["segmentation"] = annotations["abs_bboxes"].progress_apply(lambda bboxes: get_poly(bboxes)) + logging.info("Create category_id") + annotations["category_id"] = annotations["labels"].progress_apply(lambda x: [labels[label] for label in x]) + + categories = [{"supercategory": "none", "name": k, "id": v} for k, v in labels.items()] + logging.info(f"Save to {phase}.json") + res_file = {"categories": categories, "images": [], "annotations": []} + annot_count = 0 + for index, row in tqdm(annotations.iterrows()): + img_elem = {"file_name": row["image_path"], "height": row["height"], "width": row["width"], "id": row["id"]} + + res_file["images"].append(img_elem) + + num_boxes = len(row["bboxes"]) + for i in range(num_boxes): + annot_elem = { + "id": annot_count, + "bbox": row["abs_bboxes"][i], + "segmentation": [row["segmentation"][i]], + "image_id": row["id"], + "category_id": row["category_id"][i], + "iscrowd": 0, + "area": row["area"][i], + } + res_file["annotations"].append(annot_elem) + annot_count += 1 + + with open(f"{out}/{phase}.json", "w") as f: + json_str = json.dumps(res_file) + f.write(json_str) diff --git a/src/opendr/perception/gesture_recognition/dependencies.ini b/src/opendr/perception/gesture_recognition/dependencies.ini new file mode 100644 index 0000000000..aac7b3d043 --- /dev/null +++ b/src/opendr/perception/gesture_recognition/dependencies.ini @@ -0,0 +1,18 @@ +[runtime] +# 'python' key expects a value using the Python requirements file format +# https://pip.pypa.io/en/stable/reference/pip_install/#requirements-file-format +python=torch>=1.9.0 + pytorch-lightning==1.2.3 + protobuf<=3.20.0 + omegaconf>=2.0.1 + torchvision + numpy<=1.23.5 + opencv-python + pycocotools + Cython + onnx + pyaml + tabulate + tensorboard + +opendr=opendr-toolkit-engine diff --git a/src/opendr/perception/gesture_recognition/gesture_recognition_learner.py b/src/opendr/perception/gesture_recognition/gesture_recognition_learner.py new file mode 100644 index 0000000000..bb967a417e --- /dev/null +++ b/src/opendr/perception/gesture_recognition/gesture_recognition_learner.py @@ -0,0 +1,370 @@ +# Copyright 2020-2023 OpenDR European Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import datetime +import json +import zipfile +from pathlib import Path +import random +import pytorch_lightning as pl +import torch +from pytorch_lightning.callbacks import ProgressBar + +from opendr.perception.object_detection_2d.nanodet.nanodet_learner import NanodetLearner +from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.model.arch import build_model +from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.data.collate import naive_collate + +from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.evaluator import build_evaluator +from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.trainer.task import TrainingTask + +from opendr.perception.gesture_recognition.algorithm.data.dataset import build_dataset +from opendr.perception.gesture_recognition.algorithm.data.dataset.hagrid2coco import convert_to_coco + +from opendr.engine.constants import OPENDR_SERVER_URL +from opendr.engine.datasets import ExternalDataset +from urllib.request import urlretrieve + +from opendr.perception.object_detection_2d.nanodet.algorithm.nanodet.util import ( + cfg, + load_config, + mkdir, + NanoDetLightningLogger, +) +_MODEL_NAMES = {"plus_m_1.5x_416"} + + +class GestureRecognitionLearner(NanodetLearner): + def __init__(self, **kwargs): + + super(GestureRecognitionLearner, self).__init__(**kwargs) + + self.model = build_model(self.cfg.model) + + def preprocess_data(self, preprocess=True, download=False, verbose=True, save_path='./data/'): + if download: + if verbose: + print('Downloading hagrid dataset....') + main_url = "https://n-usr-2uzac.s3pd02.sbercloud.ru/b-usr-2uzac-mv4/hagrid/" + test_urls = {"test": f"{main_url}test.zip", + "ann_train_val": f"{main_url}ann_train_val.zip", "ann_test": f"{main_url}ann_test.zip"} + + gestures = ["call", "dislike", "fist", "four", "like", "mute", "ok", + "one", "palm", "peace_inverted", "peace", "rock", "stop_inverted", + "stop", "three", "three2", "two_up_inverted", "two_up"] + if verbose: + print('Downloading annotations....') + save_path_test = os.path.join(save_path, 'test') + os.makedirs(save_path_test, exist_ok=True) + + os.system(f"wget {test_urls['ann_test']} -O {save_path}/ann_test.zip") + os.system(f"wget {test_urls['ann_train_val']} -O {save_path}/ann_train_val.zip") + + with zipfile.ZipFile(os.path.join(save_path, "ann_test.zip"), 'r') as zip_ref: + zip_ref.extractall(save_path) + os.remove(os.path.join(save_path, "ann_test.zip")) + with zipfile.ZipFile(os.path.join(save_path, "ann_train_val.zip"), 'r') as zip_ref: + zip_ref.extractall(save_path) + os.remove(os.path.join(save_path, "ann_train_val.zip")) + + if verbose: + print('Downloading test data....') + os.system(f"wget {test_urls['test']} -O {save_path_test}/test.zip") + with zipfile.ZipFile(os.path.join(save_path_test, "test.zip"), 'r') as zip_ref: + zip_ref.extractall(save_path_test) + os.remove(os.path.join(save_path_test, "test.zip")) + + save_train = os.path.join(save_path, 'train') + os.makedirs(save_train, exist_ok=True) + + save_val = os.path.join(save_path, 'val') + os.makedirs(save_val, exist_ok=True) + for target in gestures: + if verbose: + print('Downloading {} class....'.format(target)) + target_url = main_url+"train_val_{}.zip".format(target) + os.system(f"wget {target_url} -O {save_train}/{target}.zip") + with zipfile.ZipFile(os.path.join(save_train, "{}.zip".format(target)), 'r') as zip_ref: + zip_ref.extractall(os.path.join(save_train, target)) + n_val = int(0.2*len(os.listdir(os.path.join(save_train, target)))) + filenames = os.listdir(os.path.join(save_train, target)) + random.shuffle(filenames) + val_files = filenames[:n_val] + os.makedirs(os.path.join(save_val, target), exist_ok=True) + for filename in val_files: + os.rename(os.path.join(save_train, target, filename), os.path.join(save_val, target, filename)) + + os.remove(os.path.join(save_train, "{}.zip".format(target))) + if preprocess: + convert_to_coco(out=save_path, dataset_folder=save_path, dataset_annotations=save_path) + dataset = ExternalDataset(save_path, 'coco') + val_dataset = ExternalDataset(save_path, 'coco') + test_dataset = ExternalDataset(save_path, 'coco') + return dataset, val_dataset, test_dataset + + def fit(self, dataset, val_dataset, logging_path='', verbose=True, logging=False, seed=123, local_rank=1): + """ + This method is used to train the gesture recognition model. + :param dataset: training data + :type dataset ExternalDataset + :param val_dataset: training data + :type val_dataset ExternalDataset + :param logging_path: subdirectory in temp_path to save logger outputs + :type logging_path: str + :param verbose: if set to True, additional information is printed to STDOUT + :type verbose: bool + :param logging: if set to True, text and STDOUT logging will be used + :type logging: bool + :param seed: seed for reproducibility + :type seed: int + :param local_rank: for distribution learning + :type local_rank: int + """ + + mkdir(local_rank, self.cfg.save_dir) + + if logging: + self.logger = NanoDetLightningLogger(self.temp_path + "/" + logging_path) + self.logger.dump_cfg(self.cfg) + + if seed != '' or seed is not None: + if logging: + self.logger.info("Set random seed to {}".format(seed)) + pl.seed_everything(seed) + + if logging: + self.logger.info("Setting up data...") + elif verbose: + print("Setting up data...") + + train_dataset = build_dataset(self.cfg.data.train, dataset, self.cfg.class_names, "train") + val_dataset = train_dataset if val_dataset is None else \ + build_dataset(self.cfg.data.val, val_dataset, self.cfg.class_names, "val") + + evaluator = build_evaluator(self.cfg.evaluator, val_dataset) + + train_dataloader = torch.utils.data.DataLoader( + train_dataset, + batch_size=self.batch_size, + shuffle=True, + num_workers=self.cfg.device.workers_per_gpu, + pin_memory=False, + collate_fn=naive_collate, + drop_last=True, + ) + val_dataloader = torch.utils.data.DataLoader( + val_dataset, + batch_size=self.batch_size, + shuffle=False, + num_workers=self.cfg.device.workers_per_gpu, + pin_memory=False, + collate_fn=naive_collate, + drop_last=False, + ) + + # Load state dictionary + model_resume_path = ( + os.path.join(self.temp_path, "checkpoints", "model_iter_{}.ckpt".format(self.checkpoint_load_iter)) + if self.checkpoint_load_iter > 0 else None + ) + + if logging: + self.logger.info("Creating task...") + elif verbose: + print("Creating task...") + self.task = TrainingTask(self.cfg, self.model, evaluator) + + gpu_ids = None + accelerator = None + if self.device == "cuda": + gpu_ids = self.cfg.device.gpu_ids + accelerator = None if len(gpu_ids) <= 1 else "ddp" + + trainer = pl.Trainer( + default_root_dir=self.temp_path, + max_epochs=self.iters, + gpus=gpu_ids, + check_val_every_n_epoch=self.checkpoint_after_iter, + accelerator=accelerator, + log_every_n_steps=self.cfg.log.interval, + num_sanity_val_steps=0, + resume_from_checkpoint=model_resume_path, + callbacks=[ProgressBar(refresh_rate=0)], + logger=self.logger, + benchmark=True, + gradient_clip_val=self.cfg.get("grad_clip", 0.0), + ) + + trainer.fit(self.task, train_dataloader, val_dataloader) + + def eval(self, dataset, verbose=True, logging=False, local_rank=1): + """ + This method performs evaluation on a given dataset and returns a dictionary with the evaluation results. + :param dataset: test data + :type dataset_path: ExternalDataset + :param verbose: if set to True, additional information is printed to STDOUT + :type verbose: bool + :param logging: if set to True, text and STDOUT logging will be used + :type logging: bool + :param local_rank: for distribution learning + :type local_rank: int + """ + + timestr = datetime.datetime.now().__format__("%Y_%m_%d_%H:%M:%S") + save_dir = os.path.join(self.cfg.save_dir, timestr) + mkdir(local_rank, save_dir) + + if logging: + self.logger = NanoDetLightningLogger(save_dir) + + self.cfg.update({"test_mode": "val"}) + + if logging: + self.logger.info("Setting up data...") + elif verbose: + print("Setting up data...") + + val_dataset = build_dataset(self.cfg.data.val, dataset, self.cfg.class_names, "test") + + val_dataloader = torch.utils.data.DataLoader( + val_dataset, + batch_size=self.batch_size, + shuffle=False, + num_workers=self.cfg.device.workers_per_gpu, + pin_memory=False, + collate_fn=naive_collate, + drop_last=False, + ) + evaluator = build_evaluator(self.cfg.evaluator, val_dataset) + + if logging: + self.logger.info("Creating task...") + elif verbose: + print("Creating task...") + + self.task = TrainingTask(self.cfg, self.model, evaluator) + + gpu_ids = None + accelerator = None + if self.device == "cuda": + gpu_ids = self.cfg.device.gpu_ids + accelerator = None if len(gpu_ids) <= 1 else "ddp" + + trainer = pl.Trainer( + default_root_dir=save_dir, + gpus=gpu_ids, + accelerator=accelerator, + log_every_n_steps=self.cfg.log.interval, + num_sanity_val_steps=0, + logger=self.logger, + ) + if self.logger: + self.logger.info("Starting testing...") + elif verbose: + print("Starting testing...") + + test_results = (verbose or logging) + return trainer.test(self.task, val_dataloader, verbose=test_results) + + def download(self, path=None, verbose=True, + url=OPENDR_SERVER_URL + "/perception/gesture_recognition/nanodet/"): + + """ + Downloads model + :param path: folder to which files will be downloaded, if None self.temp_path will be used + :type path: str + :param verbose: if True, additional information is printed on STDOUT + :type verbose: bool + :param url: URL to file location on FTP server + :type url: str + """ + + if path is None: + path = self.temp_path + if not os.path.exists(path): + os.makedirs(path) + + model = self.cfg.check_point_name + + path = os.path.join(path, "nanodet_{}".format(model)) + if not os.path.exists(path): + os.makedirs(path) + + if verbose: + print("Downloading pretrained checkpoint...") + + file_url = os.path.join(url, "nanodet_{}".format(model), + "nanodet_{}.ckpt".format(model)) + if not os.path.exists(os.path.join(path, "nanodet_{}.ckpt".format(model))): + urlretrieve(file_url, os.path.join(path, "nanodet_{}.ckpt".format(model))) + else: + print("Checkpoint already exists.") + + if verbose: + print("Downloading pretrain weights if provided...") + + file_url = os.path.join(url, "nanodet_{}".format(model), + "nanodet_{}.pth".format(model)) + try: + if not os.path.exists(os.path.join(path, "nanodet_{}.pth".format(model))): + urlretrieve(file_url, os.path.join(path, "nanodet_{}.pth".format(model))) + else: + print("Weights file already exists.") + + if verbose: + print("Making metadata...") + metadata = {"model_paths": [], "framework": "pytorch", "format": "pth", "has_data": False, + "inference_params": {"input_size": self.cfg.data.val.input_size, "classes": self.classes}, + "optimized": False, "optimizer_info": {}} + + param_filepath = "nanodet_{}.pth".format(model) + metadata["model_paths"].append(param_filepath) + with open(os.path.join(path, "nanodet_{}.json".format(model)), 'w', encoding='utf-8') as f: + json.dump(metadata, f, ensure_ascii=False, indent=4) + + except: + print("Pretrain weights for this model are not provided!!! \n" + "Only the checkpoint will be download") + + if verbose: + print("Making metadata...") + metadata = {"model_paths": [], "framework": "pytorch", "format": "pth", "has_data": False, + "inference_params": {"input_size": self.cfg.data.val.input_size, "classes": self.classes}, + "optimized": False, "optimizer_info": {}} + + param_filepath = "nanodet_{}.ckpt".format(model) + metadata["model_paths"].append(param_filepath) + with open(os.path.join(path, "nanodet_{}.json".format(model)), 'w', encoding='utf-8') as f: + json.dump(metadata, f, ensure_ascii=False, indent=4) + + def _load_hparam(self, model: str): + """ Load hyperparameters for nanodet models and training configuration + + :parameter model: The name of the model of which we want to load the config file + :type model: str + :return: config with hyperparameters + :rtype: dict + """ + assert ( + model in _MODEL_NAMES + ), f"Invalid model selected. Choose one of {_MODEL_NAMES}." + full_path = list() + path = Path(__file__).parent / "algorithm" / "config" + wanted_file = "nanodet_{}.yml".format(model) + for root, dir, files in os.walk(path): + if wanted_file in files: + full_path.append(os.path.join(root, wanted_file)) + assert (len(full_path) == 1), f"You must have only one nanodet_{model}.yaml file in your config folder" + load_config(cfg, full_path[0]) + return cfg diff --git a/tests/sources/tools/perception/gesture_recognition/__init__.py b/tests/sources/tools/perception/gesture_recognition/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/sources/tools/perception/gesture_recognition/test_gesture_recognition.py b/tests/sources/tools/perception/gesture_recognition/test_gesture_recognition.py new file mode 100644 index 0000000000..b1ec6fa2b3 --- /dev/null +++ b/tests/sources/tools/perception/gesture_recognition/test_gesture_recognition.py @@ -0,0 +1,161 @@ +# Copyright 2020-2023 OpenDR European Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import cv2 +import unittest +import gc +import shutil +import os +import numpy as np +from opendr.perception.gesture_recognition.gesture_recognition_learner import GestureRecognitionLearner +from opendr.engine.datasets import ExternalDataset +import json +import time +device = os.getenv('TEST_DEVICE') if os.getenv('TEST_DEVICE') else 'cpu' + +_DEFAULT_MODEL = "plus_m_1.5x_416" + + +def rmfile(path): + try: + os.remove(path) + except OSError as e: + print("Error: %s - %s." % (e.filename, e.strerror)) + + +def rmdir(_dir): + try: + shutil.rmtree(_dir) + except OSError as e: + print("Error: %s - %s." % (e.filename, e.strerror)) + + +def make_dummy_dataset(tmp_dir): + os.makedirs(tmp_dir) + idi = 0 + for split in ['train', 'test', 'val']: + os.makedirs(os.path.join(tmp_dir, split)) + annotations = [] + categories = [] + images = [] + classes = ['call', 'dislike', 'fist', 'four', 'like', 'mute', 'ok', 'one', 'palm', + 'peace', 'rock', 'stop', 'stop_inverted', 'three', 'two_up', + 'two_up_inverted', 'three2', 'peace_inverted', 'no_gesture'] + for i, name in enumerate(classes): + categories.append({'supercategory': 'none', 'name': name, 'id': i}) + dummy_image = np.zeros((416, 416, 3)) + cv2.imwrite(os.path.join(tmp_dir, split, 'image_{}_{}_{}.jpg'.format(name, split, 0)), dummy_image) + dummy_image2 = np.zeros((416, 416, 3)) + cv2.imwrite(os.path.join(tmp_dir, split, 'image_{}_{}_{}.jpg'.format(name, split, 1)), dummy_image2) + images.append({'file_name': 'image_{}_{}_{}.jpg'.format(name, split, 0), 'height': 416, 'width': 416, 'id': idi}) + images.append({'file_name': 'image_{}_{}_{}.jpg'.format(name, split, 1), 'height': 416, 'width': 416, 'id': idi+1}) + annotations.append({'id': idi, 'bbox': [233.8257552, 238.43682560000002, + 118.39368, 145.3367104], + 'segmentation': [[233.8257552, 238.43682560000002, + 352.2194352, 238.43682560000002, + 352.2194352, 383.77353600000004, + 233.8257552, 383.77353600000004]], + 'image_id': idi, 'category_id': i, + 'iscrowd': 0, 'area': 17206.94798335027}) + annotations.append({'id': idi+1, 'bbox': [233.8257552, 238.43682560000002, + 118.39368, 145.3367104], + 'segmentation': [[233.8257552, 238.43682560000002, + 352.2194352, 238.43682560000002, + 352.2194352, 383.77353600000004, + 233.8257552, 383.77353600000004]], + 'image_id': idi+1, 'category_id': i, + 'iscrowd': 0, 'area': 17206.94798335027}) + idi += 2 + temp = {'images': images, 'annotations': annotations, 'categories': categories} + with open(os.path.join(tmp_dir, '{}.json'.format(split)), 'w') as f: + json.dump(temp, f) + + +class TestGestureRecognitionLearner(unittest.TestCase): + + @classmethod + def setUpClass(cls): + print("\n\n**********************************\nTEST GestureRecognition Learner\n" + "**********************************") + + cls.temp_dir = os.path.join(".", 'temp_gesture_'+str(time.time())) + cls.model = GestureRecognitionLearner(model_to_use=_DEFAULT_MODEL, device=device, + temp_path=cls.temp_dir, batch_size=1, + iters=1, checkpoint_after_iter=2, lr=1e-4) + make_dummy_dataset(cls.temp_dir) + + @classmethod + def tearDownClass(cls): + print('Removing temporary directory for Gesture recognition...') + # Clean up downloaded files + rmdir(cls.temp_dir) + + del cls.model + gc.collect() + print('Finished cleaning for Gesture recognition...') + + def test_fit(self): + print('Starting training test for Gesture recognition...') + dataset = ExternalDataset(self.temp_dir, 'coco') + val_dataset = ExternalDataset(self.temp_dir, 'coco') + m = list(self.model._model.parameters())[0].clone().detach().clone().to(device) + self.model.fit(dataset=dataset, val_dataset=val_dataset, verbose=False) + n = list(self.model._model.parameters())[0].clone().detach().clone().to(device) + self.assertFalse(np.array_equal(m, n), + msg="Model parameters did not change after running fit.") + del dataset, m, n + gc.collect() + + rmfile(os.path.join(self.temp_dir, "checkpoints", "model_iter_0.ckpt")) + rmdir(os.path.join(self.temp_dir, "checkpoints")) + + print('Finished training test for Gesture recognition...') + + def test_eval(self): + print('Starting evaluation test for Gesture recognition...') + + test_dataset = ExternalDataset(self.temp_dir, 'coco') + results_dict = self.model.eval(dataset=test_dataset, verbose=False) + self.assertNotEqual(len(results_dict), 0, + msg="Eval results dictionary list is empty.") + + del test_dataset, results_dict + gc.collect() + + rmfile(os.path.join(self.temp_dir, "eval_results.txt")) + print('Finished evaluation test for Nanodet...') + + def test_save_load(self): + print('Starting save/load test for Nanodet...') + self.model.save(path=os.path.join(self.temp_dir, "test_model"), verbose=False) + starting_param_1 = list(self.model._model.parameters())[0].detach().clone().to(device) + self.model.model = None + learner2 = GestureRecognitionLearner(model_to_use=_DEFAULT_MODEL, + device=device, temp_path=self.temp_dir, + batch_size=1, iters=1, + checkpoint_after_iter=1, lr=1e-4) + learner2.load(path=os.path.join(self.temp_dir, "test_model"), verbose=False) + new_param = list(learner2._model.parameters())[0].detach().clone().to(device) + self.assertTrue(starting_param_1.allclose(new_param)) + + del starting_param_1, new_param + # Cleanup + rmfile(os.path.join(self.temp_dir, "test_model", "nanodet_{}.json".format(_DEFAULT_MODEL))) + rmfile(os.path.join(self.temp_dir, "test_model", "nanodet_{}.pth".format(_DEFAULT_MODEL))) + rmdir(os.path.join(self.temp_dir, "test_model")) + print('Finished save/load test for Gesture learner...') + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_license.py b/tests/test_license.py index 98c260d46f..19c0632b46 100644 --- a/tests/test_license.py +++ b/tests/test_license.py @@ -110,6 +110,7 @@ def setUp(self): 'src/opendr/perception/panoptic_segmentation/efficient_lps/algorithm/EfficientLPS', 'src/opendr/perception/facial_expression_recognition/landmark_based_facial_expression_recognition/algorithm', 'src/opendr/perception/facial_expression_recognition/image_based_facial_emotion_estimation/algorithm', + 'src/opendr/perception/gesture_recognition/algorithm', 'projects/python/perception/facial_expression_recognition/image_based_facial_emotion_estimation', 'projects/opendr_ws_2/src/opendr_perception/test', 'projects/opendr_ws_2/src/opendr_ros2_bridge/test',