facebookresearch · lucasjinreal · Feb 18, 2022 · Feb 18, 2022 · Feb 19, 2022
diff --git a/.DS_Store b/.DS_Store
diff --git a/.gitignore b/.gitignore
@@ -1,37 +1,6 @@
-*.swp
-*.jpg
-*.png
-smpl/
-tmp/
-pretrained_models/
-.vscode/
-test_result/
-log/
-log.txt
-checkpoints/
-evaluate_results/
-*.pyc
-*.npy
-test_log.txt
-*.pkl
-pretrained/
-render_result/
-lightweight-human-pose-estimation.pytorch/
-data/*
-webvideos/
-samples/
-*.pth
-*.obj
-data/
-detectors/
-data
-samples_all/
-*.zip
-extra_data/
-sample_data/
+*.so
+tests.py
 mocap_output/
-demo.sh
-demo_multi.sh
-mocap_utils/frame_to_video.py
-mocap_utils/frame_to_gif.py
-mocap_utils/select_epick_kitchen.py
+sample_data/
+extra_data/
+*.onnx
diff --git a/__pycache__/render.cpython-39.pyc b/__pycache__/render.cpython-39.pyc
diff --git a/__pycache__/render_ctypes.cpython-39.pyc b/__pycache__/render_ctypes.cpython-39.pyc
diff --git a/bodymocap/__pycache__/body_bbox_detector.cpython-39.pyc b/bodymocap/__pycache__/body_bbox_detector.cpython-39.pyc
diff --git a/bodymocap/__pycache__/body_bbox_detector_ort.cpython-39.pyc b/bodymocap/__pycache__/body_bbox_detector_ort.cpython-39.pyc
diff --git a/bodymocap/__pycache__/body_mocap_api.cpython-39.pyc b/bodymocap/__pycache__/body_mocap_api.cpython-39.pyc
diff --git a/bodymocap/__pycache__/constants.cpython-39.pyc b/bodymocap/__pycache__/constants.cpython-39.pyc
diff --git a/bodymocap/body_bbox_detector.py b/bodymocap/body_bbox_detector.py
@@ -20,6 +20,7 @@
 from detectors.body_pose_estimator.val import normalize, pad_width
 from detectors.body_pose_estimator.modules.pose import Pose, track_poses
 from detectors.body_pose_estimator.modules.keypoints import extract_keypoints, group_keypoints
+from alfred.dl.torch.common import device
 
 
 class BodyPoseEstimator(object):
@@ -38,7 +39,7 @@ def __load_body_estimator(self):
         checkpoint = torch.load(pose2d_checkpoint, map_location='cpu')
         load_state(net, checkpoint)
         net = net.eval()
-        net = net.cuda()
+        # net = net.cuda()
         self.model = net
 
 
@@ -54,8 +55,7 @@ def __infer_fast(self, img, input_height_size, stride, upsample_ratio,
         padded_img, pad = pad_width(scaled_img, stride, pad_value, min_dims)
 
         tensor_img = torch.from_numpy(padded_img).permute(2, 0, 1).unsqueeze(0).float()
-        if not cpu:
-            tensor_img = tensor_img.cuda()
+        tensor_img = tensor_img.to(device)
 
         stages_output = self.model(tensor_img)
 

diff --git a/bodymocap/body_bbox_detector_ort.py b/bodymocap/body_bbox_detector_ort.py
@@ -0,0 +1,114 @@
+import argparse
+import cv2
+import numpy as np
+from alfred.utils.timer import ATimer
+from .utils.utils import ORTWrapper
+import math
+from operator import itemgetter
+from alfred.vis.image.pose import vis_pose_result, vis_pose_by_joints
+from alfred.utils.file_io import ImageSourceIter
+from .utils.utils import normalize, extract_keypoints, connections_nms, group_keypoints
+
+
+class LightweightedPoseDetector:
+
+    def __init__(self, onnx_f='data/human-pose-estimation.onnx') -> None:
+        self.onnx_model = ORTWrapper(onnx_f)
+
+        self.stride = 8
+        self.upsample_ratio = 4
+        self.num_keypoints = 18
+        self.img_mean = np.array([128, 128, 128]).astype(np.float32)
+        self.img_scale = np.float32(1/256)
+
+    def infer(self, img):
+        height, width, _ = img.shape
+
+        net_in_height = 256
+        net_in_width = 288
+
+        scale = min(net_in_height / height, net_in_width/width)
+
+        scaled_img = cv2.resize(
+            img, (0, 0), fx=scale, fy=scale, interpolation=cv2.INTER_LINEAR)
+        # scaled_img = normalize(scaled_img, img_mean, img_scale)
+        s_h, s_w, _ = scaled_img.shape
+        in_img = np.ones([net_in_height, net_in_width, 3]
+                         ).astype(np.uint8) * 128
+        top = (net_in_height - s_h) // 2
+        left = (net_in_width - s_w) // 2
+        in_img[top: top + s_h, left: left + s_w] = scaled_img
+
+        in_img = normalize(in_img, self.img_mean, self.img_scale, )
+        inp_img = np.expand_dims(in_img.transpose((2, 0, 1)), axis=0)
+        print(inp_img.shape)
+        stages_output = self.onnx_model.infer(inp_img)
+        # print(stages_output)
+        heatmaps = stages_output['stage_1_output_1_heatmaps']
+        pafs = stages_output['stage_1_output_0_pafs']
+
+        heatmaps = heatmaps.squeeze(0)
+        pafs = pafs.squeeze(0)
+        return heatmaps, pafs, scale, [top, left]
+
+    def run_one_img(self, img):
+        heatmaps, pafs, scale, pad = self.infer(img)
+
+        all_keypoints_by_type = []
+        total_keypoints_num = 0
+        for kpt_idx in range(self.num_keypoints):  # 19th for bg
+            total_keypoints_num += extract_keypoints(
+                heatmaps[:, :, kpt_idx], all_keypoints_by_type, total_keypoints_num)
+        pose_entries, all_keypoints = group_keypoints(
+            all_keypoints_by_type, pafs)
+
+        # h, w
+        for kpt_id in range(all_keypoints.shape[0]):
+            all_keypoints[kpt_id, 0] = (
+                all_keypoints[kpt_id, 0] * self.stride / self.upsample_ratio - pad[1]) / scale
+            all_keypoints[kpt_id, 1] = (
+                all_keypoints[kpt_id, 1] * self.stride / self.upsample_ratio - pad[0]) / scale
+
+        current_poses = []
+        for n in range(len(pose_entries)):
+            if len(pose_entries[n]) == 0:
+                continue
+            pose_keypoints = np.ones(
+                (self.num_keypoints, 2), dtype=np.int32) * -1
+            for kpt_id in range(self.num_keypoints):
+                if pose_entries[n][kpt_id] != -1.0:  # keypoint was found
+                    pose_keypoints[kpt_id, 0] = int(
+                        all_keypoints[int(pose_entries[n][kpt_id]), 0])
+                    pose_keypoints[kpt_id, 1] = int(
+                        all_keypoints[int(pose_entries[n][kpt_id]), 1])
+            # print(pose_entries[n][18])
+            current_poses.append(pose_keypoints)
+
+        if len(current_poses) > 0:
+            return np.stack(current_poses)
+        return current_poses
+
+    def get_enlarged_boxes_from_poses(self, poses, img_h, img_w):
+        current_bbox = []
+        for pose in poses:
+            found_keypoints = np.zeros(
+                (np.count_nonzero(pose[:, 0] != -1), 2), dtype=np.int32)
+            found_kpt_id = 0
+            for kpt_id in range(self.num_keypoints):
+                if pose[kpt_id, 0] == -1:
+                    continue
+                found_keypoints[found_kpt_id] = pose[kpt_id]
+                found_kpt_id += 1
+            bb = cv2.boundingRect(found_keypoints)
+            current_bbox.append(bb)
+        for i, bbox in enumerate(current_bbox):
+            x, y, w, h = bbox
+            margin = 0.05
+            x_margin = int(w * margin)
+            y_margin = int(h * margin)
+            x0 = max(x-x_margin, 0)
+            y0 = max(y-y_margin, 0)
+            x1 = min(x+w+x_margin, img_w)
+            y1 = min(y+h+y_margin, img_h)
+            current_bbox[i] = np.array((x0, y0, x1-x0, y1-y0)).astype(np.int32)
+        return current_bbox