facebookresearch · LucaswasTaken · Oct 28, 2020 · Oct 30, 2020 · Oct 30, 2020 · Nov 3, 2020
diff --git a/.gitignore b/.gitignore
@@ -34,3 +34,4 @@ demo.sh
 demo_multi.sh
 mocap_utils/frame_to_video.py
 mocap_utils/frame_to_gif.py
+mocap_utils/select_epick_kitchen.py
diff --git a/README.md b/README.md
@@ -65,6 +65,10 @@ FrankMocap pursues an easy-to-use single view 3D motion capture system developed
 - Note: 
   - Above commands use openGL by default. If it does not work, you may try alternative renderers (pytorch3d or openDR). 
   - See the readme of each module for details
+
+
+## Joint Order
+- See [joint_order](docs/joint_order.md)
 
 
 ## Body Motion Capture Module

diff --git a/bodymocap/body_bbox_detector_cpu.py b/bodymocap/body_bbox_detector_cpu.py
@@ -0,0 +1,130 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import os
+import os.path as osp
+import sys
+import numpy as np
+import cv2
+
+import torch
+import torchvision.transforms as transforms
+# from PIL import Image
+
+# Code from https://github.com/Daniil-Osokin/lightweight-human-pose-estimation.pytorch/blob/master/demo.py
+
+# 2D body pose estimator
+pose2d_estimator_path = './detectors/body_pose_estimator'
+sys.path.append(pose2d_estimator_path)
+from detectors.body_pose_estimator.pose2d_models.with_mobilenet import PoseEstimationWithMobileNet
+from detectors.body_pose_estimator.modules.load_state import load_state
+from detectors.body_pose_estimator.val import normalize, pad_width
+from detectors.body_pose_estimator.modules.pose import Pose, track_poses
+from detectors.body_pose_estimator.modules.keypoints import extract_keypoints, group_keypoints
+
+
+class BodyPoseEstimator_cpu(object):
+    """
+    Hand Detector for third-view input.
+    It combines a body pose estimator (https://github.com/jhugestar/lightweight-human-pose-estimation.pytorch.git)
+    """
+    def __init__(self):
+        print("Loading Body Pose Estimator")
+        self.__load_body_estimator()
+
+
+    def __load_body_estimator(self):
+        device = torch.device('cpu')
+        net = PoseEstimationWithMobileNet().to(device)
+        pose2d_checkpoint = "./extra_data/body_module/body_pose_estimator/checkpoint_iter_370000.pth"
+        checkpoint = torch.load(pose2d_checkpoint, map_location='cpu')
+        load_state(net, checkpoint)
+        net = net.eval()
+        # net = net.cuda()
+        self.model = net
+
+
+    #Code from https://github.com/Daniil-Osokin/lightweight-human-pose-estimation.pytorch/demo.py
+    def __infer_fast(self, img, input_height_size, stride, upsample_ratio, 
+        cpu=False, pad_value=(0, 0, 0), img_mean=(128, 128, 128), img_scale=1/256):
+        height, width, _ = img.shape
+        scale = input_height_size / height
+
+        scaled_img = cv2.resize(img, (0, 0), fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
+        scaled_img = normalize(scaled_img, img_mean, img_scale)
+        min_dims = [input_height_size, max(scaled_img.shape[1], input_height_size)]
+        padded_img, pad = pad_width(scaled_img, stride, pad_value, min_dims)
+
+        tensor_img = torch.from_numpy(padded_img).permute(2, 0, 1).unsqueeze(0).float()
+        if not cpu:
+            tensor_img = tensor_img.cpu()
+
+        stages_output = self.model(tensor_img)
+
+        stage2_heatmaps = stages_output[-2]
+        heatmaps = np.transpose(stage2_heatmaps.squeeze().cpu().data.numpy(), (1, 2, 0))
+        heatmaps = cv2.resize(heatmaps, (0, 0), fx=upsample_ratio, fy=upsample_ratio, interpolation=cv2.INTER_CUBIC)
+
+        stage2_pafs = stages_output[-1]
+        pafs = np.transpose(stage2_pafs.squeeze().cpu().data.numpy(), (1, 2, 0))
+        pafs = cv2.resize(pafs, (0, 0), fx=upsample_ratio, fy=upsample_ratio, interpolation=cv2.INTER_CUBIC)
+
+        return heatmaps, pafs, scale, pad
+
+    def detect_body_pose(self, img):
+        """
+        Output:
+            current_bbox: BBOX_XYWH
+        """
+        stride = 8
+        upsample_ratio = 4
+        orig_img = img.copy()
+
+        # forward
+        heatmaps, pafs, scale, pad = self.__infer_fast(img, 
+            input_height_size=256, stride=stride, upsample_ratio=upsample_ratio)
+
+        total_keypoints_num = 0
+        all_keypoints_by_type = []
+        num_keypoints = Pose.num_kpts
+        for kpt_idx in range(num_keypoints):  # 19th for bg
+            total_keypoints_num += extract_keypoints(heatmaps[:, :, kpt_idx], all_keypoints_by_type, total_keypoints_num)
+
+        pose_entries, all_keypoints = group_keypoints(all_keypoints_by_type, pafs, demo=True)
+        for kpt_id in range(all_keypoints.shape[0]):
+            all_keypoints[kpt_id, 0] = (all_keypoints[kpt_id, 0] * stride / upsample_ratio - pad[1]) / scale
+            all_keypoints[kpt_id, 1] = (all_keypoints[kpt_id, 1] * stride / upsample_ratio - pad[0]) / scale
+
+        '''
+        # print(len(pose_entries))
+        if len(pose_entries)>1:
+            pose_entries = pose_entries[:1]
+            print("We only support one person currently")
+            # assert len(pose_entries) == 1, "We only support one person currently"
+        '''
+
+        current_poses, current_bbox = list(), list()
+        for n in range(len(pose_entries)):
+            if len(pose_entries[n]) == 0:
+                continue
+            pose_keypoints = np.ones((num_keypoints, 2), dtype=np.int32) * -1
+            for kpt_id in range(num_keypoints):
+                if pose_entries[n][kpt_id] != -1.0:  # keypoint was found
+                    pose_keypoints[kpt_id, 0] = int(all_keypoints[int(pose_entries[n][kpt_id]), 0])
+                    pose_keypoints[kpt_id, 1] = int(all_keypoints[int(pose_entries[n][kpt_id]), 1])
+            pose = Pose(pose_keypoints, pose_entries[n][18]) 
+            current_poses.append(pose.keypoints)
+            current_bbox.append(np.array(pose.bbox))
+
+        # enlarge the bbox
+        for i, bbox in enumerate(current_bbox):
+            x, y, w, h = bbox
+            margin = 0.05
+            x_margin = int(w * margin)
+            y_margin = int(h * margin)
+            x0 = max(x-x_margin, 0)
+            y0 = max(y-y_margin, 0)
+            x1 = min(x+w+x_margin, orig_img.shape[1])
+            y1 = min(y+h+y_margin, orig_img.shape[0])
+            current_bbox[i] = np.array((x0, y0, x1-x0, y1-y0)).astype(np.int32)
+
+        return current_poses, current_bbox
diff --git a/bodymocap/body_mocap_api.py b/bodymocap/body_mocap_api.py
@@ -112,7 +112,7 @@ def regress(self, img_original, body_bbox_list):
                 pred_output['pred_joints_img'] = pred_joints_vis_img # SMPL joints in image space
 
                 pred_aa_tensor = gu.rotation_matrix_to_angle_axis(pred_rotmat.detach().cpu()[0])
-                pred_output['pred_body_pose'] = pred_aa_tensor.cpu().numpy().reshape(1, 72)
+                pred_output['pred_body_pose'] = pred_aa_tensor.cpu().numpy().reshape(1, 72) # (1, 72)
 
                 pred_output['pred_rotmat'] = pred_rotmat.detach().cpu().numpy() # (1, 24, 3, 3)
                 pred_output['pred_betas'] = pred_betas.detach().cpu().numpy() # (1, 10)

diff --git a/bodymocap/body_mocap_api_cpu.py b/bodymocap/body_mocap_api_cpu.py
@@ -0,0 +1,183 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+import cv2
+import sys
+import torch
+import numpy as np
+import pickle 
+from torchvision.transforms import Normalize
+
+from bodymocap.models import hmr, SMPL, SMPLX
+from bodymocap import constants
+from bodymocap.utils.imutils import crop, crop_bboxInfo, process_image_bbox, process_image_keypoints, bbox_from_keypoints
+from mocap_utils.coordconv import convert_smpl_to_bbox, convert_bbox_to_oriIm
+import mocap_utils.geometry_utils as gu
+
+
+class BodyMocap_cpu(object):
+    def __init__(self, regressor_checkpoint, smpl_dir, device=torch.device('cpu'), use_smplx=False):
+
+        self.device = torch.device('gpu') if torch.cuda.is_available() else torch.device('cpu')
+
+        # Load parametric model (SMPLX or SMPL)
+        if use_smplx:
+            smplModelPath = smpl_dir + '/SMPLX_NEUTRAL.pkl'
+            self.smpl = SMPLX(smpl_dir,
+                    batch_size=1,
+                    num_betas = 10,
+                    use_pca = False,
+                    create_transl=False).to(self.device)
+            self.use_smplx = True
+        else:
+            smplModelPath = smpl_dir + '/basicModel_neutral_lbs_10_207_0_v1.0.0.pkl'
+            self.smpl = SMPL(smplModelPath, batch_size=1, create_transl=False).to(self.device)
+            self.use_smplx = False
+
+        #Load pre-trained neural network 
+        SMPL_MEAN_PARAMS = './extra_data/body_module/data_from_spin/smpl_mean_params.npz'
+        self.model_regressor = hmr(SMPL_MEAN_PARAMS).to(self.device)
+        checkpoint = torch.load(regressor_checkpoint,map_location=torch.device('cpu'))
+        self.model_regressor.load_state_dict(checkpoint['model'], strict=False)
+        self.model_regressor.eval()
+
+
+    def regress(self, img_original, body_bbox_list):
+        """
+            args: 
+                img_original: original raw image (BGR order by using cv2.imread)
+                body_bbox: bounding box around the target: (minX, minY, width, height)
+            outputs:
+                pred_vertices_img:
+                pred_joints_vis_img:
+                pred_rotmat
+                pred_betas
+                pred_camera
+                bbox: [bbr[0], bbr[1],bbr[0]+bbr[2], bbr[1]+bbr[3]])
+                bboxTopLeft:  bbox top left (redundant)
+                boxScale_o2n: bbox scaling factor (redundant) 
+        """
+        pred_output_list = list()
+
+        for body_bbox in body_bbox_list:
+            img, norm_img, boxScale_o2n, bboxTopLeft, bbox = process_image_bbox(
+                img_original, body_bbox, input_res=constants.IMG_RES)
+            bboxTopLeft = np.array(bboxTopLeft)
+
+            # bboxTopLeft = bbox['bboxXYWH'][:2]
+            if img is None:
+                pred_output_list.append(None)
+                continue
+
+            with torch.no_grad():
+                # model forward
+                pred_rotmat, pred_betas, pred_camera = self.model_regressor(norm_img.to(self.device))
+
+                #Convert rot_mat to aa since hands are always in aa
+                # pred_aa = rotmat3x3_to_angle_axis(pred_rotmat)
+                pred_aa = gu.rotation_matrix_to_angle_axis(pred_rotmat).cpu()
+                pred_aa = pred_aa.reshape(pred_aa.shape[0], 72)
+                smpl_output = self.smpl(
+                    betas=pred_betas, 
+                    body_pose=pred_aa[:,3:],
+                    global_orient=pred_aa[:,:3], 
+                    pose2rot=True)
+                pred_vertices = smpl_output.vertices
+                pred_joints_3d = smpl_output.joints
+
+                pred_vertices = pred_vertices[0].cpu().numpy()
+
+                pred_camera = pred_camera.cpu().numpy().ravel()
+                camScale = pred_camera[0] # *1.15
+                camTrans = pred_camera[1:]
+
+                pred_output = dict()
+                # Convert mesh to original image space (X,Y are aligned to image)
+                # 1. SMPL -> 2D bbox
+                # 2. 2D bbox -> original 2D image
+                pred_vertices_bbox = convert_smpl_to_bbox(pred_vertices, camScale, camTrans)
+                pred_vertices_img = convert_bbox_to_oriIm(
+                    pred_vertices_bbox, boxScale_o2n, bboxTopLeft, img_original.shape[1], img_original.shape[0])
+
+                # Convert joint to original image space (X,Y are aligned to image)
+                pred_joints_3d = pred_joints_3d[0].cpu().numpy() # (1,49,3)
+                pred_joints_vis = pred_joints_3d[:,:3]  # (49,3)
+                pred_joints_vis_bbox = convert_smpl_to_bbox(pred_joints_vis, camScale, camTrans) 
+                pred_joints_vis_img = convert_bbox_to_oriIm(
+                    pred_joints_vis_bbox, boxScale_o2n, bboxTopLeft, img_original.shape[1], img_original.shape[0]) 
+
+                # Output
+                pred_output['img_cropped'] = img[:, :, ::-1]
+                pred_output['pred_vertices_smpl'] = smpl_output.vertices[0].cpu().numpy() # SMPL vertex in original smpl space
+                pred_output['pred_vertices_img'] = pred_vertices_img # SMPL vertex in image space
+                pred_output['pred_joints_img'] = pred_joints_vis_img # SMPL joints in image space
+
+                pred_aa_tensor = gu.rotation_matrix_to_angle_axis(pred_rotmat.detach().cpu()[0])
+                pred_output['pred_body_pose'] = pred_aa_tensor.cpu().numpy().reshape(1, 72)
+
+                pred_output['pred_rotmat'] = pred_rotmat.detach().cpu().numpy() # (1, 24, 3, 3)
+                pred_output['pred_betas'] = pred_betas.detach().cpu().numpy() # (1, 10)
+
+                pred_output['pred_camera'] = pred_camera
+                pred_output['bbox_top_left'] = bboxTopLeft
+                pred_output['bbox_scale_ratio'] = boxScale_o2n
+                pred_output['faces'] = self.smpl.faces
+
+                if self.use_smplx:
+                    img_center = np.array((img_original.shape[1], img_original.shape[0]) ) * 0.5
+                    # right hand
+                    pred_joints = smpl_output.right_hand_joints[0].cpu().numpy()     
+                    pred_joints_bbox = convert_smpl_to_bbox(pred_joints, camScale, camTrans)
+                    pred_joints_img = convert_bbox_to_oriIm(
+                        pred_joints_bbox, boxScale_o2n, bboxTopLeft, img_original.shape[1], img_original.shape[0])
+                    pred_output['right_hand_joints_img_coord'] = pred_joints_img
+                    # left hand 
+                    pred_joints = smpl_output.left_hand_joints[0].cpu().numpy()
+                    pred_joints_bbox = convert_smpl_to_bbox(pred_joints, camScale, camTrans)
+                    pred_joints_img = convert_bbox_to_oriIm(
+                        pred_joints_bbox, boxScale_o2n, bboxTopLeft, img_original.shape[1], img_original.shape[0])
+                    pred_output['left_hand_joints_img_coord'] = pred_joints_img
+
+                pred_output_list.append(pred_output)
+
+        return pred_output_list
+
+
+    def get_hand_bboxes(self, pred_body_list, img_shape):
+        """
+            args: 
+                pred_body_list: output of body regresion
+                img_shape: img_height, img_width
+            outputs:
+                hand_bbox_list: 
+        """
+        hand_bbox_list = list()
+        for pred_body in pred_body_list:
+            hand_bbox = dict(
+                left_hand = None,
+                right_hand = None
+            )
+            if pred_body is None:
+                hand_bbox_list.append(hand_bbox)
+            else:
+                for hand_type in hand_bbox:
+                    key = f'{hand_type}_joints_img_coord'
+                    pred_joints_vis_img = pred_body[key]
+
+                    if pred_joints_vis_img is not None:
+                        # get initial bbox
+                        x0, x1 = np.min(pred_joints_vis_img[:, 0]), np.max(pred_joints_vis_img[:, 0])
+                        y0, y1 = np.min(pred_joints_vis_img[:, 1]), np.max(pred_joints_vis_img[:, 1])
+                        width, height = x1-x0, y1-y0
+                        # extend the obtained bbox
+                        margin = int(max(height, width) * 0.2)
+                        img_height, img_width = img_shape
+                        x0 = max(x0 - margin, 0)
+                        y0 = max(y0 - margin, 0)
+                        x1 = min(x1 + margin, img_width)
+                        y1 = min(y1 + margin, img_height)
+                        # result bbox in (x0, y0, w, h) format
+                        hand_bbox[hand_type] = np.array([x0, y0, x1-x0, y1-y0]) # in (x, y, w, h ) format
+
+                hand_bbox_list.append(hand_bbox)
+
+        return hand_bbox_list
diff --git a/bodymocap/models/smpl.py b/bodymocap/models/smpl.py
@@ -77,15 +77,14 @@ def forward(self, *args, **kwargs):
         extra_joints = vertices2joints(self.J_regressor_extra, smpl_output.vertices)
         # extra_joints = vertices2joints(self.J_regressor_extra, smpl_output.vertices[:,:6890])   *0      #TODO: implement this correctly
 
-
         #SMPL-X Joint order: https://docs.google.com/spreadsheets/d/1_1dLdaX-sbMkCKr_JzJW_RZCpwBwd7rcKkWT_VgAQ_0/edit#gid=0
-        smplx_to_smpl = list(range(0,22)) + [28,43] + list(range(55,76))        #28 left middle finger , 43: right middle finger 1
-        smpl_joints = smpl_output.joints[:,smplx_to_smpl,:]       #Convert SMPL-X to SMPL     127 ->45
-        joints = torch.cat([smpl_joints, extra_joints], dim=1)               #[N, 127, 3]->[N, 45, 3]  + [N, 9, 3]        #SMPL-X has more joints. should convert 45
+        smplx_to_smpl = list(range(0,22)) + [28,43] + list(range(55,76)) # 28 left middle finger , 43: right middle finger 1
+        smpl_joints = smpl_output.joints[:,smplx_to_smpl,:] # Convert SMPL-X to SMPL     127 ->45
+        joints = torch.cat([smpl_joints, extra_joints], dim=1) # [N, 127, 3]->[N, 45, 3]  + [N, 9, 3]  # SMPL-X has more joints. should convert 45
         joints = joints[:, self.joint_map, :]     
 
         # Hand joints
-        smplx_hand_to_panoptic = [0,   13,14,15,16, 1,2,3,17, 4,5,6,18, 10,11,12,19, 7,8,9,20] #Wrist Thumb to Pinky
+        smplx_hand_to_panoptic = [0, 13,14,15,16, 1,2,3,17, 4,5,6,18, 10,11,12,19, 7,8,9,20] #Wrist Thumb to Pinky
 
         smplx_lhand =  [20] + list(range(25,40)) + list(range(66, 71))         #20 for left wrist. 20 finger joints
         lhand_joints = smpl_output.joints[:,smplx_lhand, :]      #(N,21,3)

diff --git a/demo/demo_bodymocap.py b/demo/demo_bodymocap.py
@@ -19,6 +19,7 @@
 import mocap_utils.general_utils as gnu
 from mocap_utils.timer import Timer
 
+import renderer.image_utils as imu
 from renderer.viewer2D import ImShow
 
 def run_body_mocap(args, body_bbox_detector, body_mocap, visualizer):
@@ -120,7 +121,7 @@ def run_body_mocap(args, body_bbox_detector, body_mocap, visualizer):
             img_original_bgr,
             pred_mesh_list = pred_mesh_list, 
             body_bbox_list = body_bbox_list)
-
+        
         # show result in the screen
         if not args.no_display:
             res_img = res_img.astype(np.uint8)