@@ -31,69 +31,126 @@ def str2bool(v):
31
31
32
32
parser = argparse .ArgumentParser (description = 'Hand Pose Estimation from MediaPipe' )
33
33
parser .add_argument ('--input' , '-i' , type = str , help = 'Path to the input image. Omit for using default camera.' )
34
- parser .add_argument ('--model' , '-m' , type = str , default = './handpose_estimation_mediapipe_2022may .onnx' , help = 'Path to the model.' )
34
+ parser .add_argument ('--model' , '-m' , type = str , default = './handpose_estimation_mediapipe_2023feb .onnx' , help = 'Path to the model.' )
35
35
parser .add_argument ('--backend' , '-b' , type = int , default = backends [0 ], help = help_msg_backends .format (* backends ))
36
36
parser .add_argument ('--target' , '-t' , type = int , default = targets [0 ], help = help_msg_targets .format (* targets ))
37
- parser .add_argument ('--conf_threshold' , type = float , default = 0.8 , help = 'Filter out hands of confidence < conf_threshold.' )
37
+ parser .add_argument ('--conf_threshold' , type = float , default = 0.9 , help = 'Filter out hands of confidence < conf_threshold.' )
38
38
parser .add_argument ('--save' , '-s' , type = str , default = False , help = 'Set true to save results. This flag is invalid when using camera.' )
39
39
parser .add_argument ('--vis' , '-v' , type = str2bool , default = True , help = 'Set true to open a window for result visualization. This flag is invalid when using camera.' )
40
40
args = parser .parse_args ()
41
41
42
42
43
43
def visualize (image , hands , print_result = False ):
44
- output = image .copy ()
44
+ display_screen = image .copy ()
45
+ display_3d = np .zeros ((400 , 400 , 3 ), np .uint8 )
46
+ cv .line (display_3d , (200 , 0 ), (200 , 400 ), (255 , 255 , 255 ), 2 )
47
+ cv .line (display_3d , (0 , 200 ), (400 , 200 ), (255 , 255 , 255 ), 2 )
48
+ cv .putText (display_3d , 'Main View' , (0 , 12 ), cv .FONT_HERSHEY_DUPLEX , 0.5 , (0 , 0 , 255 ))
49
+ cv .putText (display_3d , 'Top View' , (200 , 12 ), cv .FONT_HERSHEY_DUPLEX , 0.5 , (0 , 0 , 255 ))
50
+ cv .putText (display_3d , 'Left View' , (0 , 212 ), cv .FONT_HERSHEY_DUPLEX , 0.5 , (0 , 0 , 255 ))
51
+ cv .putText (display_3d , 'Right View' , (200 , 212 ), cv .FONT_HERSHEY_DUPLEX , 0.5 , (0 , 0 , 255 ))
52
+ is_draw = False # ensure only one hand is drawn
53
+
54
+ def draw_lines (image , landmarks , is_draw_point = True , thickness = 2 ):
55
+ cv .line (image , landmarks [0 ], landmarks [1 ], (255 , 255 , 255 ), thickness )
56
+ cv .line (image , landmarks [1 ], landmarks [2 ], (255 , 255 , 255 ), thickness )
57
+ cv .line (image , landmarks [2 ], landmarks [3 ], (255 , 255 , 255 ), thickness )
58
+ cv .line (image , landmarks [3 ], landmarks [4 ], (255 , 255 , 255 ), thickness )
59
+
60
+ cv .line (image , landmarks [0 ], landmarks [5 ], (255 , 255 , 255 ), thickness )
61
+ cv .line (image , landmarks [5 ], landmarks [6 ], (255 , 255 , 255 ), thickness )
62
+ cv .line (image , landmarks [6 ], landmarks [7 ], (255 , 255 , 255 ), thickness )
63
+ cv .line (image , landmarks [7 ], landmarks [8 ], (255 , 255 , 255 ), thickness )
64
+
65
+ cv .line (image , landmarks [0 ], landmarks [9 ], (255 , 255 , 255 ), thickness )
66
+ cv .line (image , landmarks [9 ], landmarks [10 ], (255 , 255 , 255 ), thickness )
67
+ cv .line (image , landmarks [10 ], landmarks [11 ], (255 , 255 , 255 ), thickness )
68
+ cv .line (image , landmarks [11 ], landmarks [12 ], (255 , 255 , 255 ), thickness )
69
+
70
+ cv .line (image , landmarks [0 ], landmarks [13 ], (255 , 255 , 255 ), thickness )
71
+ cv .line (image , landmarks [13 ], landmarks [14 ], (255 , 255 , 255 ), thickness )
72
+ cv .line (image , landmarks [14 ], landmarks [15 ], (255 , 255 , 255 ), thickness )
73
+ cv .line (image , landmarks [15 ], landmarks [16 ], (255 , 255 , 255 ), thickness )
74
+
75
+ cv .line (image , landmarks [0 ], landmarks [17 ], (255 , 255 , 255 ), thickness )
76
+ cv .line (image , landmarks [17 ], landmarks [18 ], (255 , 255 , 255 ), thickness )
77
+ cv .line (image , landmarks [18 ], landmarks [19 ], (255 , 255 , 255 ), thickness )
78
+ cv .line (image , landmarks [19 ], landmarks [20 ], (255 , 255 , 255 ), thickness )
79
+
80
+ if is_draw_point :
81
+ for p in landmarks :
82
+ cv .circle (image , p , thickness , (0 , 0 , 255 ), - 1 )
45
83
46
84
for idx , handpose in enumerate (hands ):
47
85
conf = handpose [- 1 ]
48
86
bbox = handpose [0 :4 ].astype (np .int32 )
49
- landmarks = handpose [4 :- 1 ].reshape (21 , 2 ).astype (np .int32 )
87
+ handedness = handpose [- 2 ]
88
+ if handedness <= 0.5 :
89
+ handedness_text = 'Left'
90
+ else :
91
+ handedness_text = 'Right'
92
+ landmarks_screen = handpose [4 :67 ].reshape (21 , 3 ).astype (np .int32 )
93
+ landmarks_word = handpose [67 :130 ].reshape (21 , 3 )
50
94
51
95
# Print results
52
96
if print_result :
53
97
print ('-----------hand {}-----------' .format (idx + 1 ))
54
98
print ('conf: {:.2f}' .format (conf ))
99
+ print ('handedness: {}' .format (handedness_text ))
55
100
print ('hand box: {}' .format (bbox ))
56
101
print ('hand landmarks: ' )
57
- for l in landmarks :
102
+ for l in landmarks_screen :
103
+ print ('\t {}' .format (l ))
104
+ print ('hand world landmarks: ' )
105
+ for l in landmarks_word :
58
106
print ('\t {}' .format (l ))
59
107
108
+ # draw box
109
+ cv .rectangle (display_screen , (bbox [0 ], bbox [1 ]), (bbox [2 ], bbox [3 ]), (0 , 255 , 0 ), 2 )
110
+ # draw handedness
111
+ cv .putText (display_screen , '{}' .format (handedness_text ), (bbox [0 ], bbox [1 ] + 12 ), cv .FONT_HERSHEY_DUPLEX , 0.5 , (0 , 0 , 255 ))
60
112
# Draw line between each key points
61
- cv .line (output , landmarks [0 ], landmarks [1 ], (255 , 255 , 255 ), 2 )
62
- cv .line (output , landmarks [1 ], landmarks [2 ], (255 , 255 , 255 ), 2 )
63
- cv .line (output , landmarks [2 ], landmarks [3 ], (255 , 255 , 255 ), 2 )
64
- cv .line (output , landmarks [3 ], landmarks [4 ], (255 , 255 , 255 ), 2 )
65
-
66
- cv .line (output , landmarks [0 ], landmarks [5 ], (255 , 255 , 255 ), 2 )
67
- cv .line (output , landmarks [5 ], landmarks [6 ], (255 , 255 , 255 ), 2 )
68
- cv .line (output , landmarks [6 ], landmarks [7 ], (255 , 255 , 255 ), 2 )
69
- cv .line (output , landmarks [7 ], landmarks [8 ], (255 , 255 , 255 ), 2 )
70
-
71
- cv .line (output , landmarks [0 ], landmarks [9 ], (255 , 255 , 255 ), 2 )
72
- cv .line (output , landmarks [9 ], landmarks [10 ], (255 , 255 , 255 ), 2 )
73
- cv .line (output , landmarks [10 ], landmarks [11 ], (255 , 255 , 255 ), 2 )
74
- cv .line (output , landmarks [11 ], landmarks [12 ], (255 , 255 , 255 ), 2 )
75
-
76
- cv .line (output , landmarks [0 ], landmarks [13 ], (255 , 255 , 255 ), 2 )
77
- cv .line (output , landmarks [13 ], landmarks [14 ], (255 , 255 , 255 ), 2 )
78
- cv .line (output , landmarks [14 ], landmarks [15 ], (255 , 255 , 255 ), 2 )
79
- cv .line (output , landmarks [15 ], landmarks [16 ], (255 , 255 , 255 ), 2 )
80
-
81
- cv .line (output , landmarks [0 ], landmarks [17 ], (255 , 255 , 255 ), 2 )
82
- cv .line (output , landmarks [17 ], landmarks [18 ], (255 , 255 , 255 ), 2 )
83
- cv .line (output , landmarks [18 ], landmarks [19 ], (255 , 255 , 255 ), 2 )
84
- cv .line (output , landmarks [19 ], landmarks [20 ], (255 , 255 , 255 ), 2 )
85
-
86
- for p in landmarks :
87
- cv .circle (output , p , 2 , (0 , 0 , 255 ), 2 )
88
-
89
- return output
113
+ landmarks_xy = landmarks_screen [:, 0 :2 ]
114
+ draw_lines (display_screen , landmarks_xy , is_draw_point = False )
115
+
116
+ # z value is relative to WRIST
117
+ for p in landmarks_screen :
118
+ r = max (5 - p [2 ] // 5 , 0 )
119
+ r = min (r , 14 )
120
+ cv .circle (display_screen , np .array ([p [0 ], p [1 ]]), r , (0 , 0 , 255 ), - 1 )
121
+
122
+ if is_draw is False :
123
+ is_draw = True
124
+ # Main view
125
+ landmarks_xy = landmarks_word [:, [0 , 1 ]]
126
+ landmarks_xy = (landmarks_xy * 1000 + 100 ).astype (np .int32 )
127
+ draw_lines (display_3d , landmarks_xy , thickness = 5 )
128
+
129
+ # Top view
130
+ landmarks_xz = landmarks_word [:, [0 , 2 ]]
131
+ landmarks_xz [:, 1 ] = - landmarks_xz [:, 1 ]
132
+ landmarks_xz = (landmarks_xz * 1000 + np .array ([300 , 100 ])).astype (np .int32 )
133
+ draw_lines (display_3d , landmarks_xz , thickness = 5 )
134
+
135
+ # Left view
136
+ landmarks_yz = landmarks_word [:, [2 , 1 ]]
137
+ landmarks_yz [:, 0 ] = - landmarks_yz [:, 0 ]
138
+ landmarks_yz = (landmarks_yz * 1000 + np .array ([100 , 300 ])).astype (np .int32 )
139
+ draw_lines (display_3d , landmarks_yz , thickness = 5 )
140
+
141
+ # Right view
142
+ landmarks_zy = landmarks_word [:, [2 , 1 ]]
143
+ landmarks_zy = (landmarks_zy * 1000 + np .array ([300 , 300 ])).astype (np .int32 )
144
+ draw_lines (display_3d , landmarks_zy , thickness = 5 )
145
+
146
+ return display_screen , display_3d
90
147
91
148
92
149
if __name__ == '__main__' :
93
150
# palm detector
94
151
palm_detector = MPPalmDet (modelPath = '../palm_detection_mediapipe/palm_detection_mediapipe_2023feb.onnx' ,
95
152
nmsThreshold = 0.3 ,
96
- scoreThreshold = 0.8 ,
153
+ scoreThreshold = 0.6 ,
97
154
backendId = args .backend ,
98
155
targetId = args .target )
99
156
# handpose detector
@@ -108,7 +165,7 @@ def visualize(image, hands, print_result=False):
108
165
109
166
# Palm detector inference
110
167
palms = palm_detector .infer (image )
111
- hands = np .empty (shape = (0 , 47 ))
168
+ hands = np .empty (shape = (0 , 132 ))
112
169
113
170
# Estimate the pose of each hand
114
171
for palm in palms :
@@ -117,10 +174,12 @@ def visualize(image, hands, print_result=False):
117
174
if handpose is not None :
118
175
hands = np .vstack ((hands , handpose ))
119
176
# Draw results on the input image
120
- image = visualize (image , hands , True )
177
+ image , view_3d = visualize (image , hands , True )
121
178
122
179
if len (palms ) == 0 :
123
180
print ('No palm detected!' )
181
+ else :
182
+ print ('Palm detected!' )
124
183
125
184
# Save results
126
185
if args .save :
@@ -131,6 +190,7 @@ def visualize(image, hands, print_result=False):
131
190
if args .vis :
132
191
cv .namedWindow (args .input , cv .WINDOW_AUTOSIZE )
133
192
cv .imshow (args .input , image )
193
+ cv .imshow ('3D HandPose Demo' , view_3d )
134
194
cv .waitKey (0 )
135
195
else : # Omit input to call default camera
136
196
deviceId = 0
@@ -145,7 +205,7 @@ def visualize(image, hands, print_result=False):
145
205
146
206
# Palm detector inference
147
207
palms = palm_detector .infer (frame )
148
- hands = np .empty (shape = (0 , 47 ))
208
+ hands = np .empty (shape = (0 , 132 ))
149
209
150
210
tm .start ()
151
211
# Estimate the pose of each hand
@@ -156,12 +216,14 @@ def visualize(image, hands, print_result=False):
156
216
hands = np .vstack ((hands , handpose ))
157
217
tm .stop ()
158
218
# Draw results on the input image
159
- frame = visualize (frame , hands )
219
+ frame , view_3d = visualize (frame , hands )
160
220
161
221
if len (palms ) == 0 :
162
222
print ('No palm detected!' )
163
223
else :
224
+ print ('Palm detected!' )
164
225
cv .putText (frame , 'FPS: {:.2f}' .format (tm .getFPS ()), (0 , 15 ), cv .FONT_HERSHEY_SIMPLEX , 0.5 , (0 , 0 , 255 ))
165
226
166
227
cv .imshow ('MediaPipe Handpose Detection Demo' , frame )
228
+ cv .imshow ('3D HandPose Demo' , view_3d )
167
229
tm .reset ()
0 commit comments