1111from torch .nn .functional import one_hot
1212from torchvision .prototype import features
1313from torchvision .prototype .transforms .functional ._meta import convert_bounding_box_format
14+ from torchvision .transforms .functional import _get_perspective_coeffs
1415from torchvision .transforms .functional_tensor import _max_value as get_max_value
1516
17+
1618make_tensor = functools .partial (torch .testing .make_tensor , device = "cpu" )
1719
1820
@@ -380,6 +382,37 @@ def pad_segmentation_mask():
380382 yield SampleInput (mask , padding = padding , padding_mode = padding_mode )
381383
382384
385+ @register_kernel_info_from_sample_inputs_fn
386+ def perspective_bounding_box ():
387+ for bounding_box , perspective_coeffs in itertools .product (
388+ make_bounding_boxes (),
389+ [
390+ [1.2405 , 0.1772 , - 6.9113 , 0.0463 , 1.251 , - 5.235 , 0.00013 , 0.0018 ],
391+ [0.7366 , - 0.11724 , 1.45775 , - 0.15012 , 0.73406 , 2.6019 , - 0.0072 , - 0.0063 ],
392+ ],
393+ ):
394+ yield SampleInput (
395+ bounding_box ,
396+ format = bounding_box .format ,
397+ perspective_coeffs = perspective_coeffs ,
398+ )
399+
400+
401+ @register_kernel_info_from_sample_inputs_fn
402+ def perspective_segmentation_mask ():
403+ for mask , perspective_coeffs in itertools .product (
404+ make_segmentation_masks (extra_dims = ((), (4 ,))),
405+ [
406+ [1.2405 , 0.1772 , - 6.9113 , 0.0463 , 1.251 , - 5.235 , 0.00013 , 0.0018 ],
407+ [0.7366 , - 0.11724 , 1.45775 , - 0.15012 , 0.73406 , 2.6019 , - 0.0072 , - 0.0063 ],
408+ ],
409+ ):
410+ yield SampleInput (
411+ mask ,
412+ perspective_coeffs = perspective_coeffs ,
413+ )
414+
415+
383416@register_kernel_info_from_sample_inputs_fn
384417def center_crop_bounding_box ():
385418 for bounding_box , output_size in itertools .product (make_bounding_boxes (), [(24 , 12 ), [16 , 18 ], [46 , 48 ], [12 ]]):
@@ -993,7 +1026,7 @@ def test_correctness_vertical_flip_segmentation_mask_on_fixed_input(device):
9931026 ],
9941027)
9951028def test_correctness_resized_crop_bounding_box (device , format , top , left , height , width , size ):
996- def _compute_expected (bbox , top_ , left_ , height_ , width_ , size_ ):
1029+ def _compute_expected_bbox (bbox , top_ , left_ , height_ , width_ , size_ ):
9971030 # bbox should be xyxy
9981031 bbox [0 ] = (bbox [0 ] - left_ ) * size_ [1 ] / width_
9991032 bbox [1 ] = (bbox [1 ] - top_ ) * size_ [0 ] / height_
@@ -1009,7 +1042,7 @@ def _compute_expected(bbox, top_, left_, height_, width_, size_):
10091042 ]
10101043 expected_bboxes = []
10111044 for in_box in in_boxes :
1012- expected_bboxes .append (_compute_expected (list (in_box ), top , left , height , width , size ))
1045+ expected_bboxes .append (_compute_expected_bbox (list (in_box ), top , left , height , width , size ))
10131046 expected_bboxes = torch .tensor (expected_bboxes , device = device )
10141047
10151048 in_boxes = features .BoundingBox (
@@ -1035,7 +1068,7 @@ def _compute_expected(bbox, top_, left_, height_, width_, size_):
10351068 ],
10361069)
10371070def test_correctness_resized_crop_segmentation_mask (device , top , left , height , width , size ):
1038- def _compute_expected (mask , top_ , left_ , height_ , width_ , size_ ):
1071+ def _compute_expected_mask (mask , top_ , left_ , height_ , width_ , size_ ):
10391072 output = mask .clone ()
10401073 output = output [:, top_ : top_ + height_ , left_ : left_ + width_ ]
10411074 output = torch .nn .functional .interpolate (output [None , :].float (), size = size_ , mode = "nearest" )
@@ -1046,7 +1079,7 @@ def _compute_expected(mask, top_, left_, height_, width_, size_):
10461079 in_mask [0 , 10 :20 , 10 :20 ] = 1
10471080 in_mask [0 , 5 :15 , 12 :23 ] = 2
10481081
1049- expected_mask = _compute_expected (in_mask , top , left , height , width , size )
1082+ expected_mask = _compute_expected_mask (in_mask , top , left , height , width , size )
10501083 output_mask = F .resized_crop_segmentation_mask (in_mask , top , left , height , width , size )
10511084 torch .testing .assert_close (output_mask , expected_mask )
10521085
@@ -1095,6 +1128,161 @@ def parse_padding():
10951128 torch .testing .assert_close (out_mask , expected_mask )
10961129
10971130
1131+ @pytest .mark .parametrize ("device" , cpu_and_gpu ())
1132+ @pytest .mark .parametrize (
1133+ "startpoints, endpoints" ,
1134+ [
1135+ [[[0 , 0 ], [33 , 0 ], [33 , 25 ], [0 , 25 ]], [[3 , 2 ], [32 , 3 ], [30 , 24 ], [2 , 25 ]]],
1136+ [[[3 , 2 ], [32 , 3 ], [30 , 24 ], [2 , 25 ]], [[0 , 0 ], [33 , 0 ], [33 , 25 ], [0 , 25 ]]],
1137+ [[[3 , 2 ], [32 , 3 ], [30 , 24 ], [2 , 25 ]], [[5 , 5 ], [30 , 3 ], [33 , 19 ], [4 , 25 ]]],
1138+ ],
1139+ )
1140+ def test_correctness_perspective_bounding_box (device , startpoints , endpoints ):
1141+ def _compute_expected_bbox (bbox , pcoeffs_ ):
1142+ m1 = np .array (
1143+ [
1144+ [pcoeffs_ [0 ], pcoeffs_ [1 ], pcoeffs_ [2 ]],
1145+ [pcoeffs_ [3 ], pcoeffs_ [4 ], pcoeffs_ [5 ]],
1146+ ]
1147+ )
1148+ m2 = np .array (
1149+ [
1150+ [pcoeffs_ [6 ], pcoeffs_ [7 ], 1.0 ],
1151+ [pcoeffs_ [6 ], pcoeffs_ [7 ], 1.0 ],
1152+ ]
1153+ )
1154+
1155+ bbox_xyxy = convert_bounding_box_format (
1156+ bbox , old_format = bbox .format , new_format = features .BoundingBoxFormat .XYXY
1157+ )
1158+ points = np .array (
1159+ [
1160+ [bbox_xyxy [0 ].item (), bbox_xyxy [1 ].item (), 1.0 ],
1161+ [bbox_xyxy [2 ].item (), bbox_xyxy [1 ].item (), 1.0 ],
1162+ [bbox_xyxy [0 ].item (), bbox_xyxy [3 ].item (), 1.0 ],
1163+ [bbox_xyxy [2 ].item (), bbox_xyxy [3 ].item (), 1.0 ],
1164+ ]
1165+ )
1166+ numer = np .matmul (points , m1 .T )
1167+ denom = np .matmul (points , m2 .T )
1168+ transformed_points = numer / denom
1169+ out_bbox = [
1170+ np .min (transformed_points [:, 0 ]),
1171+ np .min (transformed_points [:, 1 ]),
1172+ np .max (transformed_points [:, 0 ]),
1173+ np .max (transformed_points [:, 1 ]),
1174+ ]
1175+ out_bbox = features .BoundingBox (
1176+ out_bbox ,
1177+ format = features .BoundingBoxFormat .XYXY ,
1178+ image_size = bbox .image_size ,
1179+ dtype = torch .float32 ,
1180+ device = bbox .device ,
1181+ )
1182+ return convert_bounding_box_format (
1183+ out_bbox , old_format = features .BoundingBoxFormat .XYXY , new_format = bbox .format , copy = False
1184+ )
1185+
1186+ image_size = (32 , 38 )
1187+
1188+ pcoeffs = _get_perspective_coeffs (startpoints , endpoints )
1189+ inv_pcoeffs = _get_perspective_coeffs (endpoints , startpoints )
1190+
1191+ for bboxes in make_bounding_boxes (
1192+ image_sizes = [
1193+ image_size ,
1194+ ],
1195+ extra_dims = ((4 ,),),
1196+ ):
1197+ bboxes = bboxes .to (device )
1198+ bboxes_format = bboxes .format
1199+ bboxes_image_size = bboxes .image_size
1200+
1201+ output_bboxes = F .perspective_bounding_box (
1202+ bboxes ,
1203+ bboxes_format ,
1204+ perspective_coeffs = pcoeffs ,
1205+ )
1206+
1207+ if bboxes .ndim < 2 :
1208+ bboxes = [bboxes ]
1209+
1210+ expected_bboxes = []
1211+ for bbox in bboxes :
1212+ bbox = features .BoundingBox (bbox , format = bboxes_format , image_size = bboxes_image_size )
1213+ expected_bboxes .append (_compute_expected_bbox (bbox , inv_pcoeffs ))
1214+ if len (expected_bboxes ) > 1 :
1215+ expected_bboxes = torch .stack (expected_bboxes )
1216+ else :
1217+ expected_bboxes = expected_bboxes [0 ]
1218+ torch .testing .assert_close (output_bboxes , expected_bboxes , rtol = 1e-5 , atol = 1e-5 )
1219+
1220+
1221+ @pytest .mark .parametrize ("device" , cpu_and_gpu ())
1222+ @pytest .mark .parametrize (
1223+ "startpoints, endpoints" ,
1224+ [
1225+ [[[0 , 0 ], [33 , 0 ], [33 , 25 ], [0 , 25 ]], [[3 , 2 ], [32 , 3 ], [30 , 24 ], [2 , 25 ]]],
1226+ [[[3 , 2 ], [32 , 3 ], [30 , 24 ], [2 , 25 ]], [[0 , 0 ], [33 , 0 ], [33 , 25 ], [0 , 25 ]]],
1227+ [[[3 , 2 ], [32 , 3 ], [30 , 24 ], [2 , 25 ]], [[5 , 5 ], [30 , 3 ], [33 , 19 ], [4 , 25 ]]],
1228+ ],
1229+ )
1230+ def test_correctness_perspective_segmentation_mask (device , startpoints , endpoints ):
1231+ def _compute_expected_mask (mask , pcoeffs_ ):
1232+ assert mask .ndim == 3 and mask .shape [0 ] == 1
1233+ m1 = np .array (
1234+ [
1235+ [pcoeffs_ [0 ], pcoeffs_ [1 ], pcoeffs_ [2 ]],
1236+ [pcoeffs_ [3 ], pcoeffs_ [4 ], pcoeffs_ [5 ]],
1237+ ]
1238+ )
1239+ m2 = np .array (
1240+ [
1241+ [pcoeffs_ [6 ], pcoeffs_ [7 ], 1.0 ],
1242+ [pcoeffs_ [6 ], pcoeffs_ [7 ], 1.0 ],
1243+ ]
1244+ )
1245+
1246+ expected_mask = torch .zeros_like (mask .cpu ())
1247+ for out_y in range (expected_mask .shape [1 ]):
1248+ for out_x in range (expected_mask .shape [2 ]):
1249+ output_pt = np .array ([out_x + 0.5 , out_y + 0.5 , 1.0 ])
1250+
1251+ numer = np .matmul (output_pt , m1 .T )
1252+ denom = np .matmul (output_pt , m2 .T )
1253+ input_pt = np .floor (numer / denom ).astype (np .int32 )
1254+
1255+ in_x , in_y = input_pt [:2 ]
1256+ if 0 <= in_x < mask .shape [2 ] and 0 <= in_y < mask .shape [1 ]:
1257+ expected_mask [0 , out_y , out_x ] = mask [0 , in_y , in_x ]
1258+ return expected_mask .to (mask .device )
1259+
1260+ pcoeffs = _get_perspective_coeffs (startpoints , endpoints )
1261+
1262+ for mask in make_segmentation_masks (extra_dims = ((), (4 ,))):
1263+ mask = mask .to (device )
1264+
1265+ output_mask = F .perspective_segmentation_mask (
1266+ mask ,
1267+ perspective_coeffs = pcoeffs ,
1268+ )
1269+
1270+ if mask .ndim < 4 :
1271+ masks = [mask ]
1272+ else :
1273+ masks = [m for m in mask ]
1274+
1275+ expected_masks = []
1276+ for mask in masks :
1277+ expected_mask = _compute_expected_mask (mask , pcoeffs )
1278+ expected_masks .append (expected_mask )
1279+ if len (expected_masks ) > 1 :
1280+ expected_masks = torch .stack (expected_masks )
1281+ else :
1282+ expected_masks = expected_masks [0 ]
1283+ torch .testing .assert_close (output_mask , expected_masks )
1284+
1285+
10981286@pytest .mark .parametrize ("device" , cpu_and_gpu ())
10991287@pytest .mark .parametrize (
11001288 "output_size" ,
@@ -1148,5 +1336,4 @@ def _compute_expected_bbox(bbox, output_size_):
11481336 expected_bboxes = torch .stack (expected_bboxes )
11491337 else :
11501338 expected_bboxes = expected_bboxes [0 ]
1151- expected_bboxes = expected_bboxes .to (device = device )
11521339 torch .testing .assert_close (output_boxes , expected_bboxes )
0 commit comments