diff --git a/fiftyone/utils/sam2.py b/fiftyone/utils/sam2.py new file mode 100644 index 0000000000..330ca18170 --- /dev/null +++ b/fiftyone/utils/sam2.py @@ -0,0 +1,746 @@ +""" +`Segment Anything `_ wrapper for the FiftyOne +Model Zoo. + +| Copyright 2017-2024, Voxel51, Inc. +| `voxel51.com `_ +| +""" + +import numpy as np +import cv2 + +import eta.core.utils as etau +import logging + +import fiftyone.core.labels as fol +import fiftyone.core.utils as fou +import fiftyone.utils.torch as fout +import fiftyone.zoo.models as fozm +import fiftyone.core.models as fom + +fou.ensure_torch() +import torch + +sam2 = fou.lazy_import("sam2") +smvid = fou.lazy_import("sam2.sam2_video_predictor") +smutil = fou.lazy_import("sam2.utils.misc") +smip = fou.lazy_import("sam2.sam2_image_predictor") +samg = fou.lazy_import("sam2.automatic_mask_generator") + +logger = logging.getLogger(__name__) + +class SegmentAnything2ImageModelConfig( + fout.TorchImageModelConfig, fozm.HasZooModel +): + """Configuration for running a :class:`SegmentAnything2Model`. + + See :class:`fiftyone.utils.torch.TorchImageModelConfig` for additional + arguments. + + Args: + auto_kwargs (None): a dictionary of keyword arguments to pass to + ``segment_anything.SamAutomaticMaskGenerator(model, **auto_kwargs)`` + points_mask_index (None): an optional mask index to use for each + keypoint output + """ + + def __init__(self, d): + d = self.init(d) + super().__init__(d) + + self.auto_kwargs = self.parse_dict(d, "auto_kwargs", default=None) + self.points_mask_index = self.parse_int( + d, "points_mask_index", default=None + ) + if self.points_mask_index and not 0 <= self.points_mask_index <= 2: + raise ValueError("mask_index must be 0, 1, or 2") + +class SegmentAnything2VideoModelConfig( + fout.TorchImageModelConfig, fozm.HasZooModel +): + """Configuration for running a :class:`SegmentAnything2VideoModel`. + + See :class:`fiftyone.utils.torch.TorchImageModelConfig` for additional + arguments. + """ + + def __init__(self, d): + d = self.init(d) + super().__init__(d) + +class SegmentAnything2ImageModel(fout.TorchSamplesMixin, fout.TorchImageModel): + """Wrapper for running `Segment Anything 2 `_ + inference on images. + + Box prompt example:: + + import fiftyone as fo + import fiftyone.zoo as foz + + dataset = foz.load_zoo_dataset( + "quickstart", max_samples=25, shuffle=True, seed=51 + ) + + model = foz.load_zoo_model("segment-anything-2-hiera-tiny-image-torch") + + # Prompt with boxes + dataset.apply_model( + model, + label_field="segmentations", + prompt_field="ground_truth", + ) + + session = fo.launch_app(dataset) + + Keypoint prompt example:: + + import fiftyone as fo + import fiftyone.zoo as foz + from fiftyone import ViewField as F + + dataset = foz.load_zoo_dataset("quickstart") + dataset = dataset.filter_labels("ground_truth", F("label") == "person") + + + # Generate some keypoints + model = foz.load_zoo_model("keypoint-rcnn-resnet50-fpn-coco-torch") + dataset.default_skeleton = model.skeleton + dataset.apply_model(model, label_field="gt") + + model = foz.load_zoo_model("segment-anything-2-hiera-tiny-image-torch") + + # Prompt with keypoints + dataset.apply_model( + model, + label_field="segmentations", + prompt_field="gt_keypoints", + ) + + session = fo.launch_app(dataset) + + Automatic segmentation example:: + + import fiftyone as fo + import fiftyone.zoo as foz + + dataset = foz.load_zoo_dataset( + "quickstart", max_samples=5, shuffle=True, seed=51 + ) + + model = foz.load_zoo_model("segment-anything-2-hiera-tiny-image-torch") + + # Automatic segmentation + dataset.apply_model(model, label_field="auto") + + session = fo.launch_app(dataset) + + Args: + config: a :class:`SegmentAnything2ImageModelConfig` + """ + + def __init__(self, config): + fout.TorchSamplesMixin.__init__(self) + fout.TorchImageModel.__init__(self, config) + + self._curr_prompt_type = None + self._curr_prompts = None + self._curr_classes = None + + def _download_model(self, config): + config.download_model_if_necessary() + + def _load_model(self, config): + entrypoint = etau.get_function(config.entrypoint_fcn) + model = entrypoint( + config.entrypoint_args["model_cfg"], ckpt_path=config.model_path + ) + + model = model.to(self._device) + if self.using_half_precision: + model = model.half() + + model.eval() + + return model + + def predict_all(self, imgs, samples=None): + field_name = self._get_field() + if field_name is not None and samples is not None: + prompt_type, prompts, classes = self._parse_samples( + samples, field_name + ) + else: + prompt_type, prompts, classes = None, None, None + + self._curr_prompt_type = prompt_type + self._curr_prompts = prompts + self._curr_classes = classes + + return self._predict_all(imgs) + + def _get_field(self): + if "prompt_field" in self.needs_fields: + prompt_field = self.needs_fields["prompt_field"] + else: + prompt_field = next(iter(self.needs_fields.values()), None) + + if prompt_field is not None and prompt_field.startswith("frames."): + prompt_field = prompt_field[len("frames.") :] + + return prompt_field + + def _parse_samples(self, samples, field_name): + prompt_type = self._get_prompt_type(samples, field_name) + prompts = self._get_prompts(samples, field_name) + classes = self._get_classes(samples, field_name) + return prompt_type, prompts, classes + + def _get_prompt_type(self, samples, field_name): + for sample in samples: + value = sample.get_field(field_name) + if value is None: + continue + + if isinstance(value, fol.Detections): + return "boxes" + + if isinstance(value, fol.Keypoints): + return "points" + + raise ValueError( + "Unsupported prompt type %s. The supported field types are %s" + % (type(value), (fol.Detections, fol.Keypoints)) + ) + + return None + + def _get_prompts(self, samples, field_name): + prompts = [] + for sample in samples: + value = sample.get_field(field_name) + if value is not None: + prompts.append(value) + else: + raise ValueError( + "Sample %s is missing a prompt in field '%s'" + % (sample.id, field_name) + ) + + return prompts + + def _get_classes(self, samples, field_name): + classes = set() + for sample in samples: + value = sample.get_field(field_name) + if isinstance(value, fol.Detections): + classes.update(det.label for det in value.detections) + + if isinstance(value, fol.Keypoints): + classes.update(kp.label for kp in value.keypoints) + + return sorted(classes) + + def _forward_pass(self, imgs): + if self._curr_prompt_type == "boxes": + return self._forward_pass_boxes(imgs) + + if self._curr_prompt_type == "points": + return self._forward_pass_points(imgs) + + return self._forward_pass_auto(imgs) + + def _forward_pass_boxes(self, imgs): + sam2_predictor = smip.SAM2ImagePredictor(self._model) + self._output_processor = fout.InstanceSegmenterOutputProcessor( + self._curr_classes + ) + outputs = [] + for img, detections in zip(imgs, self._curr_prompts): + ## If no detections, return empty tensors instead of running SAM + if detections is None or len(detections.detections) == 0: + h, w = img.shape[1], img.shape[2] + outputs.append( + { + "boxes": torch.tensor([[]]), + "labels": torch.empty([0, 4]), + "masks": torch.empty([0, 1, h, w]), + } + ) + continue + inp = _to_sam_input(img) + sam2_predictor.set_image(inp) + h, w = img.size(1), img.size(2) + + boxes = [d.bounding_box for d in detections.detections] + sam_boxes = np.array([_to_sam_box(box, w, h) for box in boxes]) + input_boxes = torch.tensor(sam_boxes, device=sam2_predictor.device) + + labels = torch.tensor( + [ + self._curr_classes.index(d.label) + for d in detections.detections + ], + device=sam2_predictor.device, + ) + + masks, _, _ = sam2_predictor.predict( + point_coords=None, + point_labels=None, + box=sam_boxes[None, :], + multimask_output=False, + ) + outputs.append( + {"boxes": input_boxes, "labels": labels, "masks": torch.tensor(masks, device=sam2_predictor.device)} + ) + + return outputs + + def _forward_pass_points(self, imgs): + sam2_predictor = smip.SAM2ImagePredictor(self._model) + self._output_processor = fout.InstanceSegmenterOutputProcessor( + self._curr_classes + ) + + outputs = [] + for img, keypoints in zip(imgs, self._curr_prompts): + inp = _to_sam_input(img) + sam2_predictor.set_image(inp) + h, w = img.size(1), img.size(2) + + boxes, labels, scores, masks = [], [], [], [] + + ## If no keypoints, return empty tensors instead of running SAM + if keypoints is None or len(keypoints.keypoints) == 0: + outputs.append( + { + "boxes": torch.tensor([[]]), + "labels": torch.empty([0, 4]), + "masks": torch.empty([0, 1, h, w]), + } + ) + continue + + for kp in keypoints.keypoints: + sam_points, sam_labels = _to_sam_points(kp.points, w, h, kp) + + multi_mask, mask_scores, _ = sam2_predictor.predict( + point_coords=sam_points, + point_labels=sam_labels, + multimask_output=True, + ) + + mask_index = self.config.points_mask_index + if mask_index is None: + mask_index = np.argmax(mask_scores) + + mask = multi_mask[mask_index].astype(int) + if mask.any(): + boxes.append(_mask_to_box(mask)) + labels.append(self._curr_classes.index(kp.label)) + scores.append(min(1.0, np.max(mask_scores))) + masks.append(mask) + + outputs.append( + { + "boxes": torch.tensor(boxes, device=sam2_predictor.device), + "labels": torch.tensor( + labels, device=sam2_predictor.device + ), + "scores": torch.tensor( + scores, device=sam2_predictor.device + ), + "masks": torch.tensor( + np.array(masks), device=sam2_predictor.device + ).unsqueeze(1), + } + ) + + return outputs + + def _forward_pass_auto(self, imgs): + kwargs = self.config.auto_kwargs or {} + mask_generator = samg.SAM2AutomaticMaskGenerator(self._model, **kwargs) + self._output_processor = None + + outputs = [] + for img in imgs: + inp = _to_sam_input(img) + detections = [] + for data in mask_generator.generate(inp): + detection = fol.Detection.from_mask( + mask=data["segmentation"], + score=data["predicted_iou"], + stability=data["stability_score"], + ) + detections.append(detection) + detections = fol.Detections(detections=detections) + outputs.append(detections) + + return outputs + +class SegmentAnything2VideoModel(fom.SamplesMixin, fom.Model): + """Wrapper for running `Segment Anything 2 `_ + inference on videos. + + Example for running SAM2 model on a video with prompt: + import fiftyone as fo + import fiftyone.zoo as foz + + dataset = foz.load_zoo_dataset("quickstart-video", max_samples=2) + + # Only retain detections on the first frame of each video + for sample in dataset: + for frame_idx in sample.frames: + frame = sample.frames[frame_idx] + if frame_idx >= 2: + frame.detections = None + sample.save() + + model = foz.load_zoo_model("segment-anything-2-hiera-tiny-video-torch") + + # Prompt with boxes + dataset.apply_model( + model, + label_field="segmentations", + prompt_field="frames.detections", # You can also pass in a keypoint field here + ) + + session = fo.launch_app(dataset) + + Args: + config: a :class:`SegmentAnything2VideoModelConfig` + """ + + def __init__(self, config): + self._fields = {} + self.config = config + + self._download_model(config) + try: + self.ctx = _load_video_frames_monkey_patch() + except Exception as e: + logger.warning( + "Failed to monkey patch sam2.utils.misc.load_vide_frames: %s", + e, + ) + self.model = self._load_model(config) + + self._curr_prompt_type = None + self._curr_prompts = None + self._curr_classes = None + self._curr_frame_width = None + self._curr_frame_height = None + + + @property + def media_type(self): + return "video" + + def _download_model(self, config): + config.download_model_if_necessary() + + def _load_model(self, config): + entrypoint = etau.get_function(config.entrypoint_fcn) + with self.ctx: + model = entrypoint( + config.entrypoint_args["model_cfg"], ckpt_path=config.model_path + ) + return model + + def predict(self, video_reader, sample): + field_name = self._get_field() + self._curr_frame_width, self._curr_frame_height = ( + video_reader.frame_size + ) + self._curr_prompts = self._get_prompts(sample, field_name) + self._curr_prompt_type = self._get_prompt_type(sample, field_name) + + return self._forward_pass(video_reader, sample) + + def _get_field(self): + if "prompt_field" in self.needs_fields: + prompt_field = self.needs_fields["prompt_field"] + else: + prompt_field = next(iter(self.needs_fields.values()), None) + + if not prompt_field.startswith("frames."): + raise ValueError( + "'prompt_field' should be a frame field for segment anything 2 video model" + ) + + if prompt_field is None: + raise AttributeError( + "Missing required argument 'prompt_field' for segment anything 2 video model" + ) + + prompt_field = prompt_field[len("frames.") :] + + return prompt_field + + def _get_prompt_type(self, sample, field_name): + for _, frame in sample.frames.items(): + value = frame.get_field(field_name) + if value is None: + continue + + if isinstance(value, fol.Detections): + return "boxes" + + if isinstance(value, fol.Keypoints): + return "points" + + raise ValueError( + f"Unsupported prompt type {type(value)}. The supported field types are {(fol.Detections, fol.Keypoints)}" + ) + + raise ValueError( + f"Frame field {field_name} is empty for all frames, please provide at least one value" + ) + + def _get_prompts(self, sample, field_name): + prompts = [] + for _, frame in sample.frames.items(): + value = frame.get_field(field_name) + if value is not None: + prompts.append(value) + else: + prompts.append([]) + + return prompts + + def _forward_pass(self, video_reader, sample): + if self._curr_prompt_type == "boxes": + return self._forward_pass_boxes(video_reader, sample) + elif self._curr_prompt_type == "points": + return self._forward_pass_points(video_reader, sample) + + def _forward_pass_boxes(self, video_reader, sample): + video_path = (sample, video_reader) + inference_state = self.model.init_state(video_path) + + classes_obj_id_map = {} + kp_idx_obj_id_map = {} + current_obj_idx = 0 + for frame_idx, frame_detections in enumerate(self._curr_prompts): + if ( + len(frame_detections) == 0 + or len(frame_detections.detections) == 0 + ): + continue + for detection in frame_detections.detections: + if detection.index is not None: + if detection.index in kp_idx_obj_id_map: + ann_obj_id = kp_idx_obj_id_map[detection.index] + else: + ann_obj_id = current_obj_idx + kp_idx_obj_id_map[detection.index] = ann_obj_id + current_obj_idx += 1 + else: + ann_obj_id = current_obj_idx + current_obj_idx += 1 + classes_obj_id_map[ann_obj_id] = detection.label + box = _to_sam_box( + detection.bounding_box, + self._curr_frame_width, + self._curr_frame_height + ) + _, _, _ = self.model.add_new_points_or_box( + inference_state=inference_state, + frame_idx=frame_idx, + obj_id=ann_obj_id, + box=box + ) + + sample_detections = {} + for ( + out_frame_idx, + out_obj_ids, + out_mask_logits, + ) in self.model.propagate_in_video(inference_state): + detections = [] + for i, out_obj_id in enumerate(out_obj_ids): + mask = np.squeeze((out_mask_logits[i] > 0.0).cpu().numpy(), axis=0) + box = _mask_to_box(mask) + if box is None: + continue + label = classes_obj_id_map[out_obj_id] + x1, y1, x2, y2 = box + bounding_box = [ + x1 / self._curr_frame_width, + y1 / self._curr_frame_height, + (x2 - x1) / self._curr_frame_width, + (y2 - y1) / self._curr_frame_height, + ] + mask = mask[ + int(round(y1)) : int(round(y2)), + int(round(x1)) : int(round(x2)), + ] + detections.append( + fol.Detection( + label=label, + bounding_box=bounding_box, + mask=mask, + index=out_obj_id + ) + ) + sample_detections[int(out_frame_idx) + 1] = fol.Detections(detections=detections) + return sample_detections + + def _forward_pass_points(self, video_reader, sample): + video_path = (sample, video_reader) + inference_state = self.model.init_state(video_path) + + classes_obj_id_map = {} + kp_idx_obj_id_map = {} + current_obj_idx = 0 + for frame_idx, frame_keypoints in enumerate(self._curr_prompts): + if ( + len(frame_keypoints) == 0 + or len(frame_keypoints.keypoints) == 0 + ): + continue + for keypoint in frame_keypoints.keypoints: + if keypoint.index is not None: + if keypoint.index in kp_idx_obj_id_map: + ann_obj_id = kp_idx_obj_id_map[keypoint.index] + else: + ann_obj_id = current_obj_idx + kp_idx_obj_id_map[keypoint.index] = ann_obj_id + current_obj_idx += 1 + else: + ann_obj_id = current_obj_idx + current_obj_idx += 1 + classes_obj_id_map[ann_obj_id] = keypoint.label + points, labels = _to_sam_points( + keypoint.points, + self._curr_frame_width, + self._curr_frame_height, + keypoint + ) + _, _, _ = self.model.add_new_points_or_box( + inference_state=inference_state, + frame_idx=frame_idx, + obj_id=ann_obj_id, + points=points, + labels=labels, + ) + + sample_detections = {} + for ( + out_frame_idx, + out_obj_ids, + out_mask_logits, + ) in self.model.propagate_in_video(inference_state): + detections = [] + for i, out_obj_id in enumerate(out_obj_ids): + mask = np.squeeze((out_mask_logits[i] > 0.0).cpu().numpy(), axis=0) + box = _mask_to_box(mask) + if box is None: + continue + label = classes_obj_id_map[out_obj_id] + x1, y1, x2, y2 = box + bounding_box = [ + x1 / self._curr_frame_width, + y1 / self._curr_frame_height, + (x2 - x1) / self._curr_frame_width, + (y2 - y1) / self._curr_frame_height, + ] + mask = mask[ + int(round(y1)) : int(round(y2)), + int(round(x1)) : int(round(x2)), + ] + detections.append( + fol.Detection( + label=label, + bounding_box=bounding_box, + mask=mask, + index=out_obj_id + ) + ) + sample_detections[int(out_frame_idx) + 1] = fol.Detections(detections=detections) + return sample_detections + +def _to_sam_input(tensor): + return (255 * tensor.cpu().numpy()).astype("uint8").transpose(1, 2, 0) + +def _to_sam_points(points, w, h, keypoint): + points = np.array(points) + valid_rows = ~np.isnan(points).any(axis=1) + scaled_points = np.array(points[valid_rows]) * np.array([w, h]) + labels = ( + np.array(keypoint.sam2_labels)[valid_rows] + if "sam2_labels" in keypoint + else np.ones(len(scaled_points)) + ) + return scaled_points.astype(np.float32), labels.astype(np.uint32) + + +def _to_sam_box(box, w, h): + new_box = np.copy(np.array(box)) + new_box[0] *= w + new_box[2] *= w + new_box[1] *= h + new_box[3] *= h + new_box[2] += new_box[0] + new_box[3] += new_box[1] + return np.round(new_box).astype(int) + + +def _mask_to_box(mask): + pos_indices = np.where(mask) + if all(arr.size == 0 for arr in pos_indices): + return None + x1 = np.min(pos_indices[1]) + x2 = np.max(pos_indices[1]) + y1 = np.min(pos_indices[0]) + y2 = np.max(pos_indices[0]) + return [x1, y1, x2, y2] + +def load_fiftyone_video_frames( + video_path, + image_size, + offload_video_to_cpu, + img_mean=(0.485, 0.456, 0.406), + img_std=(0.229, 0.224, 0.225), + async_loading_frames=False, + ): + sample, video_reader = video_path + img_mean = torch.tensor(img_mean, dtype=torch.float32)[:, None, None] + img_std = torch.tensor(img_std, dtype=torch.float32)[:, None, None] + + num_frames = len(sample.frames) + try: + images = torch.zeros( + num_frames, 3, image_size, image_size, dtype=torch.float32 + ) + except Exception as e: + raise(e) + for frame_number in range(num_frames): + current_frame = video_reader.read() + resized_frame = ( + cv2.resize(current_frame, (image_size, image_size)) / 255.0 + ) + img = torch.from_numpy(resized_frame).permute(2, 0, 1) + images[frame_number] = img + + video_width, video_height = ( + current_frame.shape[1], + current_frame.shape[0], + ) + if not offload_video_to_cpu: + images = images.cuda() + img_mean = img_mean.cuda() + img_std = img_std.cuda() + + images -= img_mean + images /= img_std + return images, video_height, video_width + +def _load_video_frames_monkey_patch(): + entrypoint_module = smutil.load_video_frames + + return fou.MonkeyPatchFunction( + entrypoint_module, + load_fiftyone_video_frames, + ) diff --git a/fiftyone/zoo/models/manifest-torch.json b/fiftyone/zoo/models/manifest-torch.json index 2e90521a36..9b9bfc20d1 100644 --- a/fiftyone/zoo/models/manifest-torch.json +++ b/fiftyone/zoo/models/manifest-torch.json @@ -185,6 +185,269 @@ "tags": ["segment-anything", "sa-1b", "torch", "zero-shot"], "date_added": "2023-05-12 11:40:51" }, + { + "base_name": "segment-anything-2-hiera-tiny-image-torch", + "base_filename": "sam2_hiera_tiny_image.pt", + "version": null, + "description": "Segment Anything Model 2 (SAM2) from `SAM2: Segment Anything in Images and Videos `_", + "source": "https://ai.meta.com/sam2/", + "size_bytes": 155906050, + "manager": { + "type": "fiftyone.core.models.ModelManager", + "config": { + "url": "https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_tiny.pt" + } + }, + "default_deployment_config_dict": { + "type": "fiftyone.utils.sam2.SegmentAnything2ImageModel", + "config": { + "entrypoint_fcn": "sam2.build_sam.build_sam2", + "entrypoint_args": {"model_cfg" : "sam2_hiera_t.yaml"}, + "output_processor_cls": "fiftyone.utils.torch.SemanticSegmenterOutputProcessor" + } + }, + "requirements": { + "packages": ["torch"], + "cpu": { + "support": true + }, + "gpu": { + "support": true + } + }, + "tags": ["segment-anything-2", "segmentation", "torch", "zero-shot"], + "date_added": "2024-08-05 14:38:20" + }, + { + "base_name": "segment-anything-2-hiera-small-image-torch", + "base_filename": "sam2_hiera_small_image.pt", + "version": null, + "description": "Segment Anything Model 2 (SAM2) from `SAM2: Segment Anything in Images and Videos `_", + "source": "https://ai.meta.com/sam2/", + "size_bytes": 155906050, + "manager": { + "type": "fiftyone.core.models.ModelManager", + "config": { + "url": "https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_small.pt" + } + }, + "default_deployment_config_dict": { + "type": "fiftyone.utils.sam2.SegmentAnything2ImageModel", + "config": { + "entrypoint_fcn": "sam2.build_sam.build_sam2", + "entrypoint_args": {"model_cfg" : "sam2_hiera_s.yaml"}, + "output_processor_cls": "fiftyone.utils.torch.SemanticSegmenterOutputProcessor" + } + }, + "requirements": { + "packages": ["torch"], + "cpu": { + "support": true + }, + "gpu": { + "support": true + } + }, + "tags": ["segment-anything-2", "segmentation", "torch", "zero-shot"], + "date_added": "2024-08-05 14:38:20" + }, + { + "base_name": "segment-anything-2-hiera-base-plus-image-torch", + "base_filename": "sam2_hiera_base_plus_image.pt", + "version": null, + "description": "Segment Anything Model 2 (SAM2) from `SAM2: Segment Anything in Images and Videos `_", + "source": "https://ai.meta.com/sam2/", + "size_bytes": 155906050, + "manager": { + "type": "fiftyone.core.models.ModelManager", + "config": { + "url": "https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_base_plus.pt" + } + }, + "default_deployment_config_dict": { + "type": "fiftyone.utils.sam2.SegmentAnything2ImageModel", + "config": { + "entrypoint_fcn": "sam2.build_sam.build_sam2", + "entrypoint_args": {"model_cfg" : "sam2_hiera_b+.yaml"}, + "output_processor_cls": "fiftyone.utils.torch.SemanticSegmenterOutputProcessor" + } + }, + "requirements": { + "packages": ["torch"], + "cpu": { + "support": true + }, + "gpu": { + "support": true + } + }, + "tags": ["segment-anything-2", "segmentation", "torch", "zero-shot"], + "date_added": "2024-08-05 14:38:20" + }, + { + "base_name": "segment-anything-2-hiera-large-image-torch", + "base_filename": "sam2_hiera_large_image.pt", + "version": null, + "description": "Segment Anything Model 2 (SAM2) from `SAM2: Segment Anything in Images and Videos `_", + "source": "https://ai.meta.com/sam2/", + "size_bytes": 155906050, + "manager": { + "type": "fiftyone.core.models.ModelManager", + "config": { + "url": "https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_large.pt" + } + }, + "default_deployment_config_dict": { + "type": "fiftyone.utils.sam2.SegmentAnything2ImageModel", + "config": { + "entrypoint_fcn": "sam2.build_sam.build_sam2", + "entrypoint_args": {"model_cfg" : "sam2_hiera_l.yaml"}, + "output_processor_cls": "fiftyone.utils.torch.SemanticSegmenterOutputProcessor" + } + }, + "requirements": { + "packages": ["torch"], + "cpu": { + "support": true + }, + "gpu": { + "support": true + } + }, + "tags": ["segment-anything-2", "segmentation", "torch", "zero-shot"], + "date_added": "2024-08-05 14:38:20" + }, + { + "base_name": "segment-anything-2-hiera-tiny-video-torch", + "base_filename": "sam2_hiera_tiny_video.pt", + "version": null, + "description": "Segment Anything Model 2 (SAM2) from `SAM2: Segment Anything in Images and Videos `_", + "source": "https://ai.meta.com/sam2/", + "size_bytes": 155906050, + "manager": { + "type": "fiftyone.core.models.ModelManager", + "config": { + "url": "https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_tiny.pt" + } + }, + "default_deployment_config_dict": { + "type": "fiftyone.utils.sam2.SegmentAnything2VideoModel", + "config": { + "entrypoint_fcn": "sam2.build_sam.build_sam2_video_predictor", + "entrypoint_args": {"model_cfg" : "sam2_hiera_t.yaml"} + } + }, + "requirements": { + "packages": ["torch"], + "cpu": { + "support": true + }, + "gpu": { + "support": true + } + }, + "tags": ["segment-anything-2", "segmentation", "torch", "zero-shot"], + "date_added": "2024-08-05 14:38:20" + }, + { + "base_name": "segment-anything-2-hiera-small-video-torch", + "base_filename": "sam2_hiera_small_video.pt", + "version": null, + "description": "Segment Anything Model 2 (SAM2) from `SAM2: Segment Anything in Images and Videos `_", + "source": "https://ai.meta.com/sam2/", + "size_bytes": 155906050, + "manager": { + "type": "fiftyone.core.models.ModelManager", + "config": { + "url": "https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_small.pt" + } + }, + "default_deployment_config_dict": { + "type": "fiftyone.utils.sam2.SegmentAnything2VideoModel", + "config": { + "entrypoint_fcn": "sam2.build_sam.build_sam2_video_predictor", + "entrypoint_args": {"model_cfg" : "sam2_hiera_s.yaml"}, + "output_processor_cls": "fiftyone.utils.torch.SemanticSegmenterOutputProcessor" + } + }, + "requirements": { + "packages": ["torch"], + "cpu": { + "support": true + }, + "gpu": { + "support": true + } + }, + "tags": ["segment-anything-2", "segmentation", "torch", "zero-shot"], + "date_added": "2024-08-05 14:38:20" + }, + { + "base_name": "segment-anything-2-hiera-base-plus-video-torch", + "base_filename": "sam2_hiera_base_plus_video.pt", + "version": null, + "description": "Segment Anything Model 2 (SAM2) from `SAM2: Segment Anything in Images and Videos `_", + "source": "https://ai.meta.com/sam2/", + "size_bytes": 155906050, + "manager": { + "type": "fiftyone.core.models.ModelManager", + "config": { + "url": "https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_base_plus.pt" + } + }, + "default_deployment_config_dict": { + "type": "fiftyone.utils.sam2.SegmentAnything2VideoModel", + "config": { + "entrypoint_fcn": "sam2.build_sam.build_sam2_video_predictor", + "entrypoint_args": {"model_cfg" : "sam2_hiera_b+.yaml"}, + "output_processor_cls": "fiftyone.utils.torch.SemanticSegmenterOutputProcessor" + } + }, + "requirements": { + "packages": ["torch"], + "cpu": { + "support": true + }, + "gpu": { + "support": true + } + }, + "tags": ["segment-anything-2", "segmentation", "torch", "zero-shot"], + "date_added": "2024-08-05 14:38:20" + }, + { + "base_name": "segment-anything-2-hiera-large-video-torch", + "base_filename": "sam2_hiera_large_video.pt", + "version": null, + "description": "Segment Anything Model 2 (SAM2) from `SAM2: Segment Anything in Images and Videos `_", + "source": "https://ai.meta.com/sam2/", + "size_bytes": 155906050, + "manager": { + "type": "fiftyone.core.models.ModelManager", + "config": { + "url": "https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_large.pt" + } + }, + "default_deployment_config_dict": { + "type": "fiftyone.utils.sam2.SegmentAnything2VideoModel", + "config": { + "entrypoint_fcn": "sam2.build_sam.build_sam2_video_predictor", + "entrypoint_args": {"model_cfg" : "sam2_hiera_l.yaml"}, + "output_processor_cls": "fiftyone.utils.torch.SemanticSegmenterOutputProcessor" + } + }, + "requirements": { + "packages": ["torch"], + "cpu": { + "support": true + }, + "gpu": { + "support": true + } + }, + "tags": ["segment-anything-2", "segmentation", "torch", "zero-shot"], + "date_added": "2024-08-05 14:38:20" + }, { "base_name": "deeplabv3-resnet50-coco-torch", "base_filename": "deeplabv3_resnet50_coco-cd0a2569.pth",