Add LAVIS (blip) example (#36)

leptonai · Sep 14, 2023 · 9d13f97 · 9d13f97
1 parent f96f485
commit 9d13f97
Show file tree

Hide file tree

Showing 4 changed files with 311 additions and 0 deletions.
diff --git a/advanced/lavis/README.md b/advanced/lavis/README.md
@@ -0,0 +1,127 @@
+# LAVIS
+
+[LAVIS](https://github.com/salesforce/LAVIS) is a Python deep learning library for LAnguage-and-VISion intelligence research and applications that supports 10+ tasks like retrieval, captioning, visual question answering (vqa), multimodal classification. In this example we are going to show how to use LAVIS to do image captioning, vqa and features extraction on Lepton.
+
+## Install Lepton sdk
+```shell
+pip install leptonai
+```
+
+## Launch inference service locally
+
+### Image Captioning
+
+```shell
+lep photon run -n caption -m caption.py
+```
+
+### Visual Question Answering (VQA)
+
+```shell
+lep photon run -n vqa -m vqa.py
+```
+
+### Features Extraction
+
+```shell
+lep photon run -n extract-features -m extract-features.py
+```
+
+## Launch inference service in the cloud
+
+Similar to other examples, you can run services on Lepton Cloud Platform easily, e.g.:
+
+```shell
+lep photon create -n extract-features -m extract-features.py
+lep photon push -n extract-features
+lep photon run \
+ -n extract-features \
+ --resource-shape gpu.a10
+```
+
+You can visit [dashboard.lepton.ai](https://dashboard.lepton.ai/) to try out the model.
+
+Note: in default, the server is protected via a token, so you won't be able to access the gradio UI. This is by design to provide adequate security. If you want to make the UI public, you can either add the `--public` argument to `lep photon run`, or update the deployment with:
+
+```shell
+lep deployment update -n extract-features --public
+```
+
+## Client
+
+Once the inference service is up (either locally or in the cloud), you can use the client to access it in a programmatical way:
+
+```python
+from leptonai.client import Client, local, current
+
+# Use this if you are running locally
+client = Client(local())
+# Or, if you are logged in to your workspace via `lep login` already
+# and have launched it:
+# client = Client(current(), "extract-features") # or "caption" for Image Captioning, or "vqa" for VQA
+```
+
+### Image Captioning
+```python
+image = "http://images.cocodataset.org/val2017/000000039769.jpg"
+caption = client.run(image=image)
+
+print(caption)
+```
+
+```
+a couple of cats laying on top of a pink couch
+```
+
+### Visual Question Answering (VQA)
+
+```python
+image = "http://images.cocodataset.org/val2017/000000039769.jpg"
+question = "How many cats?"
+answer = client.run(image=image, question=question)
+
+print(answer)
+```
+
+```
+2
+```
+
+### Features Extraction
+
+```python
+# image embedding
+image = "http://images.cocodataset.org/val2017/000000039769.jpg"
+features = client.run(image=image)
+
+print(f"embedding dimensions: {len(features)} x {len(features[0])}")
+```
+
+```
+embedding dimensions: 32 x 768
+```
+
+```python
+# text embedding
+text = "a large fountain spewing water into the air"
+features = client.run(text=text)
+
+print(f"embedding dimensions: {len(features)} x {len(features[0])}")
+```
+
+```
+embedding dimensions: 12 x 768
+```
+
+```python
+# multimodal embedding
+image = "http://images.cocodataset.org/val2017/000000039769.jpg"
+text = "two cats"
+features = client.run(image=image, text=text)
+
+print(f"embedding dimensions: {len(features)} x {len(features[0])}")
+```
+
+```
+embedding dimensions: 32 x 768
+```
diff --git a/advanced/lavis/caption.py b/advanced/lavis/caption.py
@@ -0,0 +1,51 @@
+from io import BytesIO
+from typing import Union
+
+from leptonai.photon import Photon, FileParam, get_file_content
+
+
+class CaptionPhoton(Photon):
+ requirement_dependency = [
+ "salesforce-lavis",
+ "Pillow",
+ "opencv-python!=4.8.0.76",
+ "opencv-contrib-python!=4.8.0.76",
+ ]
+
+ def _get_img(self, param):
+ from PIL import Image
+
+ content = get_file_content(param)
+ return Image.open(BytesIO(content)).convert("RGB")
+
+ def init(self):
+ import torch
+ from lavis.models import load_model_and_preprocess
+
+ if torch.cuda.is_available():
+ self.device = torch.device("cuda")
+ else:
+ self.device = torch.device("cpu")
+
+ # Here we choose blip model, for other available models, please refer to:
+ #
+ # from lavis.models import model_zoo
+ # print(model_zoo)
+ #
+ self.model_and_preprocess = load_model_and_preprocess(
+ name="blip_caption",
+ model_type="large_coco",
+ is_eval=True,
+ device=self.device,
+ )
+
+ @Photon.handler(
+ example={"image": "http://images.cocodataset.org/val2017/000000039769.jpg"}
+ )
+ def run(self, image: Union[FileParam, str]) -> str:
+ model, vis_processors, _ = self.model_and_preprocess
+
+ image = self._get_img(image)
+ image = vis_processors["eval"](image).unsqueeze(0).to(self.device)
+ captions = model.generate({"image": image})
+ return captions[0]
diff --git a/advanced/lavis/extract-features.py b/advanced/lavis/extract-features.py
@@ -0,0 +1,79 @@
+from io import BytesIO
+from typing import Union, Optional, List
+
+from leptonai.photon import Photon, FileParam, get_file_content, HTTPException
+
+
+class ExtractFeaturesPhoton(Photon):
+ requirement_dependency = [
+ "salesforce-lavis",
+ "Pillow",
+ "opencv-python!=4.8.0.76",
+ "opencv-contrib-python!=4.8.0.76",
+ ]
+
+ def _get_img(self, param):
+ from PIL import Image
+
+ content = get_file_content(param)
+ return Image.open(BytesIO(content)).convert("RGB")
+
+ def init(self):
+ import torch
+ from lavis.models import load_model_and_preprocess
+
+ if torch.cuda.is_available():
+ self.device = torch.device("cuda")
+ else:
+ self.device = torch.device("cpu")
+
+ # Here we choose blip2 model, for other available models, please refer to:
+ #
+ # from lavis.models import model_zoo
+ # print(model_zoo)
+ #
+ self.model_and_preprocess = load_model_and_preprocess(
+ name="blip2_feature_extractor",
+ model_type="pretrain",
+ is_eval=True,
+ device=self.device,
+ )
+
+ @Photon.handler(
+ examples=[
+ {"image": "http://images.cocodataset.org/val2017/000000039769.jpg"},
+ {"text": "a large fountain spewing water into the air"},
+ {
+ "image": "http://images.cocodataset.org/val2017/000000039769.jpg",
+ "text": "two cats",
+ },
+ ]
+ )
+ def run(
+ self, image: Optional[Union[FileParam, str]] = None, text: Optional[str] = None
+ ) -> List[float]:
+ model, vis_processors, txt_processors = self.model_and_preprocess
+
+ if image is None and text is None:
+ raise HTTPException(
+ status_code=400, detail="Either image or text should be provided."
+ )
+
+ if image is not None:
+ image = self._get_img(image)
+ image = vis_processors["eval"](image).unsqueeze(0).to(self.device)
+ if text is not None:
+ text = txt_processors["eval"](text)
+
+ if image is not None and text is None:
+ # image embedding
+ features = model.extract_features({"image": image}, mode="image")
+ return features.image_embeds[0].tolist()
+ elif image is None and text is not None:
+ # text embedding
+ features = model.extract_features({"text_input": [text]}, mode="text")
+ return features.text_embeds[0].tolist()
+ else:
+ # multimodal embedding
+ features = model.extract_features({"image": image, "text_input": [text]})
+ return features.multimodal_embeds[0].tolist()
diff --git a/advanced/lavis/vqa.py b/advanced/lavis/vqa.py
@@ -0,0 +1,54 @@
+from io import BytesIO
+from typing import Union
+
+from leptonai.photon import Photon, FileParam, get_file_content
+
+
+class VQAPhoton(Photon):
+ requirement_dependency = [
+ "salesforce-lavis",
+ "Pillow",
+ "opencv-python!=4.8.0.76",
+ "opencv-contrib-python!=4.8.0.76",
+ ]
+
+ def _get_img(self, param):
+ from PIL import Image
+
+ content = get_file_content(param)
+ return Image.open(BytesIO(content)).convert("RGB")
+
+ def init(self):
+ import torch
+ from lavis.models import load_model_and_preprocess
+
+ if torch.cuda.is_available():
+ self.device = torch.device("cuda")
+ else:
+ self.device = torch.device("cpu")
+
+ # Here we choose blip model, for other available models, please refer to:
+ #
+ # from lavis.models import model_zoo
+ # print(model_zoo)
+ #
+ self.model_and_preprocess = load_model_and_preprocess(
+ name="blip_vqa", model_type="vqav2", is_eval=True, device=self.device
+ )
+
+ @Photon.handler(
+ example={
+ "image": "http://images.cocodataset.org/val2017/000000039769.jpg",
+ "question": "How many cats?",
+ }
+ )
+ def run(self, image: Union[FileParam, str], question: str) -> str:
+ model, vis_processors, txt_processors = self.model_and_preprocess
+ image = self._get_img(image)
+ image = vis_processors["eval"](image).unsqueeze(0).to(self.device)
+ question = txt_processors["eval"](question)
+ answers = model.predict_answers(
+ samples={"image": image, "text_input": question},
+ inference_method="generate",
+ )
+ return answers[0]