Merge branch 'main-dev' of https://github.com/unum-cloud/uform into m…

…ain-dev
unum-cloud · Mar 27, 2024 · 3b5c5f2 · 3b5c5f2
2 parents a310192 + d468db5
commit 3b5c5f2
Show file tree

Hide file tree

Showing 15 changed files with 298 additions and 110 deletions.
diff --git a/.gitignore b/.gitignore
@@ -6,4 +6,4 @@ src/test.py
 build/
 package-lock.json
 *.egg-info
-*.oonx
+*.onnx
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,33 @@
+ci:
+  autofix_commit_msg: "chore(pre-commit): autofix run"
+  autoupdate_commit_msg: "chore(pre-commit): autoupdate hooks"
+
+default_install_hook_types:
+  - pre-commit
+
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.5.0
+    hooks:
+      - id: check-toml
+      - id: check-yaml
+      - id: debug-statements
+      - id: end-of-file-fixer
+      - id: name-tests-test
+      - id: trailing-whitespace
+  - repo: https://github.com/pappasam/toml-sort
+    rev: v0.23.1
+    hooks:
+      - id: toml-sort-fix
+  - repo: https://github.com/asottile/add-trailing-comma
+    rev: v3.1.0
+    hooks:
+      - id: add-trailing-comma
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.1.11
+    hooks:
+      # Run the linter
+      - id: ruff
+        args: [--fix]
+      # Run the formatter
+      - id: ruff-format
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -18,4 +18,4 @@
         "editor.defaultFormatter": "ms-python.black-formatter"
     },
     "python.formatting.provider": "none"
-}
+}
diff --git a/.vscode/tasks.json b/.vscode/tasks.json
@@ -9,4 +9,4 @@
             "command": "python -m pip install build twine && python -m build && twine check dist/* && twine upload dist/*"
         }
     ]
-}
+}
diff --git a/LICENSE b/LICENSE
@@ -198,4 +198,4 @@
    distributed under the License is distributed on an "AS IS" BASIS,
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
-   limitations under the License.
+   limitations under the License.
diff --git a/README.md b/README.md
@@ -7,13 +7,13 @@ For Content Understanding and Generation<br/>
 
 <p align="center">
 <a href="https://discord.gg/jsMURnSFM2"><img height="25" src="https://github.com/unum-cloud/.github/raw/main/assets/discord.svg" alt="Discord"></a>
-&nbsp; &nbsp; &nbsp; 
+&nbsp; &nbsp; &nbsp;
 <a href="https://www.linkedin.com/company/unum-cloud/"><img height="25" src="https://github.com/unum-cloud/.github/raw/main/assets/linkedin.svg" alt="LinkedIn"></a>
-&nbsp; &nbsp; &nbsp; 
+&nbsp; &nbsp; &nbsp;
 <a href="https://twitter.com/unum_cloud"><img height="25" src="https://github.com/unum-cloud/.github/raw/main/assets/twitter.svg" alt="Twitter"></a>
-&nbsp; &nbsp; &nbsp; 
+&nbsp; &nbsp; &nbsp;
 <a href="https://unum.cloud/post"><img height="25" src="https://github.com/unum-cloud/.github/raw/main/assets/blog.svg" alt="Blog"></a>
-&nbsp; &nbsp; &nbsp; 
+&nbsp; &nbsp; &nbsp;
 <a href="https://github.com/unum-cloud/uform"><img height="25" src="https://github.com/unum-cloud/.github/raw/main/assets/github.svg" alt="GitHub"></a>
 </p>
 
@@ -53,13 +53,13 @@ With compact __custom pre-trained transformer models__, this can run anywhere fr
 
 ### Generative Models
 
-| Model                        | Parameters |               Purpose |         Architecture |
-| :--------------------------- | ---------: | --------------------: | -------------------: |
-| [`uform-gen`][model-g]       |       1.5B | Image Captioning, VQA | llama-1.3B, ViT-B/16 |
-| [`uform-gen-chat`][model-gc] |       1.5B |       Multimodal Chat | llama-1.3B, ViT-B/16 |
+| Model                              | Parameters |            Purpose          |     Architecture      |
+| :--------------------------------- | ---------: | --------------------------: | --------------------: |
+| [`uform-gen2-qwen-500m`][model-g2] |    1.2B    | Chat, Image Captioning, VQA | qwen1.5-0.5B, ViT-H/14|
+| [`uform-gen`][model-g1]             |    1.5B    | Image Captioning, VQA       | llama-1.3B, ViT-B/16  |
 
-[model-g]: https://huggingface.co/unum-cloud/uform-gen/
-[model-gc]: https://huggingface.co/unum-cloud/uform-gen-chat/
+[model-g2]: https://huggingface.co/unum-cloud/uform-gen2-qwen-500m/
+[model-g1]: https://huggingface.co/unum-cloud/uform-gen/
 
 
 ## Quick Start
@@ -92,7 +92,7 @@ similarity = F.cosine_similarity(image_embedding, text_embedding)
 ```
 
 To search for similar items, the embeddings can be compared using cosine similarity.
-The resulting value will fall within the range of `-1` to `1`, where `1` indicates a high likelihood of a match. 
+The resulting value will fall within the range of `-1` to `1`, where `1` indicates a high likelihood of a match.
 Once the list of nearest neighbors (best matches) is obtained, the joint multimodal embeddings, created from both text and image features, can be used to better rerank (reorder) the list.
 The model can calculate a "matching score" that falls within the range of `[0, 1]`, where `1` indicates a high likelihood of a match.
 
@@ -105,8 +105,43 @@ joint_embedding = model.encode_multimodal(
 score = model.get_matching_scores(joint_embedding)
 ```
 
+### Chat, Image Captioning and Question Answering
+
+The generative model can be used to caption images, answer questions about them. Also it is suitable for a multimodal chat.
+
+
+```python
+from transformers import AutoModel, AutoProcessor
+
+model = AutoModel.from_pretrained("unum-cloud/uform-gen2-qwen-500m", trust_remote_code=True)
+processor = AutoProcessor.from_pretrained("unum-cloud/uform-gen2-qwen-500m", trust_remote_code=True)
+
+prompt = "Question or Instruction"
+image = Image.open("image.jpg")
+
+inputs = processor(text=[prompt], images=[image], return_tensors="pt")
+
+with torch.inference_mode():
+     output = model.generate(
+        **inputs,
+        do_sample=False,
+        use_cache=True,
+        max_new_tokens=256,
+        eos_token_id=151645,
+        pad_token_id=processor.tokenizer.pad_token_id
+    )
+prompt_len = inputs["input_ids"].shape[1]
+decoded_text = processor.batch_decode(output[:, prompt_len:])[0]
+```
+
+You can check examples of different prompts in our [demo space](https://huggingface.co/spaces/unum-cloud/uform-gen2-qwen-500m-demo)
+
+
 ### Image Captioning and Question Answering
 
+__It is the instruction for the first version of UForm-Gen model. We highly recommend you use the new model, instructions for which you can find above.__
+
+
 The generative model can be used to caption images, summarize their content, or answer questions about them.
 The exact behavior is controlled by prompts.
 
@@ -231,6 +266,12 @@ Evaluating the `unum-cloud/uform-vl-multilingual-v2` model, one can expect the f
 
 ### Generative Models
 
+| Model                               | LLM Size |  SQA  |  MME   | MMBench  | Average¹ |
+| :---------------------------------- | -------: | -----:| ------:| --------:| --------:|
+| UForm-Gen2-Qwen-500m                |   0.5B   | 45.5  | 880.1  |  42.0    |   29.31  |
+| MobileVLM v2                        |   1.4B   | 52.1  | 1302.8 |  57.7    |   36.81  |
+| LLaVA-Phi                           |   2.7B   | 68.4  | 1335.1 |  59.8    |   42.95  |
+
 For captioning evaluation we measure CLIPScore and RefCLIPScore³.
 
 | Model                               | Size | Caption Length | CLIPScore | RefCLIPScore |
@@ -262,7 +303,7 @@ Results for VQAv2 evaluation.
 
 ## Speed
 
-On RTX 3090, the following performance is expected on text encoding.
+On Nvidia RTX 3090, the following performance is expected on text encoding.
 
 | Model                                     | Multilingual |                  Speed |    Speedup |
 | :---------------------------------------- | -----------: | ---------------------: | ---------: |
@@ -271,14 +312,27 @@ On RTX 3090, the following performance is expected on text encoding.
 | `sentence-transformers/all-MiniLM-L12-v2` |      __Yes__ | 3'604 sequences/second |     x 2.24 |
 | `unum-cloud/uform-vl-multilingual-v2`     |      __Yes__ | 6'809 sequences/second | __x 4.22__ |
 
-On RTX 3090, the following performance is expected on text token generation using `float16`, equivalent PyTorch settings, and greedy decoding.
+On Nvidia RTX 3090, the following performance is expected on text token generation using `float16`, equivalent PyTorch settings, and greedy decoding.
 
 | Model                               | Size |               Speed |   Speedup |
 | :---------------------------------- | ---: | ------------------: | --------: |
 | `llava-hf/llava-1.5-7b-hf`          |   7B |  ~ 40 tokens/second |           |
 | `Salesforce/instructblip-vicuna-7b` |   7B |  ~ 40 tokens/second |           |
 | `unum-cloud/uform-gen`              | 1.5B | ~ 140 tokens/second | __x 3.5__ |
 
+Given the small size of the model it also work well on mobile devices.
+On Apple M2 Arm chips the energy efficiency of inference can exceed that of the RTX 3090 GPU and other Ampere-generation cards.
+
+| Device                 |               Speed | Device TDP |        Efficiency |
+| :--------------------- | ------------------: | ---------: | ----------------: |
+| Nvidia RTX 3090        | ~ 140 tokens/second |     < 350W | 0.40 tokens/joule |
+| Apple M2 Pro unplugged |  ~ 19 tokens/second |      < 20W | 0.95 tokens/joule |
+| Apple M2 Max unplugged |  ~ 38 tokens/second |      < 36W | 1.06 tokens/joule |
+| Apple M2 Max plugged   |  ~ 56 tokens/second |      < 89W | 0.63 tokens/joule |
+
+> [!WARNING]
+> The above numbers are for reference only and are not guaranteed to be accurate.
+
 ## License
 
 All models come under the same license as the code - Apache 2.0.
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-0.3.2
+0.3.2
diff --git a/docs/_static/custom.css b/docs/_static/custom.css
@@ -23,4 +23,4 @@ table>tbody>tr>td:first-child {
 
 #overview>p>a>img {
     height: 25px !important;
-}
+}
diff --git a/docs/index.rst b/docs/index.rst
@@ -3,7 +3,7 @@ Overview
 ==========
 .. mdinclude:: ../README.md
 
-.. toctree:: 
+.. toctree::
    :hidden:
 
    self

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,53 +1,72 @@
 [build-system]
-requires = ["setuptools>=42"]
 build-backend = "setuptools.build_meta"
-
-[project.scripts]
-uform-chat = "uform.chat:main"
+requires = ["setuptools>=42"]
 
 [project]
-name = "uform"
-version = "1.0.3"
 authors = [
-  { name="Mikhail Kim", email="mike.kim@unum.cloud" },
-  { name="Vladimir Orshulevich", email="vladimir.orshulevich@unum.cloud" },
-  { name="Ash Vardanian", email="ash.vardanian@unum.cloud" },
+    {email = "ash.vardanian@unum.cloud", name = "Ash Vardanian"},
+    {email = "mike.kim@unum.cloud", name = "Mikhail Kim"},
+    {email = "vladimir.orshulevich@unum.cloud", name = "Vladimir Orshulevich"},
 ]
-maintainers = [
-  { name="Unum Cloud", email="info@unum.cloud" },
-]
-dependencies = [
-  "torch>=1.13.1",
-  "tokenizers>=0.13.3",
-  "huggingface_hub>=0.16.4",
-  "transformers>=4.36.2",
-  "torchvision"
-]
-description = "Multi-Modal Transformers library for Semantic Search and other Vision-Language tasks"
-readme = "README.md"
-requires-python = ">=3.7"
 classifiers = [
-    "Programming Language :: Python :: 3",
     "Development Status :: 5 - Production/Stable",
     "License :: OSI Approved :: Apache Software License",
-    "Operating System :: OS Independent",
-
-    "Topic :: Scientific/Engineering :: Artificial Intelligence",
-    "Topic :: Scientific/Engineering :: Image Processing",
-    "Topic :: Scientific/Engineering :: Image Recognition",
-
+    "Natural Language :: Chinese (Simplified)",
     "Natural Language :: English",
     "Natural Language :: French",
-    "Natural Language :: Korean",
     "Natural Language :: German",
     "Natural Language :: Italian",
-    "Natural Language :: Polish",
-    "Natural Language :: Spanish",
     "Natural Language :: Japanese",
+    "Natural Language :: Korean",
+    "Natural Language :: Polish",
     "Natural Language :: Russian",
+    "Natural Language :: Spanish",
     "Natural Language :: Turkish",
-    "Natural Language :: Chinese (Simplified)",
+    "Operating System :: OS Independent",
+    "Programming Language :: Python :: 3",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    "Topic :: Scientific/Engineering :: Image Processing",
+    "Topic :: Scientific/Engineering :: Image Recognition",
+]
+dependencies = [
+    "huggingface_hub>=0.16.4",
+    "tokenizers>=0.13.3",
+    "torch>=1.13.1",
+    "torchvision",
+    "transformers>=4.36.2",
+]
+description = "Multi-Modal Transformers library for Semantic Search and other Vision-Language tasks"
+maintainers = [
+    {email = "info@unum.cloud", name = "Unum Cloud"},
 ]
+name = "uform"
+readme = "README.md"
+requires-python = ">=3.7"
+version = "1.1.1"
+
+[project.scripts]
+uform-chat = "uform.chat:main"
 
 [project.urls]
-"Homepage" = "https://github.com/unum-cloud/uform"
+"Homepage" = "https://github.com/unum-cloud/uform"
+
+[tool.ruff]
+ignore = ["C408", "C901", "E501", "E741"]
+ignore-init-module-imports = true
+select = ["C", "E", "F", "I", "UP", "W"]
+
+[tool.ruff.isort]
+lines-after-imports = 2
+
+[tool.ruff.lint.isort]
+known-first-party = ["uform"]
+
+[tool.ruff.per-file-ignores]
+"__init__.py" = ["E401"]
+
+[tool.tomlsort]
+all = true
+in_place = true
+spaces_before_inline_comment = 2
+spaces_indent_inline_array = 4
+trailing_comma_inline_array = true
diff --git a/scripts/bench.py b/scripts/bench.py
@@ -1,14 +1,19 @@
-import requests
+from functools import partial
 from time import perf_counter
 from typing import List
 
-from PIL import Image
+import requests
 import torch
+from PIL import Image
+from transformers import (
+    AutoProcessor,
+    InstructBlipForConditionalGeneration,
+    InstructBlipProcessor,
+    LlavaForConditionalGeneration,
+)
 
 from uform import get_model
 from uform.gen_model import VLMForCausalLM, VLMProcessor
-from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration
-from transformers import AutoProcessor, LlavaForConditionalGeneration
 
 dtype = torch.bfloat16
 low_cpu_mem_usage = False
@@ -18,7 +23,7 @@
 def caption(model, processor, prompt: str, image: Image.Image) -> str:
     inputs = processor(prompt, image, return_tensors="pt")
     for possible_key in ["images", "pixel_values"]:
-        if not possible_key in inputs:
+        if possible_key not in inputs:
             continue
         inputs[possible_key] = inputs[possible_key].to(dtype)  # Downcast floats
     inputs = {k: v.to(device) for k, v in inputs.items()}  # Move to the right device
@@ -56,15 +61,12 @@ def bench_captions(
     total_duration = 0
     total_length = 0
     model = torch.compile(model)
+
+    def caption_image(image, model=model, processor=processor, prompt=prompt):
+        return caption(model=model, processor=processor, prompt=prompt, image=image)
+
     for image in images:
-        seconds, text = duration(
-            lambda: caption(
-                model=model,
-                processor=processor,
-                prompt=prompt,
-                image=image,
-            )
-        )
+        seconds, text = duration(partial(caption_image, image=image))
         total_duration += seconds
         total_length += len(text)
-Original file line number
+Diff line change
@@ Expand Up / @@ -6,4 +6,4 @@ src/test.py @@
     build/
     package-lock.json
     *.egg-info
-    *.oonx
+    *.onnx