diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 4170c99..512b641 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -113,10 +113,14 @@ jobs:
uses: actions/checkout@v4
with:
ref: "main"
+ - name: Install dependencies
+ run: |
+ sudo apt update &&
+ sudo apt install -y doxygen graphviz dia git &&
+ pip install sphinx==5.3.0 sphinx-js==3.2.1 breathe==4.35.0 furo==2023.3.27 m2r2==0.3.3.post2 sphinxcontrib-googleanalytics==0.2.dev20220708 sphinxcontrib-jquery==4.1 &&
+ npm install -g jsdoc
- name: Setup GitHub Pages
uses: actions/configure-pages@v2
- - name: Install dependencies
- run: sudo apt update && sudo apt install -y doxygen graphviz dia git && pip install sphinx==7.1.2 breathe furo m2r2 sphinxcontrib-googleanalytics==0.2.dev20220708 sphinxcontrib-jquery toml
- name: Install UForm from PyPi
run: pip install uform
- name: Build documentation
diff --git a/.gitignore b/.gitignore
index af7d4af..1bbdc30 100755
--- a/.gitignore
+++ b/.gitignore
@@ -4,7 +4,21 @@ test
build/
package-lock.json
*.egg-info
-*.onnx
__pycache__
.build
-.swiftpm
\ No newline at end of file
+.swiftpm
+.hf_token
+
+dictionary*
+vocab*
+
+# Tensors & ML Model
+*.onnx
+*.pt
+*.safetensors
+*.mlpackage
+
+# NodeJS
+node_modules
+node_build
+yarn-error.log
diff --git a/.vscode/launch.json b/.vscode/launch.json
index 59eb78c..92a1844 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -5,11 +5,29 @@
"version": "0.2.0",
"configurations": [
{
- "name": "Python Debugger: Current File with Arguments",
+ "name": "Python Debugger",
"type": "debugpy",
"request": "launch",
"program": "${file}",
"console": "integratedTerminal",
+ },
+ {
+ "name": "PyTest Debugger",
+ "type": "debugpy",
+ "request": "launch",
+ "program": "pytest",
+ "console": "integratedTerminal",
+ "args": [
+ "${file}",
+ "-s",
+ "-x",
+ ],
+ },
+ {
+ "name": "NodeJS Debugger",
+ "type": "node-terminal",
+ "request": "launch",
+ "command": "npm run test",
}
]
}
\ No newline at end of file
diff --git a/.vscode/settings.json b/.vscode/settings.json
index a6cceb8..3275f93 100755
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -1,8 +1,10 @@
{
"cSpell.words": [
"arange",
+ "astype",
"CFURL",
"coreml",
+ "crossattn",
"cumsum",
"dtype",
"embs",
@@ -19,26 +21,37 @@
"ndarray",
"numpy",
"ONNX",
+ "onnxconverter",
"onnxruntime",
+ "opset",
"packbits",
"preprocess",
"pretrained",
"probs",
"pypi",
+ "pytest",
+ "randn",
"rerank",
"reranker",
"reranking",
+ "sandbeach",
"sess",
"SIMD",
"softmax",
+ "Tensorrt",
+ "torchvision",
"transfromers",
"uform",
"unimodal",
"unsqueeze",
- "Vardanian"
+ "Vardanian",
+ "whitespaces"
],
"[python]": {
"editor.defaultFormatter": "ms-python.black-formatter"
},
- "python.formatting.provider": "none"
+ "python.formatting.provider": "none",
+ "window.autoDetectColorScheme": true,
+ "workbench.colorTheme": "Default Dark+",
+ "workbench.preferredDarkColorTheme": "Default Dark+"
}
\ No newline at end of file
diff --git a/BENCHMARKS.md b/BENCHMARKS.md
new file mode 100644
index 0000000..07ff0bb
--- /dev/null
+++ b/BENCHMARKS.md
@@ -0,0 +1,182 @@
+# UForm Model Benchmarks
+
+## Accuracy
+
+### Embedding Models
+
+Few retrieval benchmarks exist for multimodal embeddings.
+The most famous ones for English are "MS-COCO" and "Flickr30k".
+Evaluating `uform-vl-english` model, one can expect the following numbers for search quality.
+
+| Dataset | Recall @ 1 | Recall @ 5 | Recall @ 10 |
+| :-------- | ---------: | ---------: | ----------: |
+| Flickr | 0.727 | 0.915 | 0.949 |
+| MS-COCO ¹ | 0.510 | 0.761 | 0.838 |
+
+For multilingual benchmarks, we've created the [`unum-cloud/coco-sm`](https://github.com/unum-cloud/coco-sm) repository².
+Evaluating the `unum-cloud/uform-vl-multilingual-v2` model, one can expect the following metrics for text-to-image search, compared against `xlm-roberta-base-ViT-B-32` [OpenCLIP](https://github.com/mlfoundations/open_clip) model.
+
+| Language | OpenCLIP @ 1 | UForm @ 1 | OpenCLIP @ 5 | UForm @ 5 | OpenCLIP @ 10 | UForm @ 10 | Speakers |
+| :-------- | -----------: | --------: | -----------: | --------: | ------------: | ---------: | -------: |
+| English 🇺🇸 | __37.8__ | 37.7 | 63.5 | __65.0__ | 73.5 | __75.9__ | 1'452 M |
+| Chinese 🇨🇳 | 27.3 | __32.2__ | 51.3 | __59.0__ | 62.1 | __70.5__ | 1'118 M |
+| Hindi 🇮🇳 | 20.7 | __31.3__ | 42.5 | __57.9__ | 53.7 | __69.6__ | 602 M |
+| Spanish 🇪🇸 | 32.6 | __35.6__ | 58.0 | __62.8__ | 68.8 | __73.7__ | 548 M |
+| Arabic 🇸🇦 | 22.7 | __31.7__ | 44.9 | __57.8__ | 55.8 | __69.2__ | 274 M |
+| French 🇫🇷 | 31.3 | __35.4__ | 56.5 | __62.6__ | 67.4 | __73.3__ | 274 M |
+
+
+All languages:
+
+| Language | OpenCLIP @ 1 | UForm @ 1 | OpenCLIP @ 5 | UForm @ 5 | OpenCLIP @ 10 | UForm @ 10 | Speakers |
+| :------------------- | -----------: | -----------: | -----------: | -----------: | ------------: | -----------: | -------: |
+| Arabic 🇸🇦 | 22.7 | __31.7__ | 44.9 | __57.8__ | 55.8 | __69.2__ | 274 M |
+| Armenian 🇦🇲 | 5.6 | __22.0__ | 14.3 | __44.7__ | 20.2 | __56.0__ | 4 M |
+| Chinese 🇨🇳 | 27.3 | __32.2__ | 51.3 | __59.0__ | 62.1 | __70.5__ | 1'118 M |
+| English 🇺🇸 | __37.8__ | 37.7 | 63.5 | __65.0__ | 73.5 | __75.9__ | 1'452 M |
+| French 🇫🇷 | 31.3 | __35.4__ | 56.5 | __62.6__ | 67.4 | __73.3__ | 274 M |
+| German 🇩🇪 | 31.7 | __35.1__ | 56.9 | __62.2__ | 67.4 | __73.3__ | 134 M |
+| Hebrew 🇮🇱 | 23.7 | __26.7__ | 46.3 | __51.8__ | 57.0 | __63.5__ | 9 M |
+| Hindi 🇮🇳 | 20.7 | __31.3__ | 42.5 | __57.9__ | 53.7 | __69.6__ | 602 M |
+| Indonesian 🇮🇩 | 26.9 | __30.7__ | 51.4 | __57.0__ | 62.7 | __68.6__ | 199 M |
+| Italian 🇮🇹 | 31.3 | __34.9__ | 56.7 | __62.1__ | 67.1 | __73.1__ | 67 M |
+| Japanese 🇯🇵 | 27.4 | __32.6__ | 51.5 | __59.2__ | 62.6 | __70.6__ | 125 M |
+| Korean 🇰🇷 | 24.4 | __31.5__ | 48.1 | __57.8__ | 59.2 | __69.2__ | 81 M |
+| Persian 🇮🇷 | 24.0 | __28.8__ | 47.0 | __54.6__ | 57.8 | __66.2__ | 77 M |
+| Polish 🇵🇱 | 29.2 | __33.6__ | 53.9 | __60.1__ | 64.7 | __71.3__ | 41 M |
+| Portuguese 🇵🇹 | 31.6 | __32.7__ | 57.1 | __59.6__ | 67.9 | __71.0__ | 257 M |
+| Russian 🇷🇺 | 29.9 | __33.9__ | 54.8 | __60.9__ | 65.8 | __72.0__ | 258 M |
+| Spanish 🇪🇸 | 32.6 | __35.6__ | 58.0 | __62.8__ | 68.8 | __73.7__ | 548 M |
+| Thai 🇹🇭 | 21.5 | __28.7__ | 43.0 | __54.6__ | 53.7 | __66.0__ | 61 M |
+| Turkish 🇹🇷 | 25.5 | __33.0__ | 49.1 | __59.6__ | 60.3 | __70.8__ | 88 M |
+| Ukranian 🇺🇦 | 26.0 | __30.6__ | 49.9 | __56.7__ | 60.9 | __68.1__ | 41 M |
+| Vietnamese 🇻🇳 | 25.4 | __28.3__ | 49.2 | __53.9__ | 60.3 | __65.5__ | 85 M |
+| | | | | | | | |
+| Mean | 26.5±6.4 | __31.8±3.5__ | 49.8±9.8 | __58.1±4.5__ | 60.4±10.6 | __69.4±4.3__ | - |
+| Google Translate | 27.4±6.3 | __31.5±3.5__ | 51.1±9.5 | __57.8±4.4__ | 61.7±10.3 | __69.1±4.3__ | - |
+| Microsoft Translator | 27.2±6.4 | __31.4±3.6__ | 50.8±9.8 | __57.7±4.7__ | 61.4±10.6 | __68.9±4.6__ | - |
+| Meta NLLB | 24.9±6.7 | __32.4±3.5__ | 47.5±10.3 | __58.9±4.5__ | 58.2±11.2 | __70.2±4.3__ | - |
+
+### Generative Models
+
+| Model | LLM Size | SQA | MME | MMBench | Average¹ |
+| :------------------- | -------: | ---: | -----: | ------: | -------: |
+| UForm-Gen2-Qwen-500m | 0.5B | 45.5 | 880.1 | 42.0 | 29.31 |
+| MobileVLM v2 | 1.4B | 52.1 | 1302.8 | 57.7 | 36.81 |
+| LLaVA-Phi | 2.7B | 68.4 | 1335.1 | 59.8 | 42.95 |
+
+For captioning evaluation we measure CLIPScore and RefCLIPScore³.
+
+| Model | Size | Caption Length | CLIPScore | RefCLIPScore |
+| :---------------------------------- | ---: | -------------: | --------: | -----------: |
+| `llava-hf/llava-1.5-7b-hf` | 7B | Long | 0.878 | 0.529 |
+| `llava-hf/llava-1.5-7b-hf` | 7B | Short | 0.886 | 0.531 |
+| | | | | |
+| `Salesforce/instructblip-vicuna-7b` | 7B | Long | 0.902 | 0.534 |
+| `Salesforce/instructblip-vicuna-7b` | 7B | Short | 0.848 | 0.523 |
+| | | | | |
+| `unum-cloud/uform-gen` | 1.5B | Long | 0.847 | 0.523 |
+| `unum-cloud/uform-gen` | 1.5B | Short | 0.842 | 0.522 |
+| | | | | |
+| `unum-cloud/uform-gen-chat` | 1.5B | Long | 0.860 | 0.525 |
+| `unum-cloud/uform-gen-chat` | 1.5B | Short | 0.858 | 0.525 |
+
+Results for VQAv2 evaluation.
+
+| Model | Size | Accuracy |
+| :------------------------- | ---: | -------: |
+| `llava-hf/llava-1.5-7b-hf` | 7B | 78.5 |
+| `unum-cloud/uform-gen` | 1.5B | 66.5 |
+
+
+
+> ¹ Train split was in training data.
+> ² Lacking a broad enough evaluation dataset, we translated the [COCO Karpathy test split](https://www.kaggle.com/datasets/shtvkumar/karpathy-splits) with multiple public and proprietary translation services, averaging the scores across all sets, and breaking them down in the bottom section.
+> ³ We used `apple/DFN5B-CLIP-ViT-H-14-378` CLIP model.
+
+## Speed
+
+### Embedding Models
+
+UForm comes pre-packaged with speed benchmarks for the models.
+
+```bash
+$ python python/scripts/bench_encoders.py --help
+usage: bench_encoders.py [-h] [--filter-out FILTER_OUT] [--batch-size BATCH_SIZE]
+
+options:
+ -h, --help show this help message and exit
+ --filter-out FILTER_OUT
+ Filter out models, backends, or devices with a Regular Expression.
+ --batch-size BATCH_SIZE
+ Batch size for the benchmark. Batch size 1 measures latency. Large batch sizes may not fit on every GPU.
+```
+
+Running that script for a fairly small batch size of 50 on an Nvidia H100 GPU and
+
+| Model Name | Device | Backend | Images Preprocessed/s | Images Encoded/s | Texts Preprocessed/s | Texts Encoded/s |
+| :--------------------------------------------- | :----- | :------ | --------------------: | :--------------- | :------------------- | :-------------- |
+| unum-cloud/uform3-image-text-english-base | cpu | torch | 23.03 | 76.57 | 15,978.03 | 562.28 |
+| unum-cloud/uform3-image-text-english-base | cpu | onnx | 23.11 | 77.75 | 13,880.27 | 1,067.40 |
+| unum-cloud/uform3-image-text-english-base | cuda | torch | 22.87 | 1,060.40 | 12,348.94 | 13,242.83 |
+| unum-cloud/uform3-image-text-english-large | cpu | torch | 22.41 | 10.84 | 13,350.45 | 145.12 |
+| unum-cloud/uform3-image-text-english-large | cpu | onnx | 23.13 | 19.60 | 18,031.85 | 960.09 |
+| unum-cloud/uform3-image-text-english-large | cuda | torch | 22.78 | 244.86 | 13,226.40 | 10,204.04 |
+| unum-cloud/uform3-image-text-english-small | cpu | torch | 20.08 | 71.68 | 12,147.05 | 249.63 |
+| unum-cloud/uform3-image-text-english-small | cpu | onnx | 22.84 | 195.27 | 13,636.99 | 1,385.25 |
+| unum-cloud/uform3-image-text-english-small | cuda | torch | 22.63 | 2,662.16 | 14,731.18 | 14,694.87 |
+| unum-cloud/uform3-image-text-multilingual-base | cpu | torch | 22.98 | 64.28 | 10,129.27 | 209.76 |
+| unum-cloud/uform3-image-text-multilingual-base | cpu | onnx | 23.06 | 66.81 | 8,963.13 | 1,104.32 |
+| unum-cloud/uform3-image-text-multilingual-base | cuda | torch | 22.88 | 1,051.95 | 15,639.72 | 12,416.12 |
+
+If you are interested in performance numbers on consumer grade hardware, compared to third-party models, here are some rough estimates.
+On Nvidia RTX 3090:
+
+| Model | Multilingual | Speed | Speedup |
+| :----------------------------------------------- | -----------: | ---------------------: | ---------: |
+| `bert-base-uncased` | No | 1'612 sequences/second | |
+| `distilbert-base-uncased` | No | 3'174 sequences/second | x 1.96 |
+| `sentence-transformers/all-MiniLM-L12-v2` | __Yes__ | 3'604 sequences/second | x 2.24 |
+| `unum-cloud/uform3-image-text-multilingual-base` | __Yes__ | 6'809 sequences/second | __x 4.22__ |
+
+Given the small size of the model it also work well on mobile devices.
+On Apple M2 Arm chips the energy efficiency of inference can exceed that of the RTX 3090 GPU and other Ampere-generation cards.
+
+| Device | Speed | Device TDP | Efficiency |
+| :--------------------- | ------------------: | ---------: | ----------------: |
+| Nvidia RTX 3090 | ~ 140 tokens/second | < 350W | 0.40 tokens/joule |
+| Apple M2 Pro unplugged | ~ 19 tokens/second | < 20W | 0.95 tokens/joule |
+| Apple M2 Max unplugged | ~ 38 tokens/second | < 36W | 1.06 tokens/joule |
+| Apple M2 Max plugged | ~ 56 tokens/second | < 89W | 0.63 tokens/joule |
+
+### Generative Models
+
+```bash
+$ python python/scripts/bench_decoders.py --help
+usage: bench_decoders.py [-h] [--filter-out FILTER_OUT] [--batch-size BATCH_SIZE]
+
+options:
+ -h, --help show this help message and exit
+ --batch-size BATCH_SIZE
+ Batch size for the benchmark. Batch size 1 measures latency. Large batch sizes may not fit on every GPU.
+ --max-length MAX_LENGTH
+ Maximum length of the generated text in tokens.
+```
+
+On Nvidia H100 GPU, the following performance is expected on text token generation using `float16`, equivalent PyTorch settings, and greedy decoding.
+
+| Model | Size | Decoding Speed | Decoding Parallel Streams |
+| :---------------------------------- | ----: | -------------: | ---------------------------: |
+| `llava-hf/llava-1.5-7b-hf` | 7 B | ~ 141 tokens/s | ~ 4 K tokens/s (32 streams) |
+| `Salesforce/instructblip-vicuna-7b` | 7 B | ~ 211 tokens/s | ~ 2 K tokens/s (32 streams) |
+| `unum-cloud/uform-gen` | 1.5 B | ~ 252 tokens/s | ~ 3 K tokens/s (128 streams) |
+| `unum-cloud/uform-gen2-dpo` | 1.2 B | ~ 372 tokens/s | ~ 10 K tokens/s (64 streams) |
+
+On Nvidia RTX 3090, the following performance is expected on text token generation using `float16`, equivalent PyTorch settings, and greedy decoding.
+
+| Model | Size | Decoding Speed | Speedup |
+| :---------------------------------- | ----: | -------------: | --------: |
+| `llava-hf/llava-1.5-7b-hf` | 7 B | ~ 40 tokens/s | |
+| `Salesforce/instructblip-vicuna-7b` | 7 B | ~ 40 tokens/s | |
+| `unum-cloud/uform-gen` | 1.5 B | ~ 140 tokens/s | __x 3.5__ |
+
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 181d9e2..65e0b26 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -7,12 +7,11 @@ We welcome contributions to UForm!
Before submitting any changes, please make sure that the tests pass.
```sh
-pip install -e . # For core dependencies
-
+pip install -e ".[dev]" # For development dependencies
pip install -e ".[torch]" # For PyTorch
pip install -e ".[onnx]" # For ONNX on CPU
pip install -e ".[onnx-gpu]" # For ONNX on GPU, available for some platforms
-pip install -e ".[torch,onnx]" # For PyTorch and ONNX Python tests
+pip install -e ".[torch,onnx,onnx-gpu,dev]" # For all
pytest python/scripts/ -s -x -Wd -v
pytest python/scripts/ -s -x -Wd -v -k onnx # To run only ONNX tests without loading Torch
@@ -20,6 +19,13 @@ pytest python/scripts/ -s -x -Wd -v -k onnx # To run only ONNX tests without loa
## Swift
+To build and test the Swift package, use the following command:
+
+```bash
+swift build
+swift test
+```
+
Swift formatting is enforced with `swift-format` default utility from Apple.
To install and run it on all the files in the project, use the following command:
@@ -30,3 +36,31 @@ swift-format . -i -r
The style is controlled by the `.swift-format` JSON file in the root of the repository.
As there is no standard for Swift formatting, even Apple's own `swift-format` tool and Xcode differ in their formatting rules, and available settings.
+
+## JavaScript
+
+For rapid development you can avoid the TypeScript precompilation step:
+
+```sh
+npm install -g ts-node
+ts-node javascript/embeddings.mts
+```
+
+Before submitting any changes, please make sure that the tests pass.
+
+```sh
+npm install
+npm run test
+```
+
+## Benchmarking
+
+If you want to double check, how fast the model may work on your hardware, you can clone the library and repeat the benchmarks locally.
+The following benchmark will exclude PyTorch backend, CUDA-capable devices, and all the `-base` and `-large` models, running only the ONNX benchmarks on the CPU.
+
+```sh
+git clone https://github.com/unum-cloud/uform --depth 1 # Clone the repository
+cd uform && pip install -e ".[torch,onnx,onnx-gpu,dev]" # Install all dependencies
+python python/scripts/bench_encoders.py --filter-out "torch|cuda|base|large"
+```
+
diff --git a/Package.resolved b/Package.resolved
index fe63c94..6e3b1f7 100644
--- a/Package.resolved
+++ b/Package.resolved
@@ -14,7 +14,7 @@
"kind" : "remoteSourceControl",
"location" : "https://github.com/ashvardanian/swift-transformers",
"state" : {
- "revision" : "9ef46a51eca46978b62773f8887926dfe72b0ab4"
+ "revision" : "89fb5d97e1df347f9f588f62fc538dcad6fdb16c"
}
}
],
diff --git a/Package.swift b/Package.swift
index 6ac8372..c2f7fe7 100644
--- a/Package.swift
+++ b/Package.swift
@@ -19,7 +19,7 @@ let package = Package(
dependencies: [
.package(
url: "https://github.com/ashvardanian/swift-transformers",
- revision: "9ef46a51eca46978b62773f8887926dfe72b0ab4"
+ revision: "89fb5d97e1df347f9f588f62fc538dcad6fdb16c"
)
],
targets: [
@@ -29,13 +29,13 @@ let package = Package(
.product(name: "Transformers", package: "swift-transformers")
],
path: "swift",
- exclude: ["EmbeddingsTests.swift"]
+ exclude: ["EncodersTests.swift"]
),
.testTarget(
name: "UFormTests",
dependencies: ["UForm"],
path: "swift",
- sources: ["EmbeddingsTests.swift"]
+ sources: ["EncodersTests.swift"]
),
]
)
diff --git a/README.md b/README.md
index 031c484..8484b0f 100755
--- a/README.md
+++ b/README.md
@@ -20,18 +20,24 @@ For Content Understanding and Generation
Multimodal Embeddings from 64 to 768 Dimensions • 1B Parameter Chat
-Short Texts • Images • 🔜 Video Clips
+Short Texts • Images • 🔜 Video Clips • 🔜 Long Documents
-PyTorch • ONNX
+ONNX • CoreML • PyTorch
+
+Python
+ •
+JavaScript
+ •
+Swift
---
-![](https://github.com/ashvardanian/usearch-images/blob/main/assets/uform-gen-preview.jpg?raw=true)
+![UForm Chat Preview](https://github.com/ashvardanian/usearch-images/blob/main/assets/uform-gen-preview.jpg?raw=true)
Welcome to UForm, a __multimodal__ AI library that's as versatile as it is efficient.
UForm [tiny embedding models](#encoder) will help you understand and search visual and textual content across various languages.
-UForm [small generative models](#decoder), on the other hand, don't only support conversational and chat use-cases, but are also capable of image captioning and Visual Question Answering (VQA).
+UForm [small generative models](#decoder), on the other hand, don't only support conversational and chat use-cases, but are great for fast image captioning and Visual Question Answering (VQA).
With compact __custom pre-trained transformer models__, this can run anywhere from your server farm down to your smartphone.
## Features
@@ -40,108 +46,167 @@ With compact __custom pre-trained transformer models__, this can run anywhere fr
- __Throughput__: Thanks to the small size, the inference speed is [2-4x faster](#speed) than competitors.
- __Portable__: Models come with native ONNX support, making them easy to deploy on any platform.
- __Quantization Aware__: Down-cast embeddings from `f32` to `i8` without losing much recall.
-- __Multilingual__: Trained on a balanced dataset, the recall is great across over [20 languages](#evaluation).
+- __Multilingual__: Trained on a balanced dataset, the recall is great across over 20 languages.
[usearch]: https://github.com/unum-cloud/usearch
[matryoshka]: https://arxiv.org/abs/2205.13147
## Models
-### Embedding Models
+For accuracy and speed benchmarks refer to the [evaluation page](https://github.com/unum-cloud/uform/blob/main/BENCHMARKS.md).
-| Model | Parameters | Languages | Architecture |
-| :--------------------------------------- | ---------: | --------: | -------------------------------------------: |
-| [`uform-vl-english-large`][model-e-l] 🆕 | 365M | 1 | 6 text layers, ViT-L/14, 6 multimodal layers |
-| [`uform-vl-english`][model-e] | 143M | 1 | 2 text layers, ViT-B/16, 2 multimodal layers |
-| [`uform-vl-english-small`][model-e-s] 🆕 | 79M | 1 | 2 text layers, ViT-S/16, 2 multimodal layers |
-| [`uform-vl-multilingual-v2`][model-m-v2] | 206M | 21 | 8 text layers, ViT-B/16, 4 multimodal layers |
-| [`uform-vl-multilingual`][model-m] | 206M | 12 | 8 text layers, ViT-B/16, 4 multimodal layers |
+### Embedding Models
-[model-e-l]: https://huggingface.co/unum-cloud/uform-vl-english-large/
-[model-e]: https://huggingface.co/unum-cloud/uform-vl-english/
-[model-e-s]: https://huggingface.co/unum-cloud/uform-vl-english-small/
-[model-m]: https://huggingface.co/unum-cloud/uform-vl-multilingual/
-[model-m-v2]: https://huggingface.co/unum-cloud/uform-vl-multilingual-v2/
+
### Generative Models
-| Model | Parameters | Purpose | Architecture |
-| :--------------------------------- | ---------: | --------------------------: | ---------------------: |
-| [`uform-gen2-dpo`][model-g2] 🆕 | 1.2B | Chat, Image Captioning, VQA | qwen1.5-0.5B, ViT-H/14 |
-| [`uform-gen2-qwen-500m`][model-g2] | 1.2B | Chat, Image Captioning, VQA | qwen1.5-0.5B, ViT-H/14 |
-| [`uform-gen`][model-g1] | 1.5B | Image Captioning, VQA | llama-1.3B, ViT-B/16 |
+
+
+
+ Model
+ Parameters
+ Purpose
+ Architecture
+
+
+
+
+ uform-gen2-dpo
🆕
+ 1.2 B
+ Chat, Image Captioning, VQA
+ qwen1.5-0.5B, ViT-H/14
+
+
+ uform-gen2-qwen-500m
+ 1.2 B
+ Chat, Image Captioning, VQA
+ qwen1.5-0.5B, ViT-H/14
+
+
+ uform-gen
⚠️
+ 1.5 B
+ Image Captioning, VQA
+ llama-1.3B, ViT-B/16
+
+
+
+
+## Quick Start Examples
+
+### Embedding Models
-[model-g2]: https://huggingface.co/unum-cloud/uform-gen2-qwen-500m/
-[model-g1]: https://huggingface.co/unum-cloud/uform-gen/
+First, `pip install uform`.
+Then, load the model:
-## Producing Embeddings
+```py
+from uform import get_model, Modality
-Add UForm to your dependencies list, or just install it locally:
+processors, models = get_model('unum-cloud/uform3-image-text-english-small')
-```bash
-pip install uform
+model_text = models[Modality.TEXT_ENCODER]
+model_image = models[Modality.IMAGE_ENCODER]
+processor_text = processors[Modality.TEXT_ENCODER]
+processor_image = processors[Modality.IMAGE_ENCODER]
```
-Then, you can use the following code to get embeddings for text and images.
-You can do that either with the PyTorch reference model or the lighter cross-platform ONNX weights.
+Embed images:
-```python
-import uform
+```py
+import requests
+from io import BytesIO
from PIL import Image
-# If you want to use the PyTorch model
-model, processor = uform.get_model('unum-cloud/uform-vl-english-large') # Just English
-model, processor = uform.get_model('unum-cloud/uform-vl-multilingual-v2') # 21 Languages
+image_url = 'https://media-cdn.tripadvisor.com/media/photo-s/1b/28/6b/53/lovely-armenia.jpg'
+image_url = Image.open(BytesIO(requests.get(image_url).content))
+image_data = processor_image(image)
+image_features, image_embedding = model_image.encode(image_data, return_features=True)
+```
-# If you want to use the light-weight portable ONNX model
-# Available combinations: cpu & fp32, gpu & fp32, gpu & fp16
-# Check out Unum's Hugging Face space for more details: https://huggingface.co/unum-cloud
-model, processor = uform.get_model_onnx('unum-cloud/uform-vl-english-small', 'cpu', 'fp32')
-model, processor = uform.get_model_onnx('unum-cloud/uform-vl-english-large', 'gpu', 'fp16')
+Embed queries:
-text = 'a small red panda in a zoo'
-image = Image.open('red_panda.jpg')
+```py
+text = 'a cityscape bathed in the warm glow of the sun, with varied architecture and a towering, snow-capped mountain rising majestically in the background'
+text_data = processor_text(text)
+text_features, text_embedding = model_text.encode(text_data, return_features=True)
+```
-image_data = processor.preprocess_image(image)
-text_data = processor.preprocess_text(text)
+For more details check out:
-image_features, image_embedding = model.encode_image(image_data, return_features=True)
-text_features, text_embedding = model.encode_text(text_data, return_features=True)
-```
+- Python docs on embedding models in [python/README.md](https://github.com/unum-cloud/uform/blob/main/python/README.md#embedding-models)
+- JavaScript docs on embedding models in [javascript/README.md](https://github.com/unum-cloud/uform/blob/main/javascript/README.md#embedding-models)
+- Swift docs on embedding models in [swift/README.md](https://github.com/unum-cloud/uform/blob/main/swift/README.md#embedding-models)
+
+### Generative Models
-To search for similar items, the embeddings can be compared using cosine similarity.
-The resulting value will fall within the range of `-1` to `1`, where `1` indicates a high likelihood of a match.
-PyTorch provides a built-in function for calculating cosine similarity, while for ONNX, you can use NumPy.
+The generative models are natively compatible with
```python
-import torch.nn.functional as F
+from transformers import AutoModel, AutoProcessor
-similarity = F.cosine_similarity(image_embedding, text_embedding)
-```
+model = AutoModel.from_pretrained('unum-cloud/uform-gen2-dpo', trust_remote_code=True)
+processor = AutoProcessor.from_pretrained('unum-cloud/uform-gen2-dpo', trust_remote_code=True)
-ONNX has no such function, but you can calculate the cosine similarity using [SimSIMD](https://github.com/ashvardanian/simsimd) or manually, with NumPy:
+prompt = 'Question or Instruction'
+image = Image.open('image.jpg')
-```python
-import numpy as np
+inputs = processor(text=[prompt], images=[image], return_tensors='pt')
-image_embedding = image_embedding / np.linalg.norm(image_embedding, keepdims=True, axis=1)
-text_embedding = text_embedding / np.linalg.norm(text_embedding, keepdims=True, axis=1)
-similarity = (image_embedding * text_embedding).sum(axis=1)
+with torch.inference_mode():
+ output = model.generate(
+ **inputs,
+ do_sample=False,
+ use_cache=True,
+ max_new_tokens=256,
+ eos_token_id=151645,
+ pad_token_id=processor.tokenizer.pad_token_id
+ )
+prompt_len = inputs['input_ids'].shape[1]
+decoded_text = processor.batch_decode(output[:, prompt_len:])[0]
```
-### Reranking
+For more details check out:
-Once the list of nearest neighbors (best matches) is obtained, the joint multimodal embeddings, created from both text and image features, can be used to better rerank (reorder) the list.
-The model can calculate a "matching score" that falls within the range of `[0, 1]`, where `1` indicates a high likelihood of a match.
+- Python docs on generative models in [python/README.md](https://github.com/unum-cloud/uform/blob/main/python/README.md#generative-models)
+- JavaScript docs on generative models 🔜
+- Swift docs on generative models 🔜
-```python
-score, joint_embedding = model.encode_multimodal(
- image_features=image_features,
- text_features=text_features,
- attention_mask=text_data['attention_mask'],
- return_scores=True,
-)
-```
+## Technical Details
### Down-casting, Quantization, Matryoshka, and Slicing
@@ -153,7 +218,7 @@ Similarly, for higher-dimensional embeddings (512 or 768), a common strategy is
```python
import numpy as np
-f32_embedding: np.ndarray = model.encode_text(text_data, return_features=False).detach().cpu().numpy()
+f32_embedding: np.ndarray = model.encode_text(text_data, return_features=False)
f16_embedding: np.ndarray = f32_embedding.astype(np.float16)
i8_embedding: np.ndarray = (f32_embedding * 127).astype(np.int8)
b1_embedding: np.ndarray = np.packbits((f32_embedding > 0).astype(np.uint8))
@@ -164,7 +229,7 @@ Alternative approach to quantization is to use the Matryoshka embeddings, where
```python
import numpy as np
-large_embedding: np.ndarray = model.encode_text(text_data, return_features=False).detach().cpu().numpy()
+large_embedding: np.ndarray = model.encode_text(text_data, return_features=False)
small_embedding: np.ndarray = large_embedding[:, :256]
tiny_embedding: np.ndarray = large_embedding[:, :64]
```
@@ -219,253 +284,16 @@ You can pick one of many supported [ONNX execution providers][onnx-providers], w
[onnx-providers]: https://onnxruntime.ai/docs/execution-providers/
----
-
-The configuration process may include a few additional steps, depending on the environment.
-When using the CUDA and TensorRT backends with CUDA 12 or newer make sure to [install the Nvidia toolkit][install-nvidia-toolkit] and the `onnxruntime-gpu` package from the custom repository.
-
-```sh
-wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
-sudo dpkg -i cuda-keyring_1.1-1_all.deb
-sudo apt-get update
-sudo apt-get -y install cuda-toolkit-12
-pip install onnxruntime-gpu --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/
-export CUDA_PATH="/usr/local/cuda-12/bin"
-export PATH="/usr/local/cuda-12/bin${PATH:+:${PATH}}"
-export LD_LIBRARY_PATH="/usr/local/cuda-12/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
-pytest python/scripts/ -s -x -Wd -v -k onnx
-```
-
-[install-nvidia-toolkit]: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/#network-repo-installation-for-ubuntu
-
-## Chat, Image Captioning and Question Answering
-
-UForm generative models are fully compatible with the Hugging Face Transformers library, and can be used without installing the UForm library.
-Those models can be used to caption images or power multimodal chat experiences.
-
-```python
-from transformers import AutoModel, AutoProcessor
-
-model = AutoModel.from_pretrained('unum-cloud/uform-gen2-qwen-500m', trust_remote_code=True)
-processor = AutoProcessor.from_pretrained('unum-cloud/uform-gen2-qwen-500m', trust_remote_code=True)
+### Multimodal Chat in CLI
-prompt = 'Question or Instruction'
-image = Image.open('image.jpg')
-
-inputs = processor(text=[prompt], images=[image], return_tensors='pt')
-
-with torch.inference_mode():
- output = model.generate(
- **inputs,
- do_sample=False,
- use_cache=True,
- max_new_tokens=256,
- eos_token_id=151645,
- pad_token_id=processor.tokenizer.pad_token_id
- )
-prompt_len = inputs['input_ids'].shape[1]
-decoded_text = processor.batch_decode(output[:, prompt_len:])[0]
-```
-
-You can check examples of different prompts in our [demo space](https://huggingface.co/spaces/unum-cloud/uform-gen2-qwen-500m-demo)
-
-
-### Image Captioning and Question Answering
-
-__It is the instruction for the first version of UForm-Gen model. We highly recommend you use the new model, instructions for which you can find above.__
-
-
-The generative model can be used to caption images, summarize their content, or answer questions about them.
-The exact behavior is controlled by prompts.
-
-```python
-from uform.gen_model import VLMForCausalLM, VLMProcessor
-
-model = VLMForCausalLM.from_pretrained('unum-cloud/uform-gen')
-processor = VLMProcessor.from_pretrained('unum-cloud/uform-gen')
-
-# [cap] Narrate the contents of the image with precision.
-# [cap] Summarize the visual content of the image.
-# [vqa] What is the main subject of the image?
-prompt = '[cap] Summarize the visual content of the image.'
-image = Image.open('zebra.jpg')
-
-inputs = processor(texts=[prompt], images=[image], return_tensors='pt')
-with torch.inference_mode():
- output = model.generate(
- **inputs,
- do_sample=False,
- use_cache=True,
- max_new_tokens=128,
- eos_token_id=32001,
- pad_token_id=processor.tokenizer.pad_token_id
- )
-
-prompt_len = inputs['input_ids'].shape[1]
-decoded_text = processor.batch_decode(output[:, prompt_len:])[0]
-```
-
-### Multimodal Chat
-
-The generative models can be used for chat-like experiences, where the user can provide both text and images as input.
-To use that feature, you can start with the following CLI command:
+The generative models can be used for chat-like experiences in the command line.
+For that, you can use the `uform-chat` CLI tool, which is available in the UForm package.
```bash
-uform-chat --model unum-cloud/uform-gen-chat --image=zebra.jpg
-uform-chat --model unum-cloud/uform-gen-chat \
- --image="https://bit.ly/3tIVg9M" \
- --device="cuda:0" \
- --fp16
+$ pip install uform
+$ uform-chat --model unum-cloud/uform-gen2-dpo --image=zebra.jpg
+$ uform-chat --model unum-cloud/uform-gen2-dpo \
+> --image="https://bit.ly/3tIVg9M" \
+> --device="cuda:0" \
+> --fp16
```
-
-### Multi-GPU
-
-To achieve higher throughput, you can launch UForm on multiple GPUs.
-For that pick the encoder of the model you want to run in parallel (`text_encoder` or `image_encoder`), and wrap it in `nn.DataParallel` (or `nn.DistributedDataParallel`).
-
-```python
-import uform
-
-model, processor = uform.get_model('unum-cloud/uform-vl-english')
-model_image = nn.DataParallel(model.image_encoder)
-
-device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-model_image.to(device)
-
-_, res = model_image(images, 0)
-```
-
-## Evaluation
-
-### Embedding Models
-
-Few retrieval benchmarks exist for multimodal embeddings.
-The most famous ones for English are "MS-COCO" and "Flickr30k".
-Evaluating `uform-vl-english` model, one can expect the following numbers for search quality.
-
-| Dataset | Recall @ 1 | Recall @ 5 | Recall @ 10 |
-| :------- | ---------: | ---------: | ----------: |
-| Flickr | 0.727 | 0.915 | 0.949 |
-| MS-COCO¹ | 0.510 | 0.761 | 0.838 |
-
-
-For multilingual benchmarks, we've created the [`unum-cloud/coco-sm`](https://github.com/unum-cloud/coco-sm) repository².
-Evaluating the `unum-cloud/uform-vl-multilingual-v2` model, one can expect the following metrics for text-to-image search, compared against `xlm-roberta-base-ViT-B-32` [OpenCLIP](https://github.com/mlfoundations/open_clip) model.
-
-| Language | OpenCLIP @ 1 | UForm @ 1 | OpenCLIP @ 5 | UForm @ 5 | OpenCLIP @ 10 | UForm @ 10 | Speakers |
-| :-------- | -----------: | --------: | -----------: | --------: | ------------: | ---------: | -------: |
-| English 🇺🇸 | __37.8__ | 37.7 | 63.5 | __65.0__ | 73.5 | __75.9__ | 1'452 M |
-| Chinese 🇨🇳 | 27.3 | __32.2__ | 51.3 | __59.0__ | 62.1 | __70.5__ | 1'118 M |
-| Hindi 🇮🇳 | 20.7 | __31.3__ | 42.5 | __57.9__ | 53.7 | __69.6__ | 602 M |
-| Spanish 🇪🇸 | 32.6 | __35.6__ | 58.0 | __62.8__ | 68.8 | __73.7__ | 548 M |
-| Arabic 🇸🇦 | 22.7 | __31.7__ | 44.9 | __57.8__ | 55.8 | __69.2__ | 274 M |
-| French 🇫🇷 | 31.3 | __35.4__ | 56.5 | __62.6__ | 67.4 | __73.3__ | 274 M |
-
-
-
-All languages.
-
-
-| Language | OpenCLIP @ 1 | UForm @ 1 | OpenCLIP @ 5 | UForm @ 5 | OpenCLIP @ 10 | UForm @ 10 | Speakers |
-| :------------------- | -----------: | -----------: | -----------: | -----------: | ------------: | -----------: | -------: |
-| Arabic 🇸🇦 | 22.7 | __31.7__ | 44.9 | __57.8__ | 55.8 | __69.2__ | 274 M |
-| Armenian 🇦🇲 | 5.6 | __22.0__ | 14.3 | __44.7__ | 20.2 | __56.0__ | 4 M |
-| Chinese 🇨🇳 | 27.3 | __32.2__ | 51.3 | __59.0__ | 62.1 | __70.5__ | 1'118 M |
-| English 🇺🇸 | __37.8__ | 37.7 | 63.5 | __65.0__ | 73.5 | __75.9__ | 1'452 M |
-| French 🇫🇷 | 31.3 | __35.4__ | 56.5 | __62.6__ | 67.4 | __73.3__ | 274 M |
-| German 🇩🇪 | 31.7 | __35.1__ | 56.9 | __62.2__ | 67.4 | __73.3__ | 134 M |
-| Hebrew 🇮🇱 | 23.7 | __26.7__ | 46.3 | __51.8__ | 57.0 | __63.5__ | 9 M |
-| Hindi 🇮🇳 | 20.7 | __31.3__ | 42.5 | __57.9__ | 53.7 | __69.6__ | 602 M |
-| Indonesian 🇮🇩 | 26.9 | __30.7__ | 51.4 | __57.0__ | 62.7 | __68.6__ | 199 M |
-| Italian 🇮🇹 | 31.3 | __34.9__ | 56.7 | __62.1__ | 67.1 | __73.1__ | 67 M |
-| Japanese 🇯🇵 | 27.4 | __32.6__ | 51.5 | __59.2__ | 62.6 | __70.6__ | 125 M |
-| Korean 🇰🇷 | 24.4 | __31.5__ | 48.1 | __57.8__ | 59.2 | __69.2__ | 81 M |
-| Persian 🇮🇷 | 24.0 | __28.8__ | 47.0 | __54.6__ | 57.8 | __66.2__ | 77 M |
-| Polish 🇵🇱 | 29.2 | __33.6__ | 53.9 | __60.1__ | 64.7 | __71.3__ | 41 M |
-| Portuguese 🇵🇹 | 31.6 | __32.7__ | 57.1 | __59.6__ | 67.9 | __71.0__ | 257 M |
-| Russian 🇷🇺 | 29.9 | __33.9__ | 54.8 | __60.9__ | 65.8 | __72.0__ | 258 M |
-| Spanish 🇪🇸 | 32.6 | __35.6__ | 58.0 | __62.8__ | 68.8 | __73.7__ | 548 M |
-| Thai 🇹🇭 | 21.5 | __28.7__ | 43.0 | __54.6__ | 53.7 | __66.0__ | 61 M |
-| Turkish 🇹🇷 | 25.5 | __33.0__ | 49.1 | __59.6__ | 60.3 | __70.8__ | 88 M |
-| Ukranian 🇺🇦 | 26.0 | __30.6__ | 49.9 | __56.7__ | 60.9 | __68.1__ | 41 M |
-| Vietnamese 🇻🇳 | 25.4 | __28.3__ | 49.2 | __53.9__ | 60.3 | __65.5__ | 85 M |
-| | | | | | | | |
-| Mean | 26.5±6.4 | __31.8±3.5__ | 49.8±9.8 | __58.1±4.5__ | 60.4±10.6 | __69.4±4.3__ | - |
-| Google Translate | 27.4±6.3 | __31.5±3.5__ | 51.1±9.5 | __57.8±4.4__ | 61.7±10.3 | __69.1±4.3__ | - |
-| Microsoft Translator | 27.2±6.4 | __31.4±3.6__ | 50.8±9.8 | __57.7±4.7__ | 61.4±10.6 | __68.9±4.6__ | - |
-| Meta NLLB | 24.9±6.7 | __32.4±3.5__ | 47.5±10.3 | __58.9±4.5__ | 58.2±11.2 | __70.2±4.3__ | - |
-
-
-
-### Generative Models
-
-| Model | LLM Size | SQA | MME | MMBench | Average¹ |
-| :------------------- | -------: | ---: | -----: | ------: | -------: |
-| UForm-Gen2-Qwen-500m | 0.5B | 45.5 | 880.1 | 42.0 | 29.31 |
-| MobileVLM v2 | 1.4B | 52.1 | 1302.8 | 57.7 | 36.81 |
-| LLaVA-Phi | 2.7B | 68.4 | 1335.1 | 59.8 | 42.95 |
-
-For captioning evaluation we measure CLIPScore and RefCLIPScore³.
-
-| Model | Size | Caption Length | CLIPScore | RefCLIPScore |
-| :---------------------------------- | ---: | -------------: | --------: | -----------: |
-| `llava-hf/llava-1.5-7b-hf` | 7B | Long | 0.878 | 0.529 |
-| `llava-hf/llava-1.5-7b-hf` | 7B | Short | 0.886 | 0.531 |
-| |
-| `Salesforce/instructblip-vicuna-7b` | 7B | Long | 0.902 | 0.534 |
-| `Salesforce/instructblip-vicuna-7b` | 7B | Short | 0.848 | 0.523 |
-| |
-| `unum-cloud/uform-gen` | 1.5B | Long | 0.847 | 0.523 |
-| `unum-cloud/uform-gen` | 1.5B | Short | 0.842 | 0.522 |
-| |
-| `unum-cloud/uform-gen-chat` | 1.5B | Long | 0.860 | 0.525 |
-| `unum-cloud/uform-gen-chat` | 1.5B | Short | 0.858 | 0.525 |
-
-Results for VQAv2 evaluation.
-
-| Model | Size | Accuracy |
-| :------------------------- | ---: | -------: |
-| `llava-hf/llava-1.5-7b-hf` | 7B | 78.5 |
-| `unum-cloud/uform-gen` | 1.5B | 66.5 |
-
-
-
-> ¹ Train split was in training data.
-> ² Lacking a broad enough evaluation dataset, we translated the [COCO Karpathy test split](https://www.kaggle.com/datasets/shtvkumar/karpathy-splits) with multiple public and proprietary translation services, averaging the scores across all sets, and breaking them down in the bottom section.
-> ³ We used `apple/DFN5B-CLIP-ViT-H-14-378` CLIP model.
-
-## Speed
-
-On Nvidia RTX 3090, the following performance is expected on text encoding.
-
-| Model | Multilingual | Speed | Speedup |
-| :---------------------------------------- | -----------: | ---------------------: | ---------: |
-| `bert-base-uncased` | No | 1'612 sequences/second | |
-| `distilbert-base-uncased` | No | 3'174 sequences/second | x 1.96 |
-| `sentence-transformers/all-MiniLM-L12-v2` | __Yes__ | 3'604 sequences/second | x 2.24 |
-| `unum-cloud/uform-vl-multilingual-v2` | __Yes__ | 6'809 sequences/second | __x 4.22__ |
-
-On Nvidia RTX 3090, the following performance is expected on text token generation using `float16`, equivalent PyTorch settings, and greedy decoding.
-
-| Model | Size | Speed | Speedup |
-| :---------------------------------- | ---: | ------------------: | --------: |
-| `llava-hf/llava-1.5-7b-hf` | 7B | ~ 40 tokens/second | |
-| `Salesforce/instructblip-vicuna-7b` | 7B | ~ 40 tokens/second | |
-| `unum-cloud/uform-gen` | 1.5B | ~ 140 tokens/second | __x 3.5__ |
-
-Given the small size of the model it also work well on mobile devices.
-On Apple M2 Arm chips the energy efficiency of inference can exceed that of the RTX 3090 GPU and other Ampere-generation cards.
-
-| Device | Speed | Device TDP | Efficiency |
-| :--------------------- | ------------------: | ---------: | ----------------: |
-| Nvidia RTX 3090 | ~ 140 tokens/second | < 350W | 0.40 tokens/joule |
-| Apple M2 Pro unplugged | ~ 19 tokens/second | < 20W | 0.95 tokens/joule |
-| Apple M2 Max unplugged | ~ 38 tokens/second | < 36W | 1.06 tokens/joule |
-| Apple M2 Max plugged | ~ 56 tokens/second | < 89W | 0.63 tokens/joule |
-
-> [!WARNING]
-> The above numbers are for reference only and are not guaranteed to be accurate.
-
-## License
-
-All models come under the same license as the code - Apache 2.0.
diff --git a/docs/_static/custom.js b/docs/_static/custom.js
index b909a1d..3dd0974 100644
--- a/docs/_static/custom.js
+++ b/docs/_static/custom.js
@@ -3,5 +3,5 @@ $(document).ready(function () {
`
- $(".sidebar-brand-text").html("Unum · UForm $(VERSION) " + github_logo)
+ $(".sidebar-brand-text").html("Unum · UForm 2.1.1 " + github_logo)
})
diff --git a/docs/benchmarks.rst b/docs/benchmarks.rst
new file mode 100644
index 0000000..7683788
--- /dev/null
+++ b/docs/benchmarks.rst
@@ -0,0 +1,5 @@
+====================
+Benchmarks
+====================
+
+.. mdinclude:: ../BENCHMARKS.md
\ No newline at end of file
diff --git a/docs/conf.py b/docs/conf.py
index acc061e..f9061f5 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -5,12 +5,11 @@
# -- Project information -----------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
-import toml
project = "Unum · UForm"
copyright = "2023, Unum"
author = "Unum"
-release = toml.load("../pyproject.toml")["project"]["version"]
+release = open("../VERSION", "r").read().strip()
with open("_static/custom.js", "r+") as js:
content = js.read()
js.seek(0)
@@ -24,6 +23,7 @@
"breathe",
"m2r2",
"sphinx.ext.autodoc",
+ "sphinx_js",
"sphinx.ext.autosummary",
"sphinx.ext.intersphinx",
"sphinx.ext.napoleon",
@@ -44,6 +44,9 @@
html_static_path = ["_static"]
html_css_files = ["custom.css"]
html_js_files = ["custom.js"]
+html_baseurl = "/docs/uform/"
breathe_projects = {"UForm": "../build/xml"}
breathe_default_project = "UForm"
+
+js_source_path = "../javascript/"
diff --git a/docs/contributing.rst b/docs/contributing.rst
new file mode 100644
index 0000000..48893cf
--- /dev/null
+++ b/docs/contributing.rst
@@ -0,0 +1,5 @@
+====================
+Contributing
+====================
+
+.. mdinclude:: ../CONTRIBUTING.md
\ No newline at end of file
diff --git a/docs/index.rst b/docs/index.rst
index 162bbee..d3da0ec 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -1,11 +1,25 @@
-==========
+====================
Overview
-==========
+====================
.. mdinclude:: ../README.md
-.. toctree::
+.. toctree::
:hidden:
+ :caption: �
+
+ python/index
+ javascript/index
+ swift/index
+
+.. toctree::
+ :hidden:
+ :caption: �
+
+ contributing
+ benchmarks
+
+.. toctree::
+ :hidden:
+ :caption: �
- self
- reference
genindex
diff --git a/docs/javascript/index.rst b/docs/javascript/index.rst
new file mode 100644
index 0000000..771081c
--- /dev/null
+++ b/docs/javascript/index.rst
@@ -0,0 +1,9 @@
+====================
+JavaScript SDK
+====================
+
+
+.. mdinclude:: ../../javascript/README.md
+
+.. toctree::
+ :hidden:
diff --git a/docs/javascript/reference.rst.txt b/docs/javascript/reference.rst.txt
new file mode 100644
index 0000000..356176a
--- /dev/null
+++ b/docs/javascript/reference.rst.txt
@@ -0,0 +1,18 @@
+API Reference
+====================
+
+====================
+Encoders
+====================
+
+.. js:autoclass:: ../javascript/encoders.TextProcessor
+ :members:
+
+.. js:autoclass:: ../javascript/encoders.ImageProcessor
+ :members:
+
+.. js:autoclass:: ../javascript/encoders.TextEncoder
+ :members:
+
+.. js:autoclass:: ../javascript/encoders.ImageEncoder
+ :members:
diff --git a/docs/python/index.rst b/docs/python/index.rst
new file mode 100644
index 0000000..5f870d1
--- /dev/null
+++ b/docs/python/index.rst
@@ -0,0 +1,11 @@
+====================
+Python SDK
+====================
+
+
+.. mdinclude:: ../../python/README.md
+
+.. toctree::
+ :hidden:
+
+ reference
\ No newline at end of file
diff --git a/docs/python/reference.rst b/docs/python/reference.rst
new file mode 100644
index 0000000..d580583
--- /dev/null
+++ b/docs/python/reference.rst
@@ -0,0 +1,42 @@
+API Reference
+====================
+
+====================
+Root
+====================
+
+.. automodule:: uform
+ :members:
+ :undoc-members:
+
+====================
+Torch Encoreds
+====================
+
+.. automodule:: uform.torch_encoders
+ :members:
+ :undoc-members:
+
+====================
+Torch Processors
+====================
+
+.. automodule:: uform.torch_processors
+ :members:
+ :undoc-members:
+
+====================
+ONNX Encoders
+====================
+
+.. automodule:: uform.onnx_encoders
+ :members:
+ :undoc-members:
+
+====================
+NumPy Processors
+====================
+
+.. automodule:: uform.numpy_processors
+ :members:
+ :undoc-members:
diff --git a/docs/reference.rst b/docs/reference.rst
deleted file mode 100644
index 5828f41..0000000
--- a/docs/reference.rst
+++ /dev/null
@@ -1,6 +0,0 @@
-API Reference
-==============
-
-.. automodule:: uform
- :members:
- :undoc-members:
diff --git a/docs/swift/index.rst b/docs/swift/index.rst
new file mode 100644
index 0000000..5f2e213
--- /dev/null
+++ b/docs/swift/index.rst
@@ -0,0 +1,6 @@
+====================
+Swift SDK
+====================
+
+
+.. mdinclude:: ../../swift/README.md
diff --git a/javascript/README.md b/javascript/README.md
new file mode 100644
index 0000000..0ef5c54
--- /dev/null
+++ b/javascript/README.md
@@ -0,0 +1,67 @@
+# UForm for JavaScript
+
+UForm multimodal AI SDK offers a simple way to integrate multimodal AI capabilities into your JavaScript applications.
+Built around ONNX, the SDK is supposed to work with most runtimes and almost any hardware.
+
+## Installation
+
+There are several ways to install the UForm JavaScript SDK from NPM.
+
+```bash
+pnpm add uform
+npm add uform
+yarn add uform
+```
+
+## Quick Start
+
+### Embeddings
+
+```js
+import { getModel, Modality } from 'uform';
+import { TextProcessor, TextEncoder, ImageEncoder, ImageProcessor } from 'uform';
+
+const { configPath, modalityPaths, tokenizerPath } = await getModel({
+ modelId: 'unum-cloud/uform3-image-text-english-small',
+ modalities: [Modality.TextEncoder, Modality.ImageEncoder],
+ token: null, // Optional Hugging Face token for private models
+ saveDir: null, // Optional directory to save the model to
+});
+
+const textProcessor = new TextProcessor(configPath, tokenizerPath);
+await textProcessor.init();
+const processedTexts = await textProcessor.process("a small red panda in a zoo");
+
+const textEncoder = new TextEncoder(modalityPaths.text_encoder, textProcessor);
+await textEncoder.init();
+const textOutput = await textEncoder.encode(processedTexts);
+assert(textOutput.embeddings.dims.length === 2, "Output should be 2D");
+await textEncoder.dispose();
+
+const imageProcessor = new ImageProcessor(configPath);
+await imageProcessor.init();
+const processedImages = await imageProcessor.process("path/to/image.png");
+
+const imageEncoder = new ImageEncoder(modalityPaths.image_encoder, imageProcessor);
+await imageEncoder.init();
+const imageOutput = await imageEncoder.encode(processedImages);
+assert(imageOutput.embeddings.dims.length === 2, "Output should be 2D");
+```
+
+The `textOutput` and `imageOutput` would contain `features` and `embeddings` properties, which are the same as the `features` and `embeddings` properties in the Python SDK.
+The embeddings can later be compared using the cosine similarity or other distance metrics.
+
+### Generative Models
+
+Coming soon ...
+
+## Technical Details
+
+### Faster Search
+
+Depending on the application, the embeddings can be down-casted to smaller numeric representations without losing much recall.
+Independent of the quantization level, native JavaScript functionality may be too slow for large-scale search.
+In such cases, consider using [USearch][github-usearch] or [SimSimD][github-simsimd].
+
+[github-usearch]: https://github.com/unum-cloud/usearch
+[github-simsimd]: https://github.com/ashvardanian/simsimd
diff --git a/javascript/encoders.mjs b/javascript/encoders.mjs
new file mode 100644
index 0000000..3c41636
--- /dev/null
+++ b/javascript/encoders.mjs
@@ -0,0 +1,311 @@
+import { readFileSync } from 'fs';
+import { InferenceSession, Tensor } from 'onnxruntime-node';
+import { PreTrainedTokenizer } from '@xenova/transformers';
+import sharp from 'sharp';
+
+/**
+ * A processor for text data that prepares input for the text encoder model.
+ */
+class TextProcessor {
+
+ /**
+ * Constructs a new TextProcessor instance.
+ *
+ * @param {string} configPath - The path to the configuration file for the text encoder.
+ * @param {string} tokenizerPath - The path to the tokenizer configuration file.
+ */
+ constructor(configPath, tokenizerPath) {
+ this.configPath = configPath;
+ this.tokenizerPath = tokenizerPath;
+
+ this.maxSeqLen = 0;
+ this.padTokenIdx = 0;
+ this.tokenizer = null;
+ }
+
+ /**
+ * Initializes the TextProcessor by loading configurations and setting up the tokenizer.
+ */
+ async init() {
+ var config = JSON.parse(readFileSync(this.configPath, { encoding: 'utf8' }));
+ if (config.text_encoder !== undefined) {
+ config = config.text_encoder;
+ }
+
+ this.maxSeqLen = config.max_position_embeddings;
+ this.padTokenIdx = config.padding_idx;
+
+ const tokenizerConfig = JSON.parse(readFileSync(this.tokenizerPath, { encoding: 'utf8' }));
+ this.tokenizer = new PreTrainedTokenizer(tokenizerConfig, config);
+ this.tokenizer.model_max_length = this.maxSeqLen;
+ this.tokenizer.pad_token_id = this.padTokenIdx;
+ }
+
+ /**
+ * Processes a list of text strings into model-ready format, including padding and attention masks.
+ *
+ * @param {Array} texts - An array of text strings to process.
+ * @return {Object} The processed texts as model input features.
+ */
+ async process(texts) {
+
+ const encoded = await this.tokenizer(texts, {
+ add_special_tokens: true,
+ padding: 'max_length',
+ max_length: this.maxSeqLen,
+ truncation: true,
+ });
+
+ return {
+ 'input_ids': encoded.input_ids,
+ 'attention_mask': encoded.attention_mask,
+ };
+ }
+}
+
+/**
+ * An encoder for text data that uses a pre-trained model to encode text.
+ */
+class TextEncoder {
+
+ /**
+ * Constructs a new TextEncoder instance.
+ *
+ * @param {string} modelPath - The path to the pre-trained ONNX model.
+ */
+ constructor(modelPath) {
+ this.modelPath = modelPath;
+ this.session = null;
+ }
+
+ /**
+ * Initializes the ONNX session with the pre-trained model.
+ */
+ async init() {
+ this.session = await InferenceSession.create(this.modelPath);
+ }
+
+ /**
+ * Releases the ONNX session resources.
+ */
+ async dispose() {
+ if (this.session) {
+ await this.session.release();
+ this.session = null;
+ }
+ }
+
+ /**
+ * Encodes the input data using the pre-trained model.
+ *
+ * @param {Object} inputs - The input data containing input_ids and attention_mask.
+ * @return {Object} The encoded outputs from the model.
+ */
+ async encode(inputs) {
+ if (!this.session) {
+ throw new Error("Session is not initialized.");
+ }
+
+ // Helper function to convert BigInt64Array to Int32Array or validate Int32Array
+ function ensureInt32Array(data) {
+ if (data instanceof Int32Array) {
+ return data; // Use as is if already Int32Array
+ }
+ if (data instanceof BigInt64Array) {
+ // Convert BigInt64Array to Int32Array, ensuring all values are in range
+ return new Int32Array(Array.from(data).map(bigInt => {
+ if (bigInt > 2147483647n || bigInt < -2147483648n) {
+ throw new Error("Value out of range for Int32.");
+ }
+ return Number(bigInt); // Convert BigInt to Number
+ }));
+ }
+ // Additional case: handle conversion from generic Arrays or other typed arrays to Int32Array
+ if (Array.isArray(data) || data instanceof Uint32Array || data instanceof Uint8Array) {
+ return new Int32Array(data); // Convert directly
+ }
+ throw new Error("Unsupported data type for tensor conversion.");
+ }
+
+ // Prepare tensor data
+ const inputIDsData = ensureInt32Array(inputs.input_ids.data);
+ const attentionMaskData = ensureInt32Array(inputs.attention_mask.data);
+
+ // Create ONNX Tensors as 'int32'
+ const inputIDs = new Tensor('int32', inputIDsData, inputs.input_ids.dims);
+ const attentionMask = new Tensor('int32', attentionMaskData, inputs.attention_mask.dims);
+
+ // Run model inference
+ return this.session.run({
+ input_ids: inputIDs,
+ attention_mask: attentionMask,
+ });
+ }
+
+}
+
+/**
+ * A processor for image data that prepares images for the image encoder model.
+ */
+class ImageProcessor {
+ constructor(configPath) {
+ this.configPath = configPath;
+ }
+
+ /**
+ * Initializes the ImageProcessor by loading configuration settings for image preprocessing.
+ */
+ async init() {
+ var config = JSON.parse(readFileSync(this.configPath, 'utf8'));
+ if (config.image_encoder !== undefined) {
+ config = config.image_encoder;
+ }
+
+ this.imageSize = config.image_size;
+ this.normalizationMeans = config.normalization_means;
+ this.normalizationDeviations = config.normalization_deviations;
+
+ this.imageMean = new Float32Array(this.normalizationMeans);
+ this.imageStd = new Float32Array(this.normalizationDeviations);
+ }
+ /**
+ * Processes raw image data into a model-ready format, including resizing, cropping, and normalizing.
+ *
+ * @param {Buffer|Array} images - A single image or an array of images to process.
+ * @return {Array} The processed image data as an array of Float32Arrays.
+ */
+ async process(images) {
+ const processSingle = async (image) => {
+ let img = sharp(image).toColorspace('srgb');
+ const metadata = await img.metadata();
+ const scale = this.imageSize / Math.min(metadata.width, metadata.height);
+ const scaledWidth = Math.ceil(metadata.width * scale);
+ const scaledHeight = Math.ceil(metadata.height * scale);
+ img = img.resize({
+ width: scaledWidth,
+ height: scaledHeight,
+ fit: sharp.fit.cover,
+ position: sharp.strategy.entropy,
+ options: sharp.interpolators.bicubic
+ }).extract({
+ left: Math.max(0, Math.floor((scaledWidth - this.imageSize) / 2)),
+ top: Math.max(0, Math.floor((scaledHeight - this.imageSize) / 2)),
+ width: this.imageSize,
+ height: this.imageSize
+ }).removeAlpha();
+
+ let buffer = await img.raw().toBuffer();
+ let array = new Float32Array(buffer.length);
+
+ // When we export into the `array`, we reorder the dimensions of the tensor
+ // from HWC to CHW, and normalize the pixel values.
+ let channelSize = this.imageSize * this.imageSize;
+ for (let i = 0; i < this.imageSize * this.imageSize; i++) {
+ let r = buffer[i * 3];
+ let g = buffer[i * 3 + 1];
+ let b = buffer[i * 3 + 2];
+ array[i] = (r / 255.0 - this.imageMean[0]) / this.imageStd[0];
+ array[channelSize + i] = (g / 255.0 - this.imageMean[1]) / this.imageStd[1];
+ array[channelSize * 2 + i] = (b / 255.0 - this.imageMean[2]) / this.imageStd[2];
+ }
+
+ return array;
+ };
+
+ if (Array.isArray(images)) {
+ return Promise.all(images.map(img => processSingle(img)));
+ } else {
+ return [await processSingle(images)];
+ }
+ }
+}
+
+/**
+ * An encoder for image data that uses a pre-trained model to encode images.
+ */
+class ImageEncoder {
+ constructor(modelPath, processor) {
+ this.modelPath = modelPath;
+ this.imageSize = processor.imageSize;
+ }
+
+ /**
+ * Initializes the ONNX session with the pre-trained model.
+ */
+ async init() {
+ this.session = await InferenceSession.create(this.modelPath);
+ }
+
+ /**
+ * Releases the ONNX session resources.
+ */
+ async dispose() {
+ if (this.session) {
+ await this.session.release();
+ this.session = null;
+ }
+ }
+
+ /**
+ * Encodes the processed image data using the pre-trained model.
+ *
+ * @param {Float32Array|Array} images - The processed image data.
+ * @return {Object} The encoded outputs from the model.
+ */
+ async encode(images) {
+ if (!this.session) {
+ throw new Error("Session is not initialized.");
+ }
+
+ // Helper function to ensure data is a Float32Array.
+ const ensureFloat32Array = (data) => {
+ if (!(data instanceof Float32Array)) {
+ throw new Error("Unsupported data type for tensor conversion.");
+ }
+ return data;
+ };
+
+ // Helper function to concatenate multiple Float32Arrays into a single Float32Array.
+ const concatFloat32Arrays = (arrays) => {
+ const totalLength = arrays.reduce((acc, val) => acc + val.length, 0);
+ const result = new Float32Array(totalLength);
+ let offset = 0;
+ for (let arr of arrays) {
+ result.set(arr, offset);
+ offset += arr.length;
+ }
+ return result;
+ };
+
+ let imagesData;
+ let dims;
+
+ if (Array.isArray(images)) {
+ // Assuming each image in the array is a Float32Array representing an image already processed to a fixed size.
+ const arrays = images.map(ensureFloat32Array);
+ imagesData = concatFloat32Arrays(arrays);
+ const numImages = arrays.length;
+ const numChannels = 3;
+ const height = this.imageSize;
+ const width = this.imageSize;
+ dims = [numImages, numChannels, height, width];
+ } else {
+ // Single image images, which is already a Float32Array.
+ imagesData = ensureFloat32Array(images);
+ const numChannels = 3;
+ const height = this.imageSize;
+ const width = this.imageSize;
+ dims = [1, numChannels, height, width];
+ }
+
+ // Create ONNX Tensor
+ const imagesTensor = new Tensor('float32', imagesData, dims);
+
+ // Run model inference
+ return this.session.run({
+ images: imagesTensor,
+ });
+ }
+}
+
+export { TextProcessor, TextEncoder, ImageProcessor, ImageEncoder };
diff --git a/javascript/encoders_test.js b/javascript/encoders_test.js
new file mode 100644
index 0000000..30ea96a
--- /dev/null
+++ b/javascript/encoders_test.js
@@ -0,0 +1,233 @@
+import { existsSync, readFileSync } from 'fs';
+import { fileURLToPath } from 'url';
+import path from 'path';
+import assert from 'assert';
+import fetch from 'node-fetch';
+
+import { getModel, Modality } from "./hub.mjs";
+import { TextProcessor, TextEncoder, ImageEncoder, ImageProcessor } from "./encoders.mjs";
+
+// Check if the HuggingFace Hub API token is set in the environment variable.
+let hf_token = process.env.HUGGINGFACE_HUB_TOKEN;
+if (!hf_token) {
+ const dirname = path.dirname(fileURLToPath(import.meta.url));
+ const tokenPath = path.join(dirname, '../', '.hf_token');
+ if (existsSync(tokenPath)) {
+ hf_token = readFileSync(tokenPath, 'utf8').trim();
+ }
+}
+
+async function tryGettingCheckpoint(modelId, modalities) {
+ const { configPath, modalityPaths, tokenizerPath } = await getModel(
+ modelId,
+ modalities,
+ hf_token,
+ '.onnx'
+ );
+
+ assert(configPath !== null, "Config path should not be null");
+ assert(modalityPaths !== null, "Modality paths should not be null");
+ assert(tokenizerPath !== null, "Tokenizer path should not be null");
+
+ // Check if the file actually exists
+ assert(existsSync(configPath), `Config file should exist at ${configPath}`);
+ assert(existsSync(tokenizerPath), `Tokenizer file should exist at ${tokenizerPath}`);
+ for (const modalityPath of Object.values(modalityPaths)) {
+ assert(existsSync(modalityPath), `Modality file should exist at ${modalityPath}`);
+ }
+}
+
+async function testGetCheckpoint() {
+ console.log("- `testGetCheckpoint`: Start");
+
+ try {
+ const modalities = [Modality.TextEncoder, Modality.ImageEncoder];
+
+ for (const modelId of [
+ 'unum-cloud/uform3-image-text-english-small',
+ 'unum-cloud/uform3-image-text-english-base',
+ 'unum-cloud/uform3-image-text-english-large',
+ 'unum-cloud/uform3-image-text-multilingual-base',
+ ]) {
+ await tryGettingCheckpoint(modelId, modalities, hf_token);
+ }
+
+ console.log("- `testGetCheckpoint`: Success");
+ } catch (error) {
+ console.error("- `testGetCheckpoint`: Failed", error);
+ }
+}
+
+async function tryTextEncoderForwardPass(modelId) {
+ const modalities = [Modality.TextEncoder];
+ const { configPath, modalityPaths, tokenizerPath } = await getModel(
+ modelId,
+ modalities,
+ hf_token,
+ '.onnx'
+ );
+
+ const textProcessor = new TextProcessor(configPath, tokenizerPath);
+ await textProcessor.init();
+ const processedTexts = await textProcessor.process("a small red panda in a zoo");
+
+ const textEncoder = new TextEncoder(modalityPaths.text_encoder, textProcessor);
+ await textEncoder.init();
+ const textOutput = await textEncoder.encode(processedTexts);
+ assert(textOutput.embeddings.dims.length === 2, "Output should be 2D");
+
+ await textEncoder.dispose();
+}
+
+async function tryImageEncoderForwardPass(modelId) {
+ const modalities = [Modality.ImageEncoder];
+ const { configPath, modalityPaths } = await getModel(
+ modelId,
+ modalities,
+ hf_token,
+ '.onnx'
+ );
+
+ const imageProcessor = new ImageProcessor(configPath);
+ await imageProcessor.init();
+ const processedImages = await imageProcessor.process("assets/unum.png");
+
+ const imageEncoder = new ImageEncoder(modalityPaths.image_encoder, imageProcessor);
+ await imageEncoder.init();
+ const imageOutput = await imageEncoder.encode(processedImages);
+ assert(imageOutput.embeddings.dims.length === 2, "Output should be 2D");
+
+ await imageEncoder.dispose();
+}
+
+function cosineSimilarity(vecA, vecB) {
+ // We may be receiving a complex tensor type, so let's check if it
+ // has an array member named `data`.
+ if (vecA.data) {
+ vecA = vecA.data;
+ }
+ if (vecB.data) {
+ vecB = vecB.data;
+ }
+
+ let dotProduct = 0.0;
+ let normA = 0.0;
+ let normB = 0.0;
+ for (let i = 0; i < vecA.length; i++) {
+ dotProduct += vecA[i] * 1.0 * vecB[i];
+ normA += vecA[i] * 1.0 * vecA[i];
+ normB += vecB[i] * 1.0 * vecB[i];
+ }
+ if (normA === 0 || normB === 0) {
+ return 0;
+ } else {
+ return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB));
+ }
+}
+
+async function fetchImage(url) {
+ const response = await fetch(url);
+ const arrayBuffer = await response.arrayBuffer();
+ const buffer = Buffer.from(arrayBuffer);
+ return buffer;
+}
+
+async function tryCrossReferencingImageAndText(modelId) {
+
+ const modalities = [Modality.ImageEncoder, Modality.TextEncoder];
+ const { configPath, modalityPaths, tokenizerPath } = await getModel(
+ modelId,
+ modalities,
+ hf_token,
+ '.onnx'
+ );
+
+ const imageProcessor = new ImageProcessor(configPath);
+ await imageProcessor.init();
+ const imageEncoder = new ImageEncoder(modalityPaths.image_encoder, imageProcessor);
+ await imageEncoder.init();
+ const textProcessor = new TextProcessor(configPath, tokenizerPath);
+ await textProcessor.init();
+ const textEncoder = new TextEncoder(modalityPaths.text_encoder, textProcessor);
+ await textEncoder.init();
+
+ const texts = [
+ "A group of friends enjoy a barbecue on a sandy beach, with one person grilling over a large black grill, while the other sits nearby, laughing and enjoying the camaraderie.",
+ "A white and orange cat stands on its hind legs, reaching towards a wicker basket filled with red raspberries on a wooden table in a garden, surrounded by orange flowers and a white teapot, creating a serene and whimsical scene.",
+ "A little girl in a yellow dress stands in a grassy field, holding an umbrella and looking at the camera, amidst rain.",
+ "This serene bedroom features a white bed with a black canopy, a gray armchair, a black dresser with a mirror, a vase with a plant, a window with white curtains, a rug, and a wooden floor, creating a tranquil and elegant atmosphere.",
+ "The image captures the iconic Louvre Museum in Paris, illuminated by warm lights against a dark sky, with the iconic glass pyramid in the center, surrounded by ornate buildings and a large courtyard, showcasing the museum's grandeur and historical significance.",
+ ];
+ const imageUrls = [
+ "https://github.com/ashvardanian/ashvardanian/blob/master/demos/bbq-on-beach.jpg?raw=true",
+ "https://github.com/ashvardanian/ashvardanian/blob/master/demos/cat-in-garden.jpg?raw=true",
+ "https://github.com/ashvardanian/ashvardanian/blob/master/demos/girl-and-rain.jpg?raw=true",
+ "https://github.com/ashvardanian/ashvardanian/blob/master/demos/light-bedroom-furniture.jpg?raw=true",
+ "https://github.com/ashvardanian/ashvardanian/blob/master/demos/louvre-at-night.jpg?raw=true",
+ ];
+
+ const textEmbeddings = [];
+ const imageEmbeddings = [];
+
+ for (let i = 0; i < texts.length; i++) {
+ const text = texts[i];
+ const imageUrl = imageUrls[i];
+ const imageBuffer = await fetchImage(imageUrl);
+
+ const processedText = await textProcessor.process(text);
+ const processedImage = await imageProcessor.process(imageBuffer);
+
+ const textEmbedding = await textEncoder.encode(processedText);
+ const imageEmbedding = await imageEncoder.encode(processedImage);
+
+ textEmbeddings.push(new Float32Array(textEmbedding.embeddings.data));
+ imageEmbeddings.push(new Float32Array(imageEmbedding.embeddings.data));
+
+ // Print-based debugging at its best :)
+ // console.log(`Text: ${text}, Image: ${imageUrl}`);
+ // console.log(`Text embedding first components: ${textEmbeddings[i].slice(0, 5)}`);
+ // console.log(`Image embedding first components: ${imageEmbeddings[i].slice(0, 5)}`);
+ console.log(`Similarity: ${cosineSimilarity(textEmbeddings[i], imageEmbeddings[i])}`)
+ }
+
+ for (let i = 0; i < texts.length; i++) {
+ const pairSimilarity = cosineSimilarity(textEmbeddings[i], imageEmbeddings[i]);
+ const otherTextSimilarities = textEmbeddings.map((te, idx) => idx === i ? -Infinity : cosineSimilarity(te, imageEmbeddings[i]));
+ const otherImageSimilarities = imageEmbeddings.map((ie, idx) => idx === i ? -Infinity : cosineSimilarity(textEmbeddings[i], ie));
+
+ const maxOtherTextSimilarity = Math.max(...otherTextSimilarities);
+ const maxOtherImageSimilarity = Math.max(...otherImageSimilarities);
+
+ assert(pairSimilarity > maxOtherTextSimilarity, "Text should be more similar to its corresponding image than to other images.");
+ assert(pairSimilarity > maxOtherImageSimilarity, "Image should be more similar to its corresponding text than to other texts.");
+ }
+
+ await textEncoder.dispose();
+ await imageEncoder.dispose();
+}
+
+async function testEncoders() {
+ console.log("- `testEncoders`: Start");
+
+ try {
+
+ // Go through the bi-modal models
+ for (const modelId of [
+ 'unum-cloud/uform3-image-text-english-small',
+ // 'unum-cloud/uform3-image-text-english-base',
+ // 'unum-cloud/uform3-image-text-english-large',
+ // 'unum-cloud/uform3-image-text-multilingual-base',
+ ]) {
+ await tryTextEncoderForwardPass(modelId, hf_token);
+ await tryImageEncoderForwardPass(modelId, hf_token);
+ await tryCrossReferencingImageAndText(modelId, hf_token);
+ }
+
+ console.log("- `testEncoders`: Success");
+ } catch (error) {
+ console.error("- `testEncoders`: Failed", error);
+ }
+}
+
+testGetCheckpoint();
+testEncoders();
diff --git a/javascript/hub.mjs b/javascript/hub.mjs
new file mode 100644
index 0000000..a59fb73
--- /dev/null
+++ b/javascript/hub.mjs
@@ -0,0 +1,104 @@
+import { join } from "path"
+import { createWriteStream, existsSync, mkdirSync, writeFileSync } from "fs";
+
+import { downloadFile, listFiles } from "@huggingface/hub";
+
+const Modality = {
+ TextEncoder: "text_encoder",
+ ImageEncoder: "image_encoder",
+ VideoEncoder: "video_encoder",
+ TextDecoder: "text_decoder",
+};
+
+function isModality(value) {
+ return Object.values(Modality).includes(value);
+}
+
+function normalizeModalities(modalities) {
+ return modalities.map(x => {
+ if (typeof x === "string") {
+ if (isModality(x)) {
+ return x;
+ } else {
+ throw new Error(`Invalid modality: ${x}`);
+ }
+ }
+ return x;
+ });
+}
+
+async function ensureDirectoryExists(dirPath) {
+ if (!existsSync(dirPath)) {
+ mkdirSync(dirPath, { recursive: true });
+ }
+}
+
+async function getModel(modelId, modalities, token = null, format = '.onnx', saveDir = './models') {
+ modalities = normalizeModalities(modalities);
+
+ const configNames = ['config.json'];
+ const tokenizerNames = ['tokenizer.json'];
+ const modelFileNames = modalities.map(modality => `${modality}${format}`);
+ const allowedPatterns = [...modelFileNames, ...configNames, ...tokenizerNames];
+
+ const repo = { type: "model", name: modelId };
+ const credentials = token ? { accessToken: token } : undefined;
+
+ let configPath = null;
+ let tokenizerPath = null;
+ const modalityPaths = {};
+ const modelSaveDir = join(saveDir, modelId);
+
+ await ensureDirectoryExists(modelSaveDir);
+
+ const fileIterator = listFiles({ repo, recursive: true, credentials });
+ for await (const file of fileIterator) {
+ const fileName = file.path.split('/').pop();
+ if (fileName && allowedPatterns.includes(fileName)) {
+ const filePath = file.path;
+ const savePath = join(modelSaveDir, fileName);
+
+ if (configNames.includes(fileName)) {
+ configPath = savePath;
+ } else if (tokenizerNames.includes(fileName)) {
+ tokenizerPath = savePath;
+ } else {
+ const modalityName = fileName.split('.')[0];
+ modalityPaths[modalityName] = savePath;
+ }
+
+ const response = await downloadFile({ repo, path: filePath, credentials });
+ if (response) {
+ // HuggingFace might be defining the `env.localModelPath` variable
+ // to store the downloaded files in a local directory.
+ // Let's check if the file is there.
+ // const localPath = join(env.localModelPath, repo, filePath);
+ // if (existsSync(localPath)) {
+ // console.log(`File already exists locally at ${localPath}`);
+ // }
+
+ if (response.body && response.body.pipe) {
+ const fileStream = createWriteStream(savePath);
+ response.body.pipe(fileStream);
+ await new Promise((resolve, reject) => {
+ fileStream.on('finish', resolve);
+ fileStream.on('error', reject);
+ });
+ } else if (response.arrayBuffer) {
+ // Handle non-streamable response for environments like Node.js
+ const buffer = await response.arrayBuffer();
+ writeFileSync(savePath, Buffer.from(buffer));
+ } else {
+ console.error('Unexpected response type');
+ }
+ console.log(`Downloaded ${fileName} successfully to ${savePath}`);
+ } else {
+ console.log('No response received for the file download request.');
+ }
+ }
+ }
+
+ return { configPath, modalityPaths, tokenizerPath };
+}
+
+export { getModel, Modality };
diff --git a/package.json b/package.json
new file mode 100644
index 0000000..948550b
--- /dev/null
+++ b/package.json
@@ -0,0 +1,33 @@
+{
+ "name": "uform",
+ "type": "module",
+ "private": true,
+ "version": "2.0.2",
+ "description": "Pocket-Sized Multimodal AI for Content Understanding and Generation",
+ "dependencies": {
+ "@huggingface/hub": "^0.14.8",
+ "@xenova/transformers": "^2.17.0",
+ "node-fetch": "^3.3.2",
+ "onnxruntime-node": "^1.17.0",
+ "onnxruntime-web": "^1.17.3"
+ },
+ "devDependencies": {
+ "nodemon": "^2.0.15"
+ },
+ "scripts": {
+ "start": "node javascript/encoders.mjs",
+ "test": "node javascript/encoders_test.js"
+ },
+ "main": "javascript/encoders.mjs",
+ "directories": {
+ "doc": "docs"
+ },
+ "keywords": [
+ "AI",
+ "multimodal",
+ "content generation",
+ "huggingface"
+ ],
+ "author": "Ash Vardanian, Unum Cloud",
+ "license": "Apache-2.0"
+}
diff --git a/pyproject.toml b/pyproject.toml
index 10f7a9b..fef02d3 100755
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -31,7 +31,8 @@ classifiers = [
dependencies = [
"huggingface_hub>=0.16.4",
"tokenizers>=0.13.3",
- "pillow"
+ "pillow",
+ "simsimd",
]
description = "Pocket-Sized Multimodal AI for Content Understanding and Generation"
maintainers = [
@@ -49,6 +50,7 @@ uform-chat = "uform.chat:main"
torch = ["torch>=1.13.1", "torchvision", "transformers>=4.36.2"]
onnx = ["onnx>=1.15.0", "onnxruntime>=1.17.1", "numpy"]
onnx-gpu = ["onnx>=1.15.0", "onnxruntime-gpu>=1.17.1", "numpy"]
+dev = ["pytest", "pandas"]
[project.urls]
"Homepage" = "https://github.com/unum-cloud/uform"
diff --git a/python/README.md b/python/README.md
new file mode 100644
index 0000000..dd7611d
--- /dev/null
+++ b/python/README.md
@@ -0,0 +1,148 @@
+# UForm Python SDK
+
+UForm multimodal AI SDK offers a simple way to integrate multimodal AI capabilities into your Python applications.
+The SDK doesn't require any deep learning knowledge, PyTorch, or CUDA installation, and can run on almost any hardware.
+
+## Installation
+
+There are several ways to install the UForm Python SDK, depending on the backend you want to use.
+PyTorch is by far the heaviest, but the most capable.
+ONNX is a lightweight alternative that can run on any CPU, and on some GPUs.
+
+```bash
+pip install "uform[torch]" # For PyTorch
+pip install "uform[onnx]" # For ONNX on CPU
+pip install "uform[onnx-gpu]" # For ONNX on GPU, available for some platforms
+pip install "uform[torch,onnx]" # For PyTorch and ONNX Python tests
+```
+
+## Quick Start
+
+### Embeddings
+
+Load the model:
+
+```py
+from uform import get_model, Modality
+
+model_name = 'unum-cloud/uform3-image-text-english-small'
+modalities = [Modality.TEXT_ENCODER, Modality.IMAGE_ENCODER]
+processors, models = get_model(model_name, modalities=modalities)
+
+model_text = models[Modality.TEXT_ENCODER]
+model_image = models[Modality.IMAGE_ENCODER]
+processor_text = processors[Modality.TEXT_ENCODER]
+processor_image = processors[Modality.IMAGE_ENCODER]
+```
+
+Embed images:
+
+```py
+import requests
+from io import BytesIO
+from PIL import Image
+
+image_url = 'https://media-cdn.tripadvisor.com/media/photo-s/1b/28/6b/53/lovely-armenia.jpg'
+image_url = Image.open(BytesIO(requests.get(image_url).content))
+image_data = processor_image(image)
+image_features, image_embedding = model_image.encode(image_data, return_features=True)
+```
+
+Embed queries:
+
+```py
+text = 'a cityscape bathed in the warm glow of the sun, with varied architecture and a towering, snow-capped mountain rising majestically in the background'
+text_data = processor_text(text)
+text_features, text_embedding = model_text.encode(text_data, return_features=True)
+```
+
+### Generative Models
+
+UForm generative models are fully compatible with the Hugging Face Transformers library, and can be used without installing the UForm library.
+Those models can be used to caption images or power multimodal chat experiences.
+
+```python
+from transformers import AutoModel, AutoProcessor
+
+model = AutoModel.from_pretrained('unum-cloud/uform-gen2-dpo', trust_remote_code=True)
+processor = AutoProcessor.from_pretrained('unum-cloud/uform-gen2-dpo', trust_remote_code=True)
+
+prompt = 'Question or Instruction'
+image = Image.open('image.jpg')
+
+inputs = processor(text=[prompt], images=[image], return_tensors='pt')
+
+with torch.inference_mode():
+ output = model.generate(
+ **inputs,
+ do_sample=False,
+ use_cache=True,
+ max_new_tokens=256,
+ eos_token_id=151645,
+ pad_token_id=processor.tokenizer.pad_token_id
+ )
+prompt_len = inputs['input_ids'].shape[1]
+decoded_text = processor.batch_decode(output[:, prompt_len:])[0]
+```
+
+You can check examples of different prompts in our demo Gradio spaces on HuggingFace:
+
+- for [`uform-gen2-qwen-500m`](https://huggingface.co/spaces/unum-cloud/uform-gen2-qwen-500m-demo)
+- for [`uform-gen2-dpo`](https://huggingface.co/spaces/unum-cloud/uform-gen2-qwen-500m-dpo-demo)
+
+## Technical Details
+
+### Multi-GPU Parallelism
+
+To achieve higher throughput, you can launch UForm on multiple GPUs.
+For that pick the encoder of the model you want to run in parallel, and wrap it in `nn.DataParallel` (or `nn.DistributedDataParallel`).
+
+```python
+from uform import get_model, Modality
+import torch.nn as nn
+
+encoders, processors = uform.get_model('unum-cloud/uform-vl-english-small', backend='torch')
+
+model_text = models[Modality.TEXT_ENCODER]
+model_image = models[Modality.IMAGE_ENCODER]
+processor_text = processors[Modality.TEXT_ENCODER]
+processor_image = processors[Modality.IMAGE_ENCODER]
+
+model_text.return_features = False
+model_image.return_features = False
+model_text_parallel = nn.DataParallel(model_text)
+model_image_parallel = nn.DataParallel(model_image)
+```
+
+Since we are now dealing with the PyTorch wrapper, make sure to use the `forward` method (instead of `encode`) to get the embeddings, and the `.detach().cpu().numpy()` sequence to bring the data back to more Pythonic NumPy arrays.
+
+```python
+def get_image_embedding(images: List[Image]):
+ preprocessed = processor_image(images)
+ embedding = model_image_parallel.forward(preprocessed)
+ return embedding.detach().cpu().numpy()
+
+def get_text_embedding(texts: List[str]):
+ preprocessed = processor_text(texts)
+ embedding = model_text_parallel.forward(preprocessed)
+ return embedding.detach().cpu().numpy()
+```
+
+### ONNX and CUDA
+
+The configuration process may include a few additional steps, depending on the environment.
+When using the CUDA and TensorRT backends with CUDA 12 or newer make sure to [install the Nvidia toolkit][install-nvidia-toolkit] and the `onnxruntime-gpu` package from the custom repository.
+
+```sh
+wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
+sudo dpkg -i cuda-keyring_1.1-1_all.deb
+sudo apt-get update
+sudo apt-get -y install cuda-toolkit-12
+pip install onnxruntime-gpu --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/
+export CUDA_PATH="/usr/local/cuda-12/bin"
+export PATH="/usr/local/cuda-12/bin${PATH:+:${PATH}}"
+export LD_LIBRARY_PATH="/usr/local/cuda-12/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
+pytest python/scripts/ -s -x -Wd -v -k onnx
+```
+
+[install-nvidia-toolkit]: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/#network-repo-installation-for-ubuntu
diff --git a/python/scripts/bench.py b/python/scripts/bench_decoders.py
similarity index 60%
rename from python/scripts/bench.py
rename to python/scripts/bench_decoders.py
index 49c7004..0842ba9 100644
--- a/python/scripts/bench.py
+++ b/python/scripts/bench_decoders.py
@@ -1,6 +1,8 @@
from functools import partial
from time import perf_counter
+from dataclasses import dataclass
from typing import List
+import argparse
import requests
import torch
@@ -10,18 +12,38 @@
InstructBlipForConditionalGeneration,
InstructBlipProcessor,
LlavaForConditionalGeneration,
+ AutoModel,
+ AutoProcessor,
)
-from uform import get_model
-from uform.gen_model import VLMForCausalLM, VLMProcessor
+from uform.torch_decoders import VLMForCausalLM, VLMProcessor
dtype = torch.bfloat16
low_cpu_mem_usage = False
device = "cuda:0"
-def caption(model, processor, prompt: str, image: Image.Image) -> str:
- inputs = processor(prompt, image, return_tensors="pt")
+@dataclass
+class BenchmarkResult:
+ model_name: str
+ device_name: str
+ backend_name: str
+ duration_image_preprocessing: float
+ duration_image_embedding: float
+ duration_text_preprocessing: float
+ duration_text_embedding: float
+
+
+def caption(model, processor, prompt: str, image: Image.Image, max_length: int, batch_size: int) -> List[str]:
+ # BLIP models require the prompt to be the first argument
+ prompt = [prompt] * batch_size
+ image = [image] * batch_size
+ try:
+ inputs = processor(prompt, image, return_tensors="pt")
+ except ValueError:
+ inputs = processor(image, prompt, return_tensors="pt")
+
+ # Downcast and move to device
for possible_key in ["images", "pixel_values"]:
if possible_key not in inputs:
continue
@@ -33,19 +55,20 @@ def caption(model, processor, prompt: str, image: Image.Image) -> str:
**inputs,
do_sample=False,
# use_cache=True,
- max_new_tokens=128,
+ max_new_tokens=max_length,
eos_token_id=32001,
pad_token_id=processor.tokenizer.pad_token_id,
)
prompt_len = inputs["input_ids"].shape[1]
- decoded_text = processor.batch_decode(
+ decoded_texts = processor.batch_decode(
output[:, prompt_len:],
skip_special_tokens=True,
- )[0].strip()
- return decoded_text
+ )
+ return decoded_texts
def duration(callable):
+ """Profile the duration of a callable and return the duration and the result."""
start = perf_counter()
result = callable()
stop = perf_counter()
@@ -57,49 +80,35 @@ def bench_captions(
processor,
prompt: str,
images: List[Image.Image],
+ max_length: int = 256,
+ batch_size: int = 10,
) -> List[str]:
total_duration = 0
total_length = 0
model = torch.compile(model)
- def caption_image(image, model=model, processor=processor, prompt=prompt):
- return caption(model=model, processor=processor, prompt=prompt, image=image)
+ def caption_image(image):
+ return caption(
+ model=model,
+ processor=processor,
+ prompt=prompt,
+ image=image,
+ max_length=max_length,
+ batch_size=batch_size,
+ )
for image in images:
- seconds, text = duration(partial(caption_image, image=image))
+ seconds, captions = duration(partial(caption_image, image=image))
total_duration += seconds
- total_length += len(text)
+ total_length += len(captions.strip()) if isinstance(captions, str) else sum(len(t.strip()) for t in captions)
del model
del processor
print(f"Throughput: {total_length/total_duration:.2f} tokens/s")
-def bench_image_embeddings(model, images):
- total_duration = 0
- total_embeddings = 0
- images *= 10
- while total_duration < 10:
- seconds, embeddings = duration(lambda: model.encode_image(processor.preprocess_image(images)))
- total_duration += seconds
- total_embeddings += len(embeddings)
-
- print(f"Throughput: {total_embeddings/total_duration:.2f} images/s")
-
-
-def bench_text_embeddings(model, texts):
- total_duration = 0
- total_embeddings = 0
- texts *= 10
- while total_duration < 10:
- seconds, embeddings = duration(lambda: model.encode_text(processor.preprocess_text(texts)))
- total_duration += seconds
- total_embeddings += len(embeddings)
-
- print(f"Throughput: {total_embeddings/total_duration:.2f} queries/s")
-
+def main(batch_size: int = 10, max_length: int = 256):
-if __name__ == "__main__":
image_urls = [
"https://images.unsplash.com/photo-1697665666330-7acf230fa830?q=80&w=2787&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D",
"https://images.unsplash.com/photo-1695653422543-7da6d6744364?q=80&w=2940&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDF8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D",
@@ -116,18 +125,40 @@ def bench_text_embeddings(model, texts):
"a few food containers, with past, corn, olives, and sliced red & green peppers, with a man pouring sous on top of it",
]
+ print("UForm-Gen2")
+ bench_captions(
+ model=AutoModel.from_pretrained(
+ "unum-cloud/uform-gen2-dpo",
+ trust_remote_code=True,
+ torch_dtype=dtype,
+ low_cpu_mem_usage=low_cpu_mem_usage,
+ ignore_mismatched_sizes=True,
+ ).to(device),
+ processor=AutoProcessor.from_pretrained(
+ "unum-cloud/uform-gen2-dpo",
+ trust_remote_code=True,
+ ),
+ prompt="Describe the picture in great detail",
+ images=images,
+ batch_size=batch_size,
+ max_length=max_length,
+ )
+
print("UForm-Gen")
bench_captions(
model=VLMForCausalLM.from_pretrained(
"unum-cloud/uform-gen",
torch_dtype=dtype,
low_cpu_mem_usage=low_cpu_mem_usage,
+ ignore_mismatched_sizes=True,
).to(device),
processor=VLMProcessor.from_pretrained(
"unum-cloud/uform-gen",
),
prompt="[cap] Summarize the visual content of the image.",
images=images,
+ batch_size=batch_size,
+ max_length=max_length,
)
print("LLaVA")
@@ -142,6 +173,8 @@ def bench_text_embeddings(model, texts):
),
prompt="USER: \nWhat are these?\nASSISTANT:",
images=images,
+ batch_size=batch_size,
+ max_length=max_length,
)
print("InstructBLIP")
@@ -156,12 +189,26 @@ def bench_text_embeddings(model, texts):
),
prompt="Summarize the visual content of the image.",
images=images,
+ batch_size=batch_size,
+ max_length=max_length,
)
- print("UForm-English")
- bench_image_embeddings(get_model("unum-cloud/uform-vl-english"), images)
- bench_text_embeddings(get_model("unum-cloud/uform-vl-english"), captions)
- print("UForm-Multilingual")
- bench_image_embeddings(get_model("unum-cloud/uform-vl-multilingual-v2"), images)
- bench_text_embeddings(get_model("unum-cloud/uform-vl-multilingual-v2"), captions)
+if __name__ == "__main__":
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--batch-size",
+ type=int,
+ default=10,
+ help="Batch size for the benchmark. Batch size 1 measures latency. Large batch sizes may not fit on every GPU.",
+ )
+ parser.add_argument(
+ "--max-length",
+ type=str,
+ default=256,
+ help="Maximum length of the generated text in tokens.",
+ )
+ args = parser.parse_args()
+
+ main(batch_size=args.batch_size, max_length=args.max_length)
diff --git a/python/scripts/bench_encoders.py b/python/scripts/bench_encoders.py
new file mode 100644
index 0000000..b237126
--- /dev/null
+++ b/python/scripts/bench_encoders.py
@@ -0,0 +1,274 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+This script provides the throughput of UForm multimodal embedding models.
+
+The output of the script will cover:
+ - Time to preprocess an image, and throughput in images/s.
+ - Time to tokenize the text, and throughput in queries/s.
+ - Time to encode the image, and throughput in images/s.
+ - Time to encode the text, and throughput in queries/s.
+ - Share of time spent on each part of the pipeline.
+
+Those numbers are presented for every model, device (cpu or gpu), backend (torch or onnx),
+and precision (float32 or bfloat16), producing a pretty comprehensive benchmark.
+
+Before running the script - install all available packages via `pip install -e ".[torch,onnx,onnx-gpu]"`.
+Before printing the numbers, a warm-up is performed to ensure the model is loaded and the cache is filled.
+"""
+
+from functools import partial
+from time import perf_counter
+from dataclasses import dataclass
+from typing import List, Tuple, Literal, Callable, Generator
+import re
+import argparse
+
+import requests
+from PIL import Image
+import pandas as pd
+
+from uform import get_model, Modality, ExecutionProviderError
+
+# Define global constants for the hardware availability
+torch_available = False
+try:
+ import torch
+
+ torch_available = True
+except ImportError:
+ pass
+onnx_available = False
+try:
+ import onnx
+
+ onnx_available = True
+except ImportError:
+ pass
+cuda_available = False
+try:
+ if torch_available:
+ cuda_available = torch.cuda.is_available()
+ elif onnx_available:
+ import onnxruntime
+
+ cuda_available = onnxruntime.get_device() == "GPU"
+except ImportError:
+ pass
+
+
+@dataclass
+class BenchmarkResult:
+ model_name: str
+ device_name: Literal["cpu", "cuda"] = "cpu"
+ backend_name: Literal["torch", "onnx"] = "torch"
+ duration_image_preprocessing: float = 0
+ duration_image_embedding: float = 0
+ duration_text_preprocessing: float = 0
+ duration_text_embedding: float = 0
+
+
+def duration(callable, synchronize=False):
+ """Profile the duration of a callable and return the duration and the result."""
+ if synchronize and torch_available and cuda_available:
+ torch.cuda.synchronize() # Wait for CUDA operations to complete
+ start = perf_counter()
+ result = callable()
+ if synchronize and torch_available and cuda_available:
+ torch.cuda.synchronize() # Ensure all CUDA kernels have finished
+ stop = perf_counter()
+ return stop - start, result
+
+
+def get_captioned_images() -> List[Tuple[Image.Image, str]]:
+ """Get a list of pre-downloaded and decoded images and their captions."""
+ image_urls = [
+ "https://images.unsplash.com/photo-1697665666330-7acf230fa830?q=80&w=2787&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D",
+ "https://images.unsplash.com/photo-1695653422543-7da6d6744364?q=80&w=2940&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDF8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D",
+ "https://images.unsplash.com/photo-1703244551371-ecffad9cc3b6?q=80&w=2859&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D",
+ "https://plus.unsplash.com/premium_photo-1702910931866-2642eee270b1?q=80&w=2940&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D",
+ "https://plus.unsplash.com/premium_photo-1700583712241-893aded49e69?q=80&w=2942&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D",
+ ]
+ images = [Image.open(requests.get(url, stream=True).raw) for url in image_urls]
+ captions = [
+ "lonely house in a beautiful valley. house is made of white wood and black bricks. its surrounded by a green field",
+ "grab last-mile delivery driver on a scooter grabbing a delivery in Jakarta",
+ "monochrome picture of new york in the late 2th century on a sunny day, showing a few canonical brick buildings and the citizens bank",
+ "asian girl sleeping in a bed. top down view",
+ "a few food containers, with past, corn, olives, and sliced red & green peppers, with a man pouring sous on top of it",
+ ]
+ return list(zip(images, captions))
+
+
+def yield_benchmarks(batch_size: int) -> Generator[Tuple[BenchmarkResult, Callable], None, None]:
+ """Yields callable benchmarks for all supported backends of the given model."""
+
+ # Pull the content and artificially grow the batch size
+ images, captions = zip(*get_captioned_images())
+
+ if len(images) < batch_size:
+ import math
+
+ multiplier = int(math.ceil(batch_size / len(images)))
+ images *= multiplier
+ captions *= multiplier
+ images = images[:batch_size]
+ captions = captions[:batch_size]
+
+ def run(model_name: str, device: str, backend_name: str):
+ result = BenchmarkResult(
+ model_name=model_name,
+ backend_name=backend_name,
+ device_name=device,
+ duration_image_preprocessing=0,
+ duration_image_embedding=0,
+ duration_text_preprocessing=0,
+ duration_text_embedding=0,
+ )
+
+ sync = backend_name == "torch"
+ processors, models = get_model(
+ model_name,
+ device=device,
+ modalities=[Modality.IMAGE_ENCODER, Modality.TEXT_ENCODER],
+ backend=backend_name,
+ )
+
+ model_text = models[Modality.TEXT_ENCODER]
+ model_image = models[Modality.IMAGE_ENCODER]
+ processor_text = processors[Modality.TEXT_ENCODER]
+ processor_image = processors[Modality.IMAGE_ENCODER]
+
+ # Image preprocessing
+ total_duration = 0
+ total_iterations = 0
+ while total_duration < 10 and total_iterations < 100:
+ seconds, _ = duration(lambda: processor_image(images))
+ total_duration += seconds
+ total_iterations += len(images)
+ duration_per_iteration = total_duration / total_iterations
+ result.duration_image_preprocessing = duration_per_iteration
+
+ # Image embedding
+ total_duration = 0
+ total_iterations = 0
+ while total_duration < 10 and total_iterations < 100:
+ images_data = processor_image(images)
+ seconds, _ = duration(lambda: model_image.encode(images_data), synchronize=sync)
+ total_duration += seconds
+ total_iterations += len(images)
+ duration_per_iteration = total_duration / total_iterations
+ result.duration_image_embedding = duration_per_iteration
+
+ # Text preprocessing
+ total_duration = 0
+ total_iterations = 0
+ while total_duration < 10 and total_iterations < 100:
+ seconds, _ = duration(lambda: processor_text(captions))
+ total_duration += seconds
+ total_iterations += len(captions)
+ duration_per_iteration = total_duration / total_iterations
+ result.duration_text_preprocessing = duration_per_iteration
+
+ # Text embedding
+ total_duration = 0
+ total_iterations = 0
+ while total_duration < 10 and total_iterations < 100:
+ texts_data = processor_text(captions)
+ seconds, _ = duration(lambda: model_text.encode(texts_data), synchronize=sync)
+ total_duration += seconds
+ total_iterations += len(captions)
+ duration_per_iteration = total_duration / total_iterations
+ result.duration_text_embedding = duration_per_iteration
+
+ return result
+
+ devices = ["cpu"]
+ if cuda_available:
+ devices.append("cuda")
+ backends = []
+ if torch_available:
+ backends.append("torch")
+ if onnx_available:
+ backends.append("onnx")
+
+ for device in devices:
+ for backend_name in backends:
+ for model_name in [
+ "unum-cloud/uform3-image-text-english-small",
+ "unum-cloud/uform3-image-text-english-base",
+ "unum-cloud/uform3-image-text-english-large",
+ "unum-cloud/uform3-image-text-multilingual-base",
+ ]:
+ yield BenchmarkResult(
+ model_name=model_name,
+ device_name=device,
+ backend_name=backend_name,
+ ), partial(run, model_name, device, backend_name)
+
+
+def main(filter_out: str = None, batch_size: int = 10):
+ results = []
+ filter_pattern = re.compile(filter_out) if filter_out else None
+ for specs, func in yield_benchmarks(batch_size=batch_size):
+ if filter_pattern and (
+ filter_pattern.search(specs.model_name)
+ or filter_pattern.search(specs.backend_name)
+ or filter_pattern.search(specs.device_name)
+ ):
+ continue
+
+ try:
+ print(f"Running `{specs.model_name}` on `{specs.device_name}` using `{specs.backend_name}` backend")
+ result = func()
+ results.append(result)
+ except ExecutionProviderError as e:
+ print(f"- skipping missing backend")
+ print(e)
+
+ results = sorted(results, key=lambda x: x.model_name)
+ results = [x.__dict__ for x in results]
+
+ df = pd.DataFrame(results)
+ df.columns = [
+ "Model Name",
+ "Device",
+ "Backend",
+ "Images Preprocessed/s",
+ "Images Encoded/s",
+ "Texts Preprocessed/s",
+ "Texts Encoded/s",
+ ]
+
+ def inverse(x):
+ return 1 / x if x != 0 else 0
+
+ # Apply number formatting directly in the DataFrame
+ formatted_df = df.copy()
+ formatted_df["Images Preprocessed/s"] = df["Images Preprocessed/s"].map(inverse).map("{:,.2f}".format)
+ formatted_df["Images Encoded/s"] = df["Images Encoded/s"].map(inverse).map("{:,.2f}".format)
+ formatted_df["Texts Preprocessed/s"] = df["Texts Preprocessed/s"].map(inverse).map("{:,.2f}".format)
+ formatted_df["Texts Encoded/s"] = df["Texts Encoded/s"].map(inverse).map("{:,.2f}".format)
+
+ # Convert formatted DataFrame to Markdown
+ print(formatted_df.to_markdown())
+
+
+if __name__ == "__main__":
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--filter-out",
+ type=str,
+ default=None,
+ help="Filter out models, backends, or devices with a Regular Expression.",
+ )
+ parser.add_argument(
+ "--batch-size",
+ type=int,
+ default=10,
+ help="Batch size for the benchmark. Batch size 1 measures latency. Large batch sizes may not fit on every GPU.",
+ )
+ args = parser.parse_args()
+
+ main(filter_out=args.filter_out, batch_size=args.batch_size)
diff --git a/python/scripts/export.ipynb b/python/scripts/export.ipynb
deleted file mode 100644
index ce8cf10..0000000
--- a/python/scripts/export.ipynb
+++ /dev/null
@@ -1,666 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Scripts for Exporting PyTorch Models to ONNX and CoreML"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "!pip install --upgrade \"uform[torch]\" coremltools"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "/Users/av/miniconda3/lib/python3.10/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: dlopen(/Users/av/miniconda3/lib/python3.10/site-packages/torchvision/image.so, 0x0006): Symbol not found: __ZN3c106detail19maybe_wrap_dim_slowExxb\n",
- " Referenced from: <0B637046-A38B-3A5C-80C6-E847C27DCCD5> /Users/av/miniconda3/lib/python3.10/site-packages/torchvision/image.so\n",
- " Expected in: <3AE92490-D363-3FD7-8532-CB6F5F795BC8> /Users/av/miniconda3/lib/python3.10/site-packages/torch/lib/libc10.dylib\n",
- " warn(f\"Failed to load image Python extension: {e}\")\n"
- ]
- },
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "fadffc0299c04e249fd4f7a5b40ba0af",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "Fetching 5 files: 0%| | 0/5 [00:00, ?it/s]"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "data": {
- "text/plain": [
- "(torch.Size([1, 197, 384]),\n",
- " torch.Size([1, 64, 768]),\n",
- " torch.Size([1, 256]),\n",
- " torch.Size([1, 256]))"
- ]
- },
- "execution_count": 1,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "import uform\n",
- "from PIL import Image\n",
- "\n",
- "model, processor = uform.get_model('unum-cloud/uform-vl-english-small')\n",
- "text = 'a small red panda in a zoo'\n",
- "image = Image.open('../../assets/unum.png')\n",
- "\n",
- "image_data = processor.preprocess_image(image)\n",
- "text_data = processor.preprocess_text(text)\n",
- "\n",
- "image_features, image_embedding = model.encode_image(image_data, return_features=True)\n",
- "text_features, text_embedding = model.encode_text(text_data, return_features=True)\n",
- "\n",
- "image_features.shape, text_features.shape, image_embedding.shape, text_embedding.shape"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "TextEncoder(model_type='bert', dim=768, context_dim=384, vocab_size=30522, padding_idx=0, num_layers=4, num_heads=12, embedding_dim=256, multimodal_layers_ids=[2, 3], head_one_neuron=False, pooling='cls', max_position_embeddings=64, dropout_prob=0.1)"
- ]
- },
- "execution_count": 2,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "model.text_encoder"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "VisualEncoder(dim=384, patch_size=16, image_size=224, num_layers=12, num_heads=6, embedding_dim=256, pooling='cls', num_reg_tokens=0)"
- ]
- },
- "execution_count": 3,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "model.image_encoder"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "First layer of image_encoder: patch_embed\n",
- "First layer of text_encoder: word_embeddings\n"
- ]
- }
- ],
- "source": [
- "# Assuming `model` is your loaded model with image_encoder and text_encoder attributes\n",
- "for name, module in model.image_encoder.named_children():\n",
- " print(f\"First layer of image_encoder: {name}\")\n",
- " break # We break after the first layer\n",
- "\n",
- "for name, module in model.text_encoder.named_children():\n",
- " print(f\"First layer of text_encoder: {name}\")\n",
- " break # We break after the first layer"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## ONNX"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## CoreML"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "scikit-learn version 1.2.1 is not supported. Minimum required version: 0.17. Maximum required version: 1.1.2. Disabling scikit-learn conversion API.\n",
- "Torch version 2.1.1 has not been tested with coremltools. You may run into unexpected errors. Torch 2.1.0 is the most recent version that has been tested.\n"
- ]
- }
- ],
- "source": [
- "import coremltools as ct\n",
- "import torch"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {},
- "outputs": [],
- "source": [
- "image_input = ct.TensorType(name=\"input\", shape=image_data.shape)\n",
- "text_input = ct.TensorType(name=\"input_ids\", shape=text_data[\"input_ids\"].shape)\n",
- "text_attention_input = ct.TensorType(name=\"attention_mask\", shape=text_data[\"attention_mask\"].shape)\n",
- "text_features = ct.TensorType(name=\"features\")\n",
- "text_embeddings = ct.TensorType(name=\"embeddings\")\n",
- "image_features = ct.TensorType(name=\"features\")\n",
- "image_embeddings = ct.TensorType(name=\"embeddings\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "VisualEncoder(\n",
- " original_name=VisualEncoder\n",
- " (patch_embed): Conv2d(original_name=Conv2d)\n",
- " (blocks): Sequential(\n",
- " original_name=Sequential\n",
- " (0): VisualEncoderBlock(\n",
- " original_name=VisualEncoderBlock\n",
- " (norm1): LayerNorm(original_name=LayerNorm)\n",
- " (attn): Attention(\n",
- " original_name=Attention\n",
- " (query): Linear(original_name=Linear)\n",
- " (key): Linear(original_name=Linear)\n",
- " (value): Linear(original_name=Linear)\n",
- " (out): Linear(original_name=Linear)\n",
- " )\n",
- " (ls1): LayerScale(original_name=LayerScale)\n",
- " (norm2): LayerNorm(original_name=LayerNorm)\n",
- " (mlp): MLP(\n",
- " original_name=MLP\n",
- " (hidden_layer): Linear(original_name=Linear)\n",
- " (output_layer): Linear(original_name=Linear)\n",
- " )\n",
- " (ls2): LayerScale(original_name=LayerScale)\n",
- " )\n",
- " (1): VisualEncoderBlock(\n",
- " original_name=VisualEncoderBlock\n",
- " (norm1): LayerNorm(original_name=LayerNorm)\n",
- " (attn): Attention(\n",
- " original_name=Attention\n",
- " (query): Linear(original_name=Linear)\n",
- " (key): Linear(original_name=Linear)\n",
- " (value): Linear(original_name=Linear)\n",
- " (out): Linear(original_name=Linear)\n",
- " )\n",
- " (ls1): LayerScale(original_name=LayerScale)\n",
- " (norm2): LayerNorm(original_name=LayerNorm)\n",
- " (mlp): MLP(\n",
- " original_name=MLP\n",
- " (hidden_layer): Linear(original_name=Linear)\n",
- " (output_layer): Linear(original_name=Linear)\n",
- " )\n",
- " (ls2): LayerScale(original_name=LayerScale)\n",
- " )\n",
- " (2): VisualEncoderBlock(\n",
- " original_name=VisualEncoderBlock\n",
- " (norm1): LayerNorm(original_name=LayerNorm)\n",
- " (attn): Attention(\n",
- " original_name=Attention\n",
- " (query): Linear(original_name=Linear)\n",
- " (key): Linear(original_name=Linear)\n",
- " (value): Linear(original_name=Linear)\n",
- " (out): Linear(original_name=Linear)\n",
- " )\n",
- " (ls1): LayerScale(original_name=LayerScale)\n",
- " (norm2): LayerNorm(original_name=LayerNorm)\n",
- " (mlp): MLP(\n",
- " original_name=MLP\n",
- " (hidden_layer): Linear(original_name=Linear)\n",
- " (output_layer): Linear(original_name=Linear)\n",
- " )\n",
- " (ls2): LayerScale(original_name=LayerScale)\n",
- " )\n",
- " (3): VisualEncoderBlock(\n",
- " original_name=VisualEncoderBlock\n",
- " (norm1): LayerNorm(original_name=LayerNorm)\n",
- " (attn): Attention(\n",
- " original_name=Attention\n",
- " (query): Linear(original_name=Linear)\n",
- " (key): Linear(original_name=Linear)\n",
- " (value): Linear(original_name=Linear)\n",
- " (out): Linear(original_name=Linear)\n",
- " )\n",
- " (ls1): LayerScale(original_name=LayerScale)\n",
- " (norm2): LayerNorm(original_name=LayerNorm)\n",
- " (mlp): MLP(\n",
- " original_name=MLP\n",
- " (hidden_layer): Linear(original_name=Linear)\n",
- " (output_layer): Linear(original_name=Linear)\n",
- " )\n",
- " (ls2): LayerScale(original_name=LayerScale)\n",
- " )\n",
- " (4): VisualEncoderBlock(\n",
- " original_name=VisualEncoderBlock\n",
- " (norm1): LayerNorm(original_name=LayerNorm)\n",
- " (attn): Attention(\n",
- " original_name=Attention\n",
- " (query): Linear(original_name=Linear)\n",
- " (key): Linear(original_name=Linear)\n",
- " (value): Linear(original_name=Linear)\n",
- " (out): Linear(original_name=Linear)\n",
- " )\n",
- " (ls1): LayerScale(original_name=LayerScale)\n",
- " (norm2): LayerNorm(original_name=LayerNorm)\n",
- " (mlp): MLP(\n",
- " original_name=MLP\n",
- " (hidden_layer): Linear(original_name=Linear)\n",
- " (output_layer): Linear(original_name=Linear)\n",
- " )\n",
- " (ls2): LayerScale(original_name=LayerScale)\n",
- " )\n",
- " (5): VisualEncoderBlock(\n",
- " original_name=VisualEncoderBlock\n",
- " (norm1): LayerNorm(original_name=LayerNorm)\n",
- " (attn): Attention(\n",
- " original_name=Attention\n",
- " (query): Linear(original_name=Linear)\n",
- " (key): Linear(original_name=Linear)\n",
- " (value): Linear(original_name=Linear)\n",
- " (out): Linear(original_name=Linear)\n",
- " )\n",
- " (ls1): LayerScale(original_name=LayerScale)\n",
- " (norm2): LayerNorm(original_name=LayerNorm)\n",
- " (mlp): MLP(\n",
- " original_name=MLP\n",
- " (hidden_layer): Linear(original_name=Linear)\n",
- " (output_layer): Linear(original_name=Linear)\n",
- " )\n",
- " (ls2): LayerScale(original_name=LayerScale)\n",
- " )\n",
- " (6): VisualEncoderBlock(\n",
- " original_name=VisualEncoderBlock\n",
- " (norm1): LayerNorm(original_name=LayerNorm)\n",
- " (attn): Attention(\n",
- " original_name=Attention\n",
- " (query): Linear(original_name=Linear)\n",
- " (key): Linear(original_name=Linear)\n",
- " (value): Linear(original_name=Linear)\n",
- " (out): Linear(original_name=Linear)\n",
- " )\n",
- " (ls1): LayerScale(original_name=LayerScale)\n",
- " (norm2): LayerNorm(original_name=LayerNorm)\n",
- " (mlp): MLP(\n",
- " original_name=MLP\n",
- " (hidden_layer): Linear(original_name=Linear)\n",
- " (output_layer): Linear(original_name=Linear)\n",
- " )\n",
- " (ls2): LayerScale(original_name=LayerScale)\n",
- " )\n",
- " (7): VisualEncoderBlock(\n",
- " original_name=VisualEncoderBlock\n",
- " (norm1): LayerNorm(original_name=LayerNorm)\n",
- " (attn): Attention(\n",
- " original_name=Attention\n",
- " (query): Linear(original_name=Linear)\n",
- " (key): Linear(original_name=Linear)\n",
- " (value): Linear(original_name=Linear)\n",
- " (out): Linear(original_name=Linear)\n",
- " )\n",
- " (ls1): LayerScale(original_name=LayerScale)\n",
- " (norm2): LayerNorm(original_name=LayerNorm)\n",
- " (mlp): MLP(\n",
- " original_name=MLP\n",
- " (hidden_layer): Linear(original_name=Linear)\n",
- " (output_layer): Linear(original_name=Linear)\n",
- " )\n",
- " (ls2): LayerScale(original_name=LayerScale)\n",
- " )\n",
- " (8): VisualEncoderBlock(\n",
- " original_name=VisualEncoderBlock\n",
- " (norm1): LayerNorm(original_name=LayerNorm)\n",
- " (attn): Attention(\n",
- " original_name=Attention\n",
- " (query): Linear(original_name=Linear)\n",
- " (key): Linear(original_name=Linear)\n",
- " (value): Linear(original_name=Linear)\n",
- " (out): Linear(original_name=Linear)\n",
- " )\n",
- " (ls1): LayerScale(original_name=LayerScale)\n",
- " (norm2): LayerNorm(original_name=LayerNorm)\n",
- " (mlp): MLP(\n",
- " original_name=MLP\n",
- " (hidden_layer): Linear(original_name=Linear)\n",
- " (output_layer): Linear(original_name=Linear)\n",
- " )\n",
- " (ls2): LayerScale(original_name=LayerScale)\n",
- " )\n",
- " (9): VisualEncoderBlock(\n",
- " original_name=VisualEncoderBlock\n",
- " (norm1): LayerNorm(original_name=LayerNorm)\n",
- " (attn): Attention(\n",
- " original_name=Attention\n",
- " (query): Linear(original_name=Linear)\n",
- " (key): Linear(original_name=Linear)\n",
- " (value): Linear(original_name=Linear)\n",
- " (out): Linear(original_name=Linear)\n",
- " )\n",
- " (ls1): LayerScale(original_name=LayerScale)\n",
- " (norm2): LayerNorm(original_name=LayerNorm)\n",
- " (mlp): MLP(\n",
- " original_name=MLP\n",
- " (hidden_layer): Linear(original_name=Linear)\n",
- " (output_layer): Linear(original_name=Linear)\n",
- " )\n",
- " (ls2): LayerScale(original_name=LayerScale)\n",
- " )\n",
- " (10): VisualEncoderBlock(\n",
- " original_name=VisualEncoderBlock\n",
- " (norm1): LayerNorm(original_name=LayerNorm)\n",
- " (attn): Attention(\n",
- " original_name=Attention\n",
- " (query): Linear(original_name=Linear)\n",
- " (key): Linear(original_name=Linear)\n",
- " (value): Linear(original_name=Linear)\n",
- " (out): Linear(original_name=Linear)\n",
- " )\n",
- " (ls1): LayerScale(original_name=LayerScale)\n",
- " (norm2): LayerNorm(original_name=LayerNorm)\n",
- " (mlp): MLP(\n",
- " original_name=MLP\n",
- " (hidden_layer): Linear(original_name=Linear)\n",
- " (output_layer): Linear(original_name=Linear)\n",
- " )\n",
- " (ls2): LayerScale(original_name=LayerScale)\n",
- " )\n",
- " (11): VisualEncoderBlock(\n",
- " original_name=VisualEncoderBlock\n",
- " (norm1): LayerNorm(original_name=LayerNorm)\n",
- " (attn): Attention(\n",
- " original_name=Attention\n",
- " (query): Linear(original_name=Linear)\n",
- " (key): Linear(original_name=Linear)\n",
- " (value): Linear(original_name=Linear)\n",
- " (out): Linear(original_name=Linear)\n",
- " )\n",
- " (ls1): LayerScale(original_name=LayerScale)\n",
- " (norm2): LayerNorm(original_name=LayerNorm)\n",
- " (mlp): MLP(\n",
- " original_name=MLP\n",
- " (hidden_layer): Linear(original_name=Linear)\n",
- " (output_layer): Linear(original_name=Linear)\n",
- " )\n",
- " (ls2): LayerScale(original_name=LayerScale)\n",
- " )\n",
- " )\n",
- " (norm): LayerNorm(original_name=LayerNorm)\n",
- " (embedding_projection): Linear(original_name=Linear)\n",
- ")"
- ]
- },
- "execution_count": 12,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "module = model.image_encoder\n",
- "module.eval()\n",
- "module.return_features = True\n",
- "\n",
- "traced_script_module = torch.jit.trace(module, example_inputs=image_data)\n",
- "traced_script_module"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Tuple detected at graph output. This will be flattened in the converted model.\n",
- "Converting PyTorch Frontend ==> MIL Ops: 100%|█████████▉| 453/455 [00:00<00:00, 5638.83 ops/s]\n",
- "Running MIL frontend_pytorch pipeline: 100%|██████████| 5/5 [00:00<00:00, 381.07 passes/s]\n",
- "Running MIL default pipeline: 100%|██████████| 69/69 [00:00<00:00, 156.08 passes/s]\n",
- "Running MIL backend_mlprogram pipeline: 100%|██████████| 12/12 [00:00<00:00, 699.38 passes/s]\n"
- ]
- }
- ],
- "source": [
- "coreml_model = ct.convert(\n",
- " traced_script_module, source=\"pytorch\",\n",
- " inputs=[image_input], outputs=[image_features, image_embeddings],\n",
- " convert_to='mlprogram', compute_precision=ct.precision.FLOAT32)\n",
- "\n",
- "coreml_model.author = 'Unum Cloud'\n",
- "coreml_model.license = 'Apache 2.0'\n",
- "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n",
- "coreml_model.save(\"../uform-vl-english-small-image.mlpackage\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "TextEncoder(\n",
- " original_name=TextEncoder\n",
- " (word_embeddings): Embedding(original_name=Embedding)\n",
- " (position_embeddings): Embedding(original_name=Embedding)\n",
- " (layer_norm): LayerNorm(original_name=LayerNorm)\n",
- " (dropout): Dropout(original_name=Dropout)\n",
- " (blocks): ModuleList(\n",
- " original_name=ModuleList\n",
- " (0): TextEncoderBlock(\n",
- " original_name=TextEncoderBlock\n",
- " (norm_attn): LayerNorm(original_name=LayerNorm)\n",
- " (attention): Attention(\n",
- " original_name=Attention\n",
- " (query): Linear(original_name=Linear)\n",
- " (key): Linear(original_name=Linear)\n",
- " (value): Linear(original_name=Linear)\n",
- " (out): Linear(original_name=Linear)\n",
- " )\n",
- " (norm_mlp): LayerNorm(original_name=LayerNorm)\n",
- " (mlp): MLP(\n",
- " original_name=MLP\n",
- " (hidden_layer): Linear(original_name=Linear)\n",
- " (output_layer): Linear(original_name=Linear)\n",
- " )\n",
- " (dropout): Dropout(original_name=Dropout)\n",
- " )\n",
- " (1): TextEncoderBlock(\n",
- " original_name=TextEncoderBlock\n",
- " (norm_attn): LayerNorm(original_name=LayerNorm)\n",
- " (attention): Attention(\n",
- " original_name=Attention\n",
- " (query): Linear(original_name=Linear)\n",
- " (key): Linear(original_name=Linear)\n",
- " (value): Linear(original_name=Linear)\n",
- " (out): Linear(original_name=Linear)\n",
- " )\n",
- " (norm_mlp): LayerNorm(original_name=LayerNorm)\n",
- " (mlp): MLP(\n",
- " original_name=MLP\n",
- " (hidden_layer): Linear(original_name=Linear)\n",
- " (output_layer): Linear(original_name=Linear)\n",
- " )\n",
- " (dropout): Dropout(original_name=Dropout)\n",
- " )\n",
- " (2): TextEncoderBlock(\n",
- " original_name=TextEncoderBlock\n",
- " (norm_attn): LayerNorm(original_name=LayerNorm)\n",
- " (attention): Attention(\n",
- " original_name=Attention\n",
- " (query): Linear(original_name=Linear)\n",
- " (key): Linear(original_name=Linear)\n",
- " (value): Linear(original_name=Linear)\n",
- " (out): Linear(original_name=Linear)\n",
- " )\n",
- " (norm_crossattn): LayerNorm(original_name=LayerNorm)\n",
- " (crossattn): Attention(\n",
- " original_name=Attention\n",
- " (query): Linear(original_name=Linear)\n",
- " (key): Linear(original_name=Linear)\n",
- " (value): Linear(original_name=Linear)\n",
- " (out): Linear(original_name=Linear)\n",
- " )\n",
- " (norm_mlp): LayerNorm(original_name=LayerNorm)\n",
- " (mlp): MLP(\n",
- " original_name=MLP\n",
- " (hidden_layer): Linear(original_name=Linear)\n",
- " (output_layer): Linear(original_name=Linear)\n",
- " )\n",
- " (dropout): Dropout(original_name=Dropout)\n",
- " )\n",
- " (3): TextEncoderBlock(\n",
- " original_name=TextEncoderBlock\n",
- " (norm_attn): LayerNorm(original_name=LayerNorm)\n",
- " (attention): Attention(\n",
- " original_name=Attention\n",
- " (query): Linear(original_name=Linear)\n",
- " (key): Linear(original_name=Linear)\n",
- " (value): Linear(original_name=Linear)\n",
- " (out): Linear(original_name=Linear)\n",
- " )\n",
- " (norm_crossattn): LayerNorm(original_name=LayerNorm)\n",
- " (crossattn): Attention(\n",
- " original_name=Attention\n",
- " (query): Linear(original_name=Linear)\n",
- " (key): Linear(original_name=Linear)\n",
- " (value): Linear(original_name=Linear)\n",
- " (out): Linear(original_name=Linear)\n",
- " )\n",
- " (norm_mlp): LayerNorm(original_name=LayerNorm)\n",
- " (mlp): MLP(\n",
- " original_name=MLP\n",
- " (hidden_layer): Linear(original_name=Linear)\n",
- " (output_layer): Linear(original_name=Linear)\n",
- " )\n",
- " (dropout): Dropout(original_name=Dropout)\n",
- " )\n",
- " )\n",
- " (embedding_projection): Linear(original_name=Linear)\n",
- " (matching_head): Linear(original_name=Linear)\n",
- " (context_projection): Linear(original_name=Linear)\n",
- ")"
- ]
- },
- "execution_count": 9,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "module = model.text_encoder\n",
- "module.eval()\n",
- "module.return_features = True\n",
- "\n",
- "traced_script_module = torch.jit.trace(module, example_inputs=[text_data['input_ids'], text_data['attention_mask']])\n",
- "traced_script_module"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "Tuple detected at graph output. This will be flattened in the converted model.\n",
- "Converting PyTorch Frontend ==> MIL Ops: 0%| | 0/157 [00:00, ? ops/s]Core ML embedding (gather) layer does not support any inputs besides the weights and indices. Those given will be ignored.\n",
- "Converting PyTorch Frontend ==> MIL Ops: 99%|█████████▊| 155/157 [00:00<00:00, 6809.29 ops/s]\n",
- "Running MIL frontend_pytorch pipeline: 100%|██████████| 5/5 [00:00<00:00, 1947.76 passes/s]\n",
- "Running MIL default pipeline: 100%|██████████| 69/69 [00:00<00:00, 816.08 passes/s]\n",
- "Running MIL backend_mlprogram pipeline: 100%|██████████| 12/12 [00:00<00:00, 3294.17 passes/s]\n"
- ]
- }
- ],
- "source": [
- "coreml_model = ct.convert(\n",
- " traced_script_module, source=\"pytorch\",\n",
- " inputs=[text_input, text_attention_input], outputs=[text_features, text_embeddings],\n",
- " convert_to='mlprogram', compute_precision=ct.precision.FLOAT32)\n",
- "\n",
- "coreml_model.author = 'Unum Cloud'\n",
- "coreml_model.license = 'Apache 2.0'\n",
- "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n",
- "coreml_model.save(\"../uform-vl-english-small-text.mlpackage\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "base",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.10.11"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
diff --git a/python/scripts/export_decoders.ipynb b/python/scripts/export_decoders.ipynb
new file mode 100644
index 0000000..26e463b
--- /dev/null
+++ b/python/scripts/export_decoders.ipynb
@@ -0,0 +1,91 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Scripts for Exporting PyTorch Models to ONNX and CoreML\n",
+ "\n",
+ "Depending on the backend, we prefer different qunatization schemes.\n",
+ "\n",
+ "- For ONNX we use `uint8` quantization.\n",
+ "- For PyTorch we use `bfloat16` quantization.\n",
+ "- For CoreML we use `float32` representation."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!pip install --upgrade \"uform[torch]\" coremltools"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "model_name = \"unum-cloud/uform-gen2-dpo\"\n",
+ "output_directory = \"../../\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import torch\n",
+ "import uform\n",
+ "from PIL import Image\n",
+ "from transformers import AutoModel, AutoProcessor\n",
+ "\n",
+ "model = AutoModel.from_pretrained(model_name, trust_remote_code=True)\n",
+ "processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)\n",
+ "\n",
+ "prompt = 'Describe the picture'\n",
+ "image = Image.open('../../assets/unum.png')\n",
+ "inputs = processor(text=[prompt], images=[image], return_tensors='pt')\n",
+ "\n",
+ "with torch.inference_mode():\n",
+ " output = model.generate(\n",
+ " **inputs,\n",
+ " do_sample=False,\n",
+ " use_cache=True,\n",
+ " max_new_tokens=256,\n",
+ " eos_token_id=151645,\n",
+ " pad_token_id=processor.tokenizer.pad_token_id\n",
+ " )\n",
+ "prompt_len = inputs['input_ids'].shape[1]\n",
+ "decoded_text = processor.batch_decode(output[:, prompt_len:])[0]\n",
+ "\n",
+ "print(decoded_text)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "base",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.5"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/python/scripts/export_encoders.ipynb b/python/scripts/export_encoders.ipynb
new file mode 100644
index 0000000..a8b868d
--- /dev/null
+++ b/python/scripts/export_encoders.ipynb
@@ -0,0 +1,681 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Scripts for Exporting PyTorch Models to ONNX and CoreML\n",
+ "\n",
+ "Depending on the backend, we prefer different qunatization schemes.\n",
+ "\n",
+ "- For ONNX we use `uint8` quantization.\n",
+ "- For PyTorch we use `bfloat16` quantization.\n",
+ "- For CoreML we use `float32` representation."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!pip install --upgrade \"uform[torch]\" coremltools"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "\n",
+ "working_directory = \"../..\"\n",
+ "model_name = \"uform3-image-text-english-small\"\n",
+ "model_directory = os.path.join(working_directory, \"models\", model_name)\n",
+ "model_weights_path = os.path.join(model_directory, \"torch_weight.pt\")\n",
+ "config_path = os.path.join(model_directory, \"config.json\")\n",
+ "tokenizer_path = os.path.join(model_directory, \"tokenizer.json\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import torch\n",
+ "\n",
+ "state_dict = torch.load(model_weights_path)\n",
+ "list(state_dict.keys())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from uform.torch_encoders import ImageEncoder, TextEncoder\n",
+ "from uform.torch_processors import ImageProcessor, TextProcessor"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "image_encoder = ImageEncoder.from_pretrained(config_path, state_dict)\n",
+ "text_encoder = TextEncoder.from_pretrained(config_path, state_dict)\n",
+ "image_encoder, text_encoder"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "text_processor = TextProcessor(config_path, tokenizer_path)\n",
+ "image_processor = ImageProcessor(config_path)\n",
+ "text_processor, image_processor"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import uform\n",
+ "from PIL import Image\n",
+ "\n",
+ "text = 'a small red panda in a zoo'\n",
+ "image = Image.open('../../assets/unum.png')\n",
+ "\n",
+ "text_data = text_processor(text)\n",
+ "image_data = image_processor(image)\n",
+ "\n",
+ "image_features, image_embedding = image_encoder.forward(image_data, return_features=True)\n",
+ "text_features, text_embedding = text_encoder.forward(text_data, return_features=True)\n",
+ "\n",
+ "image_features.shape, text_features.shape, image_embedding.shape, text_embedding.shape"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## CoreML"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import coremltools as ct\n",
+ "import torch"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "precision = ct.precision.FLOAT32"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "CoreML Tools provides a way to convert ONNX models to CoreML models. This script demonstrates how to convert an ONNX model to a CoreML model. For that, we need to provide an example input, and the tensor shapes will be inferred from that.\n",
+ "\n",
+ "```python\n",
+ " image_input = ct.TensorType(name=\"images\", shape=image_data.shape)\n",
+ " text_input = ct.TensorType(name=\"input_ids\", shape=text_data[\"input_ids\"].shape)\n",
+ " text_attention_input = ct.TensorType(name=\"attention_mask\", shape=text_data[\"attention_mask\"].shape)\n",
+ "```\n",
+ "\n",
+ "That, however, will only work for batch-size one. To support larger batches, we need to override the input shapes.\n",
+ "\n",
+ "```python\n",
+ " ct.RangeDim(lower_bound=25, upper_bound=100, default=45)\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def generalize_first_dimensions(input_shape, upper_bound=64):\n",
+ " if upper_bound == 1:\n",
+ " return input_shape\n",
+ " input_shape = (ct.RangeDim(lower_bound=1, upper_bound=upper_bound, default=1),) + input_shape[1:]\n",
+ " return input_shape\n",
+ "\n",
+ "generalize_first_dimensions(image_data[\"images\"].shape), generalize_first_dimensions(text_data[\"input_ids\"].shape), generalize_first_dimensions(text_data[\"attention_mask\"].shape)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "image_input = ct.TensorType(name=\"images\", shape=generalize_first_dimensions(image_data[\"images\"].shape, 1))\n",
+ "text_input = ct.TensorType(name=\"input_ids\", shape=generalize_first_dimensions(text_data[\"input_ids\"].shape, 1))\n",
+ "text_attention_input = ct.TensorType(name=\"attention_mask\", shape=generalize_first_dimensions(text_data[\"attention_mask\"].shape, 1))\n",
+ "text_features = ct.TensorType(name=\"features\")\n",
+ "text_embeddings = ct.TensorType(name=\"embeddings\")\n",
+ "image_features = ct.TensorType(name=\"features\")\n",
+ "image_embeddings = ct.TensorType(name=\"embeddings\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "module = image_encoder\n",
+ "module.eval()\n",
+ "module.return_features = True\n",
+ "\n",
+ "traced_script_module = torch.jit.trace(module, example_inputs=image_data[\"images\"])\n",
+ "traced_script_module"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "coreml_model = ct.convert(\n",
+ " traced_script_module, source=\"pytorch\",\n",
+ " inputs=[image_input], outputs=[image_features, image_embeddings],\n",
+ " convert_to='mlprogram', compute_precision=precision)\n",
+ "\n",
+ "coreml_model.author = 'Unum Cloud'\n",
+ "coreml_model.license = 'Apache 2.0'\n",
+ "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n",
+ "coreml_model.save(os.path.join(model_directory, \"image_encoder.mlpackage\"))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "module = text_encoder\n",
+ "module.eval()\n",
+ "module.return_features = True\n",
+ "\n",
+ "traced_script_module = torch.jit.trace(module, example_inputs=[text_data['input_ids'], text_data['attention_mask']])\n",
+ "traced_script_module"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "coreml_model = ct.convert(\n",
+ " traced_script_module, source=\"pytorch\",\n",
+ " inputs=[text_input, text_attention_input], outputs=[text_features, text_embeddings],\n",
+ " convert_to='mlprogram', compute_precision=precision)\n",
+ "\n",
+ "coreml_model.author = 'Unum Cloud'\n",
+ "coreml_model.license = 'Apache 2.0'\n",
+ "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n",
+ "coreml_model.save(os.path.join(model_directory, \"text_encoder.mlpackage\"))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# PyTorch\n",
+ "\n",
+ "Let's ensure:\n",
+ "\n",
+ "- the `model.text_encoder` inputs are called `input_ids` and `attention_mask`, and outputs are `embeddings` and `features`.\n",
+ "- the `model.image_encoder` input is called `input`, and outputs are `embeddings` and `features`.\n",
+ "- the model itself works fine in `f16` half-precision, so that the model is lighter and easier to download."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import torch\n",
+ "from safetensors import safe_open\n",
+ "from safetensors.torch import save_file"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "image_encoder.eval()\n",
+ "image_encoder.to(dtype=torch.bfloat16)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "torch.save(image_encoder.state_dict(), os.path.join(model_directory, \"image_encoder.pt\"))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "save_file(image_encoder.state_dict(), os.path.join(model_directory, \"image_encoder.safetensors\"))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "text_encoder.eval()\n",
+ "text_encoder.to(dtype=torch.bfloat16)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "torch.save(text_encoder.state_dict(), os.path.join(model_directory, \"text_encoder.pt\"))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "save_file(text_encoder.state_dict(), os.path.join(model_directory, \"text_encoder.safetensors\"))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "image_features, image_embedding = image_encoder.forward(image_data[\"images\"].to(dtype=torch.bfloat16), return_features=True)\n",
+ "text_features, text_embedding = text_encoder.forward(text_data, return_features=True)\n",
+ "\n",
+ "image_features.shape, text_features.shape, image_embedding.shape, text_embedding.shape"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## ONNX"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!pip install onnx onnxconverter-common"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from torch.onnx import export as onnx_export\n",
+ "import torch"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We can't immediately export to `bfloat16` as it's not supported by ONNX, but we also can't export to `float16`, as the forward pass (that will be traced) is gonna fail. So let's export to `float32` ONNX file first."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "module = text_encoder\n",
+ "module.eval()\n",
+ "module.return_features = True\n",
+ "module.to(dtype=torch.float32)\n",
+ "\n",
+ "onnx_export(\n",
+ " module,\n",
+ " (text_data[\"input_ids\"], text_data[\"attention_mask\"]), \n",
+ " os.path.join(model_directory, \"text_encoder.onnx\"), \n",
+ " export_params=True,\n",
+ " opset_version=15,\n",
+ " do_constant_folding=True,\n",
+ " input_names = ['input_ids', 'attention_mask'], \n",
+ " output_names = ['features', 'embeddings'],\n",
+ " dynamic_axes={\n",
+ " 'input_ids' : {0 : 'batch_size'}, \n",
+ " 'attention_mask' : {0 : 'batch_size'}, \n",
+ " 'features' : {0 : 'batch_size'}, \n",
+ " 'embeddings' : {0 : 'batch_size'}})"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Now repeat the same for images."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "module = image_encoder\n",
+ "module.eval()\n",
+ "module.return_features = True\n",
+ "module.to(dtype=torch.float32)\n",
+ "\n",
+ "torch.onnx.export(\n",
+ " module,\n",
+ " image_data[\"images\"], \n",
+ " os.path.join(model_directory, \"image_encoder.onnx\"), \n",
+ " export_params=True,\n",
+ " opset_version=15,\n",
+ " do_constant_folding=True,\n",
+ " input_names = ['images'], \n",
+ " output_names = ['features', 'embeddings'],\n",
+ " dynamic_axes={\n",
+ " 'images' : {0 : 'batch_size'},\n",
+ " 'features' : {0 : 'batch_size'},\n",
+ " 'embeddings' : {0 : 'batch_size'}})"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Quantizing to `float16`\n",
+ "\n",
+ "Let's use [additional ONNX tooling](https://onnxruntime.ai/docs/performance/model-optimizations/float16.html#mixed-precision) to convert to half-precision."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import onnx\n",
+ "from onnxconverter_common import float16"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "module_path = os.path.join(model_directory, \"text_encoder.onnx\")\n",
+ "module = onnx.load(module_path)\n",
+ "module_fp16 = float16.convert_float_to_float16(module)\n",
+ "onnx.save(module_fp16, module_path)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "module_path = os.path.join(model_directory, \"image_encoder.onnx\")\n",
+ "module = onnx.load(module_path)\n",
+ "module_fp16 = float16.convert_float_to_float16(module)\n",
+ "onnx.save(module_fp16, module_path)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Quantizing to `uint8`\n",
+ "\n",
+ "We can further quantize the model into `uint8` using ONNX quantization tools.\n",
+ "The `int8` is default variant, but [some of the operators don't support it](https://github.com/microsoft/onnxruntime/issues/15888)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from onnxruntime.quantization import quantize_dynamic, QuantType"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "module_path = os.path.join(model_directory, \"text_encoder.onnx\")\n",
+ "quantize_dynamic(module_path, module_path, weight_type=QuantType.QUInt8)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "module_path = os.path.join(model_directory, \"image_encoder.onnx\")\n",
+ "quantize_dynamic(module_path, module_path, weight_type=QuantType.QUInt8)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Let's make sure that all the text inputs are integers of identical type - `int32`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import onnx\n",
+ "import os\n",
+ "from onnx import helper\n",
+ "\n",
+ "# Load the ONNX model\n",
+ "module_path = os.path.join(model_directory, \"text_encoder.onnx\")\n",
+ "module = onnx.load(module_path)\n",
+ "\n",
+ "# Get the module's graph\n",
+ "graph = module.graph\n",
+ "\n",
+ "# Iterate through the inputs and update the data type of `input_ids`\n",
+ "for input_tensor in graph.input:\n",
+ " # Check if this is the tensor we want to change\n",
+ " if input_tensor.name == 'input_ids' or input_tensor.name == 'attention_mask':\n",
+ " # Get the tensor type information\n",
+ " tensor_type = input_tensor.type.tensor_type\n",
+ " # Set the element type to INT32 (int32's enum value in onnx is 6)\n",
+ " tensor_type.elem_type = onnx.TensorProto.INT32\n",
+ "\n",
+ "# Optionally, check that the module is still valid\n",
+ "onnx.checker.check_model(module)\n",
+ "\n",
+ "# Save the modified module\n",
+ "onnx.save(module, module_path)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We can use the following function to print and validate the input and output types of the ONNX model files."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def print_model_inputs_and_outputs(onnx_model_path):\n",
+ " model = onnx.load(onnx_model_path)\n",
+ "\n",
+ " # Get the model's graph\n",
+ " graph = model.graph\n",
+ "\n",
+ " # Print input information\n",
+ " print(\"Model Inputs:\")\n",
+ " for input_tensor in graph.input:\n",
+ " tensor_type = input_tensor.type.tensor_type\n",
+ " # Get the element type (data type)\n",
+ " elem_type = tensor_type.elem_type\n",
+ " # Convert numeric type to readable format\n",
+ " readable_type = onnx.TensorProto.DataType.Name(elem_type)\n",
+ " # Get tensor shape\n",
+ " shape = [dim.dim_value for dim in tensor_type.shape.dim]\n",
+ " print(f\"Name: {input_tensor.name}, Type: {readable_type}, Shape: {shape}\")\n",
+ "\n",
+ " # Print output information similarly if needed\n",
+ " print(\"\\nModel Outputs:\")\n",
+ " for output_tensor in graph.output:\n",
+ " tensor_type = output_tensor.type.tensor_type\n",
+ " elem_type = tensor_type.elem_type\n",
+ " readable_type = onnx.TensorProto.DataType.Name(elem_type)\n",
+ " shape = [dim.dim_value for dim in tensor_type.shape.dim]\n",
+ " print(f\"Name: {output_tensor.name}, Type: {readable_type}, Shape: {shape}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Let's check that the runtime can actually load those models."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import onnxruntime as ort\n",
+ "session_options = ort.SessionOptions()\n",
+ "session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "module_path = os.path.join(model_directory, \"text_encoder.onnx\")\n",
+ "session = ort.InferenceSession(module_path, sess_options=session_options)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "module_path = os.path.join(model_directory, \"image_encoder.onnx\")\n",
+ "session = ort.InferenceSession(module_path, sess_options=session_options)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Upload to Hugging Face"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../models/uform3-image-text-english-small/ . --exclude=\"torch_weight.pt\""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../image_encoder.onnx image_encoder.onnx\n",
+ "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../text_encoder.onnx text_encoder.onnx\n",
+ "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../image_encoder.safetensors image_encoder.safetensors\n",
+ "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../text_encoder.safetensors text_encoder.safetensors\n",
+ "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../image_encoder.pt image_encoder.pt\n",
+ "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../text_encoder.pt text_encoder.pt"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "base",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.5"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/python/scripts/test_generative.py b/python/scripts/test_decoders.py
similarity index 100%
rename from python/scripts/test_generative.py
rename to python/scripts/test_decoders.py
diff --git a/python/scripts/test_embeddings.py b/python/scripts/test_embeddings.py
deleted file mode 100644
index d71bf0b..0000000
--- a/python/scripts/test_embeddings.py
+++ /dev/null
@@ -1,148 +0,0 @@
-from typing import Tuple
-
-import pytest
-from PIL import Image
-import uform
-
-# PyTorch is a very heavy dependency, so we may want to skip these tests if it's not installed
-try:
- import torch
-
- torch_available = True
-except:
- torch_available = False
-
-# ONNX is not a very light dependency either
-try:
- import onnx
-
- onnx_available = True
-except:
- onnx_available = False
-
-torch_models = [
- "unum-cloud/uform-vl-english",
- "unum-cloud/uform-vl-multilingual-v2",
-]
-
-onnx_models_and_providers = [
- ("unum-cloud/uform-vl-english-small", "cpu", "fp32"),
- ("unum-cloud/uform-vl-english-large", "cpu", "fp32"),
- ("unum-cloud/uform-vl-english-small", "gpu", "fp32"),
- ("unum-cloud/uform-vl-english-large", "gpu", "fp32"),
- ("unum-cloud/uform-vl-english-small", "gpu", "fp16"),
- ("unum-cloud/uform-vl-english-large", "gpu", "fp16"),
-]
-
-
-@pytest.mark.skipif(not torch_available, reason="PyTorch is not installed")
-@pytest.mark.parametrize("model_name", torch_models)
-def test_torch_one_embedding(model_name: str):
- model, processor = uform.get_model(model_name)
- text = "a small red panda in a zoo"
- image_path = "assets/unum.png"
-
- image = Image.open(image_path)
- image_data = processor.preprocess_image(image)
- text_data = processor.preprocess_text(text)
-
- image_features, image_embedding = model.encode_image(image_data, return_features=True)
- text_features, text_embedding = model.encode_text(text_data, return_features=True)
-
- assert image_embedding.shape[0] == 1, "Image embedding batch size is not 1"
- assert text_embedding.shape[0] == 1, "Text embedding batch size is not 1"
-
- # Test reranking
- score, joint_embedding = model.encode_multimodal(
- image_features=image_features,
- text_features=text_features,
- attention_mask=text_data["attention_mask"],
- return_scores=True,
- )
- assert score.shape[0] == 1, "Matching score batch size is not 1"
- assert joint_embedding.shape[0] == 1, "Joint embedding batch size is not 1"
-
-
-@pytest.mark.skipif(not torch_available, reason="PyTorch is not installed")
-@pytest.mark.parametrize("model_name", torch_models)
-@pytest.mark.parametrize("batch_size", [1, 2])
-def test_torch_many_embeddings(model_name: str, batch_size: int):
- model, processor = uform.get_model(model_name)
- texts = ["a small red panda in a zoo"] * batch_size
- image_paths = ["assets/unum.png"] * batch_size
-
- images = [Image.open(path) for path in image_paths]
- image_data = processor.preprocess_image(images)
- text_data = processor.preprocess_text(texts)
-
- image_embeddings = model.encode_image(image_data, return_features=False)
- text_embeddings = model.encode_text(text_data, return_features=False)
-
- assert image_embeddings.shape[0] == batch_size, "Image embedding is unexpected"
- assert text_embeddings.shape[0] == batch_size, "Text embedding is unexpected"
-
-
-@pytest.mark.skipif(not onnx_available, reason="ONNX is not installed")
-@pytest.mark.parametrize("model_specs", onnx_models_and_providers)
-def test_onnx_one_embedding(model_specs: Tuple[str, str, str]):
-
- from uform.onnx_models import ExecutionProviderError
-
- try:
-
- model, processor = uform.get_model_onnx(*model_specs)
- text = "a small red panda in a zoo"
- image_path = "assets/unum.png"
-
- image = Image.open(image_path)
- image_data = processor.preprocess_image(image)
- text_data = processor.preprocess_text(text)
-
- image_features, image_embedding = model.encode_image(image_data, return_features=True)
- text_features, text_embedding = model.encode_text(text_data, return_features=True)
-
- assert image_embedding.shape[0] == 1, "Image embedding batch size is not 1"
- assert text_embedding.shape[0] == 1, "Text embedding batch size is not 1"
-
- score, joint_embedding = model.encode_multimodal(
- image_features=image_features,
- text_features=text_features,
- attention_mask=text_data["attention_mask"],
- return_scores=True,
- )
- assert score.shape[0] == 1, "Matching score batch size is not 1"
- assert joint_embedding.shape[0] == 1, "Joint embedding batch size is not 1"
-
- except ExecutionProviderError as e:
- pytest.skip(f"Execution provider error: {e}")
-
-
-@pytest.mark.skipif(not onnx_available, reason="ONNX is not installed")
-@pytest.mark.parametrize("model_specs", onnx_models_and_providers)
-@pytest.mark.parametrize("batch_size", [1, 2])
-def test_onnx_many_embeddings(model_specs: Tuple[str, str, str], batch_size: int):
-
- from uform.onnx_models import ExecutionProviderError
-
- try:
-
- model, processor = uform.get_model_onnx(*model_specs)
- texts = ["a small red panda in a zoo"] * batch_size
- image_paths = ["assets/unum.png"] * batch_size
-
- images = [Image.open(path) for path in image_paths]
- image_data = processor.preprocess_image(images)
- text_data = processor.preprocess_text(texts)
-
- image_embeddings = model.encode_image(image_data, return_features=False)
- text_embeddings = model.encode_text(text_data, return_features=False)
-
- assert image_embeddings.shape[0] == batch_size, "Image embedding is unexpected"
- assert text_embeddings.shape[0] == batch_size, "Text embedding is unexpected"
-
- except ExecutionProviderError as e:
- pytest.skip(f"Execution provider error: {e}")
-
-
-if __name__ == "__main__":
- pytest.main(["-s", "-x", __file__])
diff --git a/python/scripts/test_encoders.py b/python/scripts/test_encoders.py
new file mode 100644
index 0000000..20caed2
--- /dev/null
+++ b/python/scripts/test_encoders.py
@@ -0,0 +1,292 @@
+from functools import wraps
+from typing import Tuple
+import requests
+from io import BytesIO
+import os
+
+import pytest
+import numpy as np
+from PIL import Image
+
+from uform import Modality, get_model, ExecutionProviderError
+
+# PyTorch is a very heavy dependency, so we may want to skip these tests if it's not installed
+try:
+ import torch
+
+ torch_available = True
+except:
+ torch_available = False
+
+# ONNX is not a very light dependency either
+try:
+ import onnx
+
+ onnx_available = True
+except:
+ onnx_available = False
+
+torch_models = [
+ "unum-cloud/uform3-image-text-english-small",
+ "unum-cloud/uform3-image-text-english-base",
+ "unum-cloud/uform3-image-text-english-large",
+ "unum-cloud/uform3-image-text-multilingual-base",
+]
+
+onnx_models = [
+ "unum-cloud/uform3-image-text-english-small",
+ "unum-cloud/uform3-image-text-english-base",
+ "unum-cloud/uform3-image-text-english-large",
+ "unum-cloud/uform3-image-text-multilingual-base",
+]
+
+# Let's check if the HuggingFace Hub API token is set in the environment variable.
+# If it's not there, check if the `.hf_token` file is present in the current working directory.
+token = os.getenv("HUGGINGFACE_HUB_TOKEN", None)
+if token is None:
+ token_path = "./.hf_token"
+ if os.path.exists(token_path):
+ with open(token_path, "r") as file:
+ token = file.read().strip()
+
+
+def skip_on(exception, reason="No good reason :)"):
+ def decorator_func(f):
+ @wraps(f)
+ def wrapper(*args, **kwargs):
+ try:
+ # Try to run the test
+ return f(*args, **kwargs)
+ except exception:
+ pytest.skip(reason)
+
+ return wrapper
+
+ return decorator_func
+
+
+def cosine_similarity(x, y) -> float:
+ if not isinstance(x, np.ndarray):
+ x = x.detach().numpy()
+ if not isinstance(y, np.ndarray):
+ y = y.detach().numpy()
+
+ # Unlike NumPy, SimSIMD can properly deal with integer types
+ x = x.astype(np.float32).flatten()
+ y = y.astype(np.float32).flatten()
+ return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))
+
+
+def cross_references_image_and_text_embeddings(text_to_embedding, image_to_embedding, batch_size_multiple: int = 1):
+ """Test if the embeddings of text and image are semantically similar
+ using a small set of example text-image pairs."""
+
+ texts = [
+ "A group of friends enjoy a barbecue on a sandy beach, with one person grilling over a large black grill, while the other sits nearby, laughing and enjoying the camaraderie.",
+ "A white and orange cat stands on its hind legs, reaching towards a wicker basket filled with red raspberries on a wooden table in a garden, surrounded by orange flowers and a white teapot, creating a serene and whimsical scene.",
+ "A little girl in a yellow dress stands in a grassy field, holding an umbrella and looking at the camera, amidst rain.",
+ "This serene bedroom features a white bed with a black canopy, a gray armchair, a black dresser with a mirror, a vase with a plant, a window with white curtains, a rug, and a wooden floor, creating a tranquil and elegant atmosphere.",
+ "The image captures the iconic Louvre Museum in Paris, illuminated by warm lights against a dark sky, with the iconic glass pyramid in the center, surrounded by ornate buildings and a large courtyard, showcasing the museum's grandeur and historical significance.",
+ ]
+
+ image_urls = [
+ "https://github.com/ashvardanian/ashvardanian/blob/master/demos/bbq-on-beach.jpg?raw=true",
+ "https://github.com/ashvardanian/ashvardanian/blob/master/demos/cat-in-garden.jpg?raw=true",
+ "https://github.com/ashvardanian/ashvardanian/blob/master/demos/girl-and-rain.jpg?raw=true",
+ "https://github.com/ashvardanian/ashvardanian/blob/master/demos/light-bedroom-furniture.jpg?raw=true",
+ "https://github.com/ashvardanian/ashvardanian/blob/master/demos/louvre-at-night.jpg?raw=true",
+ ]
+ assert len(texts) == len(image_urls), "Number of texts and images should be the same."
+
+ images = [Image.open(BytesIO(requests.get(image_url).content)) for image_url in image_urls]
+ count_pairs = len(texts)
+
+ # Ensure we have a sufficiently large batch
+ texts = texts * batch_size_multiple
+ images = images * batch_size_multiple
+
+ # Compute the embedding in a batch fashion
+ text_embeddings = text_to_embedding(texts)
+ image_embeddings = image_to_embedding(images)
+
+ # Evaluate cosine similarity
+ for i in range(count_pairs):
+ pair_similarity = cosine_similarity(text_embeddings[i], image_embeddings[i])
+ other_text_similarities = [
+ cosine_similarity(text_embeddings[j], image_embeddings[i]) for j in range(count_pairs) if j != i
+ ]
+ other_image_similarities = [
+ cosine_similarity(text_embeddings[i], image_embeddings[j]) for j in range(count_pairs) if j != i
+ ]
+
+ assert pair_similarity > max(
+ other_text_similarities
+ ), "Text should be more similar to its corresponding image than to other images."
+ assert pair_similarity > max(
+ other_image_similarities
+ ), "Image should be more similar to its corresponding text than to other texts."
+
+
+@pytest.mark.skipif(not torch_available, reason="PyTorch is not installed")
+@pytest.mark.parametrize("model_name", torch_models)
+def test_torch_one_embedding(model_name: str):
+ processors, models = get_model(model_name, token=token, backend="torch")
+ model_text = models[Modality.TEXT_ENCODER]
+ model_image = models[Modality.IMAGE_ENCODER]
+ processor_text = processors[Modality.TEXT_ENCODER]
+ processor_image = processors[Modality.IMAGE_ENCODER]
+
+ text = "a small red panda in a zoo"
+ image_path = "assets/unum.png"
+
+ image = Image.open(image_path)
+ image_data = processor_image(image)
+ text_data = processor_text(text)
+
+ image_features, image_embedding = model_image.encode(image_data, return_features=True)
+ text_features, text_embedding = model_text.encode(text_data, return_features=True)
+
+ assert image_embedding.shape[0] == 1, "Image embedding batch size is not 1"
+ assert text_embedding.shape[0] == 1, "Text embedding batch size is not 1"
+
+ # Test if the model outputs actually make sense
+ cross_references_image_and_text_embeddings(
+ lambda text: model_text(processor_text(text)),
+ lambda image: model_image(processor_image(image)),
+ )
+
+
+@pytest.mark.skipif(not torch_available, reason="PyTorch is not installed")
+@pytest.mark.parametrize("model_name", torch_models)
+@pytest.mark.parametrize("batch_size", [1, 2])
+def test_torch_many_embeddings(model_name: str, batch_size: int):
+
+ processors, models = get_model(model_name, token=token, backend="torch")
+ model_text = models[Modality.TEXT_ENCODER]
+ model_image = models[Modality.IMAGE_ENCODER]
+ processor_text = processors[Modality.TEXT_ENCODER]
+ processor_image = processors[Modality.IMAGE_ENCODER]
+
+ texts = ["a small red panda in a zoo"] * batch_size
+ image_paths = ["assets/unum.png"] * batch_size
+
+ images = [Image.open(path) for path in image_paths]
+ image_data = processor_image(images)
+ text_data = processor_text(texts)
+
+ image_embeddings = model_image.encode(image_data, return_features=False)
+ text_embeddings = model_text.encode(text_data, return_features=False)
+
+ assert image_embeddings.shape[0] == batch_size, "Image embedding is unexpected"
+ assert text_embeddings.shape[0] == batch_size, "Text embedding is unexpected"
+
+
+@pytest.mark.skipif(not onnx_available, reason="ONNX is not installed")
+@pytest.mark.parametrize("model_name", onnx_models)
+@pytest.mark.parametrize("device", ["CPUExecutionProvider"])
+@skip_on(ExecutionProviderError, reason="Missing execution provider")
+def test_onnx_one_embedding(model_name: str, device: str):
+
+ processors, models = get_model(model_name, token=token, device=device, backend="onnx")
+ model_text = models[Modality.TEXT_ENCODER]
+ model_image = models[Modality.IMAGE_ENCODER]
+ processor_text = processors[Modality.TEXT_ENCODER]
+ processor_image = processors[Modality.IMAGE_ENCODER]
+
+ text = "a small red panda in a zoo"
+ image_path = "assets/unum.png"
+
+ image = Image.open(image_path)
+ image_data = processor_image(image)
+ text_data = processor_text(text)
+
+ image_features, image_embedding = model_image.encode(image_data)
+ text_features, text_embedding = model_text.encode(text_data)
+
+ assert image_embedding.shape[0] == 1, "Image embedding batch size is not 1"
+ assert text_embedding.shape[0] == 1, "Text embedding batch size is not 1"
+
+ # Nested fucntions are easier to debug, than lambdas
+ def get_image_embedding(image_data):
+ features, embedding = model_image.encode(processor_image(image_data))
+ return embedding
+
+ def get_text_embedding(text_data):
+ features, embedding = model_text.encode(processor_text(text_data))
+ return embedding
+
+ # Test if the model outputs actually make sense
+ cross_references_image_and_text_embeddings(get_text_embedding, get_image_embedding)
+
+
+@pytest.mark.skipif(not onnx_available, reason="ONNX is not installed")
+@pytest.mark.parametrize("model_name", onnx_models)
+@pytest.mark.parametrize("batch_size", [1, 2])
+@pytest.mark.parametrize("device", ["CPUExecutionProvider"])
+@skip_on(ExecutionProviderError, reason="Missing execution provider")
+def test_onnx_many_embeddings(model_name: str, batch_size: int, device: str):
+
+ processors, models = get_model(model_name, token=token, device=device, backend="onnx")
+ model_text = models[Modality.TEXT_ENCODER]
+ model_image = models[Modality.IMAGE_ENCODER]
+ processor_text = processors[Modality.TEXT_ENCODER]
+ processor_image = processors[Modality.IMAGE_ENCODER]
+
+ texts = ["a small red panda in a zoo"] * batch_size
+ image_paths = ["assets/unum.png"] * batch_size
+
+ images = [Image.open(path) for path in image_paths]
+ image_data = processor_image(images)
+ text_data = processor_text(texts)
+
+ image_embeddings = model_image.encode(image_data, return_features=False)
+ text_embeddings = model_text.encode(text_data, return_features=False)
+
+ assert image_embeddings.shape[0] == batch_size, "Image embedding is unexpected"
+ assert text_embeddings.shape[0] == batch_size, "Text embedding is unexpected"
+
+
+@pytest.mark.skipif(not torch_available, reason="PyTorch is not installed")
+@pytest.mark.parametrize("model_name", torch_models[:1])
+def test_torch_multi_gpu(model_name: str):
+
+ count_cuda_devices = torch.cuda.device_count()
+ if count_cuda_devices < 2:
+ pytest.skip("Not enough CUDA devices to run multi-GPU test")
+
+ processors, models = get_model(model_name, token=token, backend="torch", device="cuda")
+ model_text = models[Modality.TEXT_ENCODER]
+ model_image = models[Modality.IMAGE_ENCODER]
+ processor_text = processors[Modality.TEXT_ENCODER]
+ processor_image = processors[Modality.IMAGE_ENCODER]
+
+ import torch.nn as nn
+
+ model_text.return_features = False
+ model_image.return_features = False
+ model_text_parallel = nn.DataParallel(model_text)
+ model_image_parallel = nn.DataParallel(model_image)
+
+ # Nested fucntions are easier to debug, than lambdas
+ def get_image_embedding(image_data):
+ preprocessed = processor_image(image_data)
+ embedding = model_image_parallel.forward(preprocessed)
+ return embedding.detach().cpu().numpy()
+
+ def get_text_embedding(text_data):
+ preprocessed = processor_text(text_data)
+ embedding = model_text_parallel.forward(preprocessed)
+ return embedding.detach().cpu().numpy()
+
+ # Test if the model outputs actually make sense
+ cross_references_image_and_text_embeddings(
+ get_text_embedding,
+ get_image_embedding,
+ batch_size_multiple=count_cuda_devices,
+ )
+
+
+if __name__ == "__main__":
+ # If you want to run this test file individually, you can do so by running:
+ # pytest.main(["-s", "-x", __file__])
+ pass
diff --git a/python/uform/__init__.py b/python/uform/__init__.py
index 1ecb242..7af8b75 100755
--- a/python/uform/__init__.py
+++ b/python/uform/__init__.py
@@ -1,59 +1,191 @@
-from json import load
-from os.path import join
-from typing import Mapping, Optional, Tuple
-
-from huggingface_hub import snapshot_download
-
-
-def get_checkpoint(model_name: str, token: str) -> Tuple[str, Mapping, str]:
- import torch
-
- model_path = snapshot_download(repo_id=model_name, token=token)
- config_path = join(model_path, "torch_config.json")
-
- state = torch.load(join(model_path, "torch_weight.pt"))
- return config_path, state, join(model_path, "tokenizer.json")
-
-
-def get_model(model_name: str, token: Optional[str] = None):
- from uform.torch_models import VLM
- from uform.torch_preprocessor import TorchProcessor
-
- config_path, state, tokenizer_path = get_checkpoint(model_name, token)
-
- with open(config_path) as f:
- config = load(f)
-
- model = VLM(config, tokenizer_path)
- model.image_encoder.load_state_dict(state["image_encoder"])
- model.text_encoder.load_state_dict(state["text_encoder"])
- processor = TorchProcessor(config, tokenizer_path)
-
- return model.eval(), processor
-
-
-def get_model_onnx(model_name: str, device: str, dtype: str, token: Optional[str] = None):
- from uform.onnx_models import VLM_ONNX
- from uform.numpy_preprocessor import NumPyProcessor
-
- assert device in (
- "cpu",
- "gpu",
- ), f"Invalid `device`: {device}. Must be either `cpu` or `gpu`"
- assert dtype in (
- "fp32",
- "fp16",
- ), f"Invalid `dtype`: {dtype}. Must be either `fp32` or `fp16` (only for gpu)"
- assert (
- device == "cpu" and dtype == "fp32"
- ) or device == "gpu", "Combination `device`=`cpu` & `dtype=fp16` is not supported"
-
- model_path = snapshot_download(repo_id=f"{model_name}-{device}-{dtype}", token=token)
-
- with open(join(model_path, "config.json")) as f:
- config = load(f)
-
- model = VLM_ONNX(model_path, config, device, dtype)
- processor = NumPyProcessor(config, join(model_path, "tokenizer.json"))
-
- return model, processor
+from os.path import join, exists
+from typing import Dict, Optional, Tuple, Literal, Union, Callable
+
+from huggingface_hub import snapshot_download, utils
+
+from uform.shared import ExecutionProviderError, Modality
+
+
+def _normalize_modalities(modalities: Tuple[str, Modality]) -> Tuple[Modality]:
+ if modalities is None:
+ return (Modality.TEXT_ENCODER, Modality.IMAGE_ENCODER, Modality.TEXT_DECODER, Modality.VIDEO_ENCODER)
+
+ return tuple(x if isinstance(x, Modality) else Modality(x) for x in modalities)
+
+
+def get_checkpoint(
+ model_name: str,
+ modalities: Tuple[str, Modality],
+ token: Optional[str] = None,
+ format: Literal[".pt", ".onnx"] = ".pt",
+) -> Tuple[str, Dict[Modality, str], Optional[str]]:
+ """Downloads a model checkpoint from the Hugging Face Hub.
+
+ :param model_name: The name of the model to download, like `unum-cloud/uform3-image-text-english-small`
+ :param token: The Hugging Face API token, if required
+ :param modalities: The modalities to download, like `("text_encoder", "image_encoder")`
+ :param format: The format of the model checkpoint, either `.pt` or `.onnx`
+ :return: A tuple of the config path, dictionary of paths to different modalities, and tokenizer path
+ """
+
+ modalities = _normalize_modalities(modalities)
+
+ # It is not recommended to use `.pth` extension when checkpointing models
+ # because it collides with Python path (`.pth`) configuration files.
+ merged_model_names = [x + format for x in ["torch_weight", "weight", "model"]]
+ separate_modality_names = [(x.value if isinstance(x, Modality) else x) + format for x in modalities]
+ config_names = ["torch_config.json", "config.json"]
+ tokenizer_names = ["tokenizer.json"]
+
+ old_progress_behavior = utils.are_progress_bars_disabled()
+ utils.disable_progress_bars()
+
+ # The download stats depend on the number of times the `config.json` is pulled
+ # https://huggingface.co/docs/hub/models-download-stats
+ model_path = snapshot_download(
+ repo_id=model_name,
+ token=token,
+ allow_patterns=merged_model_names + separate_modality_names + config_names + tokenizer_names,
+ )
+
+ if old_progress_behavior:
+ utils.enable_progress_bars()
+
+ # Find the first name in `config_names` that is present
+ config_path = None
+ for config_name in config_names:
+ if exists(join(model_path, config_name)):
+ config_path = join(model_path, config_name)
+ break
+
+ # Same for the tokenizer
+ tokenizer_path = None
+ for tokenizer_name in tokenizer_names:
+ if exists(join(model_path, tokenizer_name)):
+ tokenizer_path = join(model_path, tokenizer_name)
+ break
+
+ # Ideally, we want to separately fetch all the models.
+ # If those aren't available, aggregate separate modalities and merge them.
+ modality_paths = None
+ for file_name in merged_model_names:
+ if exists(join(model_path, file_name)):
+ modality_paths = join(model_path, file_name)
+ break
+
+ if modality_paths is None:
+ modality_paths = {}
+ for separate_modality_name in separate_modality_names:
+ if exists(join(model_path, separate_modality_name)):
+ modality_name, _, _ = separate_modality_name.partition(".")
+ modality_paths[Modality(modality_name)] = join(model_path, separate_modality_name)
+
+ return config_path, modality_paths, tokenizer_path
+
+
+def get_model_torch(
+ model_name: str,
+ *,
+ token: Optional[str] = None,
+ device: Literal["cpu", "cuda"] = "cpu",
+ modalities: Optional[Tuple[Union[str, Modality]]] = None,
+) -> Tuple[Dict[Modality, Callable], Dict]:
+ """
+ Fetches and constructs a PyTorch model with its processors based on provided modalities.
+
+ :param model_name: The identifier of the model on the Hugging Face Hub.
+ :param token: Optional API token for authenticated access to the model.
+ :param device: The device to load the model onto ('cpu' or 'cuda').
+ :param modalities: A tuple specifying the types of model components to fetch (e.g., text encoder).
+ :return: A tuple containing dictionaries for processors and models keyed by their respective modalities.
+ """
+ from uform.torch_encoders import TextEncoder, ImageEncoder
+ from uform.torch_processors import TextProcessor, ImageProcessor
+
+ modalities = _normalize_modalities(modalities)
+ config_path, modality_paths, tokenizer_path = get_checkpoint(model_name, modalities, token=token, format=".pt")
+
+ result_processors = {}
+ result_models = {}
+
+ if Modality.TEXT_ENCODER in modalities:
+ processor = TextProcessor(config_path, tokenizer_path)
+ encoder = TextEncoder.from_pretrained(config_path, modality_paths.get(Modality.TEXT_ENCODER))
+ encoder = encoder.eval().to(device)
+ result_processors[Modality.TEXT_ENCODER] = processor
+ result_models[Modality.TEXT_ENCODER] = encoder
+
+ if Modality.IMAGE_ENCODER in modalities:
+ processor = ImageProcessor(config_path)
+ encoder = ImageEncoder.from_pretrained(config_path, modality_paths.get(Modality.IMAGE_ENCODER))
+ encoder = encoder.eval().to(device)
+ result_processors[Modality.IMAGE_ENCODER] = processor
+ result_models[Modality.IMAGE_ENCODER] = encoder
+
+ return result_processors, result_models
+
+
+def get_model_onnx(
+ model_name: str,
+ *,
+ device: Literal["cpu", "cuda"] = "cpu",
+ token: Optional[str] = None,
+ modalities: Optional[Tuple[str]] = None,
+):
+ """
+ Fetches and constructs an ONNX model with its processors based on provided modalities.
+
+ :param model_name: The identifier of the model on the Hugging Face Hub.
+ :param device: The device on which the model will operate ('cpu' or 'cuda').
+ :param token: Optional API token for authenticated access to the model.
+ :param modalities: A tuple specifying the types of model components to fetch (e.g., text encoder).
+ :return: A tuple containing dictionaries for processors and models keyed by their respective modalities.
+ """
+ from uform.onnx_encoders import TextEncoder, ImageEncoder
+ from uform.numpy_processors import TextProcessor, ImageProcessor
+
+ modalities = _normalize_modalities(modalities)
+ config_path, modality_paths, tokenizer_path = get_checkpoint(model_name, modalities, token=token, format=".onnx")
+
+ result_processors = {}
+ result_models = {}
+
+ if Modality.TEXT_ENCODER in modalities:
+ processor = TextProcessor(config_path, tokenizer_path)
+ encoder = TextEncoder(modality_paths.get(Modality.TEXT_ENCODER), device=device)
+ result_processors[Modality.TEXT_ENCODER] = processor
+ result_models[Modality.TEXT_ENCODER] = encoder
+
+ if Modality.IMAGE_ENCODER in modalities:
+ processor = ImageProcessor(config_path)
+ encoder = ImageEncoder(modality_paths.get(Modality.IMAGE_ENCODER), device=device)
+ result_processors[Modality.IMAGE_ENCODER] = processor
+ result_models[Modality.IMAGE_ENCODER] = encoder
+
+ return result_processors, result_models
+
+
+def get_model(
+ model_name: str,
+ *,
+ device: Literal["cpu", "cuda"] = "cpu", # change this if you have a GPU
+ backend: Literal["onnx", "torch"] = "onnx", # lighter = better
+ modalities: Optional[Tuple[str, Modality]] = None, # all by default
+ token: Optional[str] = None, # optional HuggingFace Hub token for private models
+) -> Tuple[Dict[Modality, Callable], Dict]:
+ """
+ Fetches a model and its processors from the Hugging Face Hub, using either the ONNX or Torch backend.
+
+ :param model_name: The identifier of the model on the Hugging Face Hub.
+ :param device: The device to load the model onto ('cpu' or 'cuda').
+ :param backend: The backend framework to use ('onnx' or 'torch').
+ :param modalities: A tuple specifying the types of model components to fetch.
+ :param token: Optional API token for authenticated access to the model.
+ :return: A tuple containing dictionaries for processors and models keyed by their respective modalities.
+ """
+ if backend == "onnx":
+ return get_model_onnx(model_name, device=device, token=token, modalities=modalities)
+ elif backend == "torch":
+ return get_model_torch(model_name, device=device, token=token, modalities=modalities)
+ else:
+ raise ValueError(f"Unknown backend: {backend}")
diff --git a/python/uform/chat.py b/python/uform/chat.py
index 5ef44b7..b9e4423 100644
--- a/python/uform/chat.py
+++ b/python/uform/chat.py
@@ -3,20 +3,16 @@
import requests
import torch
from PIL import Image
-from transformers import TextStreamer
-
-from uform.gen_model import VLMForCausalLM, VLMProcessor
-
-EOS_TOKEN = 32001
+from transformers import TextStreamer, AutoModel, AutoProcessor
def parse_args():
parser = ArgumentParser(description="Chat with UForm generative model")
- parser.add_argument("--model", type=str, default="unum-cloud/uform-gen-chat")
- parser.add_argument("--image", type=str, help="", required=True)
- parser.add_argument("--device", type=str, required=True)
- parser.add_argument("--fp16", action="store_true")
+ parser.add_argument("--model", type=str, default="unum-cloud/uform-gen-chat", help="Model name or path")
+ parser.add_argument("--image", type=str, required=True, help="Path to image or URL")
+ parser.add_argument("--device", type=str, required=True, help="Device to run on, like `cpu` or `cuda:0`")
+ parser.add_argument("--fp16", action="store_true", help="Use half-precision math for faster inference")
return parser.parse_args()
@@ -30,22 +26,18 @@ def run_chat(opts, model, processor):
messages = [{"role": "system", "content": "You are a helpful assistant."}]
is_first_message = True
+
if opts.image.startswith("http"):
- image = (
- processor.image_processor(
- Image.open(requests.get(opts.image, stream=True).raw),
- )
- .unsqueeze(0)
- .to(torch.bfloat16 if opts.fp16 else torch.float32)
- .to(opts.device)
- )
+ image = Image.open(requests.get(opts.image, stream=True).raw)
else:
- image = (
- processor.image_processor(Image.open(opts.image))
- .unsqueeze(0)
- .to(torch.bfloat16 if opts.fp16 else torch.float32)
- .to(opts.device)
- )
+ image = Image.open(opts.image)
+
+ image = (
+ processor.feature_extractor(image) #
+ .unsqueeze(0)
+ .to(torch.bfloat16 if opts.fp16 else torch.float32)
+ .to(opts.device)
+ )
while True:
if messages[-1]["role"] in ("system", "assistant"):
@@ -68,7 +60,7 @@ def run_chat(opts, model, processor):
1,
input_ids.shape[1] + processor.num_image_latents - 1,
).to(opts.device)
- x = {
+ inputs = {
"input_ids": input_ids,
"attention_mask": attention_mask,
"images": image,
@@ -76,18 +68,19 @@ def run_chat(opts, model, processor):
print("Assistant: ", end="")
with torch.inference_mode():
- y = model.generate(
- **x,
+ output = model.generate(
+ **inputs,
do_sample=False,
use_cache=True,
max_new_tokens=1024,
- eos_token_id=EOS_TOKEN,
+ eos_token_id=151645,
pad_token_id=processor.tokenizer.pad_token_id,
streamer=streamer,
)
print()
- message = processor.batch_decode(y[:, x["input_ids"].shape[1] : -1])[0]
+ prompt_len = inputs["input_ids"].shape[1]
+ message = processor.batch_decode(output[:, prompt_len:-1])[0]
messages.append({"role": "assistant", "content": message})
@@ -95,16 +88,17 @@ def run_chat(opts, model, processor):
def main():
try:
opts = parse_args()
-
+ processor = AutoProcessor.from_pretrained(opts.model, trust_remote_code=True)
model = (
- VLMForCausalLM.from_pretrained(
+ AutoModel.from_pretrained(
opts.model,
torch_dtype=torch.bfloat16 if opts.fp16 else torch.float32,
+ ignore_mismatched_sizes=True,
+ trust_remote_code=True,
)
.eval()
.to(opts.device)
)
- processor = VLMProcessor.from_pretrained(opts.model)
run_chat(opts, model, processor)
diff --git a/python/uform/gen_model.py b/python/uform/gen_model.py
index c03b6eb..6792120 100644
--- a/python/uform/gen_model.py
+++ b/python/uform/gen_model.py
@@ -1,464 +1 @@
-from typing import List, Optional, Tuple, Union
-
-import torch
-import torch.nn.functional as F
-from torch import nn
-from torchvision.transforms import (CenterCrop, Compose, InterpolationMode,
- Normalize, RandomResizedCrop, Resize,
- ToTensor)
-from transformers import AutoConfig, AutoTokenizer
-from transformers.configuration_utils import PretrainedConfig
-from transformers.modeling_outputs import CausalLMOutputWithPast
-from transformers.modeling_utils import PreTrainedModel
-from transformers.models.auto.modeling_auto import (AutoModel,
- AutoModelForCausalLM)
-from transformers.processing_utils import ProcessorMixin
-from transformers.tokenization_utils_base import BatchEncoding
-
-from uform.torch_models import VisualEncoder
-
-IMAGENET_MEAN = (0.48145466, 0.4578275, 0.40821073)
-IMAGENET_STD = (0.26862954, 0.26130258, 0.27577711)
-
-
-def convert_to_rgb(image):
- return image.convert("RGB")
-
-
-class LayerScale(nn.Module):
- def __init__(self, dim, init_values: float = 1e-5, inplace: bool = False):
- super().__init__()
- self.weight = nn.Parameter(init_values * torch.ones(dim))
- self.inplace = inplace
-
- def forward(self, x):
- return x.mul_(self.weight) if self.inplace else x * self.weight
-
-
-class ImageFeaturesPooler(nn.Module):
- def __init__(
- self,
- input_size,
- hidden_size,
- num_attn_heads,
- intermediate_size,
- num_latents,
- initializer_range,
- ):
- super().__init__()
- self.projection = nn.Linear(input_size, hidden_size)
-
- self.pooler = nn.TransformerDecoderLayer(
- hidden_size,
- num_attn_heads,
- intermediate_size,
- activation=nn.functional.silu,
- batch_first=True,
- norm_first=True,
- )
- self.image_latents = nn.Parameter(
- torch.randn(1, num_latents, hidden_size) * initializer_range**0.5,
- )
-
- def forward(self, features):
- features = self.projection(features)
- return self.pooler(
- self.image_latents.expand(features.shape[0], -1, -1),
- features,
- )
-
-
-class VLMConfig(PretrainedConfig):
- model_type = "vlm"
-
- def __init__(
- self,
- text_decoder_name_or_path: str = "",
- tokenizer_name_or_path: str = "",
- image_size: int = 224,
- image_encoder_hidden_size: int = 768,
- image_encoder_patch_size: int = 16,
- image_encoder_num_layers: int = 12,
- image_encoder_num_heads: int = 12,
- image_encoder_embedding_dim: int = 256,
- image_encoder_pooling: str = "cls",
- image_pooler_num_attn_heads: int = 16,
- image_pooler_intermediate_size: int = 5504,
- image_pooler_num_latents: int = 196,
- image_token_id: int = 32002,
- initializer_range: float = 0.02,
- use_cache: bool = True,
- center_crop: bool = True,
- **kwargs,
- ):
- self.text_decoder_name_or_path = text_decoder_name_or_path
- self.tokenizer_name_or_path = tokenizer_name_or_path
-
- self.image_size = image_size
- self.image_encoder_hidden_size = image_encoder_hidden_size
- self.image_encoder_patch_size = image_encoder_patch_size
- self.image_encoder_num_layers = image_encoder_num_layers
- self.image_encoder_num_heads = image_encoder_num_heads
- self.image_encoder_embedding_dim = image_encoder_embedding_dim
- self.image_encoder_pooling = image_encoder_pooling
-
- self.image_pooler_num_attn_heads = image_pooler_num_attn_heads
- self.image_pooler_intermediate_size = image_pooler_intermediate_size
- self.image_pooler_num_latents = image_pooler_num_latents
-
- self.image_token_id = image_token_id
-
- self.initializer_range = initializer_range
- self.use_cache = use_cache
- self.center_crop = center_crop
-
- super().__init__(**kwargs)
-
-
-class VLMPreTrainedModel(PreTrainedModel):
- config_class = VLMConfig
- base_model_prefix = "vlm"
- supports_gradient_checkpointing = True
- _no_split_modules = []
- _skip_keys_device_placement = "past_key_values"
-
- def _init_weights(self, module):
- pass
-
- def _initialize_weights(self, module):
- pass
-
-
-class VLMForCausalLM(VLMPreTrainedModel):
- def __init__(self, config: VLMConfig):
- super().__init__(config)
-
- self.config = config
- self.text_config = AutoConfig.from_pretrained(config.text_decoder_name_or_path)
- self.text_config.vocab_size += 3
- self.text_decoder = AutoModelForCausalLM.from_config(self.text_config)
-
- self.image_encoder = VisualEncoder(
- self.config.image_encoder_hidden_size,
- self.config.image_encoder_patch_size,
- self.config.image_size,
- self.config.image_encoder_num_layers,
- self.config.image_encoder_num_heads,
- self.config.image_encoder_embedding_dim,
- self.config.image_encoder_pooling,
- )
-
- # replace models' layerscales because `transformers` automatically renames keys in state_dict
- for i in range(len(self.image_encoder.blocks)):
- self.image_encoder.blocks[i].ls1 = LayerScale(
- self.image_encoder.blocks[i].ls1.dim,
- )
- self.image_encoder.blocks[i].ls2 = LayerScale(
- self.image_encoder.blocks[i].ls2.dim,
- )
-
- self.image_pooler = ImageFeaturesPooler(
- self.config.image_encoder_hidden_size,
- self.text_config.hidden_size,
- self.config.image_pooler_num_attn_heads,
- self.config.image_pooler_intermediate_size,
- self.config.image_pooler_num_latents,
- self.config.initializer_range,
- )
-
- def get_input_embeddings(self):
- return self.text_decoder.get_input_embeddings()
-
- def set_input_embeddings(self, value):
- self.text_decoder.set_input_embeddings(value)
-
- def get_images_embeddings(self, images):
- features = self.image_encoder.forward_features(images)
- return self.image_pooler(features)
-
- def gather_continuous_embeddings(
- self,
- input_ids: torch.Tensor,
- word_embeddings: torch.Tensor,
- image_embeddings: torch.Tensor,
- ) -> torch.Tensor:
- start_indices = (input_ids == self.config.image_token_id).nonzero()[:, 1]
- embeddings = []
-
- for sample_idx, start_idx in enumerate(start_indices.tolist()):
- embeddings.append(
- torch.cat(
- (
- word_embeddings[sample_idx, :start_idx],
- image_embeddings[sample_idx],
- word_embeddings[sample_idx, start_idx + 1 :],
- ),
- dim=0,
- ),
- )
-
- return torch.stack(embeddings, dim=0)
-
- def forward(
- self,
- input_ids: torch.LongTensor = None,
- images: torch.Tensor = None,
- attention_mask: Optional[torch.Tensor] = None,
- position_ids: Optional[torch.LongTensor] = None,
- past_key_values: Optional[List[torch.FloatTensor]] = None,
- inputs_embeds: Optional[torch.FloatTensor] = None,
- use_cache: Optional[bool] = None,
- labels: Optional[torch.Tensor] = None,
- output_attentions: Optional[bool] = None,
- output_hidden_states: Optional[bool] = None,
- return_dict: Optional[bool] = None,
- ) -> Union[dict, Tuple, CausalLMOutputWithPast]:
- output_attentions = (
- output_attentions
- if output_attentions is not None
- else self.config.output_attentions
- )
- output_hidden_states = (
- output_hidden_states
- if output_hidden_states is not None
- else self.config.output_hidden_states
- )
- use_cache = use_cache if use_cache is not None else self.config.use_cache
-
- return_dict = (
- return_dict if return_dict is not None else self.config.use_return_dict
- )
-
- if input_ids is not None and inputs_embeds is not None:
- raise ValueError(
- "You cannot specify both input_ids and inputs_embeds at the same time",
- )
- elif input_ids is None and inputs_embeds is None:
- raise ValueError("You have to specify either input_is or inputs_embeds")
-
- if inputs_embeds is None and past_key_values is None:
- inputs_embeds = self.get_input_embeddings()(input_ids)
-
- if images is not None:
- image_embeds = self.get_images_embeddings(images)
- inputs_embeds = self.gather_continuous_embeddings(
- input_ids,
- inputs_embeds,
- image_embeds,
- )
-
- if position_ids is None:
- seq_length = (
- inputs_embeds.shape[1]
- if inputs_embeds is not None
- else input_ids.shape[1]
- )
- past_key_values_length = 0
-
- if past_key_values is not None:
- past_key_values_length = past_key_values[0][0].shape[2]
-
- device = input_ids.device if input_ids is not None else inputs_embeds.device
- position_ids = torch.arange(
- past_key_values_length,
- seq_length + past_key_values_length,
- dtype=torch.long,
- device=device,
- )
- position_ids = position_ids.unsqueeze(0)
-
- outputs = self.text_decoder(
- inputs_embeds=inputs_embeds,
- input_ids=input_ids if past_key_values is not None else None,
- attention_mask=attention_mask,
- labels=labels,
- position_ids=position_ids,
- past_key_values=past_key_values,
- output_attentions=output_attentions,
- output_hidden_states=output_hidden_states,
- use_cache=use_cache,
- return_dict=return_dict,
- )
-
- return outputs
-
- def prepare_inputs_for_generation(
- self,
- input_ids,
- images=None,
- past_key_values=None,
- attention_mask=None,
- inputs_embeds=None,
- **kwargs,
- ):
- if past_key_values:
- input_ids = input_ids[:, -1:]
-
- position_ids = kwargs.get("position_ids", None)
- if attention_mask is not None and position_ids is None:
- # create position_ids on the fly for batch generation
- position_ids = attention_mask.long().cumsum(-1) - 1
- position_ids.masked_fill_(attention_mask == 0, 1)
- if past_key_values:
- position_ids = position_ids[:, -1].unsqueeze(-1)
-
- # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
- if inputs_embeds is not None and past_key_values is None:
- model_inputs = {"inputs_embeds": inputs_embeds}
- else:
- model_inputs = {"input_ids": input_ids}
-
- if images is not None:
- model_inputs["images"] = images
-
- model_inputs.update(
- {
- "position_ids": position_ids,
- "past_key_values": past_key_values,
- "use_cache": kwargs.get("use_cache"),
- "attention_mask": attention_mask,
- "images": images if past_key_values is None else None,
- },
- )
- return model_inputs
-
- @classmethod
- def from_config(cls, config, **kwargs):
- return cls._from_config(config, **kwargs)
-
-
-class VLMProcessor(ProcessorMixin):
- def __init__(self, config, **kwargs):
- self.feature_extractor = None
- self.config = config
-
- if config.center_crop:
- self.image_processor = Compose(
- [
- Resize(256, interpolation=InterpolationMode.BICUBIC),
- CenterCrop(config.image_size),
- convert_to_rgb,
- ToTensor(),
- Normalize(
- mean=IMAGENET_MEAN,
- std=IMAGENET_STD,
- ),
- ],
- )
- else:
- self.image_processor = Compose(
- [
- RandomResizedCrop(
- config.image_size,
- scale=(0.8, 1),
- interpolation=InterpolationMode.BICUBIC,
- ),
- convert_to_rgb,
- ToTensor(),
- Normalize(
- mean=IMAGENET_MEAN,
- std=IMAGENET_STD,
- ),
- ],
- )
-
- self.tokenizer = AutoTokenizer.from_pretrained(
- config.tokenizer_name_or_path,
- additional_special_tokens=["<|im_end|>"],
- )
- self.num_image_latents = config.image_pooler_num_latents
-
- def __call__(self, texts=None, images=None, return_tensors="pt", **kwargs):
- if texts is not None:
- if isinstance(texts, str):
- texts = [texts]
-
- tokenized_texts = []
- for text in texts:
- messages = [
- {"role": "system", "content": "You are a helpful assistant."},
- {"role": "user", "content": f" {text}"},
- ]
- tokenized_prompt = self.tokenizer.apply_chat_template(
- messages,
- add_generation_prompt=True,
- return_tensors=return_tensors,
- )
-
- tokenized_texts.append(tokenized_prompt)
-
- max_len = max(len(t[0]) for t in tokenized_texts)
- input_ids = torch.full(
- (len(tokenized_texts), max_len),
- fill_value=self.tokenizer.pad_token_id,
- dtype=torch.int64,
- )
- attention_mask = torch.full(
- (len(tokenized_texts), max_len),
- fill_value=0,
- dtype=torch.int64,
- )
-
- for i, tokens in enumerate(tokenized_texts):
- input_ids[i, -len(tokens[0]) :] = tokens[0]
- attention_mask[i, -len(tokens[0]) :] = 1
-
- attention_mask = F.pad(
- attention_mask,
- pad=(0, self.num_image_latents - 1),
- value=1,
- )
-
- encoding = BatchEncoding(
- data={"input_ids": input_ids, "attention_mask": attention_mask},
- )
-
- if images is not None:
- if isinstance(images, (list, tuple)):
- image_features = torch.empty(
- (len(images), 3, self.config.image_size, self.config.image_size),
- dtype=torch.float32,
- )
-
- for i, image in enumerate(images):
- image_features[i] = self.image_processor(image)
- else:
- image_features = self.image_processor(images).unsqueeze(0)
-
- if texts is not None and images is not None:
- encoding["images"] = image_features
- return encoding
-
- if texts is not None:
- return encoding
-
- return BatchEncoding(
- data={
- "images": image_features,
- },
- tensor_type=return_tensors,
- )
-
- def batch_decode(self, *args, **kwargs):
- return self.tokenizer.batch_decode(*args, **kwargs)
-
- def decode(self, *args, **kwargs):
- return self.tokenizer.decode(*args, **kwargs)
-
- @classmethod
- def from_pretrained(
- cls,
- pretrained_model_name_or_path,
- cache_dir=None,
- force_download: bool = False,
- local_files_only: bool = False,
- token=None,
- revision: str = "main",
- **kwargs,
- ):
- config = AutoConfig.from_pretrained(pretrained_model_name_or_path)
- return cls(config)
-
-
-AutoConfig.register("vlm", VLMConfig)
-AutoModel.register(VLMConfig, VLMForCausalLM)
+from uform.torch_decoders import VLMForCausalLM, VLMProcessor # legacy path
diff --git a/python/uform/numpy_preprocessor.py b/python/uform/numpy_processors.py
similarity index 62%
rename from python/uform/numpy_preprocessor.py
rename to python/uform/numpy_processors.py
index a556db4..166ecf4 100644
--- a/python/uform/numpy_preprocessor.py
+++ b/python/uform/numpy_processors.py
@@ -1,29 +1,31 @@
from os import PathLike
-from typing import Dict, List, Union
+from typing import Dict, List, Union, Sequence
+import json
from PIL.Image import Image, BICUBIC
from tokenizers import Tokenizer
import numpy as np
+from uform.shared import read_config
-class NumPyProcessor:
- def __init__(self, config: Dict, tokenizer_path: PathLike):
+
+class TextProcessor:
+ def __init__(self, config_path: PathLike, tokenizer_path: PathLike):
"""
:param config: model config
:param tokenizer_path: path to tokenizer file
- :param tensor_type: which tensors to return, either pt (PyTorch) or np (NumPy)
"""
- self._image_size = config["image_encoder"]["image_size"]
- self._max_seq_len = config["text_encoder"]["max_position_embeddings"]
+ config = read_config(config_path)
+ if "text_encoder" in config:
+ config = config["text_encoder"]
+
+ self._max_seq_len = config["max_position_embeddings"]
self._tokenizer = Tokenizer.from_file(tokenizer_path)
self._tokenizer.no_padding()
- self._pad_token_idx = config["text_encoder"]["padding_idx"]
-
- self.image_mean = np.array([0.48145466, 0.4578275, 0.40821073], dtype=np.float32)[None, None]
- self.image_std = np.array([0.26862954, 0.26130258, 0.27577711], dtype=np.float32)[None, None]
+ self._pad_token_idx = config["padding_idx"]
- def preprocess_text(self, texts: Union[str, List[str]]) -> Dict[str, np.ndarray]:
+ def __call__(self, texts: Union[str, Sequence[str]]) -> Dict[str, np.ndarray]:
"""Transforms one or more strings into dictionary with tokenized strings and attention masks.
:param texts: text of list of texts to tokenizer
@@ -34,7 +36,7 @@ def preprocess_text(self, texts: Union[str, List[str]]) -> Dict[str, np.ndarray]
input_ids = np.full(
(len(texts), self._max_seq_len),
fill_value=self._pad_token_idx,
- dtype=np.int64,
+ dtype=np.int32,
)
attention_mask = np.zeros(
@@ -51,13 +53,37 @@ def preprocess_text(self, texts: Union[str, List[str]]) -> Dict[str, np.ndarray]
return {"input_ids": input_ids, "attention_mask": attention_mask}
- def preprocess_image(self, images: Union[Image, List[Image]]) -> np.ndarray:
+
+class ImageProcessor:
+ def __init__(self, config_path: PathLike, tokenizer_path: PathLike = None):
+ """
+ :param config: model config
+ :param tokenizer_path: path to tokenizer file
+ :param tensor_type: which tensors to return, either pt (PyTorch) or np (NumPy)
+ """
+
+ config = read_config(config_path)
+ if "image_encoder" in config:
+ config = config["image_encoder"]
+
+ self._image_size = config["image_size"]
+ self._normalization_means = config["normalization_means"]
+ self._normalization_deviations = config["normalization_deviations"]
+
+ assert isinstance(self._image_size, int) and self._image_size > 0
+ assert isinstance(self._normalization_means, list) and isinstance(self._normalization_deviations, list)
+ assert len(self._normalization_means) == len(self._normalization_deviations) == 3
+
+ self.image_mean = np.array(self._normalization_means, dtype=np.float32)[None, None]
+ self.image_std = np.array(self._normalization_deviations, dtype=np.float32)[None, None]
+
+ def __call__(self, images: Union[Image, Sequence[Image]]) -> np.ndarray:
"""Transforms one or more Pillow images into Torch Tensors.
:param images: image or list of images to preprocess
"""
- if isinstance(images, list):
+ if isinstance(images, Sequence):
batch_images = np.empty(
(len(images), 3, self._image_size, self._image_size),
dtype=np.float32,
diff --git a/python/uform/onnx_encoders.py b/python/uform/onnx_encoders.py
new file mode 100644
index 0000000..b9c4cc4
--- /dev/null
+++ b/python/uform/onnx_encoders.py
@@ -0,0 +1,139 @@
+from os import PathLike
+from typing import Dict, Optional, Tuple, Union, Literal
+import json
+
+import onnxruntime as ort
+from numpy import ndarray
+
+from uform.shared import ExecutionProviderError
+
+
+def available_providers(device: Optional[str]) -> Tuple[str, ...]:
+ """Returns a tuple of available execution providers based on the requested device.
+ https://onnxruntime.ai/docs/execution-providers/
+
+ :param device: Device name, either `cpu` or `gpu`, or a specific execution provider name.
+ :return: Tuple of available execution providers.
+ :raises ExecutionProviderError: If the requested device is not available.
+ """
+
+ gpu_providers = ("CUDAExecutionProvider", "TensorrtExecutionProvider")
+ cpu_providers = ("OpenVINOExecutionProvider", "CoreMLExecutionProvider", "CPUExecutionProvider")
+ available = ort.get_available_providers()
+
+ # If no target device is specified, let's sort all the available ones with respect to our preference
+ if device is None:
+ preferences = gpu_providers + cpu_providers
+ filtered_preferences = tuple(provider for provider in preferences if provider in available)
+ if len(filtered_preferences):
+ return filtered_preferences
+ if len(available):
+ return available
+ raise ExecutionProviderError("No execution providers are available")
+
+ # If a GPU is requested, but no GPU providers are available, raise an error
+ if device == "gpu" or device == "cuda":
+ if all(provider not in available for provider in gpu_providers):
+ raise ExecutionProviderError(
+ f"GPU providers are not available, consider installing `onnxruntime-gpu` and make sure the CUDA is available on your system. Currently installed: {available}"
+ )
+ return [x for x in gpu_providers if x in available]
+
+ # If a CPU is requested, but no CPU providers are available, raise an error
+ if device == "cpu":
+ if all(provider not in available for provider in cpu_providers):
+ raise ExecutionProviderError(
+ f"CPU providers are not available, consider installing `onnxruntime` and make sure the OpenVINO and CoreML are available on your system. Currently installed: {available}"
+ )
+ return [x for x in cpu_providers if x in available]
+
+ if device not in available:
+ available_providers = ", ".join(available)
+ raise ExecutionProviderError(
+ f"Execution provider {device} is not available. Currently installed: {available_providers}"
+ )
+
+ return (device,)
+
+
+class ImageEncoder:
+ def __init__(
+ self,
+ model_path: str,
+ *,
+ device: Literal["cpu", "cuda"] = "cpu",
+ return_features: bool = True,
+ ):
+ """
+ :param model_path: Path to onnx model
+ :param device: Device name, either cpu or gpu
+ """
+
+ session_options = ort.SessionOptions()
+ session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+
+ self.return_features = return_features
+ self.session = ort.InferenceSession(
+ model_path,
+ sess_options=session_options,
+ providers=available_providers(device),
+ )
+
+ def encode(
+ self, images: ndarray, return_features: Optional[bool] = None
+ ) -> Union[ndarray, Tuple[ndarray, ndarray]]:
+ features, embeddings = self.session.run(None, {"images": images})
+ return_features = return_features if return_features is not None else self.return_features
+ if return_features:
+ return features, embeddings
+ return embeddings
+
+
+class TextEncoder:
+ def __init__(
+ self,
+ model_path: str,
+ *,
+ device: Literal["cpu", "cuda"] = "cpu",
+ return_features: bool = True,
+ ):
+ """
+ :param text_encoder_path: Path to onnx of text encoder
+ :param device: Device name, either cpu or gpu
+ """
+
+ session_options = ort.SessionOptions()
+ session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
+
+ self.return_features = return_features
+ self.text_encoder_session = ort.InferenceSession(
+ model_path,
+ sess_options=session_options,
+ providers=available_providers(device),
+ )
+
+ def encode(
+ self,
+ x: Union[ndarray, dict],
+ attention_mask: Optional[ndarray] = None,
+ return_features: Optional[bool] = None,
+ ) -> Union[ndarray, Tuple[ndarray, ndarray]]:
+ if isinstance(x, dict):
+ assert attention_mask is None, "If `x` is a dictionary, then `attention_mask` should be None"
+ attention_mask = x["attention_mask"]
+ input_ids = x["input_ids"]
+ else:
+ input_ids = x
+
+ features, embeddings = self.text_encoder_session.run(
+ None,
+ {
+ "input_ids": input_ids,
+ "attention_mask": attention_mask,
+ },
+ )
+
+ return_features = return_features if return_features is not None else self.return_features
+ if return_features:
+ return features, embeddings
+ return embeddings
diff --git a/python/uform/onnx_models.py b/python/uform/onnx_models.py
deleted file mode 100644
index 8e2a87a..0000000
--- a/python/uform/onnx_models.py
+++ /dev/null
@@ -1,231 +0,0 @@
-from os.path import join
-from typing import Dict, Optional, Tuple, Union
-
-import onnxruntime as ort
-from numpy import ndarray
-
-
-class ExecutionProviderError(Exception):
- """Exception raised when a requested execution provider is not available."""
-
-
-def available_providers(device: str) -> Tuple[str, ...]:
- gpu_providers = ("CUDAExecutionProvider", "TensorrtExecutionProvider")
- cpu_providers = ("OpenVINOExecutionProvider", "CoreMLExecutionProvider", "CPUExecutionProvider")
- available = ort.get_available_providers()
- if device == "gpu":
- if all(provider not in available for provider in gpu_providers):
- raise ExecutionProviderError(
- f"GPU providers are not available, consider installing `onnxruntime-gpu` and make sure the CUDA is available on your system. Currently installed: {available}"
- )
- return gpu_providers
-
- return cpu_providers
-
-
-class VisualEncoderONNX:
- def __init__(self, model_path: str, device: str):
- """
- :param model_path: Path to onnx model
- :param device: Device name, either cpu or gpu
- """
-
- session_options = ort.SessionOptions()
- session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
-
- self.session = ort.InferenceSession(
- model_path,
- sess_options=session_options,
- providers=available_providers(device),
- )
-
- def __call__(self, images: ndarray) -> Tuple[ndarray, ndarray]:
- return self.session.run(None, {"images": images})
-
-
-class TextEncoderONNX:
- def __init__(self, text_encoder_path: str, reranker_path: str, device: str):
- """
- :param text_encoder_path: Path to onnx of text encoder
- :param reranker_path: Path to onnx of reranker
- :param device: Device name, either cpu or gpu
- """
-
- session_options = ort.SessionOptions()
- session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
-
- self.text_encoder_session = ort.InferenceSession(
- text_encoder_path,
- sess_options=session_options,
- providers=available_providers(device),
- )
-
- self.reranker_session = ort.InferenceSession(
- reranker_path,
- sess_options=session_options,
- providers=available_providers(device),
- )
-
- def __call__(self, input_ids: ndarray, attention_mask: ndarray) -> Tuple[ndarray, ndarray]:
- return self.text_encoder_session.run(None, {"input_ids": input_ids, "attention_mask": attention_mask})
-
- def forward_multimodal(
- self, text_features: ndarray, attention_mask: ndarray, image_features: ndarray
- ) -> Tuple[ndarray, ndarray]:
- return self.reranker_session.run(
- None,
- {
- "text_features": text_features,
- "attention_mask": attention_mask,
- "image_features": image_features,
- },
- )
-
-
-class VLM_ONNX:
- def __init__(self, checkpoint_path: str, config: Dict, device: str, dtype: str):
- assert device in (
- "cpu",
- "gpu",
- ), f"Invalid `device`: {device}. Must be either `cpu` or `gpu`"
- assert dtype in (
- "fp32",
- "fp16",
- ), f"Invalid `dtype`: {dtype}. Must be either `fp32` or `fp16` (only for gpu)"
- assert (
- device == "cpu" and dtype == "fp32"
- ) or device == "gpu", "Combination `device`=`cpu` & `dtype=fp16` is not supported"
-
- self.device = device
- self.dtype = dtype
-
- self._embedding_dim = config["text_encoder"]["embedding_dim"]
- self._text_encoder_dim = config["text_encoder"]["dim"]
- self._image_encoder_dim = config["image_encoder"]["dim"]
-
- self.text_encoder = TextEncoderONNX(
- join(checkpoint_path, f"text_encoder.onnx"),
- join(checkpoint_path, f"reranker.onnx"),
- device,
- )
-
- self.image_encoder = VisualEncoderONNX(join(checkpoint_path, f"image_encoder.onnx"), device)
-
- def encode_image(
- self,
- images: ndarray,
- return_features: bool = False,
- ) -> Union[ndarray, Tuple[ndarray, ndarray]]:
- """Passes the pre-processed images through `image_encoder` to produce images features (optional) and embeddings.
-
- :param images: Preprocessed image
- :param return_features: Whether to return images features or return only embeddings
- """
-
- features, embeddings = self.image_encoder(images)
-
- if return_features:
- return features, embeddings
-
- return embeddings
-
- def encode_text(
- self,
- texts: Dict[str, ndarray],
- return_features: bool = False,
- ) -> Union[ndarray, Tuple[ndarray, ndarray]]:
- """Passes the pre-processed texts through `text_encoder` to produce texts features (optional) and embeddings.
-
- :param texts: Dictionary with tokenized texts and attention masks
- :param return_features: Whether to return texts features or return only embeddings
- """
-
- features, embeddings = self.text_encoder(**texts)
-
- if return_features:
- return features, embeddings
-
- return embeddings
-
- def encode_multimodal(
- self,
- image: Optional[ndarray] = None,
- text: Dict[str, ndarray] = None,
- image_features: Optional[ndarray] = None,
- text_features: Optional[ndarray] = None,
- attention_mask: Optional[ndarray] = None,
- return_scores: bool = False,
- ) -> Union[ndarray, Tuple[ndarray, ndarray]]:
- """Passes preprocessed texts (or precomputed texts features) and
- preprocessed images (or precomputed images features) through multimodal encoded to produce matching scores and optionally multimodal joint embeddings.
-
- :param image: Preprocessed images
- :param text: Preprocessed texts
- :param image_features: Precomputed images features
- :param text_features: Precomputed text features
- :param attention_mask: Attention masks, not required if pass `text` instead of text_features
- """
-
- assert image is not None or image_features is not None, "Either `image` or `image_features` should be non None"
- assert text is not None or text_features is not None, "Either `text_data` or `text_features` should be non None"
-
- if text_features is not None:
- assert attention_mask is not None, "if `text_features` is not None, then you should pass `attention_mask`"
-
- if image_features is None:
- image_features = self.image_encoder(image)
-
- if text_features is None:
- text_features = self.text_encoder(
- text["input_ids"],
- text["attention_mask"],
- )
-
- matching_scores, embeddings = self.text_encoder.forward_multimodal(
- text_features,
- attention_mask if attention_mask is not None else text["attention_mask"],
- image_features,
- )
-
- if return_scores:
- return matching_scores, embeddings
-
- return embeddings
-
- def forward(
- self,
- images: ndarray,
- texts: Dict[str, ndarray],
- ) -> Union[ndarray, ndarray]:
- """Inference forward method
-
- :param images: Preprocessed images
- :param texts: Preprocessed texts
- :return: embeddings for images and texts
- """
- _, image_embeddings = self.image_encoder(images)
- _, text_embeddings = self.text_encoder(texts)
- return image_embeddings, text_embeddings
-
- @property
- def text_features_dim(self) -> int:
- """Dimensionality of the text encoder features."""
-
- return self._text_encoder_dim
-
- @property
- def image_features_dim(self) -> int:
- """Dimensionality of the image encoder features."""
-
- return self._image_encoder_dim
-
- @property
- def embedding_dim(self) -> int:
- """Dimensionality of shared space embedding."""
-
- return self._embedding_dim
-
- @property
- def multimodal_embedding_dim(self) -> int:
- """Dimensionality of multimodal joint embedding."""
- return self._text_encoder_dim
diff --git a/python/uform/preprocessing.py b/python/uform/preprocessing.py
deleted file mode 100644
index d3d833e..0000000
--- a/python/uform/preprocessing.py
+++ /dev/null
@@ -1,105 +0,0 @@
-from os import PathLike
-from typing import Dict, List, Union
-
-import torch
-from PIL import Image
-from tokenizers import Tokenizer
-from torch import Tensor
-from torchvision.transforms import (CenterCrop, Compose, InterpolationMode,
- Normalize, Resize, ToTensor)
-
-
-# lambda is not pickable
-def convert_to_rgb(image):
- return image.convert("RGB")
-
-
-class Processor:
- def __init__(self, config: Dict, tokenizer_path: PathLike, tensor_type: str = "pt"):
- """
- :param config: model config
- :param tokenizer_path: path to tokenizer file
- :param tensor_type: which tensors to return, either pt (PyTorch) or np (NumPy)
- """
-
- assert tensor_type in ("pt", "np"), "`tensor_type` must be either `pt` or `np`"
-
- self._image_size = config["image_encoder"]["image_size"]
- self._max_seq_len = config["text_encoder"]["max_position_embeddings"]
- self._tokenizer = Tokenizer.from_file(tokenizer_path)
- self._tokenizer.no_padding()
- self._pad_token_idx = config["text_encoder"]["padding_idx"]
-
- self.tensor_type = tensor_type
-
- self._image_transform = Compose(
- [
- Resize(self._image_size, interpolation=InterpolationMode.BICUBIC),
- convert_to_rgb,
- CenterCrop(self._image_size),
- ToTensor(),
- Normalize(
- mean=(0.48145466, 0.4578275, 0.40821073),
- std=(0.26862954, 0.26130258, 0.27577711),
- ),
- ],
- )
-
- def preprocess_text(self, texts: Union[str, List[str]]) -> Dict[str, Tensor]:
- """Transforms one or more strings into dictionary with tokenized strings and attention masks.
-
- :param texts: text of list of texts to tokenizer
- """
- if isinstance(texts, str):
- texts = [texts]
-
- input_ids = torch.full(
- (len(texts), self._max_seq_len),
- fill_value=self._pad_token_idx,
- dtype=torch.int64,
- )
-
- attention_mask = torch.zeros(
- len(texts),
- self._max_seq_len,
- dtype=torch.int32,
- )
- encoded = self._tokenizer.encode_batch(texts)
-
- for i, seq in enumerate(encoded):
- seq_len = min(len(seq), self._max_seq_len)
- input_ids[i, :seq_len] = torch.LongTensor(
- seq.ids[: self._max_seq_len],
- )
- attention_mask[i, :seq_len] = 1
-
- if self.tensor_type == "np":
- return {
- "input_ids": input_ids.numpy(),
- "attention_mask": attention_mask.numpy(),
- }
-
- return {"input_ids": input_ids, "attention_mask": attention_mask}
-
- def preprocess_image(self, images: Union[Image, List[Image]]) -> Tensor:
- """Transforms one or more Pillow images into Torch Tensors.
-
- :param images: image or list of images to preprocess
- """
-
- if isinstance(images, list):
- batch_images = torch.empty(
- (len(images), 3, self._image_size, self._image_size),
- dtype=torch.float32,
- )
-
- for i, image in enumerate(images):
- batch_images[i] = self._image_transform(image)
-
- else:
- batch_images = self._image_transform(images).unsqueeze(0)
-
- if self.tensor_type == "np":
- return batch_images.numpy()
-
- return batch_images
diff --git a/python/uform/shared.py b/python/uform/shared.py
new file mode 100644
index 0000000..37d256b
--- /dev/null
+++ b/python/uform/shared.py
@@ -0,0 +1,26 @@
+from enum import Enum
+from typing import Union
+from os import PathLike
+import json
+
+
+class Modality(Enum):
+ TEXT_ENCODER = "text_encoder"
+ IMAGE_ENCODER = "image_encoder"
+ VIDEO_ENCODER = "video_encoder"
+ TEXT_DECODER = "text_decoder"
+
+
+class ExecutionProviderError(Exception):
+ """Exception raised when a requested execution provider is not available."""
+
+
+ConfigOrPath = Union[PathLike, str, object]
+
+
+def read_config(path_or_object: ConfigOrPath) -> object:
+ if isinstance(path_or_object, (PathLike, str)):
+ with open(path_or_object, "r") as f:
+ return json.load(f)
+ else:
+ return path_or_object
diff --git a/python/uform/torch_decoders.py b/python/uform/torch_decoders.py
new file mode 100644
index 0000000..475f5b0
--- /dev/null
+++ b/python/uform/torch_decoders.py
@@ -0,0 +1,469 @@
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torchvision.transforms import (
+ CenterCrop,
+ Compose,
+ InterpolationMode,
+ Normalize,
+ RandomResizedCrop,
+ Resize,
+ ToTensor,
+)
+from transformers import AutoConfig, AutoTokenizer
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers.modeling_utils import PreTrainedModel
+from transformers.models.auto.modeling_auto import AutoModel, AutoModelForCausalLM
+from transformers.processing_utils import ProcessorMixin
+from transformers.tokenization_utils_base import BatchEncoding
+
+from uform.torch_encoders import ImageEncoder
+
+IMAGENET_MEAN = (0.48145466, 0.4578275, 0.40821073)
+IMAGENET_STD = (0.26862954, 0.26130258, 0.27577711)
+
+
+def convert_to_rgb(image):
+ return image.convert("RGB")
+
+
+class LayerScale(nn.Module):
+ def __init__(self, dim, init_values: float = 1e-5, inplace: bool = False):
+ super().__init__()
+ self.weight = nn.Parameter(init_values * torch.ones(dim))
+ self.inplace = inplace
+
+ def forward(self, x):
+ return x.mul_(self.weight) if self.inplace else x * self.weight
+
+
+class ImageFeaturesPooler(nn.Module):
+ def __init__(
+ self,
+ input_size,
+ hidden_size,
+ num_attn_heads,
+ intermediate_size,
+ num_latents,
+ initializer_range,
+ ):
+ super().__init__()
+ self.projection = nn.Linear(input_size, hidden_size)
+
+ self.pooler = nn.TransformerDecoderLayer(
+ hidden_size,
+ num_attn_heads,
+ intermediate_size,
+ activation=nn.functional.silu,
+ batch_first=True,
+ norm_first=True,
+ )
+ self.image_latents = nn.Parameter(
+ torch.randn(1, num_latents, hidden_size) * initializer_range**0.5,
+ )
+
+ def forward(self, features):
+ features = self.projection(features)
+ return self.pooler(
+ self.image_latents.expand(features.shape[0], -1, -1),
+ features,
+ )
+
+
+class VLMConfig(PretrainedConfig):
+ model_type = "vlm"
+
+ def __init__(
+ self,
+ text_decoder_name_or_path: str = "",
+ tokenizer_name_or_path: str = "",
+ image_size: int = 224,
+ image_encoder_hidden_size: int = 768,
+ image_encoder_patch_size: int = 16,
+ image_encoder_num_layers: int = 12,
+ image_encoder_num_heads: int = 12,
+ image_encoder_embedding_dim: int = 256,
+ image_encoder_pooling: str = "cls",
+ image_pooler_num_attn_heads: int = 16,
+ image_pooler_intermediate_size: int = 5504,
+ image_pooler_num_latents: int = 196,
+ image_token_id: int = 32002,
+ initializer_range: float = 0.02,
+ use_cache: bool = True,
+ center_crop: bool = True,
+ **kwargs,
+ ):
+ self.text_decoder_name_or_path = text_decoder_name_or_path
+ self.tokenizer_name_or_path = tokenizer_name_or_path
+
+ self.image_size = image_size
+ self.image_encoder_hidden_size = image_encoder_hidden_size
+ self.image_encoder_patch_size = image_encoder_patch_size
+ self.image_encoder_num_layers = image_encoder_num_layers
+ self.image_encoder_num_heads = image_encoder_num_heads
+ self.image_encoder_embedding_dim = image_encoder_embedding_dim
+ self.image_encoder_pooling = image_encoder_pooling
+
+ self.image_pooler_num_attn_heads = image_pooler_num_attn_heads
+ self.image_pooler_intermediate_size = image_pooler_intermediate_size
+ self.image_pooler_num_latents = image_pooler_num_latents
+
+ self.image_token_id = image_token_id
+
+ self.initializer_range = initializer_range
+ self.use_cache = use_cache
+ self.center_crop = center_crop
+
+ super().__init__(**kwargs)
+
+
+class VLMPreTrainedModel(PreTrainedModel):
+ config_class = VLMConfig
+ base_model_prefix = "vlm"
+ supports_gradient_checkpointing = True
+ _no_split_modules = []
+ _skip_keys_device_placement = "past_key_values"
+
+ def _init_weights(self, module):
+ pass
+
+ def _initialize_weights(self, module):
+ pass
+
+
+class VLMForCausalLM(VLMPreTrainedModel):
+ def __init__(self, config: VLMConfig):
+ super().__init__(config)
+
+ self.config = config
+ self.text_config = AutoConfig.from_pretrained(config.text_decoder_name_or_path)
+ self.text_config.vocab_size += 3
+ self.text_decoder = AutoModelForCausalLM.from_config(self.text_config)
+
+ self.image_encoder = ImageEncoder(
+ self.config.image_encoder_hidden_size,
+ self.config.image_encoder_patch_size,
+ self.config.image_size,
+ self.config.image_encoder_num_layers,
+ self.config.image_encoder_num_heads,
+ self.config.image_encoder_embedding_dim,
+ self.config.image_encoder_pooling,
+ )
+
+ # replace models' layerscales because `transformers` automatically renames keys in `state_dict`
+ for i in range(len(self.image_encoder.blocks)):
+ self.image_encoder.blocks[i].ls1 = LayerScale(
+ self.image_encoder.blocks[i].ls1.dim,
+ )
+ self.image_encoder.blocks[i].ls2 = LayerScale(
+ self.image_encoder.blocks[i].ls2.dim,
+ )
+
+ self.image_pooler = ImageFeaturesPooler(
+ self.config.image_encoder_hidden_size,
+ self.text_config.hidden_size,
+ self.config.image_pooler_num_attn_heads,
+ self.config.image_pooler_intermediate_size,
+ self.config.image_pooler_num_latents,
+ self.config.initializer_range,
+ )
+
+ def get_input_embeddings(self):
+ return self.text_decoder.get_input_embeddings()
+
+ def set_input_embeddings(self, value):
+ self.text_decoder.set_input_embeddings(value)
+
+ def get_images_embeddings(self, images):
+ features = self.image_encoder.forward_features(images)
+ return self.image_pooler(features)
+
+ def gather_continuous_embeddings(
+ self,
+ input_ids: torch.Tensor,
+ word_embeddings: torch.Tensor,
+ image_embeddings: torch.Tensor,
+ ) -> torch.Tensor:
+ start_indices = (input_ids == self.config.image_token_id).nonzero()[:, 1]
+ embeddings = []
+
+ for sample_idx, start_idx in enumerate(start_indices.tolist()):
+ embeddings.append(
+ torch.cat(
+ (
+ word_embeddings[sample_idx, :start_idx],
+ image_embeddings[sample_idx],
+ word_embeddings[sample_idx, start_idx + 1 :],
+ ),
+ dim=0,
+ ),
+ )
+
+ return torch.stack(embeddings, dim=0)
+
+ def forward(
+ self,
+ input_ids: torch.LongTensor = None,
+ images: torch.Tensor = None,
+ attention_mask: Optional[torch.Tensor] = None,
+ position_ids: Optional[torch.LongTensor] = None,
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
+ inputs_embeds: Optional[torch.FloatTensor] = None,
+ use_cache: Optional[bool] = None,
+ labels: Optional[torch.Tensor] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ ) -> Union[dict, Tuple, CausalLMOutputWithPast]:
+
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+ output_hidden_states = (
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+ )
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+ if input_ids is not None and inputs_embeds is not None:
+ raise ValueError(
+ "You cannot specify both input_ids and inputs_embeds at the same time",
+ )
+ elif input_ids is None and inputs_embeds is None:
+ raise ValueError("You have to specify either input_is or inputs_embeds")
+
+ if inputs_embeds is None and past_key_values is None:
+ inputs_embeds = self.get_input_embeddings()(input_ids)
+
+ if images is not None:
+ image_embeds = self.get_images_embeddings(images)
+ inputs_embeds = self.gather_continuous_embeddings(
+ input_ids,
+ inputs_embeds,
+ image_embeds,
+ )
+
+ if position_ids is None:
+ seq_length = inputs_embeds.shape[1] if inputs_embeds is not None else input_ids.shape[1]
+ past_key_values_length = 0
+
+ if past_key_values is not None:
+ past_key_values_length = past_key_values[0][0].shape[2]
+
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
+ position_ids = torch.arange(
+ past_key_values_length,
+ seq_length + past_key_values_length,
+ dtype=torch.long,
+ device=device,
+ )
+ position_ids = position_ids.unsqueeze(0)
+
+ outputs = self.text_decoder(
+ inputs_embeds=inputs_embeds,
+ input_ids=input_ids if past_key_values is not None else None,
+ attention_mask=attention_mask,
+ labels=labels,
+ position_ids=position_ids,
+ past_key_values=past_key_values,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ use_cache=use_cache,
+ return_dict=return_dict,
+ )
+
+ return outputs
+
+ def prepare_inputs_for_generation(
+ self,
+ input_ids,
+ images=None,
+ past_key_values=None,
+ attention_mask=None,
+ inputs_embeds=None,
+ **kwargs,
+ ):
+ if past_key_values:
+ input_ids = input_ids[:, -1:]
+
+ position_ids = kwargs.get("position_ids", None)
+ if attention_mask is not None and position_ids is None:
+ # create position_ids on the fly for batch generation
+ position_ids = attention_mask.long().cumsum(-1) - 1
+ position_ids.masked_fill_(attention_mask == 0, 1)
+ if past_key_values:
+ position_ids = position_ids[:, -1].unsqueeze(-1)
+
+ # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+ if inputs_embeds is not None and past_key_values is None:
+ model_inputs = {"inputs_embeds": inputs_embeds}
+ else:
+ model_inputs = {"input_ids": input_ids}
+
+ if images is not None:
+ model_inputs["images"] = images
+
+ model_inputs.update(
+ {
+ "position_ids": position_ids,
+ "past_key_values": past_key_values,
+ "use_cache": kwargs.get("use_cache"),
+ "attention_mask": attention_mask,
+ "images": images if past_key_values is None else None,
+ },
+ )
+ return model_inputs
+
+ @classmethod
+ def from_config(cls, config, **kwargs):
+ return cls._from_config(config, **kwargs)
+
+
+class VLMProcessor(ProcessorMixin):
+ def __init__(self, config, **kwargs):
+ self.feature_extractor = None
+ self.config = config
+
+ if config.center_crop:
+ self.image_processor = Compose(
+ [
+ Resize(256, interpolation=InterpolationMode.BICUBIC),
+ CenterCrop(config.image_size),
+ convert_to_rgb,
+ ToTensor(),
+ Normalize(
+ mean=IMAGENET_MEAN,
+ std=IMAGENET_STD,
+ ),
+ ],
+ )
+ else:
+ self.image_processor = Compose(
+ [
+ RandomResizedCrop(
+ config.image_size,
+ scale=(0.8, 1),
+ interpolation=InterpolationMode.BICUBIC,
+ ),
+ convert_to_rgb,
+ ToTensor(),
+ Normalize(
+ mean=IMAGENET_MEAN,
+ std=IMAGENET_STD,
+ ),
+ ],
+ )
+
+ self.tokenizer = AutoTokenizer.from_pretrained(
+ config.tokenizer_name_or_path,
+ additional_special_tokens=["<|im_end|>"],
+ )
+ self.num_image_latents = config.image_pooler_num_latents
+
+ def __call__(self, texts=None, images=None, return_tensors="pt", **kwargs):
+ if texts is not None:
+ if isinstance(texts, str):
+ texts = [texts]
+
+ tokenized_texts = []
+ for text in texts:
+ messages = [
+ {"role": "system", "content": "You are a helpful assistant."},
+ {"role": "user", "content": f" {text}"},
+ ]
+ tokenized_prompt = self.tokenizer.apply_chat_template(
+ messages,
+ add_generation_prompt=True,
+ return_tensors=return_tensors,
+ )
+
+ tokenized_texts.append(tokenized_prompt)
+
+ max_len = max(len(t[0]) for t in tokenized_texts)
+ input_ids = torch.full(
+ (len(tokenized_texts), max_len),
+ fill_value=self.tokenizer.pad_token_id,
+ dtype=torch.int64,
+ )
+ attention_mask = torch.full(
+ (len(tokenized_texts), max_len),
+ fill_value=0,
+ dtype=torch.int64,
+ )
+
+ for i, tokens in enumerate(tokenized_texts):
+ input_ids[i, -len(tokens[0]) :] = tokens[0]
+ attention_mask[i, -len(tokens[0]) :] = 1
+
+ attention_mask = F.pad(
+ attention_mask,
+ pad=(0, self.num_image_latents - 1),
+ value=1,
+ )
+
+ encoding = BatchEncoding(
+ data={
+ "input_ids": input_ids,
+ "attention_mask": attention_mask,
+ },
+ )
+
+ if images is not None:
+ if isinstance(images, (list, tuple)):
+ image_features = torch.empty(
+ (len(images), 3, self.config.image_size, self.config.image_size),
+ dtype=torch.float32,
+ )
+
+ for i, image in enumerate(images):
+ image_features[i] = self.image_processor(image)
+ else:
+ image_features = self.image_processor(images).unsqueeze(0)
+
+ if texts is not None and images is not None:
+ encoding["images"] = image_features
+ return encoding
+
+ if texts is not None:
+ return encoding
+
+ return BatchEncoding(
+ data={
+ "images": image_features,
+ },
+ tensor_type=return_tensors,
+ )
+
+ def batch_decode(self, *args, **kwargs):
+ return self.tokenizer.batch_decode(*args, **kwargs)
+
+ def decode(self, *args, **kwargs):
+ return self.tokenizer.decode(*args, **kwargs)
+
+ @classmethod
+ def from_pretrained(
+ cls,
+ pretrained_model_name_or_path,
+ cache_dir=None,
+ force_download: bool = False,
+ local_files_only: bool = False,
+ token=None,
+ revision: str = "main",
+ **kwargs,
+ ):
+ config = AutoConfig.from_pretrained(
+ pretrained_model_name_or_path,
+ cache_dir=cache_dir,
+ force_download=force_download,
+ local_files_only=local_files_only,
+ revision=revision,
+ token=token,
+ **kwargs,
+ )
+ return cls(config)
+
+
+AutoConfig.register("vlm", VLMConfig)
+AutoModel.register(VLMConfig, VLMForCausalLM)
diff --git a/python/uform/torch_models.py b/python/uform/torch_encoders.py
similarity index 63%
rename from python/uform/torch_models.py
rename to python/uform/torch_encoders.py
index ab86622..89f6631 100644
--- a/python/uform/torch_models.py
+++ b/python/uform/torch_encoders.py
@@ -1,11 +1,23 @@
+from __future__ import annotations
+
from dataclasses import dataclass
from os import PathLike
-from typing import Dict, Optional, Tuple, Union
+from typing import Dict, Optional, Union, Mapping, Any, Tuple
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor
+from PIL.Image import Image
+
+from uform.shared import read_config
+
+
+def _is_on_gpu(model: nn.Module) -> bool:
+ try:
+ return next(model.parameters()).device.type == "cuda"
+ except StopIteration:
+ return False
@dataclass(eq=False)
@@ -132,7 +144,7 @@ def forward(
@dataclass(eq=False)
-class VisualEncoderBlock(nn.Module):
+class ImageEncoderBlock(nn.Module):
dim: int
num_heads: int
@@ -219,36 +231,14 @@ def forward_features(self, x: Tensor, attn_mask: Tensor) -> Tensor:
return x
- def forward_multimodal(
- self,
- x: Tensor,
- attn_mask: Tensor,
- context: Tensor,
- ) -> Tensor:
- context = self.context_projection(context)
- expanded_attn_mask = self.get_attention_mask(attn_mask, x.dtype)
- for block in self.blocks:
- if block.cross_attention:
- x = block(x, expanded_attn_mask, context)
-
- return self.pool_features(x, attn_mask)
-
def forward_embedding(self, x: Tensor, attn_mask: Tensor) -> Tensor:
return self.embedding_projection(self.pool_features(x, attn_mask))
- def forward_matching(self, x: Tensor) -> Tensor:
- logits = self.matching_head(x)
- if self.head_one_neuron:
- return torch.sigmoid(logits)[:, 0]
-
- return F.softmax(logits, dim=1)[:, 1]
-
def pool_features(self, x: Tensor, attn_mask: Tensor) -> Tensor:
if self.pooling == "cls":
return x[:, 0]
attn_mask = attn_mask.unsqueeze(2).type_as(x)
-
return (x * attn_mask).sum(dim=1) / attn_mask.sum(dim=1)
def get_attention_mask(self, attn_mask: Tensor, dtype: torch.dtype) -> Tensor:
@@ -273,7 +263,8 @@ def forward(
x: Union[Tensor, dict],
attention_mask: Optional[Tensor] = None,
return_features: Optional[bool] = None,
- ) -> Tensor:
+ ) -> Union[Tensor, Tuple[Tensor, Tensor]]:
+
if isinstance(x, dict):
assert attention_mask is None, "If `x` is a dictionary, then `attention_mask` should be None"
attention_mask = x["attention_mask"]
@@ -282,6 +273,11 @@ def forward(
# If no attention mask is provided - create one with all ones
attention_mask = torch.ones_like(x)
+ # If the model is on the GPU and the input matrices are not, shift them there
+ if _is_on_gpu(self) and not x.is_cuda:
+ x = x.cuda()
+ attention_mask = attention_mask.cuda()
+
features = self.forward_features(x, attention_mask)
embeddings = self.forward_embedding(features, attention_mask)
@@ -290,9 +286,48 @@ def forward(
return features, embeddings
return embeddings
+ def encode(
+ self,
+ x: Union[Tensor, dict],
+ attention_mask: Optional[Tensor] = None,
+ return_features: Optional[bool] = None,
+ ) -> Union[Tensor, Tuple[Tensor, Tensor]]:
+
+ result = self.forward(x, attention_mask, return_features)
+ if isinstance(result, tuple):
+ return result[0].detach(), result[1].detach()
+ else:
+ return result.detach()
+
+ @staticmethod
+ def from_pretrained(config: Union[PathLike, str, object], model: Union[PathLike, str]) -> TextEncoder:
+ """Load the image encoder from the given configuration and model path.
+
+ :param config: the configuration dictionary or path to the JSON configuration file
+ :param model: the model state dictionary or path to the `.pt` model file
+ """
+ config = read_config(config)
+ if "text_encoder" in config:
+ config = config["text_encoder"]
+
+ # We must strip all the non-member attributes before initializing the classes.
+ text_fields = TextEncoder.__dataclass_fields__
+ config = {k: v for k, v in config.items() if k in text_fields}
+ encoder = TextEncoder(**config)
+
+ # Load from disk
+ if isinstance(model, (PathLike, str)):
+ state = torch.load(model)
+ else:
+ state = model
+ if "text_encoder" in state:
+ state = state["text_encoder"]
+ encoder.load_state_dict(state)
+ return encoder
+
@dataclass(eq=False)
-class VisualEncoder(nn.Module):
+class ImageEncoder(nn.Module):
dim: int
patch_size: int
image_size: int
@@ -314,26 +349,23 @@ def __post_init__(self):
self.reg_token = nn.Parameter(torch.zeros(1, self.num_reg_tokens, self.dim))
self.blocks = nn.Sequential(
- *[VisualEncoderBlock(self.dim, self.num_heads) for _ in range(self.num_layers)],
+ *[ImageEncoderBlock(self.dim, self.num_heads) for _ in range(self.num_layers)],
)
self.norm = nn.LayerNorm(self.dim, eps=1e-6)
self.embedding_projection = nn.Linear(self.dim, self.embedding_dim, bias=False)
self.return_features = False
- def forward_features(self, x: Tensor) -> Tensor:
+ def forward_features(self, x: Union[Tensor, dict]) -> Tensor:
x = self.patch_embed(x).flatten(start_dim=2).transpose(2, 1)
x = x + self.pos_embed
-
special_tokens = [self.cls_token.expand(x.shape[0], -1, -1)]
if self.num_reg_tokens > 0:
special_tokens.append(self.reg_token.expand(x.shape[0], -1, -1))
x = torch.cat(special_tokens + [x], dim=1)
-
x = self.blocks(x)
-
return self.norm(x)
def forward_embedding(self, x: Tensor) -> Tensor:
@@ -344,7 +376,14 @@ def forward_embedding(self, x: Tensor) -> Tensor:
return self.embedding_projection(x)
- def forward(self, x: Tensor, return_features: Optional[bool] = None) -> Tensor:
+ def forward(self, x: Union[Tensor, dict], return_features: Optional[bool] = None) -> Tensor:
+ if isinstance(x, dict):
+ x = x["images"]
+
+ # If the model is on the GPU and the input matrices are not, shift them there
+ if _is_on_gpu(self) and not x.is_cuda:
+ x = x.cuda()
+
features = self.forward_features(x)
embeddings = self.forward_embedding(features)
return_features = return_features if return_features is not None else self.return_features
@@ -352,154 +391,38 @@ def forward(self, x: Tensor, return_features: Optional[bool] = None) -> Tensor:
return features, embeddings
return embeddings
+ def encode(self, x: Union[Tensor, dict], return_features: Optional[bool] = None) -> Tensor:
+ result = self.forward(x, return_features)
+ if isinstance(result, tuple):
+ return result[0].detach(), result[1].detach()
+ else:
+ return result.detach()
-class VLM(nn.Module):
- """
- Vision-Language Model for Multimodal embeddings.
- """
-
- def __init__(self, config: Dict, tokenizer_path: PathLike):
- """
- :param config: Model config
- """
-
- super().__init__()
- self._embedding_dim = config["text_encoder"]["embedding_dim"]
-
- self.text_encoder = TextEncoder(**config["text_encoder"])
- self.image_encoder = VisualEncoder(**config["image_encoder"])
-
- def encode_image(
- self,
- images: Tensor,
- return_features: bool = False,
- ) -> Union[Tensor, Tuple[Tensor, Tensor]]:
- """Passes the pre-processed images through `image_encoder` to produce images features (optional) and embeddings.
-
- :param images: Preprocessed image
- :param return_features: Whether to return images features or return only embeddings
- """
-
- features = self.image_encoder.forward_features(images)
- embeddings = self.image_encoder.forward_embedding(features)
-
- if return_features:
- return features, embeddings
-
- return embeddings
-
- def encode_text(
- self,
- texts: Dict[str, Tensor],
- return_features: bool = False,
- ) -> Union[Tensor, Tuple[Tensor, Tensor]]:
- """Passes the pre-processed texts through `text_encoder` to produce texts features (optional) and embeddings.
-
- :param texts: Dictionary with tokenized texts and attention masks
- :param return_features: Whether to return texts features or return only embeddings
- """
-
- features = self.text_encoder.forward_features(
- texts["input_ids"],
- texts["attention_mask"],
- )
- embeddings = self.text_encoder.forward_embedding(
- features,
- texts["attention_mask"],
- )
-
- if return_features:
- return features, embeddings
-
- return embeddings
-
- def encode_multimodal(
- self,
- image: Optional[Tensor] = None,
- text: Optional[Dict] = None,
- image_features: Optional[Tensor] = None,
- text_features: Optional[Tensor] = None,
- attention_mask: Optional[Tensor] = None,
- return_scores: bool = False,
- ) -> Union[Tensor, Tuple[Tensor, Tensor]]:
- """Passes preprocessed texts (or precomputed texts features) and
- preprocessed images (or precomputed images features) through multimodal encoded to produce multimodal joint embeddings.
-
- :param image: Preprocessed images
- :param text: Preprocessed texts
- :param image_features: Precomputed images features
- :param text_features: Precomputed text features
- :param attention_mask: Attention masks, not required if pass `text` instead of text_features
- """
-
- assert image is not None or image_features is not None, "Either `image` or `image_features` should be non None"
- assert text is not None or text_features is not None, "Either `text_data` or `text_features` should be non None"
-
- if text_features is not None:
- assert attention_mask is not None, "if `text_features` is not None, then you should pass `attention_mask`"
-
- if image_features is None:
- image_features = self.image_encoder.forward_features(image)
-
- if text_features is None:
- text_features = self.text_encoder.forward_features(
- text["input_ids"],
- text["attention_mask"],
- )
-
- embeddings = self.text_encoder.forward_multimodal(
- text_features,
- attention_mask if attention_mask is not None else text["attention_mask"],
- image_features,
- )
-
- if return_scores:
- return self.get_matching_scores(embeddings), embeddings
-
- return embeddings
-
- def get_matching_scores(self, embeddings: Tensor) -> Tensor:
- """Computes the probability that there is a match between images and texts based on their multimodal embeddings
-
- :param embeddings: multimodal joint embeddings
- """
-
- return self.text_encoder.forward_matching(embeddings)
+ @staticmethod
+ def from_pretrained(
+ config: Union[PathLike, str, object],
+ model: Union[PathLike, str, Mapping[str, Any]],
+ ) -> ImageEncoder:
+ """Load the image encoder from the given configuration and model path.
- def forward(
- self,
- images: Tensor,
- texts: Dict[str, Tensor],
- ) -> Union[Tensor, Tensor]:
- """Inference forward method
-
- :param images: Preprocessed images
- :param texts: Preprocessed texts
- :return: embeddings for images and texts
+ :param config: the configuration dictionary or path to the JSON configuration file
+ :param model: the model state dictionary or path to the `.pt` model file
"""
- _, image_embeddings = self.image_encoder(images)
- _, text_embeddings = self.text_encoder(texts)
- return image_embeddings, text_embeddings
-
- @property
- def text_features_dim(self) -> int:
- """Dimensionality of the text encoder features."""
-
- return self.text_encoder.dim
-
- @property
- def image_features_dim(self) -> int:
- """Dimensionality of the image encoder features."""
-
- return self.image_encoder.dim
-
- @property
- def embedding_dim(self) -> int:
- """Dimensionality of shared space embedding."""
-
- return self._embedding_dim
-
- @property
- def multimodal_embedding_dim(self) -> int:
- """Dimensionality of multimodal joint embedding."""
- return self.text_encoder.dim
+ config = read_config(config)
+ if "image_encoder" in config:
+ config = config["image_encoder"]
+
+ # We must strip all the non-member attributes before initializing the classes.
+ image_fields = ImageEncoder.__dataclass_fields__
+ config = {k: v for k, v in config.items() if k in image_fields}
+ encoder = ImageEncoder(**config)
+
+ # Load from disk
+ if isinstance(model, (PathLike, str)):
+ state = torch.load(model)
+ else:
+ state = model
+ if "image_encoder" in state:
+ state = state["image_encoder"]
+ encoder.load_state_dict(state)
+ return encoder
diff --git a/python/uform/torch_preprocessor.py b/python/uform/torch_processors.py
similarity index 57%
rename from python/uform/torch_preprocessor.py
rename to python/uform/torch_processors.py
index 8bdc70b..79c7e87 100644
--- a/python/uform/torch_preprocessor.py
+++ b/python/uform/torch_processors.py
@@ -1,5 +1,6 @@
from os import PathLike
-from typing import Dict, List, Union
+from typing import Dict, List, Union, Sequence
+import json
import torch
from PIL.Image import Image
@@ -14,43 +15,35 @@
ToTensor,
)
+from uform.shared import read_config
-# lambda is not pickable
+
+# lambda is not pickle-able
def convert_to_rgb(image):
return image.convert("RGB")
-class TorchProcessor:
- def __init__(self, config: Dict, tokenizer_path: PathLike):
+class TextProcessor:
+ def __init__(self, config_path: PathLike, tokenizer_path: PathLike):
"""
:param config: model config
:param tokenizer_path: path to tokenizer file
- :param tensor_type: which tensors to return, either pt (PyTorch) or np (NumPy)
"""
- self._image_size = config["image_encoder"]["image_size"]
- self._max_seq_len = config["text_encoder"]["max_position_embeddings"]
+ config = read_config(config_path)
+ if "text_encoder" in config:
+ config = config["text_encoder"]
+
+ self._max_seq_len = config["max_position_embeddings"]
self._tokenizer = Tokenizer.from_file(tokenizer_path)
self._tokenizer.no_padding()
- self._pad_token_idx = config["text_encoder"]["padding_idx"]
-
- self._image_transform = Compose(
- [
- Resize(self._image_size, interpolation=InterpolationMode.BICUBIC),
- convert_to_rgb,
- CenterCrop(self._image_size),
- ToTensor(),
- Normalize(
- mean=(0.48145466, 0.4578275, 0.40821073),
- std=(0.26862954, 0.26130258, 0.27577711),
- ),
- ],
- )
+ self._pad_token_idx = config["padding_idx"]
- def preprocess_text(self, texts: Union[str, List[str]]) -> Dict[str, Tensor]:
+ def __call__(self, texts: Union[str, List[str]]) -> Dict[str, Tensor]:
"""Transforms one or more strings into dictionary with tokenized strings and attention masks.
:param texts: text of list of texts to tokenizer
+ :return: dictionary with tokenized strings and attention masks as values
"""
if isinstance(texts, str):
texts = [texts]
@@ -77,13 +70,46 @@ def preprocess_text(self, texts: Union[str, List[str]]) -> Dict[str, Tensor]:
return {"input_ids": input_ids, "attention_mask": attention_mask}
- def preprocess_image(self, images: Union[Image, List[Image]]) -> Tensor:
+
+class ImageProcessor:
+ def __init__(self, config_path: PathLike):
+ """
+ :param config: model config
+ """
+
+ config = read_config(config_path)
+ if "image_encoder" in config:
+ config = config["image_encoder"]
+
+ self._image_size = config["image_size"]
+ self._normalization_means = config["normalization_means"]
+ self._normalization_deviations = config["normalization_deviations"]
+
+ assert isinstance(self._image_size, int) and self._image_size > 0
+ assert isinstance(self._normalization_means, list) and isinstance(self._normalization_deviations, list)
+ assert len(self._normalization_means) == len(self._normalization_deviations) == 3
+
+ self._image_transform = Compose(
+ [
+ Resize(self._image_size, interpolation=InterpolationMode.BICUBIC),
+ convert_to_rgb,
+ CenterCrop(self._image_size),
+ ToTensor(),
+ Normalize(
+ mean=tuple(self._normalization_means),
+ std=tuple(self._normalization_deviations),
+ ),
+ ],
+ )
+
+ def __call__(self, images: Union[Image, Sequence[Image]]) -> Dict[str, Tensor]:
"""Transforms one or more Pillow images into Torch Tensors.
:param images: image or list of images to preprocess
+ :return: dictionary with float-represented images in tensors as values
"""
- if isinstance(images, list):
+ if isinstance(images, Sequence):
batch_images = torch.empty(
(len(images), 3, self._image_size, self._image_size),
dtype=torch.float32,
@@ -95,4 +121,4 @@ def preprocess_image(self, images: Union[Image, List[Image]]) -> Tensor:
else:
batch_images = self._image_transform(images).unsqueeze(0)
- return batch_images
+ return {"images": batch_images}
diff --git a/swift/Embeddings.swift b/swift/Embeddings.swift
deleted file mode 100644
index 6d973ac..0000000
--- a/swift/Embeddings.swift
+++ /dev/null
@@ -1,403 +0,0 @@
-//
-// Embeddings.swift
-//
-//
-// Created by Ash Vardanian on 3/27/24.
-//
-import Accelerate
-import CoreGraphics
-import CoreML
-import Foundation
-import Hub // `Config`
-import Tokenizers // `AutoTokenizer`
-
-public enum Embedding {
- case i32s([Int32])
- case f16s([Float16])
- case f32s([Float32])
- case f64s([Float64])
-
- init?(from multiArray: MLMultiArray) {
- switch multiArray.dataType {
- case .float64:
- self = .f64s(
- Array(
- UnsafeBufferPointer(
- start: multiArray.dataPointer.assumingMemoryBound(to: Float64.self),
- count: Int(truncating: multiArray.shape[1])
- )
- )
- )
- case .float32:
- self = .f32s(
- Array(
- UnsafeBufferPointer(
- start: multiArray.dataPointer.assumingMemoryBound(to: Float32.self),
- count: Int(truncating: multiArray.shape[1])
- )
- )
- )
- case .float16:
- self = .f16s(
- Array(
- UnsafeBufferPointer(
- start: multiArray.dataPointer.assumingMemoryBound(to: Float16.self),
- count: Int(truncating: multiArray.shape[1])
- )
- )
- )
- case .int32:
- self = .i32s(
- Array(
- UnsafeBufferPointer(
- start: multiArray.dataPointer.assumingMemoryBound(to: Int32.self),
- count: Int(truncating: multiArray.shape[1])
- )
- )
- )
- @unknown default:
- return nil // return nil for unsupported data types
- }
- }
-
- public func asFloats() -> [Float] {
- switch self {
- case .f32s(let array):
- return array
- case .i32s(let array):
- return array.map { Float($0) }
- case .f16s(let array):
- return array.map { Float($0) }
- case .f64s(let array):
- return array.map { Float($0) }
- }
- }
-}
-
-// MARK: - Helpers
-
-func readConfig(fromPath path: String) throws -> [String: Any] {
- // If it's not an absolute path, let's assume it's a path relative to the current working directory
- let absPath = path.hasPrefix("/") ? path : FileManager.default.currentDirectoryPath + "/" + path
- let data = try Data(contentsOf: URL(fileURLWithPath: absPath))
- return try JSONSerialization.jsonObject(with: data, options: []) as! [String: Any]
-}
-
-func readModel(fromURL modelURL: URL) throws -> MLModel {
- let compiledModelURL = try MLModel.compileModel(at: modelURL)
- return try MLModel(contentsOf: compiledModelURL)
-}
-
-func readModel(fromPath path: String) throws -> MLModel {
- // If it's not an absolute path, let's assume it's a path relative to the current working directory
- let absPath = path.hasPrefix("/") ? path : FileManager.default.currentDirectoryPath + "/" + path
- let modelURL = URL(fileURLWithPath: absPath, isDirectory: true)
- return try readModel(fromURL: modelURL)
-}
-
-// MARK: - Encoders
-
-public class TextEncoder {
- let model: MLModel
- let processor: TextProcessor
-
- public init(modelPath: String, configPath: String? = nil, tokenizerPath: String? = nil) throws {
- let finalConfigPath = configPath ?? modelPath + "/config.json"
- let finalTokenizerPath = tokenizerPath ?? modelPath + "/tokenizer.json"
- self.model = try readModel(fromPath: modelPath)
- self.processor = try TextProcessor(configPath: finalConfigPath, tokenizerPath: finalTokenizerPath, model: self.model)
- }
-
-
- public init(modelName: String, hubApi: HubApi = .shared) async throws {
- let repo = Hub.Repo(id: modelName)
- let modelURL = try await hubApi.snapshot(from: repo, matching: ["text.mlpackage/*", "config.json", "tokenizer.json"])
- let configPath = modelURL.appendingPathComponent("config.json").path
- let tokenizerPath = modelURL.appendingPathComponent("tokenizer.json").path
- self.model = try readModel(fromURL: modelURL.appendingPathComponent("text.mlpackage", isDirectory: true))
- self.processor = try TextProcessor(configPath: configPath, tokenizerPath: tokenizerPath, model: self.model)
- }
-
- public func forward(with text: String) throws -> Embedding {
- let inputFeatureProvider = try self.processor.preprocess(text)
- let prediction = try self.model.prediction(from: inputFeatureProvider)
- guard let predictionFeature = prediction.featureValue(for: "embeddings"),
- let output = predictionFeature.multiArrayValue,
- let embedding = Embedding(from: output)
- else {
- throw NSError(
- domain: "TextEncoder",
- code: 0,
- userInfo: [NSLocalizedDescriptionKey: "Failed to extract embeddings or unsupported data type."]
- )
- }
- return embedding
- }
-}
-
-public class ImageEncoder {
- let model: MLModel
- let processor: ImageProcessor
-
- public init(modelPath: String, configPath: String? = nil) throws {
- let finalConfigPath = configPath ?? modelPath + "/config.json"
- self.model = try readModel(fromPath: modelPath)
- self.processor = try ImageProcessor(configPath: finalConfigPath)
- }
-
- public init(modelName: String, hubApi: HubApi = .shared) async throws {
- let repo = Hub.Repo(id: modelName)
- let modelURL = try await hubApi.snapshot(from: repo, matching: ["image.mlpackage/*", "config.json"])
- let configPath = modelURL.appendingPathComponent("config.json").path
- self.model = try readModel(fromURL: modelURL.appendingPathComponent("image.mlpackage", isDirectory: true))
- self.processor = try ImageProcessor(configPath: configPath)
- }
-
- public func forward(with image: CGImage) throws -> Embedding {
- let inputFeatureProvider = try self.processor.preprocess(image)
- let prediction = try self.model.prediction(from: inputFeatureProvider)
- guard let predictionFeature = prediction.featureValue(for: "embeddings"),
- let output = predictionFeature.multiArrayValue,
- let embedding = Embedding(from: output)
- else {
- throw NSError(
- domain: "ImageEncoder",
- code: 0,
- userInfo: [NSLocalizedDescriptionKey: "Failed to extract embeddings or unsupported data type."]
- )
- }
- return embedding
- }
-}
-
-// MARK: - Processors
-
-class TextProcessor {
- let tokenizer: Tokenizer
- let minContextLength: Int
- let maxContextLength: Int
-
- public init(configPath: String, tokenizerPath: String, model: MLModel) throws {
- var configDict = try readConfig(fromPath: configPath)
- let tokenizerDict = try readConfig(fromPath: tokenizerPath)
-
- // Check if there's a specific 'text_encoder' configuration within the main configuration
- if let textEncoderConfig = configDict["text_encoder"] as? [String: Any] {
- configDict = textEncoderConfig // Use the specific 'text_encoder' configuration
- }
-
- let config = Config(configDict)
- let tokenizerData = Config(tokenizerDict)
- self.tokenizer = try AutoTokenizer.from(tokenizerConfig: config, tokenizerData: tokenizerData)
-
- let inputDescription = model.modelDescription.inputDescriptionsByName["input_ids"]
- guard let shapeConstraint = inputDescription?.multiArrayConstraint?.shapeConstraint else {
- fatalError("Cannot obtain shape information")
- }
-
- switch shapeConstraint.type {
- case .enumerated:
- minContextLength = shapeConstraint.enumeratedShapes[0][1].intValue
- maxContextLength = minContextLength
- case .range:
- let range = inputDescription?.multiArrayConstraint?.shapeConstraint.sizeRangeForDimension[1] as? NSRange
- minContextLength = range?.location ?? 1
- maxContextLength = range?.length ?? 128
- case .unspecified:
- minContextLength = 128
- maxContextLength = 128
- @unknown default:
- minContextLength = 128
- maxContextLength = 128
- }
- }
-
- public func preprocess(_ text: String) throws -> MLFeatureProvider {
- let inputIDs = self.tokenizer.encode(text: text)
- return TextInput(inputIDs: inputIDs, sequenceLength: self.maxContextLength)
- }
-}
-
-class ImageProcessor {
- let imageSize: Int
- let mean: [Float] = [0.485, 0.456, 0.406] // Common mean values for normalization
- let std: [Float] = [0.229, 0.224, 0.225] // Common std values for normalization
-
- init(configPath: String) throws {
- var configDict = try readConfig(fromPath: configPath)
- // Check if there's a specific 'image_encoder' configuration within the main configuration
- if let imageEncoderConfig = configDict["image_encoder"] as? [String: Any] {
- configDict = imageEncoderConfig
- }
-
- let config = Config(configDict)
- self.imageSize = config.imageSize!.intValue!
- }
-
- func preprocess(_ cgImage: CGImage) throws -> MLFeatureProvider {
- // Populate a tensor of size 3 x `imageSize` x `imageSize`,
- // by resizing the image, then performing a center crop.
- // Then normalize with the `mean` and `std` and export as a provider.
- let cropped = resizeAndCrop(image: cgImage, toSideLength: self.imageSize)!
- let normalized = exportToTensorAndNormalize(image: cropped, mean: self.mean, std: self.std)!
- let featureValue = MLFeatureValue(multiArray: normalized)
- return try ImageInput(precomputedFeature: featureValue)
- }
-
- private func resizeAndCrop(image: CGImage, toSideLength imageSize: Int) -> CGImage? {
- let originalWidth = CGFloat(image.width)
- let originalHeight = CGFloat(image.height)
-
- // Calculate new size preserving the aspect ratio
- let widthRatio = CGFloat(imageSize) / originalWidth
- let heightRatio = CGFloat(imageSize) / originalHeight
- let scaleFactor = max(widthRatio, heightRatio)
-
- let scaledWidth = originalWidth * scaleFactor
- let scaledHeight = originalHeight * scaleFactor
-
- // Calculate the crop rectangle
- let dx = (scaledWidth - CGFloat(imageSize)) / 2.0
- let dy = (scaledHeight - CGFloat(imageSize)) / 2.0
- guard
- let context = CGContext(
- data: nil,
- width: imageSize,
- height: imageSize,
- bitsPerComponent: image.bitsPerComponent,
- bytesPerRow: 0,
- space: image.colorSpace ?? CGColorSpaceCreateDeviceRGB(),
- bitmapInfo: image.bitmapInfo.rawValue
- )
- else { return nil }
-
- // Draw the scaled and cropped image in the context
- context.interpolationQuality = .high
- context.draw(image, in: CGRect(x: -dx, y: -dy, width: scaledWidth, height: scaledHeight))
- return context.makeImage()
- }
-
- private func exportToTensorAndNormalize(image: CGImage, mean: [Float], std: [Float]) -> MLMultiArray? {
- let width = image.width
- let height = image.height
-
- // Prepare the bitmap context for drawing the image.
- var pixelData = [UInt8](repeating: 0, count: width * height * 4)
- let colorSpace = CGColorSpaceCreateDeviceRGB()
- let context = CGContext(
- data: &pixelData,
- width: width,
- height: height,
- bitsPerComponent: 8,
- bytesPerRow: 4 * width,
- space: colorSpace,
- bitmapInfo: CGImageAlphaInfo.premultipliedLast.rawValue
- )
- context?.draw(image, in: CGRect(x: 0, y: 0, width: width, height: height))
-
- // Normalize the pixel data
- var floatPixels = [Float](repeating: 0, count: width * height * 3)
- for c in 0 ..< 3 {
- for i in 0 ..< (width * height) {
- floatPixels[i * 3 + c] = (Float(pixelData[i * 4 + c]) / 255.0 - mean[c]) / std[c]
- }
- }
-
- // Create the tensor array
- var tensor = [Float](repeating: 0, count: 3 * width * height)
- for i in 0 ..< (width * height) {
- for c in 0 ..< 3 {
- tensor[c * width * height + i] = floatPixels[i * 3 + c]
- }
- }
-
- let multiArray = try? MLMultiArray(
- shape: [1, 3, NSNumber(value: height), NSNumber(value: width)],
- dataType: .float32
- )
- for i in 0 ..< tensor.count {
- multiArray?[i] = NSNumber(value: tensor[i])
- }
- return multiArray
- }
-
-}
-
-// MARK: - Feature Providers
-
-class TextInput: MLFeatureProvider {
- var inputIDs: [Int]
- var sequenceLength: Int
- var paddingID: Int
-
- init(inputIDs: [Int], sequenceLength: Int, paddingID: Int = 0) {
- self.inputIDs = inputIDs
- self.sequenceLength = sequenceLength
- self.paddingID = paddingID
- }
-
- var featureNames: Set {
- return Set(["input_ids", "attention_mask"])
- }
-
- // The model expects the input IDs to be an array of integers
- // of length `sequenceLength`, padded with `paddingID` if necessary
- func featureValue(for featureName: String) -> MLFeatureValue? {
- switch featureName {
- case "input_ids", "attention_mask":
- return createFeatureValue(for: featureName)
- default:
- return nil
- }
- }
-
- private func createFeatureValue(for featureName: String) -> MLFeatureValue? {
- let count = min(inputIDs.count, sequenceLength)
- let totalElements = sequenceLength
- guard let multiArray = try? MLMultiArray(shape: [1, NSNumber(value: totalElements)], dataType: .int32) else {
- return nil
- }
-
- if featureName == "input_ids" {
- for i in 0 ..< count {
- multiArray[i] = NSNumber(value: inputIDs[i])
- }
- for i in count ..< totalElements {
- multiArray[i] = NSNumber(value: paddingID)
- }
- }
- else if featureName == "attention_mask" {
- for i in 0 ..< count {
- multiArray[i] = NSNumber(value: 1)
- }
- for i in count ..< totalElements {
- multiArray[i] = NSNumber(value: 0)
- }
- }
-
- return MLFeatureValue(multiArray: multiArray)
- }
-}
-
-class ImageInput: MLFeatureProvider {
- var precomputedFeature: MLFeatureValue
-
- init(precomputedFeature: MLFeatureValue) throws {
- self.precomputedFeature = precomputedFeature
- }
-
- var featureNames: Set {
- return Set(["input"])
- }
-
- // The model expects the input IDs to be an array of integers
- // of length `sequenceLength`, padded with `paddingID` if necessary
- func featureValue(for featureName: String) -> MLFeatureValue? {
- switch featureName {
- case "input":
- return precomputedFeature
- default:
- return nil
- }
- }
-}
diff --git a/swift/Encoders.swift b/swift/Encoders.swift
new file mode 100644
index 0000000..509ad11
--- /dev/null
+++ b/swift/Encoders.swift
@@ -0,0 +1,505 @@
+//
+// Embeddings.swift
+//
+//
+// Created by Ash Vardanian on 3/27/24.
+//
+import Accelerate
+import CoreGraphics
+import CoreML
+import Foundation
+import Hub // `Config`
+import Tokenizers // `AutoTokenizer`
+
+/// Defines custom errors related to the encoder's functionality.
+enum EncoderError: Error {
+ case downloadError(String)
+ case loadingError(String)
+ case invalidInput(String)
+ case modelPredictionFailed(String)
+ case unknownError(String)
+}
+
+/// Represents different types of embeddings as arrays of different numeric types.
+public enum Embedding {
+ case i32s([Int32])
+ case f16s([Float16])
+ case f32s([Float32])
+ case f64s([Float64])
+
+ /// Initializes an embedding from a `MLMultiArray`.
+ /// - Parameter multiArray: The MLMultiArray to convert into an Embedding.
+ /// - Returns: nil if the data type is unsupported.
+ init?(from multiArray: MLMultiArray) {
+ switch multiArray.dataType {
+ case .float64:
+ self = .f64s(
+ Array(
+ UnsafeBufferPointer(
+ start: multiArray.dataPointer.assumingMemoryBound(to: Float64.self),
+ count: Int(truncating: multiArray.shape[1])
+ )
+ )
+ )
+ case .float32:
+ self = .f32s(
+ Array(
+ UnsafeBufferPointer(
+ start: multiArray.dataPointer.assumingMemoryBound(to: Float32.self),
+ count: Int(truncating: multiArray.shape[1])
+ )
+ )
+ )
+ case .float16:
+ self = .f16s(
+ Array(
+ UnsafeBufferPointer(
+ start: multiArray.dataPointer.assumingMemoryBound(to: Float16.self),
+ count: Int(truncating: multiArray.shape[1])
+ )
+ )
+ )
+ case .int32:
+ self = .i32s(
+ Array(
+ UnsafeBufferPointer(
+ start: multiArray.dataPointer.assumingMemoryBound(to: Int32.self),
+ count: Int(truncating: multiArray.shape[1])
+ )
+ )
+ )
+ @unknown default:
+ return nil
+ }
+ }
+
+ /// Converts the embedding to an array of `Float`.
+ public func asFloats() -> [Float] {
+ switch self {
+ case .f32s(let array): return array
+ case .i32s(let array): return array.map(Float.init)
+ case .f16s(let array): return array.map(Float.init)
+ case .f64s(let array): return array.map(Float.init)
+ }
+ }
+}
+
+/// Provides methods for reading and handling configurations and models.
+/// - Parameter path: The file path where the configuration file is located.
+/// - Returns: A dictionary containing the configuration data.
+func readConfig(fromPath path: String) throws -> [String: Any] {
+ let absPath = path.hasPrefix("/") ? path : FileManager.default.currentDirectoryPath + "/" + path
+ let data = try Data(contentsOf: URL(fileURLWithPath: absPath))
+ return try JSONSerialization.jsonObject(with: data, options: []) as! [String: Any]
+}
+
+/// Compiles and loads a machine learning model from a URL.
+/// - Parameter modelURL: The URL where the model package is located.
+/// - Returns: An instance of `MLModel`.
+func readModel(fromURL modelURL: URL) throws -> MLModel {
+ let compiledModelURL = try MLModel.compileModel(at: modelURL)
+ return try MLModel(contentsOf: compiledModelURL)
+}
+
+/// Loads a machine learning model from a local file path.
+/// - Parameter path: The file path where the model file is located.
+/// - Returns: An instance of `MLModel`.
+func readModel(fromPath path: String) throws -> MLModel {
+ let absPath = path.hasPrefix("/") ? path : FileManager.default.currentDirectoryPath + "/" + path
+ let modelURL = URL(fileURLWithPath: absPath, isDirectory: true)
+ return try readModel(fromURL: modelURL)
+}
+
+/// Encodes text input into embeddings using a machine learning model.
+public class TextEncoder {
+ let model: MLModel
+ let processor: TextProcessor
+
+ /// Initializes a `TextEncoder` using paths for the model and configuration.
+ /// - Parameters:
+ /// - modelPath: The path to the directory containing the machine learning model.
+ /// - configPath: Optional. The path to the configuration file. Defaults to config.json in the model directory.
+ /// - tokenizerPath: Optional. The path to the tokenizer file. Defaults to tokenizer.json in the model directory.
+ public init(modelPath: String, configPath: String? = nil, tokenizerPath: String? = nil) throws {
+ let finalConfigPath = configPath ?? modelPath + "/config.json"
+ let finalTokenizerPath = tokenizerPath ?? modelPath + "/tokenizer.json"
+ self.model = try readModel(fromPath: modelPath)
+ self.processor = try TextProcessor(
+ configPath: finalConfigPath,
+ tokenizerPath: finalTokenizerPath,
+ model: self.model
+ )
+ }
+
+ /// Initializes a `TextEncoder` using a model name and an API for fetching models.
+ /// - Parameters:
+ /// - modelName: The identifier for the model repository.
+ /// - hubApi: The API object to interact with the model hub. Defaults to a shared instance.
+ public init(modelName: String, hubApi: HubApi = .shared) async throws {
+ let repo = Hub.Repo(id: modelName)
+ let modelURL = try await hubApi.snapshot(
+ from: repo,
+ matching: ["text_encoder.mlpackage/*", "config.json", "tokenizer.json"]
+ )
+ let configPath = modelURL.appendingPathComponent("config.json").path
+ let tokenizerPath = modelURL.appendingPathComponent("tokenizer.json").path
+ self.model = try readModel(
+ fromURL: modelURL.appendingPathComponent("text_encoder.mlpackage", isDirectory: true)
+ )
+ self.processor = try TextProcessor(configPath: configPath, tokenizerPath: tokenizerPath, model: self.model)
+ }
+
+ /// Processes text and returns embeddings. Throws an error if processing fails.
+ /// - Parameter text: The text input to encode.
+ /// - Returns: An `Embedding` object containing the model output.
+ public func encode(_ text: String) throws -> Embedding {
+ let inputFeatureProvider = try self.processor.preprocess(text)
+ guard let prediction = try? self.model.prediction(from: inputFeatureProvider),
+ let predictionFeature = prediction.featureValue(for: "embeddings"),
+ let output = predictionFeature.multiArrayValue,
+ let embedding = Embedding(from: output)
+ else {
+ throw EncoderError.modelPredictionFailed("Failed to extract embeddings or unsupported data type.")
+ }
+ return embedding
+ }
+}
+
+/// Encodes image input into embeddings using a machine learning model.
+public class ImageEncoder {
+ let model: MLModel
+ let processor: ImageProcessor
+
+ /// Initializes an `ImageEncoder` using a path for the model and optionally a configuration file.
+ /// - Parameters:
+ /// - modelPath: The path to the directory containing the machine learning model.
+ /// - configPath: Optional. The path to the configuration file. Defaults to config.json in the model directory.
+ public init(modelPath: String, configPath: String? = nil) throws {
+ let finalConfigPath = configPath ?? modelPath + "/config.json"
+ self.model = try readModel(fromPath: modelPath)
+ self.processor = try ImageProcessor(configPath: finalConfigPath)
+ }
+
+ /// Initializes an `ImageEncoder` using a model name and an API for fetching models.
+ /// - Parameters:
+ /// - modelName: The identifier for the model repository.
+ /// - hubApi: The API object to interact with the model hub. Defaults to a shared instance.
+ public init(modelName: String, hubApi: HubApi = .shared) async throws {
+ let repo = Hub.Repo(id: modelName)
+ let modelURL = try await hubApi.snapshot(from: repo, matching: ["image_encoder.mlpackage/*", "config.json"])
+ let configPath = modelURL.appendingPathComponent("config.json").path
+ self.model = try readModel(
+ fromURL: modelURL.appendingPathComponent("image_encoder.mlpackage", isDirectory: true)
+ )
+ self.processor = try ImageProcessor(configPath: configPath)
+ }
+
+ /// Processes an image and returns embeddings. Throws an error if processing fails.
+ /// - Parameter image: The `CGImage` to encode.
+ /// - Returns: An `Embedding` object containing the model output.
+ public func encode(_ image: CGImage) throws -> Embedding {
+ let inputFeatureProvider = try self.processor.preprocess(image)
+ guard let prediction = try? self.model.prediction(from: inputFeatureProvider),
+ let predictionFeature = prediction.featureValue(for: "embeddings"),
+ let output = predictionFeature.multiArrayValue,
+ let embedding = Embedding(from: output)
+ else {
+ throw EncoderError.modelPredictionFailed("Failed to extract embeddings or unsupported data type.")
+ }
+ return embedding
+ }
+}
+
+// MARK: - Processors
+
+/// Handles the preprocessing of text data to be used by a machine learning model.
+class TextProcessor {
+ let tokenizer: Tokenizer
+ let minContextLength: Int
+ let maxContextLength: Int
+
+ /// Initializes a `TextProcessor` with specific configuration.
+ /// - Parameters:
+ /// - configPath: The path to the configuration file specifying tokenizer and model configurations.
+ /// - tokenizerPath: The path to the tokenizer configuration.
+ /// - model: The machine learning model to be used with this processor.
+ /// - Throws: An error if the configuration is invalid or missing necessary components.
+ public init(configPath: String, tokenizerPath: String, model: MLModel) throws {
+ var configDict = try readConfig(fromPath: configPath)
+ let tokenizerDict = try readConfig(fromPath: tokenizerPath)
+
+ // Check if there's a specific 'text_encoder' configuration within the main configuration
+ if let textEncoderConfig = configDict["text_encoder"] as? [String: Any] {
+ configDict = textEncoderConfig // Use the specific 'text_encoder' configuration
+ }
+
+ // Initialize the tokenizer with its configuration.
+ let config = Config(configDict)
+ let tokenizerData = Config(tokenizerDict)
+ self.tokenizer = try AutoTokenizer.from(tokenizerConfig: config, tokenizerData: tokenizerData)
+
+ // Extract the model's input shape constraints.
+ guard let inputDescription = model.modelDescription.inputDescriptionsByName["input_ids"],
+ let multiArrayConstraint = inputDescription.multiArrayConstraint
+ else {
+ throw EncoderError.invalidInput("Cannot obtain shape information from the model.")
+ }
+
+ // Determine the context length constraints based on the model's input shape constraint.
+ let shapeConstraint = multiArrayConstraint.shapeConstraint
+ switch shapeConstraint.type {
+ case .enumerated:
+ minContextLength = shapeConstraint.enumeratedShapes[0][1].intValue
+ maxContextLength = minContextLength
+ case .range:
+ guard let range = shapeConstraint.sizeRangeForDimension[1] as? NSRange else {
+ throw EncoderError.unknownError("Model input shape has a range constraint that cannot be interpreted.")
+ }
+ minContextLength = range.location
+ maxContextLength = range.length
+ case .unspecified:
+ throw EncoderError.unknownError("Model input shape is unspecified.")
+ @unknown default:
+ throw EncoderError.unknownError("Unknown model input shape constraint type.")
+ }
+ }
+
+ /// Preprocesses a string of text into a format suitable for model prediction.
+ /// - Parameter text: The text to preprocess.
+ /// - Returns: A `MLFeatureProvider` containing the processed text ready for the model.
+ /// - Throws: An error if the text encoding fails.
+ public func preprocess(_ text: String) throws -> MLFeatureProvider {
+ let inputIDs = self.tokenizer.encode(text: text)
+ return TextInput(inputIDs: inputIDs, sequenceLength: self.maxContextLength)
+ }
+}
+
+/// Handles the preprocessing of image data to be used by a machine learning model.
+class ImageProcessor {
+ let imageSize: Int
+ let mean: [Float]
+ let std: [Float]
+
+ /// Initializes an `ImageProcessor` with specific configuration.
+ /// - Parameter configPath: The path to the configuration file specifying image size, mean, and std.
+ init(configPath: String) throws {
+ var configDict = try readConfig(fromPath: configPath)
+ if let imageEncoderConfig = configDict["image_encoder"] as? [String: Any] {
+ configDict = imageEncoderConfig
+ }
+
+ let config = Config(configDict)
+ guard let imageSize = config.imageSize?.value as? Int else {
+ throw EncoderError.invalidInput("Invalid or missing image size.")
+ }
+ self.imageSize = imageSize
+
+ guard let meanArray = config.normalizationMeans?.value as? [Any],
+ let stdArray = config.normalizationDeviations?.value as? [Any]
+ else {
+ throw EncoderError.invalidInput("Normalization means or deviations are missing.")
+ }
+
+ self.mean = try meanArray.compactMap({
+ guard let doubleValue = $0 as? Double else {
+ throw EncoderError.invalidInput("Normalization means should be an array of floats.")
+ }
+ return Float(doubleValue)
+ })
+
+ self.std = try stdArray.compactMap({
+ guard let doubleValue = $0 as? Double else {
+ throw EncoderError.invalidInput("Normalization deviations should be an array of floats.")
+ }
+ return Float(doubleValue)
+ })
+
+ // Check if the arrays have 3 values for the 3 channels
+ if self.mean.count != 3 || self.std.count != 3 {
+ throw EncoderError.invalidInput("Normalization means should contain 3 values.")
+ }
+ }
+
+ /// Preprocesses a `CGImage` into a format suitable for model prediction.
+ /// - Parameter cgImage: The image to preprocess.
+ /// - Returns: An `MLFeatureProvider` containing the preprocessed image data.
+ func preprocess(_ cgImage: CGImage) throws -> MLFeatureProvider {
+ guard let cropped = resizeAndCrop(image: cgImage, toSideLength: self.imageSize),
+ let normalized = exportToTensorAndNormalize(image: cropped, mean: self.mean, std: self.std)
+ else {
+ throw EncoderError.invalidInput("Image preprocessing failed.")
+ }
+ let featureValue = MLFeatureValue(multiArray: normalized)
+ return try ImageInput(precomputedFeature: featureValue)
+ }
+
+ private func resizeAndCrop(image: CGImage, toSideLength imageSize: Int) -> CGImage? {
+ let originalWidth = CGFloat(image.width)
+ let originalHeight = CGFloat(image.height)
+
+ let widthRatio = CGFloat(imageSize) / originalWidth
+ let heightRatio = CGFloat(imageSize) / originalHeight
+ let scaleFactor = max(widthRatio, heightRatio)
+
+ let scaledWidth = originalWidth * scaleFactor
+ let scaledHeight = originalHeight * scaleFactor
+
+ let dx = (scaledWidth - CGFloat(imageSize)) / 2.0
+ let dy = (scaledHeight - CGFloat(imageSize)) / 2.0
+ guard
+ let context = CGContext(
+ data: nil,
+ width: imageSize,
+ height: imageSize,
+ bitsPerComponent: image.bitsPerComponent,
+ bytesPerRow: 0,
+ space: image.colorSpace ?? CGColorSpaceCreateDeviceRGB(),
+ bitmapInfo: image.bitmapInfo.rawValue
+ )
+ else { return nil }
+
+ // Draw the scaled and cropped image in the context
+ context.interpolationQuality = .high
+ context.draw(image, in: CGRect(x: -dx, y: -dy, width: scaledWidth, height: scaledHeight))
+ return context.makeImage()
+ }
+
+ private func exportToTensorAndNormalize(image: CGImage, mean: [Float], std: [Float]) -> MLMultiArray? {
+ let width = image.width
+ let height = image.height
+
+ // Prepare the bitmap context for drawing the image.
+ var pixelData = [UInt8](repeating: 0, count: width * height * 4)
+ let colorSpace = CGColorSpaceCreateDeviceRGB()
+ guard
+ let context = CGContext(
+ data: &pixelData,
+ width: width,
+ height: height,
+ bitsPerComponent: 8,
+ bytesPerRow: 4 * width,
+ space: colorSpace,
+ bitmapInfo: CGImageAlphaInfo.premultipliedLast.rawValue
+ )
+ else { return nil }
+ context.draw(image, in: CGRect(x: 0, y: 0, width: width, height: height))
+
+ // While normalizing the pixels, let's also transpose them from HWC to CHW
+ let channelSize = width * height
+ var floatPixels = [Float](repeating: 0, count: channelSize * 3)
+ for i in 0 ..< channelSize {
+ floatPixels[channelSize * 0 + i] = (Float(pixelData[i * 4 + 0]) / 255.0 - mean[0]) / std[0]
+ floatPixels[channelSize * 1 + i] = (Float(pixelData[i * 4 + 1]) / 255.0 - mean[1]) / std[1]
+ floatPixels[channelSize * 2 + i] = (Float(pixelData[i * 4 + 2]) / 255.0 - mean[2]) / std[2]
+ }
+
+ // We need to wrap the constructor that may fail
+ do {
+ let tensor = try MLMultiArray(
+ shape: [1, 3, NSNumber(value: height), NSNumber(value: width)],
+ dataType: .float32
+ )
+ for i in 0 ..< floatPixels.count {
+ tensor[i] = NSNumber(value: floatPixels[i])
+ }
+ return tensor
+ }
+ catch {
+ return nil
+ }
+ }
+}
+
+// MARK: - Feature Providers
+
+/// Provides features for text input to a machine learning model, handling padding and attention mask generation.
+class TextInput: MLFeatureProvider {
+ var inputIDs: [Int]
+ var sequenceLength: Int
+ var paddingID: Int
+
+ /// Initializes a new instance for providing text input features.
+ /// - Parameters:
+ /// - inputIDs: Array of integer IDs representing the encoded text.
+ /// - sequenceLength: The fixed length to which the input sequence should be padded.
+ /// - paddingID: The integer ID used for padding shorter sequences. Defaults to 0.
+ init(inputIDs: [Int], sequenceLength: Int, paddingID: Int = 0) {
+ self.inputIDs = inputIDs
+ self.sequenceLength = sequenceLength
+ self.paddingID = paddingID
+ }
+
+ var featureNames: Set {
+ return Set(["input_ids", "attention_mask"])
+ }
+
+ /// Returns the feature value for the specified feature name.
+ /// - Parameter featureName: The name of the feature for which the value is requested.
+ /// - Returns: An optional `MLFeatureValue` containing the data for the specified feature.
+ func featureValue(for featureName: String) -> MLFeatureValue? {
+ switch featureName {
+ case "input_ids", "attention_mask":
+ return createFeatureValue(for: featureName)
+ default:
+ return nil
+ }
+ }
+
+ /// Creates the feature value for input IDs or attention mask based on the specified feature name.
+ /// - Parameter featureName: The name of the feature.
+ /// - Returns: An `MLFeatureValue` if the array can be created, otherwise nil.
+ private func createFeatureValue(for featureName: String) -> MLFeatureValue? {
+ let count = min(inputIDs.count, sequenceLength)
+ let totalElements = sequenceLength
+ guard let multiArray = try? MLMultiArray(shape: [1, NSNumber(value: totalElements)], dataType: .int32) else {
+ return nil
+ }
+
+ if featureName == "input_ids" {
+ for i in 0 ..< count {
+ multiArray[i] = NSNumber(value: inputIDs[i])
+ }
+ for i in count ..< totalElements {
+ multiArray[i] = NSNumber(value: paddingID)
+ }
+ }
+ else if featureName == "attention_mask" {
+ for i in 0 ..< count {
+ multiArray[i] = NSNumber(value: 1)
+ }
+ for i in count ..< totalElements {
+ multiArray[i] = NSNumber(value: 0)
+ }
+ }
+
+ return MLFeatureValue(multiArray: multiArray)
+ }
+}
+
+/// Provides a precomputed feature for image inputs to a machine learning model.
+class ImageInput: MLFeatureProvider {
+ var precomputedFeature: MLFeatureValue
+
+ /// Initializes a new instance with a precomputed feature.
+ /// - Parameter precomputedFeature: The `MLFeatureValue` containing the precomputed feature data.
+ /// - Throws: An error if the precomputed feature is not valid for the model.
+ init(precomputedFeature: MLFeatureValue) throws {
+ self.precomputedFeature = precomputedFeature
+ }
+
+ var featureNames: Set {
+ return Set(["images"])
+ }
+
+ /// Returns the feature value for the specified feature name.
+ /// - Parameter featureName: The name of the feature for which the value is requested.
+ /// - Returns: An optional `MLFeatureValue` containing the data for the specified feature.
+ func featureValue(for featureName: String) -> MLFeatureValue? {
+ switch featureName {
+ case "images":
+ return precomputedFeature
+ default:
+ return nil
+ }
+ }
+}
diff --git a/swift/EmbeddingsTests.swift b/swift/EncodersTests.swift
similarity index 75%
rename from swift/EmbeddingsTests.swift
rename to swift/EncodersTests.swift
index 5efb87f..645d531 100644
--- a/swift/EmbeddingsTests.swift
+++ b/swift/EncodersTests.swift
@@ -1,11 +1,26 @@
import CoreGraphics
+import Hub
import ImageIO
import UForm
-import Hub
import XCTest
final class TokenizerTests: XCTestCase {
+ var hfToken: String?
+
+ override func setUp() {
+ super.setUp()
+ // Attempt to load the Hugging Face token from the `.hf_token` file in the current directory
+ let fileURL = URL(fileURLWithPath: FileManager.default.currentDirectoryPath).appendingPathComponent(".hf_token")
+ if let token = try? String(contentsOf: fileURL, encoding: .utf8).trimmingCharacters(in: .whitespacesAndNewlines)
+ {
+ hfToken = token
+ }
+
+ hfToken = hfToken ?? ProcessInfo.processInfo.environment["HF_TOKEN"]
+ hfToken = hfToken ?? "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"
+ }
+
func cosineSimilarity(between vectorA: [T], and vectorB: [T]) -> T {
guard vectorA.count == vectorB.count else {
fatalError("Vectors must be of the same length.")
@@ -23,11 +38,11 @@ final class TokenizerTests: XCTestCase {
return dotProduct / (magnitudeA * magnitudeB)
}
- func testTextEmbeddings() async throws {
+ func testTextEmbeddings(forModel modelName: String) async throws {
- let api = HubApi(hfToken: "xxx")
+ let api = HubApi(hfToken: hfToken)
let textModel = try await TextEncoder(
- modelName: "unum-cloud/uform-vl2-english-small",
+ modelName: "unum-cloud/uform3-image-text-english-small",
hubApi: api
)
@@ -40,7 +55,7 @@ final class TokenizerTests: XCTestCase {
var textEmbeddings: [[Float32]] = []
for text in texts {
- let embedding: [Float32] = try textModel.forward(with: text).asFloats()
+ let embedding: [Float32] = try textModel.encode(text).asFloats()
textEmbeddings.append(embedding)
}
@@ -60,36 +75,47 @@ final class TokenizerTests: XCTestCase {
)
}
- func testImageEmbeddings() async throws {
+ func testTextEmbeddings() async throws {
+ for model in [
+ "unum-cloud/uform3-image-text-english-small",
+ "unum-cloud/uform3-image-text-english-base",
+ "unum-cloud/uform3-image-text-english-large",
+ "unum-cloud/uform3-image-text-multilingual-base",
+ ] {
+ try await testTextEmbeddings(forModel: model)
+ }
+ }
+
+ func testImageEmbeddings(forModel modelName: String) async throws {
// One option is to use a local model repository.
//
// let root = "uform/"
// let textModel = try TextEncoder(
- // modelPath: root + "uform-vl-english-large-text.mlpackage",
+ // modelPath: root + "uform-vl-english-large-text_encoder.mlpackage",
// configPath: root + "uform-vl-english-large-text.json",
// tokenizerPath: root + "uform-vl-english-large-text.tokenizer.json"
// )
// let imageModel = try ImageEncoder(
- // modelPath: root + "uform-vl-english-large-image.mlpackage",
+ // modelPath: root + "uform-vl-english-large-image_encoder.mlpackage",
// configPath: root + "uform-vl-english-large-image.json"
// )
//
// A better option is to fetch directly from HuggingFace, similar to how users would do that:
- let api = HubApi(hfToken: "xxx")
+ let api = HubApi(hfToken: hfToken)
let textModel = try await TextEncoder(
- modelName: "unum-cloud/uform-vl2-english-small",
+ modelName: modelName,
hubApi: api
)
let imageModel = try await ImageEncoder(
- modelName: "unum-cloud/uform-vl2-english-small",
+ modelName: modelName,
hubApi: api
)
let texts = [
"A group of friends enjoy a barbecue on a sandy beach, with one person grilling over a large black grill, while the other sits nearby, laughing and enjoying the camaraderie.",
"A white and orange cat stands on its hind legs, reaching towards a wicker basket filled with red raspberries on a wooden table in a garden, surrounded by orange flowers and a white teapot, creating a serene and whimsical scene.",
- "A young girl in a yellow dress stands in a grassy field, holding an umbrella and looking at the camera, amidst rain.",
+ "A little girl in a yellow dress stands in a grassy field, holding an umbrella and looking at the camera, amidst rain.",
"This serene bedroom features a white bed with a black canopy, a gray armchair, a black dresser with a mirror, a vase with a plant, a window with white curtains, a rug, and a wooden floor, creating a tranquil and elegant atmosphere.",
"The image captures the iconic Louvre Museum in Paris, illuminated by warm lights against a dark sky, with the iconic glass pyramid in the center, surrounded by ornate buildings and a large courtyard, showcasing the museum's grandeur and historical significance.",
]
@@ -115,9 +141,9 @@ final class TokenizerTests: XCTestCase {
)
}
- let textEmbedding: [Float32] = try textModel.forward(with: text).asFloats()
+ let textEmbedding: [Float32] = try textModel.encode(text).asFloats()
textEmbeddings.append(textEmbedding)
- let imageEmbedding: [Float32] = try imageModel.forward(with: cgImage).asFloats()
+ let imageEmbedding: [Float32] = try imageModel.encode(cgImage).asFloats()
imageEmbeddings.append(imageEmbedding)
}
@@ -143,4 +169,15 @@ final class TokenizerTests: XCTestCase {
}
}
+ func testImageEmbeddings() async throws {
+ for model in [
+ "unum-cloud/uform3-image-text-english-small",
+ "unum-cloud/uform3-image-text-english-base",
+ "unum-cloud/uform3-image-text-english-large",
+ "unum-cloud/uform3-image-text-multilingual-base",
+ ] {
+ try await testImageEmbeddings(forModel: model)
+ }
+ }
+
}
diff --git a/swift/README.md b/swift/README.md
new file mode 100644
index 0000000..8fa0eb8
--- /dev/null
+++ b/swift/README.md
@@ -0,0 +1,73 @@
+# UForm Swift SDK
+
+UForm offers first-party support for Swift.
+To get started, add UForm to your project using Swift Package Manager.
+
+```bash
+swift package init --type executable
+swift package add uform
+```
+
+Then, import UForm in your Swift code:
+
+```swift
+import UForm
+```
+
+## Embeddings
+
+### Text Embeddings
+
+```swift
+let textModel = try await TextEncoder(modelName: "unum-cloud/uform3-image-text-english-small")
+let text = "A group of friends enjoy a barbecue on a sandy beach, with one person grilling over a large black grill, while the other sits nearby, laughing and enjoying the camaraderie."
+let textEmbedding: Embedding = try textModel.encode(text)
+let textVector: [Float32] = textEmbedding.asFloats()
+```
+
+### Image Embeddings
+
+```swift
+let imageModel = try await ImageEncoder(modelName: "unum-cloud/uform3-image-text-english-small")
+let imageURL = "https://github.com/ashvardanian/ashvardanian/blob/master/demos/bbq-on-beach.jpg?raw=true"
+guard let url = URL(string: imageURL),
+ let imageSource = CGImageSourceCreateWithURL(url as CFURL, nil),
+ let cgImage = CGImageSourceCreateImageAtIndex(imageSource, 0, nil) {
+ throw Exception("Could not load image from URL: \(imageURL)")
+}
+
+var imageEmbedding: Embedding = try imageModel.encode(cgImage)
+var imageVector: [Float32] = embedding.asFloats()
+```
+
+### Computing Distances
+
+There are several ways to compute distances between embeddings, once you have them.
+Naive Swift code might look like this:
+
+```swift
+func cosineSimilarity(_ a: [Float32], _ b: [Float32]) -> Float32 {
+ let dotProduct = zip(a, b).map(*).reduce(0, +)
+ let normA = sqrt(a.map { $0 * $0 }.reduce(0, +))
+ let normB = sqrt(b.map { $0 * $0 }.reduce(0, +))
+ return dotProduct / (normA * normB)
+}
+```
+
+A faster way to compute distances is to use the Accelerate framework:
+
+```swift
+import Accelerate
+
+func cosineSimilarity(_ a: [Float32], _ b: [Float32]) -> Float32 {
+ var result: Float32 = 0
+ var aNorm: Float32 = 0
+ var bNorm: Float32 = 0
+ vDSP_dotpr(a, 1, b, 1, &result, vDSP_Length(a.count))
+ vDSP_svesq(a, 1, &aNorm, vDSP_Length(a.count))
+ vDSP_svesq(b, 1, &bNorm, vDSP_Length(b.count))
+ return result / sqrt(aNorm * bNorm)
+}
+```
+
+An even faster approach would be to use USearch or SimSIMD, that work not only for `Float32` and `Float64`, but also for `Float16`, `Int8`, and binary embeddings.
diff --git a/yarn.lock b/yarn.lock
new file mode 100644
index 0000000..5ab5bbe
--- /dev/null
+++ b/yarn.lock
@@ -0,0 +1,594 @@
+# THIS IS AN AUTOGENERATED FILE. DO NOT EDIT THIS FILE DIRECTLY.
+# yarn lockfile v1
+
+
+"@huggingface/hub@^0.14.8":
+ version "0.14.8"
+ resolved "https://registry.npmjs.org/@huggingface/hub/-/hub-0.14.8.tgz"
+ integrity sha512-vdJRham99E5Uzsc4rO0gTz0ykafmx6V78pgPpJ7LGz5X+P2exe/izPFndqczAzy8jVWN55Jjtnuqg+Y0zrjc+Q==
+ dependencies:
+ hash-wasm "^4.9.0"
+
+"@huggingface/jinja@^0.2.2":
+ version "0.2.2"
+ resolved "https://registry.npmjs.org/@huggingface/jinja/-/jinja-0.2.2.tgz"
+ integrity sha512-/KPde26khDUIPkTGU82jdtTW9UAuvUTumCAbFs/7giR0SxsvZC4hru51PBvpijH6BVkHcROcvZM/lpy5h1jRRA==
+
+"@protobufjs/aspromise@^1.1.1", "@protobufjs/aspromise@^1.1.2":
+ version "1.1.2"
+ resolved "https://registry.npmjs.org/@protobufjs/aspromise/-/aspromise-1.1.2.tgz"
+ integrity sha512-j+gKExEuLmKwvz3OgROXtrJ2UG2x8Ch2YZUxahh+s1F2HZ+wAceUNLkvy6zKCPVRkU++ZWQrdxsUeQXmcg4uoQ==
+
+"@protobufjs/base64@^1.1.2":
+ version "1.1.2"
+ resolved "https://registry.npmjs.org/@protobufjs/base64/-/base64-1.1.2.tgz"
+ integrity sha512-AZkcAA5vnN/v4PDqKyMR5lx7hZttPDgClv83E//FMNhR2TMcLUhfRUBHCmSl0oi9zMgDDqRUJkSxO3wm85+XLg==
+
+"@protobufjs/codegen@^2.0.4":
+ version "2.0.4"
+ resolved "https://registry.npmjs.org/@protobufjs/codegen/-/codegen-2.0.4.tgz"
+ integrity sha512-YyFaikqM5sH0ziFZCN3xDC7zeGaB/d0IUb9CATugHWbd1FRFwWwt4ld4OYMPWu5a3Xe01mGAULCdqhMlPl29Jg==
+
+"@protobufjs/eventemitter@^1.1.0":
+ version "1.1.0"
+ resolved "https://registry.npmjs.org/@protobufjs/eventemitter/-/eventemitter-1.1.0.tgz"
+ integrity sha512-j9ednRT81vYJ9OfVuXG6ERSTdEL1xVsNgqpkxMsbIabzSo3goCjDIveeGv5d03om39ML71RdmrGNjG5SReBP/Q==
+
+"@protobufjs/fetch@^1.1.0":
+ version "1.1.0"
+ resolved "https://registry.npmjs.org/@protobufjs/fetch/-/fetch-1.1.0.tgz"
+ integrity sha512-lljVXpqXebpsijW71PZaCYeIcE5on1w5DlQy5WH6GLbFryLUrBD4932W/E2BSpfRJWseIL4v/KPgBFxDOIdKpQ==
+ dependencies:
+ "@protobufjs/aspromise" "^1.1.1"
+ "@protobufjs/inquire" "^1.1.0"
+
+"@protobufjs/float@^1.0.2":
+ version "1.0.2"
+ resolved "https://registry.npmjs.org/@protobufjs/float/-/float-1.0.2.tgz"
+ integrity sha512-Ddb+kVXlXst9d+R9PfTIxh1EdNkgoRe5tOX6t01f1lYWOvJnSPDBlG241QLzcyPdoNTsblLUdujGSE4RzrTZGQ==
+
+"@protobufjs/inquire@^1.1.0":
+ version "1.1.0"
+ resolved "https://registry.npmjs.org/@protobufjs/inquire/-/inquire-1.1.0.tgz"
+ integrity sha512-kdSefcPdruJiFMVSbn801t4vFK7KB/5gd2fYvrxhuJYg8ILrmn9SKSX2tZdV6V+ksulWqS7aXjBcRXl3wHoD9Q==
+
+"@protobufjs/path@^1.1.2":
+ version "1.1.2"
+ resolved "https://registry.npmjs.org/@protobufjs/path/-/path-1.1.2.tgz"
+ integrity sha512-6JOcJ5Tm08dOHAbdR3GrvP+yUUfkjG5ePsHYczMFLq3ZmMkAD98cDgcT2iA1lJ9NVwFd4tH/iSSoe44YWkltEA==
+
+"@protobufjs/pool@^1.1.0":
+ version "1.1.0"
+ resolved "https://registry.npmjs.org/@protobufjs/pool/-/pool-1.1.0.tgz"
+ integrity sha512-0kELaGSIDBKvcgS4zkjz1PeddatrjYcmMWOlAuAPwAeccUrPHdUqo/J6LiymHHEiJT5NrF1UVwxY14f+fy4WQw==
+
+"@protobufjs/utf8@^1.1.0":
+ version "1.1.0"
+ resolved "https://registry.npmjs.org/@protobufjs/utf8/-/utf8-1.1.0.tgz"
+ integrity sha512-Vvn3zZrhQZkkBE8LSuW3em98c0FwgO4nxzv6OdSxPKJIEKY2bGbHn+mhGIPerzI4twdxaP8/0+06HBpwf345Lw==
+
+"@types/long@^4.0.1":
+ version "4.0.2"
+ resolved "https://registry.npmjs.org/@types/long/-/long-4.0.2.tgz"
+ integrity sha512-MqTGEo5bj5t157U6fA/BiDynNkn0YknVdh48CMPkTSpFTVmvao5UQmm7uEF6xBEo7qIMAlY/JSleYaE6VOdpaA==
+
+"@types/node@>=13.7.0":
+ version "20.12.7"
+ resolved "https://registry.npmjs.org/@types/node/-/node-20.12.7.tgz"
+ integrity sha512-wq0cICSkRLVaf3UGLMGItu/PtdY7oaXaI/RVU+xliKVOtRna3PRY57ZDfztpDL0n11vfymMUnXv8QwYCO7L1wg==
+ dependencies:
+ undici-types "~5.26.4"
+
+"@xenova/transformers@^2.17.0":
+ version "2.17.0"
+ resolved "https://registry.npmjs.org/@xenova/transformers/-/transformers-2.17.0.tgz"
+ integrity sha512-usmDut7hwnrc4EqP59cboYqE6C8up63SqMy3E9RjG9nCsOhrsLndEU7DMu+bZ9R+HcAI8jRGabTIxH+B6agBVA==
+ dependencies:
+ "@huggingface/jinja" "^0.2.2"
+ onnxruntime-web "1.14.0"
+ sharp "^0.32.0"
+ optionalDependencies:
+ onnxruntime-node "1.14.0"
+
+b4a@^1.6.4:
+ version "1.6.6"
+ resolved "https://registry.npmjs.org/b4a/-/b4a-1.6.6.tgz"
+ integrity sha512-5Tk1HLk6b6ctmjIkAcU/Ujv/1WqiDl0F0JdRCR80VsOcUlHcu7pWeWRlOqQLHfDEsVx9YH/aif5AG4ehoCtTmg==
+
+bare-events@^2.0.0, bare-events@^2.2.0:
+ version "2.2.2"
+ resolved "https://registry.npmjs.org/bare-events/-/bare-events-2.2.2.tgz"
+ integrity sha512-h7z00dWdG0PYOQEvChhOSWvOfkIKsdZGkWr083FgN/HyoQuebSew/cgirYqh9SCuy/hRvxc5Vy6Fw8xAmYHLkQ==
+
+bare-fs@^2.1.1:
+ version "2.2.3"
+ resolved "https://registry.npmjs.org/bare-fs/-/bare-fs-2.2.3.tgz"
+ integrity sha512-amG72llr9pstfXOBOHve1WjiuKKAMnebcmMbPWDZ7BCevAoJLpugjuAPRsDINEyjT0a6tbaVx3DctkXIRbLuJw==
+ dependencies:
+ bare-events "^2.0.0"
+ bare-path "^2.0.0"
+ streamx "^2.13.0"
+
+bare-os@^2.1.0:
+ version "2.2.1"
+ resolved "https://registry.npmjs.org/bare-os/-/bare-os-2.2.1.tgz"
+ integrity sha512-OwPyHgBBMkhC29Hl3O4/YfxW9n7mdTr2+SsO29XBWKKJsbgj3mnorDB80r5TiCQgQstgE5ga1qNYrpes6NvX2w==
+
+bare-path@^2.0.0, bare-path@^2.1.0:
+ version "2.1.1"
+ resolved "https://registry.npmjs.org/bare-path/-/bare-path-2.1.1.tgz"
+ integrity sha512-OHM+iwRDRMDBsSW7kl3dO62JyHdBKO3B25FB9vNQBPcGHMo4+eA8Yj41Lfbk3pS/seDY+siNge0LdRTulAau/A==
+ dependencies:
+ bare-os "^2.1.0"
+
+base64-js@^1.3.1:
+ version "1.5.1"
+ resolved "https://registry.npmjs.org/base64-js/-/base64-js-1.5.1.tgz"
+ integrity sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==
+
+bl@^4.0.3:
+ version "4.1.0"
+ resolved "https://registry.npmjs.org/bl/-/bl-4.1.0.tgz"
+ integrity sha512-1W07cM9gS6DcLperZfFSj+bWLtaPGSOHWhPiGzXmvVJbRLdG82sH/Kn8EtW1VqWVA54AKf2h5k5BbnIbwF3h6w==
+ dependencies:
+ buffer "^5.5.0"
+ inherits "^2.0.4"
+ readable-stream "^3.4.0"
+
+buffer@^5.5.0:
+ version "5.7.1"
+ resolved "https://registry.npmjs.org/buffer/-/buffer-5.7.1.tgz"
+ integrity sha512-EHcyIPBQ4BSGlvjB16k5KgAJ27CIsHY/2JBmCRReo48y9rQ3MaUzWX3KVlBa4U7MyX02HdVj0K7C3WaB3ju7FQ==
+ dependencies:
+ base64-js "^1.3.1"
+ ieee754 "^1.1.13"
+
+chownr@^1.1.1:
+ version "1.1.4"
+ resolved "https://registry.npmjs.org/chownr/-/chownr-1.1.4.tgz"
+ integrity sha512-jJ0bqzaylmJtVnNgzTeSOs8DPavpbYgEr/b0YL8/2GO3xJEhInFmhKMUnEJQjZumK7KXGFhUy89PrsJWlakBVg==
+
+color-convert@^2.0.1:
+ version "2.0.1"
+ resolved "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz"
+ integrity sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==
+ dependencies:
+ color-name "~1.1.4"
+
+color-name@^1.0.0, color-name@~1.1.4:
+ version "1.1.4"
+ resolved "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz"
+ integrity sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==
+
+color-string@^1.9.0:
+ version "1.9.1"
+ resolved "https://registry.npmjs.org/color-string/-/color-string-1.9.1.tgz"
+ integrity sha512-shrVawQFojnZv6xM40anx4CkoDP+fZsw/ZerEMsW/pyzsRbElpsL/DBVW7q3ExxwusdNXI3lXpuhEZkzs8p5Eg==
+ dependencies:
+ color-name "^1.0.0"
+ simple-swizzle "^0.2.2"
+
+color@^4.2.3:
+ version "4.2.3"
+ resolved "https://registry.npmjs.org/color/-/color-4.2.3.tgz"
+ integrity sha512-1rXeuUUiGGrykh+CeBdu5Ie7OJwinCgQY0bc7GCRxy5xVHy+moaqkpL/jqQq0MtQOeYcrqEz4abc5f0KtU7W4A==
+ dependencies:
+ color-convert "^2.0.1"
+ color-string "^1.9.0"
+
+decompress-response@^6.0.0:
+ version "6.0.0"
+ resolved "https://registry.npmjs.org/decompress-response/-/decompress-response-6.0.0.tgz"
+ integrity sha512-aW35yZM6Bb/4oJlZncMH2LCoZtJXTRxES17vE3hoRiowU2kWHaJKFkSBDnDR+cm9J+9QhXmREyIfv0pji9ejCQ==
+ dependencies:
+ mimic-response "^3.1.0"
+
+deep-extend@^0.6.0:
+ version "0.6.0"
+ resolved "https://registry.npmjs.org/deep-extend/-/deep-extend-0.6.0.tgz"
+ integrity sha512-LOHxIOaPYdHlJRtCQfDIVZtfw/ufM8+rVj649RIHzcm/vGwQRXFt6OPqIFWsm2XEMrNIEtWR64sY1LEKD2vAOA==
+
+detect-libc@^2.0.0, detect-libc@^2.0.2:
+ version "2.0.3"
+ resolved "https://registry.npmjs.org/detect-libc/-/detect-libc-2.0.3.tgz"
+ integrity sha512-bwy0MGW55bG41VqxxypOsdSdGqLwXPI/focwgTYCFMbdUiBAxLg9CFzG08sz2aqzknwiX7Hkl0bQENjg8iLByw==
+
+end-of-stream@^1.1.0, end-of-stream@^1.4.1:
+ version "1.4.4"
+ resolved "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.4.tgz"
+ integrity sha512-+uw1inIHVPQoaVuHzRyXd21icM+cnt4CzD5rW+NC1wjOUSTOs+Te7FOv7AhN7vS9x/oIyhLP5PR1H+phQAHu5Q==
+ dependencies:
+ once "^1.4.0"
+
+expand-template@^2.0.3:
+ version "2.0.3"
+ resolved "https://registry.npmjs.org/expand-template/-/expand-template-2.0.3.tgz"
+ integrity sha512-XYfuKMvj4O35f/pOXLObndIRvyQ+/+6AhODh+OKWj9S9498pHHn/IMszH+gt0fBCRWMNfk1ZSp5x3AifmnI2vg==
+
+fast-fifo@^1.1.0, fast-fifo@^1.2.0:
+ version "1.3.2"
+ resolved "https://registry.npmjs.org/fast-fifo/-/fast-fifo-1.3.2.tgz"
+ integrity sha512-/d9sfos4yxzpwkDkuN7k2SqFKtYNmCTzgfEpz82x34IM9/zc8KGxQoXg1liNC/izpRM/MBdt44Nmx41ZWqk+FQ==
+
+flatbuffers@^1.12.0:
+ version "1.12.0"
+ resolved "https://registry.npmjs.org/flatbuffers/-/flatbuffers-1.12.0.tgz"
+ integrity sha512-c7CZADjRcl6j0PlvFy0ZqXQ67qSEZfrVPynmnL+2zPc+NtMvrF8Y0QceMo7QqnSPc7+uWjUIAbvCQ5WIKlMVdQ==
+
+fs-constants@^1.0.0:
+ version "1.0.0"
+ resolved "https://registry.npmjs.org/fs-constants/-/fs-constants-1.0.0.tgz"
+ integrity sha512-y6OAwoSIf7FyjMIv94u+b5rdheZEjzR63GTyZJm5qh4Bi+2YgwLCcI/fPFZkL5PSixOt6ZNKm+w+Hfp/Bciwow==
+
+github-from-package@0.0.0:
+ version "0.0.0"
+ resolved "https://registry.npmjs.org/github-from-package/-/github-from-package-0.0.0.tgz"
+ integrity sha512-SyHy3T1v2NUXn29OsWdxmK6RwHD+vkj3v8en8AOBZ1wBQ/hCAQ5bAQTD02kW4W9tUp/3Qh6J8r9EvntiyCmOOw==
+
+guid-typescript@^1.0.9:
+ version "1.0.9"
+ resolved "https://registry.npmjs.org/guid-typescript/-/guid-typescript-1.0.9.tgz"
+ integrity sha512-Y8T4vYhEfwJOTbouREvG+3XDsjr8E3kIr7uf+JZ0BYloFsttiHU0WfvANVsR7TxNUJa/WpCnw/Ino/p+DeBhBQ==
+
+hash-wasm@^4.9.0:
+ version "4.11.0"
+ resolved "https://registry.npmjs.org/hash-wasm/-/hash-wasm-4.11.0.tgz"
+ integrity sha512-HVusNXlVqHe0fzIzdQOGolnFN6mX/fqcrSAOcTBXdvzrXVHwTz11vXeKRmkR5gTuwVpvHZEIyKoePDvuAR+XwQ==
+
+ieee754@^1.1.13:
+ version "1.2.1"
+ resolved "https://registry.npmjs.org/ieee754/-/ieee754-1.2.1.tgz"
+ integrity sha512-dcyqhDvX1C46lXZcVqCpK+FtMRQVdIMN6/Df5js2zouUsqG7I6sFxitIC+7KYK29KdXOLHdu9zL4sFnoVQnqaA==
+
+inherits@^2.0.3, inherits@^2.0.4:
+ version "2.0.4"
+ resolved "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz"
+ integrity sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==
+
+ini@~1.3.0:
+ version "1.3.8"
+ resolved "https://registry.npmjs.org/ini/-/ini-1.3.8.tgz"
+ integrity sha512-JV/yugV2uzW5iMRSiZAyDtQd+nxtUnjeLt0acNdw98kKLrvuRVyB80tsREOE7yvGVgalhZ6RNXCmEHkUKBKxew==
+
+is-arrayish@^0.3.1:
+ version "0.3.2"
+ resolved "https://registry.npmjs.org/is-arrayish/-/is-arrayish-0.3.2.tgz"
+ integrity sha512-eVRqCvVlZbuw3GrM63ovNSNAeA1K16kaR/LRY/92w0zxQ5/1YzwblUX652i4Xs9RwAGjW9d9y6X88t8OaAJfWQ==
+
+long@^4.0.0:
+ version "4.0.0"
+ resolved "https://registry.npmjs.org/long/-/long-4.0.0.tgz"
+ integrity sha512-XsP+KhQif4bjX1kbuSiySJFNAehNxgLb6hPRGJ9QsUr8ajHkuXGdrHmFUTUUXhDwVX2R5bY4JNZEwbUiMhV+MA==
+
+long@^5.0.0:
+ version "5.2.3"
+ resolved "https://registry.npmjs.org/long/-/long-5.2.3.tgz"
+ integrity sha512-lcHwpNoggQTObv5apGNCTdJrO69eHOZMi4BNC+rTLER8iHAqGrUVeLh/irVIM7zTw2bOXA8T6uNPeujwOLg/2Q==
+
+long@^5.2.3:
+ version "5.2.3"
+ resolved "https://registry.npmjs.org/long/-/long-5.2.3.tgz"
+ integrity sha512-lcHwpNoggQTObv5apGNCTdJrO69eHOZMi4BNC+rTLER8iHAqGrUVeLh/irVIM7zTw2bOXA8T6uNPeujwOLg/2Q==
+
+lru-cache@^6.0.0:
+ version "6.0.0"
+ resolved "https://registry.npmjs.org/lru-cache/-/lru-cache-6.0.0.tgz"
+ integrity sha512-Jo6dJ04CmSjuznwJSS3pUeWmd/H0ffTlkXXgwZi+eq1UCmqQwCh+eLsYOYCwY991i2Fah4h1BEMCx4qThGbsiA==
+ dependencies:
+ yallist "^4.0.0"
+
+mimic-response@^3.1.0:
+ version "3.1.0"
+ resolved "https://registry.npmjs.org/mimic-response/-/mimic-response-3.1.0.tgz"
+ integrity sha512-z0yWI+4FDrrweS8Zmt4Ej5HdJmky15+L2e6Wgn3+iK5fWzb6T3fhNFq2+MeTRb064c6Wr4N/wv0DzQTjNzHNGQ==
+
+minimist@^1.2.0, minimist@^1.2.3:
+ version "1.2.8"
+ resolved "https://registry.npmjs.org/minimist/-/minimist-1.2.8.tgz"
+ integrity sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA==
+
+mkdirp-classic@^0.5.2, mkdirp-classic@^0.5.3:
+ version "0.5.3"
+ resolved "https://registry.npmjs.org/mkdirp-classic/-/mkdirp-classic-0.5.3.tgz"
+ integrity sha512-gKLcREMhtuZRwRAfqP3RFW+TK4JqApVBtOIftVgjuABpAtpxhPGaDcfvbhNvD0B8iD1oUr/txX35NjcaY6Ns/A==
+
+napi-build-utils@^1.0.1:
+ version "1.0.2"
+ resolved "https://registry.npmjs.org/napi-build-utils/-/napi-build-utils-1.0.2.tgz"
+ integrity sha512-ONmRUqK7zj7DWX0D9ADe03wbwOBZxNAfF20PlGfCWQcD3+/MakShIHrMqx9YwPTfxDdF1zLeL+RGZiR9kGMLdg==
+
+node-abi@^3.3.0:
+ version "3.57.0"
+ resolved "https://registry.npmjs.org/node-abi/-/node-abi-3.57.0.tgz"
+ integrity sha512-Dp+A9JWxRaKuHP35H77I4kCKesDy5HUDEmScia2FyncMTOXASMyg251F5PhFoDA5uqBrDDffiLpbqnrZmNXW+g==
+ dependencies:
+ semver "^7.3.5"
+
+node-addon-api@^6.1.0:
+ version "6.1.0"
+ resolved "https://registry.npmjs.org/node-addon-api/-/node-addon-api-6.1.0.tgz"
+ integrity sha512-+eawOlIgy680F0kBzPUNFhMZGtJ1YmqM6l4+Crf4IkImjYrO/mqPwRMh352g23uIaQKFItcQ64I7KMaJxHgAVA==
+
+once@^1.3.1, once@^1.4.0:
+ version "1.4.0"
+ resolved "https://registry.npmjs.org/once/-/once-1.4.0.tgz"
+ integrity sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w==
+ dependencies:
+ wrappy "1"
+
+onnx-proto@^4.0.4:
+ version "4.0.4"
+ resolved "https://registry.npmjs.org/onnx-proto/-/onnx-proto-4.0.4.tgz"
+ integrity sha512-aldMOB3HRoo6q/phyB6QRQxSt895HNNw82BNyZ2CMh4bjeKv7g/c+VpAFtJuEMVfYLMbRx61hbuqnKceLeDcDA==
+ dependencies:
+ protobufjs "^6.8.8"
+
+onnxruntime-common@~1.14.0:
+ version "1.14.0"
+ resolved "https://registry.npmjs.org/onnxruntime-common/-/onnxruntime-common-1.14.0.tgz"
+ integrity sha512-3LJpegM2iMNRX2wUmtYfeX/ytfOzNwAWKSq1HbRrKc9+uqG/FsEA0bbKZl1btQeZaXhC26l44NWpNUeXPII7Ew==
+
+onnxruntime-common@1.17.3:
+ version "1.17.3"
+ resolved "https://registry.npmjs.org/onnxruntime-common/-/onnxruntime-common-1.17.3.tgz"
+ integrity sha512-IkbaDelNVX8cBfHFgsNADRIq2TlXMFWW+nG55mwWvQT4i0NZb32Jf35Pf6h9yjrnK78RjcnlNYaI37w394ovMw==
+
+onnxruntime-node@1.14.0:
+ version "1.14.0"
+ resolved "https://registry.npmjs.org/onnxruntime-node/-/onnxruntime-node-1.14.0.tgz"
+ integrity sha512-5ba7TWomIV/9b6NH/1x/8QEeowsb+jBEvFzU6z0T4mNsFwdPqXeFUM7uxC6QeSRkEbWu3qEB0VMjrvzN/0S9+w==
+ dependencies:
+ onnxruntime-common "~1.14.0"
+
+onnxruntime-web@^1.17.3:
+ version "1.17.3"
+ resolved "https://registry.npmjs.org/onnxruntime-web/-/onnxruntime-web-1.17.3.tgz"
+ integrity sha512-MSDrNUWgc1biP0YzY488OJ9n/jTMS9EXysgm9Aw4CUj2A836ALbO2J1sgzguWJeVUHTlM6p7tRzo8IGAgaXWKw==
+ dependencies:
+ flatbuffers "^1.12.0"
+ guid-typescript "^1.0.9"
+ long "^5.2.3"
+ onnxruntime-common "1.17.3"
+ platform "^1.3.6"
+ protobufjs "^7.2.4"
+
+onnxruntime-web@1.14.0:
+ version "1.14.0"
+ resolved "https://registry.npmjs.org/onnxruntime-web/-/onnxruntime-web-1.14.0.tgz"
+ integrity sha512-Kcqf43UMfW8mCydVGcX9OMXI2VN17c0p6XvR7IPSZzBf/6lteBzXHvcEVWDPmCKuGombl997HgLqj91F11DzXw==
+ dependencies:
+ flatbuffers "^1.12.0"
+ guid-typescript "^1.0.9"
+ long "^4.0.0"
+ onnx-proto "^4.0.4"
+ onnxruntime-common "~1.14.0"
+ platform "^1.3.6"
+
+platform@^1.3.6:
+ version "1.3.6"
+ resolved "https://registry.npmjs.org/platform/-/platform-1.3.6.tgz"
+ integrity sha512-fnWVljUchTro6RiCFvCXBbNhJc2NijN7oIQxbwsyL0buWJPG85v81ehlHI9fXrJsMNgTofEoWIQeClKpgxFLrg==
+
+prebuild-install@^7.1.1:
+ version "7.1.2"
+ resolved "https://registry.npmjs.org/prebuild-install/-/prebuild-install-7.1.2.tgz"
+ integrity sha512-UnNke3IQb6sgarcZIDU3gbMeTp/9SSU1DAIkil7PrqG1vZlBtY5msYccSKSHDqa3hNg436IXK+SNImReuA1wEQ==
+ dependencies:
+ detect-libc "^2.0.0"
+ expand-template "^2.0.3"
+ github-from-package "0.0.0"
+ minimist "^1.2.3"
+ mkdirp-classic "^0.5.3"
+ napi-build-utils "^1.0.1"
+ node-abi "^3.3.0"
+ pump "^3.0.0"
+ rc "^1.2.7"
+ simple-get "^4.0.0"
+ tar-fs "^2.0.0"
+ tunnel-agent "^0.6.0"
+
+protobufjs@^6.8.8:
+ version "6.11.4"
+ resolved "https://registry.npmjs.org/protobufjs/-/protobufjs-6.11.4.tgz"
+ integrity sha512-5kQWPaJHi1WoCpjTGszzQ32PG2F4+wRY6BmAT4Vfw56Q2FZ4YZzK20xUYQH4YkfehY1e6QSICrJquM6xXZNcrw==
+ dependencies:
+ "@protobufjs/aspromise" "^1.1.2"
+ "@protobufjs/base64" "^1.1.2"
+ "@protobufjs/codegen" "^2.0.4"
+ "@protobufjs/eventemitter" "^1.1.0"
+ "@protobufjs/fetch" "^1.1.0"
+ "@protobufjs/float" "^1.0.2"
+ "@protobufjs/inquire" "^1.1.0"
+ "@protobufjs/path" "^1.1.2"
+ "@protobufjs/pool" "^1.1.0"
+ "@protobufjs/utf8" "^1.1.0"
+ "@types/long" "^4.0.1"
+ "@types/node" ">=13.7.0"
+ long "^4.0.0"
+
+protobufjs@^7.2.4:
+ version "7.2.6"
+ resolved "https://registry.npmjs.org/protobufjs/-/protobufjs-7.2.6.tgz"
+ integrity sha512-dgJaEDDL6x8ASUZ1YqWciTRrdOuYNzoOf27oHNfdyvKqHr5i0FV7FSLU+aIeFjyFgVxrpTOtQUi0BLLBymZaBw==
+ dependencies:
+ "@protobufjs/aspromise" "^1.1.2"
+ "@protobufjs/base64" "^1.1.2"
+ "@protobufjs/codegen" "^2.0.4"
+ "@protobufjs/eventemitter" "^1.1.0"
+ "@protobufjs/fetch" "^1.1.0"
+ "@protobufjs/float" "^1.0.2"
+ "@protobufjs/inquire" "^1.1.0"
+ "@protobufjs/path" "^1.1.2"
+ "@protobufjs/pool" "^1.1.0"
+ "@protobufjs/utf8" "^1.1.0"
+ "@types/node" ">=13.7.0"
+ long "^5.0.0"
+
+pump@^3.0.0:
+ version "3.0.0"
+ resolved "https://registry.npmjs.org/pump/-/pump-3.0.0.tgz"
+ integrity sha512-LwZy+p3SFs1Pytd/jYct4wpv49HiYCqd9Rlc5ZVdk0V+8Yzv6jR5Blk3TRmPL1ft69TxP0IMZGJ+WPFU2BFhww==
+ dependencies:
+ end-of-stream "^1.1.0"
+ once "^1.3.1"
+
+queue-tick@^1.0.1:
+ version "1.0.1"
+ resolved "https://registry.npmjs.org/queue-tick/-/queue-tick-1.0.1.tgz"
+ integrity sha512-kJt5qhMxoszgU/62PLP1CJytzd2NKetjSRnyuj31fDd3Rlcz3fzlFdFLD1SItunPwyqEOkca6GbV612BWfaBag==
+
+rc@^1.2.7:
+ version "1.2.8"
+ resolved "https://registry.npmjs.org/rc/-/rc-1.2.8.tgz"
+ integrity sha512-y3bGgqKj3QBdxLbLkomlohkvsA8gdAiUQlSBJnBhfn+BPxg4bc62d8TcBW15wavDfgexCgccckhcZvywyQYPOw==
+ dependencies:
+ deep-extend "^0.6.0"
+ ini "~1.3.0"
+ minimist "^1.2.0"
+ strip-json-comments "~2.0.1"
+
+readable-stream@^3.1.1, readable-stream@^3.4.0:
+ version "3.6.2"
+ resolved "https://registry.npmjs.org/readable-stream/-/readable-stream-3.6.2.tgz"
+ integrity sha512-9u/sniCrY3D5WdsERHzHE4G2YCXqoG5FTHUiCC4SIbr6XcLZBY05ya9EKjYek9O5xOAwjGq+1JdGBAS7Q9ScoA==
+ dependencies:
+ inherits "^2.0.3"
+ string_decoder "^1.1.1"
+ util-deprecate "^1.0.1"
+
+safe-buffer@^5.0.1, safe-buffer@~5.2.0:
+ version "5.2.1"
+ resolved "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz"
+ integrity sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==
+
+semver@^7.3.5, semver@^7.5.4:
+ version "7.6.0"
+ resolved "https://registry.npmjs.org/semver/-/semver-7.6.0.tgz"
+ integrity sha512-EnwXhrlwXMk9gKu5/flx5sv/an57AkRplG3hTK68W7FRDN+k+OWBj65M7719OkA82XLBxrcX0KSHj+X5COhOVg==
+ dependencies:
+ lru-cache "^6.0.0"
+
+sharp@^0.32.0:
+ version "0.32.6"
+ resolved "https://registry.npmjs.org/sharp/-/sharp-0.32.6.tgz"
+ integrity sha512-KyLTWwgcR9Oe4d9HwCwNM2l7+J0dUQwn/yf7S0EnTtb0eVS4RxO0eUSvxPtzT4F3SY+C4K6fqdv/DO27sJ/v/w==
+ dependencies:
+ color "^4.2.3"
+ detect-libc "^2.0.2"
+ node-addon-api "^6.1.0"
+ prebuild-install "^7.1.1"
+ semver "^7.5.4"
+ simple-get "^4.0.1"
+ tar-fs "^3.0.4"
+ tunnel-agent "^0.6.0"
+
+simple-concat@^1.0.0:
+ version "1.0.1"
+ resolved "https://registry.npmjs.org/simple-concat/-/simple-concat-1.0.1.tgz"
+ integrity sha512-cSFtAPtRhljv69IK0hTVZQ+OfE9nePi/rtJmw5UjHeVyVroEqJXP1sFztKUy1qU+xvz3u/sfYJLa947b7nAN2Q==
+
+simple-get@^4.0.0, simple-get@^4.0.1:
+ version "4.0.1"
+ resolved "https://registry.npmjs.org/simple-get/-/simple-get-4.0.1.tgz"
+ integrity sha512-brv7p5WgH0jmQJr1ZDDfKDOSeWWg+OVypG99A/5vYGPqJ6pxiaHLy8nxtFjBA7oMa01ebA9gfh1uMCFqOuXxvA==
+ dependencies:
+ decompress-response "^6.0.0"
+ once "^1.3.1"
+ simple-concat "^1.0.0"
+
+simple-swizzle@^0.2.2:
+ version "0.2.2"
+ resolved "https://registry.npmjs.org/simple-swizzle/-/simple-swizzle-0.2.2.tgz"
+ integrity sha512-JA//kQgZtbuY83m+xT+tXJkmJncGMTFT+C+g2h2R9uxkYIrE2yy9sgmcLhCnw57/WSD+Eh3J97FPEDFnbXnDUg==
+ dependencies:
+ is-arrayish "^0.3.1"
+
+streamx@^2.13.0, streamx@^2.15.0:
+ version "2.16.1"
+ resolved "https://registry.npmjs.org/streamx/-/streamx-2.16.1.tgz"
+ integrity sha512-m9QYj6WygWyWa3H1YY69amr4nVgy61xfjys7xO7kviL5rfIEc2naf+ewFiOA+aEJD7y0JO3h2GoiUv4TDwEGzQ==
+ dependencies:
+ fast-fifo "^1.1.0"
+ queue-tick "^1.0.1"
+ optionalDependencies:
+ bare-events "^2.2.0"
+
+string_decoder@^1.1.1:
+ version "1.3.0"
+ resolved "https://registry.npmjs.org/string_decoder/-/string_decoder-1.3.0.tgz"
+ integrity sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA==
+ dependencies:
+ safe-buffer "~5.2.0"
+
+strip-json-comments@~2.0.1:
+ version "2.0.1"
+ resolved "https://registry.npmjs.org/strip-json-comments/-/strip-json-comments-2.0.1.tgz"
+ integrity sha512-4gB8na07fecVVkOI6Rs4e7T6NOTki5EmL7TUduTs6bu3EdnSycntVJ4re8kgZA+wx9IueI2Y11bfbgwtzuE0KQ==
+
+tar-fs@^2.0.0:
+ version "2.1.1"
+ resolved "https://registry.npmjs.org/tar-fs/-/tar-fs-2.1.1.tgz"
+ integrity sha512-V0r2Y9scmbDRLCNex/+hYzvp/zyYjvFbHPNgVTKfQvVrb6guiE/fxP+XblDNR011utopbkex2nM4dHNV6GDsng==
+ dependencies:
+ chownr "^1.1.1"
+ mkdirp-classic "^0.5.2"
+ pump "^3.0.0"
+ tar-stream "^2.1.4"
+
+tar-fs@^3.0.4:
+ version "3.0.5"
+ resolved "https://registry.npmjs.org/tar-fs/-/tar-fs-3.0.5.tgz"
+ integrity sha512-JOgGAmZyMgbqpLwct7ZV8VzkEB6pxXFBVErLtb+XCOqzc6w1xiWKI9GVd6bwk68EX7eJ4DWmfXVmq8K2ziZTGg==
+ dependencies:
+ pump "^3.0.0"
+ tar-stream "^3.1.5"
+ optionalDependencies:
+ bare-fs "^2.1.1"
+ bare-path "^2.1.0"
+
+tar-stream@^2.1.4:
+ version "2.2.0"
+ resolved "https://registry.npmjs.org/tar-stream/-/tar-stream-2.2.0.tgz"
+ integrity sha512-ujeqbceABgwMZxEJnk2HDY2DlnUZ+9oEcb1KzTVfYHio0UE6dG71n60d8D2I4qNvleWrrXpmjpt7vZeF1LnMZQ==
+ dependencies:
+ bl "^4.0.3"
+ end-of-stream "^1.4.1"
+ fs-constants "^1.0.0"
+ inherits "^2.0.3"
+ readable-stream "^3.1.1"
+
+tar-stream@^3.1.5:
+ version "3.1.7"
+ resolved "https://registry.npmjs.org/tar-stream/-/tar-stream-3.1.7.tgz"
+ integrity sha512-qJj60CXt7IU1Ffyc3NJMjh6EkuCFej46zUqJ4J7pqYlThyd9bO0XBTmcOIhSzZJVWfsLks0+nle/j538YAW9RQ==
+ dependencies:
+ b4a "^1.6.4"
+ fast-fifo "^1.2.0"
+ streamx "^2.15.0"
+
+tunnel-agent@^0.6.0:
+ version "0.6.0"
+ resolved "https://registry.npmjs.org/tunnel-agent/-/tunnel-agent-0.6.0.tgz"
+ integrity sha512-McnNiV1l8RYeY8tBgEpuodCC1mLUdbSN+CYBL7kJsJNInOP8UjDDEwdk6Mw60vdLLrr5NHKZhMAOSrR2NZuQ+w==
+ dependencies:
+ safe-buffer "^5.0.1"
+
+undici-types@~5.26.4:
+ version "5.26.5"
+ resolved "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz"
+ integrity sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==
+
+util-deprecate@^1.0.1:
+ version "1.0.2"
+ resolved "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz"
+ integrity sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==
+
+wrappy@1:
+ version "1.0.2"
+ resolved "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz"
+ integrity sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==
+
+yallist@^4.0.0:
+ version "4.0.0"
+ resolved "https://registry.npmjs.org/yallist/-/yallist-4.0.0.tgz"
+ integrity sha512-3wdGidZyq5PB084XLES5TpOSRA3wjXAlIWMhum2kRcv/41Sn2emQ0dycQW4uZXLejwKvg6EsvbdlVL+FYEct7A==