diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 4170c99..512b641 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -113,10 +113,14 @@ jobs: uses: actions/checkout@v4 with: ref: "main" + - name: Install dependencies + run: | + sudo apt update && + sudo apt install -y doxygen graphviz dia git && + pip install sphinx==5.3.0 sphinx-js==3.2.1 breathe==4.35.0 furo==2023.3.27 m2r2==0.3.3.post2 sphinxcontrib-googleanalytics==0.2.dev20220708 sphinxcontrib-jquery==4.1 && + npm install -g jsdoc - name: Setup GitHub Pages uses: actions/configure-pages@v2 - - name: Install dependencies - run: sudo apt update && sudo apt install -y doxygen graphviz dia git && pip install sphinx==7.1.2 breathe furo m2r2 sphinxcontrib-googleanalytics==0.2.dev20220708 sphinxcontrib-jquery toml - name: Install UForm from PyPi run: pip install uform - name: Build documentation diff --git a/.gitignore b/.gitignore index af7d4af..1bbdc30 100755 --- a/.gitignore +++ b/.gitignore @@ -4,7 +4,21 @@ test build/ package-lock.json *.egg-info -*.onnx __pycache__ .build -.swiftpm \ No newline at end of file +.swiftpm +.hf_token + +dictionary* +vocab* + +# Tensors & ML Model +*.onnx +*.pt +*.safetensors +*.mlpackage + +# NodeJS +node_modules +node_build +yarn-error.log diff --git a/.vscode/launch.json b/.vscode/launch.json index 59eb78c..92a1844 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -5,11 +5,29 @@ "version": "0.2.0", "configurations": [ { - "name": "Python Debugger: Current File with Arguments", + "name": "Python Debugger", "type": "debugpy", "request": "launch", "program": "${file}", "console": "integratedTerminal", + }, + { + "name": "PyTest Debugger", + "type": "debugpy", + "request": "launch", + "program": "pytest", + "console": "integratedTerminal", + "args": [ + "${file}", + "-s", + "-x", + ], + }, + { + "name": "NodeJS Debugger", + "type": "node-terminal", + "request": "launch", + "command": "npm run test", } ] } \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json index a6cceb8..3275f93 100755 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,8 +1,10 @@ { "cSpell.words": [ "arange", + "astype", "CFURL", "coreml", + "crossattn", "cumsum", "dtype", "embs", @@ -19,26 +21,37 @@ "ndarray", "numpy", "ONNX", + "onnxconverter", "onnxruntime", + "opset", "packbits", "preprocess", "pretrained", "probs", "pypi", + "pytest", + "randn", "rerank", "reranker", "reranking", + "sandbeach", "sess", "SIMD", "softmax", + "Tensorrt", + "torchvision", "transfromers", "uform", "unimodal", "unsqueeze", - "Vardanian" + "Vardanian", + "whitespaces" ], "[python]": { "editor.defaultFormatter": "ms-python.black-formatter" }, - "python.formatting.provider": "none" + "python.formatting.provider": "none", + "window.autoDetectColorScheme": true, + "workbench.colorTheme": "Default Dark+", + "workbench.preferredDarkColorTheme": "Default Dark+" } \ No newline at end of file diff --git a/BENCHMARKS.md b/BENCHMARKS.md new file mode 100644 index 0000000..07ff0bb --- /dev/null +++ b/BENCHMARKS.md @@ -0,0 +1,182 @@ +# UForm Model Benchmarks + +## Accuracy + +### Embedding Models + +Few retrieval benchmarks exist for multimodal embeddings. +The most famous ones for English are "MS-COCO" and "Flickr30k". +Evaluating `uform-vl-english` model, one can expect the following numbers for search quality. + +| Dataset | Recall @ 1 | Recall @ 5 | Recall @ 10 | +| :-------- | ---------: | ---------: | ----------: | +| Flickr | 0.727 | 0.915 | 0.949 | +| MS-COCO ¹ | 0.510 | 0.761 | 0.838 | + +For multilingual benchmarks, we've created the [`unum-cloud/coco-sm`](https://github.com/unum-cloud/coco-sm) repository². +Evaluating the `unum-cloud/uform-vl-multilingual-v2` model, one can expect the following metrics for text-to-image search, compared against `xlm-roberta-base-ViT-B-32` [OpenCLIP](https://github.com/mlfoundations/open_clip) model. + +| Language | OpenCLIP @ 1 | UForm @ 1 | OpenCLIP @ 5 | UForm @ 5 | OpenCLIP @ 10 | UForm @ 10 | Speakers | +| :-------- | -----------: | --------: | -----------: | --------: | ------------: | ---------: | -------: | +| English 🇺🇸 | __37.8__ | 37.7 | 63.5 | __65.0__ | 73.5 | __75.9__ | 1'452 M | +| Chinese 🇨🇳 | 27.3 | __32.2__ | 51.3 | __59.0__ | 62.1 | __70.5__ | 1'118 M | +| Hindi 🇮🇳 | 20.7 | __31.3__ | 42.5 | __57.9__ | 53.7 | __69.6__ | 602 M | +| Spanish 🇪🇸 | 32.6 | __35.6__ | 58.0 | __62.8__ | 68.8 | __73.7__ | 548 M | +| Arabic 🇸🇦 | 22.7 | __31.7__ | 44.9 | __57.8__ | 55.8 | __69.2__ | 274 M | +| French 🇫🇷 | 31.3 | __35.4__ | 56.5 | __62.6__ | 67.4 | __73.3__ | 274 M | + + +All languages: + +| Language | OpenCLIP @ 1 | UForm @ 1 | OpenCLIP @ 5 | UForm @ 5 | OpenCLIP @ 10 | UForm @ 10 | Speakers | +| :------------------- | -----------: | -----------: | -----------: | -----------: | ------------: | -----------: | -------: | +| Arabic 🇸🇦 | 22.7 | __31.7__ | 44.9 | __57.8__ | 55.8 | __69.2__ | 274 M | +| Armenian 🇦🇲 | 5.6 | __22.0__ | 14.3 | __44.7__ | 20.2 | __56.0__ | 4 M | +| Chinese 🇨🇳 | 27.3 | __32.2__ | 51.3 | __59.0__ | 62.1 | __70.5__ | 1'118 M | +| English 🇺🇸 | __37.8__ | 37.7 | 63.5 | __65.0__ | 73.5 | __75.9__ | 1'452 M | +| French 🇫🇷 | 31.3 | __35.4__ | 56.5 | __62.6__ | 67.4 | __73.3__ | 274 M | +| German 🇩🇪 | 31.7 | __35.1__ | 56.9 | __62.2__ | 67.4 | __73.3__ | 134 M | +| Hebrew 🇮🇱 | 23.7 | __26.7__ | 46.3 | __51.8__ | 57.0 | __63.5__ | 9 M | +| Hindi 🇮🇳 | 20.7 | __31.3__ | 42.5 | __57.9__ | 53.7 | __69.6__ | 602 M | +| Indonesian 🇮🇩 | 26.9 | __30.7__ | 51.4 | __57.0__ | 62.7 | __68.6__ | 199 M | +| Italian 🇮🇹 | 31.3 | __34.9__ | 56.7 | __62.1__ | 67.1 | __73.1__ | 67 M | +| Japanese 🇯🇵 | 27.4 | __32.6__ | 51.5 | __59.2__ | 62.6 | __70.6__ | 125 M | +| Korean 🇰🇷 | 24.4 | __31.5__ | 48.1 | __57.8__ | 59.2 | __69.2__ | 81 M | +| Persian 🇮🇷 | 24.0 | __28.8__ | 47.0 | __54.6__ | 57.8 | __66.2__ | 77 M | +| Polish 🇵🇱 | 29.2 | __33.6__ | 53.9 | __60.1__ | 64.7 | __71.3__ | 41 M | +| Portuguese 🇵🇹 | 31.6 | __32.7__ | 57.1 | __59.6__ | 67.9 | __71.0__ | 257 M | +| Russian 🇷🇺 | 29.9 | __33.9__ | 54.8 | __60.9__ | 65.8 | __72.0__ | 258 M | +| Spanish 🇪🇸 | 32.6 | __35.6__ | 58.0 | __62.8__ | 68.8 | __73.7__ | 548 M | +| Thai 🇹🇭 | 21.5 | __28.7__ | 43.0 | __54.6__ | 53.7 | __66.0__ | 61 M | +| Turkish 🇹🇷 | 25.5 | __33.0__ | 49.1 | __59.6__ | 60.3 | __70.8__ | 88 M | +| Ukranian 🇺🇦 | 26.0 | __30.6__ | 49.9 | __56.7__ | 60.9 | __68.1__ | 41 M | +| Vietnamese 🇻🇳 | 25.4 | __28.3__ | 49.2 | __53.9__ | 60.3 | __65.5__ | 85 M | +| | | | | | | | | +| Mean | 26.5±6.4 | __31.8±3.5__ | 49.8±9.8 | __58.1±4.5__ | 60.4±10.6 | __69.4±4.3__ | - | +| Google Translate | 27.4±6.3 | __31.5±3.5__ | 51.1±9.5 | __57.8±4.4__ | 61.7±10.3 | __69.1±4.3__ | - | +| Microsoft Translator | 27.2±6.4 | __31.4±3.6__ | 50.8±9.8 | __57.7±4.7__ | 61.4±10.6 | __68.9±4.6__ | - | +| Meta NLLB | 24.9±6.7 | __32.4±3.5__ | 47.5±10.3 | __58.9±4.5__ | 58.2±11.2 | __70.2±4.3__ | - | + +### Generative Models + +| Model | LLM Size | SQA | MME | MMBench | Average¹ | +| :------------------- | -------: | ---: | -----: | ------: | -------: | +| UForm-Gen2-Qwen-500m | 0.5B | 45.5 | 880.1 | 42.0 | 29.31 | +| MobileVLM v2 | 1.4B | 52.1 | 1302.8 | 57.7 | 36.81 | +| LLaVA-Phi | 2.7B | 68.4 | 1335.1 | 59.8 | 42.95 | + +For captioning evaluation we measure CLIPScore and RefCLIPScore³. + +| Model | Size | Caption Length | CLIPScore | RefCLIPScore | +| :---------------------------------- | ---: | -------------: | --------: | -----------: | +| `llava-hf/llava-1.5-7b-hf` | 7B | Long | 0.878 | 0.529 | +| `llava-hf/llava-1.5-7b-hf` | 7B | Short | 0.886 | 0.531 | +| | | | | | +| `Salesforce/instructblip-vicuna-7b` | 7B | Long | 0.902 | 0.534 | +| `Salesforce/instructblip-vicuna-7b` | 7B | Short | 0.848 | 0.523 | +| | | | | | +| `unum-cloud/uform-gen` | 1.5B | Long | 0.847 | 0.523 | +| `unum-cloud/uform-gen` | 1.5B | Short | 0.842 | 0.522 | +| | | | | | +| `unum-cloud/uform-gen-chat` | 1.5B | Long | 0.860 | 0.525 | +| `unum-cloud/uform-gen-chat` | 1.5B | Short | 0.858 | 0.525 | + +Results for VQAv2 evaluation. + +| Model | Size | Accuracy | +| :------------------------- | ---: | -------: | +| `llava-hf/llava-1.5-7b-hf` | 7B | 78.5 | +| `unum-cloud/uform-gen` | 1.5B | 66.5 | + +
+ +> ¹ Train split was in training data.
+> ² Lacking a broad enough evaluation dataset, we translated the [COCO Karpathy test split](https://www.kaggle.com/datasets/shtvkumar/karpathy-splits) with multiple public and proprietary translation services, averaging the scores across all sets, and breaking them down in the bottom section.
+> ³ We used `apple/DFN5B-CLIP-ViT-H-14-378` CLIP model. + +## Speed + +### Embedding Models + +UForm comes pre-packaged with speed benchmarks for the models. + +```bash +$ python python/scripts/bench_encoders.py --help +usage: bench_encoders.py [-h] [--filter-out FILTER_OUT] [--batch-size BATCH_SIZE] + +options: + -h, --help show this help message and exit + --filter-out FILTER_OUT + Filter out models, backends, or devices with a Regular Expression. + --batch-size BATCH_SIZE + Batch size for the benchmark. Batch size 1 measures latency. Large batch sizes may not fit on every GPU. +``` + +Running that script for a fairly small batch size of 50 on an Nvidia H100 GPU and + +| Model Name | Device | Backend | Images Preprocessed/s | Images Encoded/s | Texts Preprocessed/s | Texts Encoded/s | +| :--------------------------------------------- | :----- | :------ | --------------------: | :--------------- | :------------------- | :-------------- | +| unum-cloud/uform3-image-text-english-base | cpu | torch | 23.03 | 76.57 | 15,978.03 | 562.28 | +| unum-cloud/uform3-image-text-english-base | cpu | onnx | 23.11 | 77.75 | 13,880.27 | 1,067.40 | +| unum-cloud/uform3-image-text-english-base | cuda | torch | 22.87 | 1,060.40 | 12,348.94 | 13,242.83 | +| unum-cloud/uform3-image-text-english-large | cpu | torch | 22.41 | 10.84 | 13,350.45 | 145.12 | +| unum-cloud/uform3-image-text-english-large | cpu | onnx | 23.13 | 19.60 | 18,031.85 | 960.09 | +| unum-cloud/uform3-image-text-english-large | cuda | torch | 22.78 | 244.86 | 13,226.40 | 10,204.04 | +| unum-cloud/uform3-image-text-english-small | cpu | torch | 20.08 | 71.68 | 12,147.05 | 249.63 | +| unum-cloud/uform3-image-text-english-small | cpu | onnx | 22.84 | 195.27 | 13,636.99 | 1,385.25 | +| unum-cloud/uform3-image-text-english-small | cuda | torch | 22.63 | 2,662.16 | 14,731.18 | 14,694.87 | +| unum-cloud/uform3-image-text-multilingual-base | cpu | torch | 22.98 | 64.28 | 10,129.27 | 209.76 | +| unum-cloud/uform3-image-text-multilingual-base | cpu | onnx | 23.06 | 66.81 | 8,963.13 | 1,104.32 | +| unum-cloud/uform3-image-text-multilingual-base | cuda | torch | 22.88 | 1,051.95 | 15,639.72 | 12,416.12 | + +If you are interested in performance numbers on consumer grade hardware, compared to third-party models, here are some rough estimates. +On Nvidia RTX 3090: + +| Model | Multilingual | Speed | Speedup | +| :----------------------------------------------- | -----------: | ---------------------: | ---------: | +| `bert-base-uncased` | No | 1'612 sequences/second | | +| `distilbert-base-uncased` | No | 3'174 sequences/second | x 1.96 | +| `sentence-transformers/all-MiniLM-L12-v2` | __Yes__ | 3'604 sequences/second | x 2.24 | +| `unum-cloud/uform3-image-text-multilingual-base` | __Yes__ | 6'809 sequences/second | __x 4.22__ | + +Given the small size of the model it also work well on mobile devices. +On Apple M2 Arm chips the energy efficiency of inference can exceed that of the RTX 3090 GPU and other Ampere-generation cards. + +| Device | Speed | Device TDP | Efficiency | +| :--------------------- | ------------------: | ---------: | ----------------: | +| Nvidia RTX 3090 | ~ 140 tokens/second | < 350W | 0.40 tokens/joule | +| Apple M2 Pro unplugged | ~ 19 tokens/second | < 20W | 0.95 tokens/joule | +| Apple M2 Max unplugged | ~ 38 tokens/second | < 36W | 1.06 tokens/joule | +| Apple M2 Max plugged | ~ 56 tokens/second | < 89W | 0.63 tokens/joule | + +### Generative Models + +```bash +$ python python/scripts/bench_decoders.py --help +usage: bench_decoders.py [-h] [--filter-out FILTER_OUT] [--batch-size BATCH_SIZE] + +options: + -h, --help show this help message and exit + --batch-size BATCH_SIZE + Batch size for the benchmark. Batch size 1 measures latency. Large batch sizes may not fit on every GPU. + --max-length MAX_LENGTH + Maximum length of the generated text in tokens. +``` + +On Nvidia H100 GPU, the following performance is expected on text token generation using `float16`, equivalent PyTorch settings, and greedy decoding. + +| Model | Size | Decoding Speed | Decoding Parallel Streams | +| :---------------------------------- | ----: | -------------: | ---------------------------: | +| `llava-hf/llava-1.5-7b-hf` | 7 B | ~ 141 tokens/s | ~ 4 K tokens/s (32 streams) | +| `Salesforce/instructblip-vicuna-7b` | 7 B | ~ 211 tokens/s | ~ 2 K tokens/s (32 streams) | +| `unum-cloud/uform-gen` | 1.5 B | ~ 252 tokens/s | ~ 3 K tokens/s (128 streams) | +| `unum-cloud/uform-gen2-dpo` | 1.2 B | ~ 372 tokens/s | ~ 10 K tokens/s (64 streams) | + +On Nvidia RTX 3090, the following performance is expected on text token generation using `float16`, equivalent PyTorch settings, and greedy decoding. + +| Model | Size | Decoding Speed | Speedup | +| :---------------------------------- | ----: | -------------: | --------: | +| `llava-hf/llava-1.5-7b-hf` | 7 B | ~ 40 tokens/s | | +| `Salesforce/instructblip-vicuna-7b` | 7 B | ~ 40 tokens/s | | +| `unum-cloud/uform-gen` | 1.5 B | ~ 140 tokens/s | __x 3.5__ | + diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 181d9e2..65e0b26 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -7,12 +7,11 @@ We welcome contributions to UForm! Before submitting any changes, please make sure that the tests pass. ```sh -pip install -e . # For core dependencies - +pip install -e ".[dev]" # For development dependencies pip install -e ".[torch]" # For PyTorch pip install -e ".[onnx]" # For ONNX on CPU pip install -e ".[onnx-gpu]" # For ONNX on GPU, available for some platforms -pip install -e ".[torch,onnx]" # For PyTorch and ONNX Python tests +pip install -e ".[torch,onnx,onnx-gpu,dev]" # For all pytest python/scripts/ -s -x -Wd -v pytest python/scripts/ -s -x -Wd -v -k onnx # To run only ONNX tests without loading Torch @@ -20,6 +19,13 @@ pytest python/scripts/ -s -x -Wd -v -k onnx # To run only ONNX tests without loa ## Swift +To build and test the Swift package, use the following command: + +```bash +swift build +swift test +``` + Swift formatting is enforced with `swift-format` default utility from Apple. To install and run it on all the files in the project, use the following command: @@ -30,3 +36,31 @@ swift-format . -i -r The style is controlled by the `.swift-format` JSON file in the root of the repository. As there is no standard for Swift formatting, even Apple's own `swift-format` tool and Xcode differ in their formatting rules, and available settings. + +## JavaScript + +For rapid development you can avoid the TypeScript precompilation step: + +```sh +npm install -g ts-node +ts-node javascript/embeddings.mts +``` + +Before submitting any changes, please make sure that the tests pass. + +```sh +npm install +npm run test +``` + +## Benchmarking + +If you want to double check, how fast the model may work on your hardware, you can clone the library and repeat the benchmarks locally. +The following benchmark will exclude PyTorch backend, CUDA-capable devices, and all the `-base` and `-large` models, running only the ONNX benchmarks on the CPU. + +```sh +git clone https://github.com/unum-cloud/uform --depth 1 # Clone the repository +cd uform && pip install -e ".[torch,onnx,onnx-gpu,dev]" # Install all dependencies +python python/scripts/bench_encoders.py --filter-out "torch|cuda|base|large" +``` + diff --git a/Package.resolved b/Package.resolved index fe63c94..6e3b1f7 100644 --- a/Package.resolved +++ b/Package.resolved @@ -14,7 +14,7 @@ "kind" : "remoteSourceControl", "location" : "https://github.com/ashvardanian/swift-transformers", "state" : { - "revision" : "9ef46a51eca46978b62773f8887926dfe72b0ab4" + "revision" : "89fb5d97e1df347f9f588f62fc538dcad6fdb16c" } } ], diff --git a/Package.swift b/Package.swift index 6ac8372..c2f7fe7 100644 --- a/Package.swift +++ b/Package.swift @@ -19,7 +19,7 @@ let package = Package( dependencies: [ .package( url: "https://github.com/ashvardanian/swift-transformers", - revision: "9ef46a51eca46978b62773f8887926dfe72b0ab4" + revision: "89fb5d97e1df347f9f588f62fc538dcad6fdb16c" ) ], targets: [ @@ -29,13 +29,13 @@ let package = Package( .product(name: "Transformers", package: "swift-transformers") ], path: "swift", - exclude: ["EmbeddingsTests.swift"] + exclude: ["EncodersTests.swift"] ), .testTarget( name: "UFormTests", dependencies: ["UForm"], path: "swift", - sources: ["EmbeddingsTests.swift"] + sources: ["EncodersTests.swift"] ), ] ) diff --git a/README.md b/README.md index 031c484..8484b0f 100755 --- a/README.md +++ b/README.md @@ -20,18 +20,24 @@ For Content Understanding and Generation

Multimodal Embeddings from 64 to 768 Dimensions • 1B Parameter Chat
-Short Texts • Images • 🔜 Video Clips +Short Texts • Images • 🔜 Video Clips • 🔜 Long Documents
-PyTorch • ONNX +ONNX • CoreML • PyTorch +
+Python + • +JavaScript + • +Swift

--- -![](https://github.com/ashvardanian/usearch-images/blob/main/assets/uform-gen-preview.jpg?raw=true) +![UForm Chat Preview](https://github.com/ashvardanian/usearch-images/blob/main/assets/uform-gen-preview.jpg?raw=true) Welcome to UForm, a __multimodal__ AI library that's as versatile as it is efficient. UForm [tiny embedding models](#encoder) will help you understand and search visual and textual content across various languages. -UForm [small generative models](#decoder), on the other hand, don't only support conversational and chat use-cases, but are also capable of image captioning and Visual Question Answering (VQA). +UForm [small generative models](#decoder), on the other hand, don't only support conversational and chat use-cases, but are great for fast image captioning and Visual Question Answering (VQA). With compact __custom pre-trained transformer models__, this can run anywhere from your server farm down to your smartphone. ## Features @@ -40,108 +46,167 @@ With compact __custom pre-trained transformer models__, this can run anywhere fr - __Throughput__: Thanks to the small size, the inference speed is [2-4x faster](#speed) than competitors. - __Portable__: Models come with native ONNX support, making them easy to deploy on any platform. - __Quantization Aware__: Down-cast embeddings from `f32` to `i8` without losing much recall. -- __Multilingual__: Trained on a balanced dataset, the recall is great across over [20 languages](#evaluation). +- __Multilingual__: Trained on a balanced dataset, the recall is great across over 20 languages. [usearch]: https://github.com/unum-cloud/usearch [matryoshka]: https://arxiv.org/abs/2205.13147 ## Models -### Embedding Models +For accuracy and speed benchmarks refer to the [evaluation page](https://github.com/unum-cloud/uform/blob/main/BENCHMARKS.md). -| Model | Parameters | Languages | Architecture | -| :--------------------------------------- | ---------: | --------: | -------------------------------------------: | -| [`uform-vl-english-large`][model-e-l] 🆕 | 365M | 1 | 6 text layers, ViT-L/14, 6 multimodal layers | -| [`uform-vl-english`][model-e] | 143M | 1 | 2 text layers, ViT-B/16, 2 multimodal layers | -| [`uform-vl-english-small`][model-e-s] 🆕 | 79M | 1 | 2 text layers, ViT-S/16, 2 multimodal layers | -| [`uform-vl-multilingual-v2`][model-m-v2] | 206M | 21 | 8 text layers, ViT-B/16, 4 multimodal layers | -| [`uform-vl-multilingual`][model-m] | 206M | 12 | 8 text layers, ViT-B/16, 4 multimodal layers | +### Embedding Models -[model-e-l]: https://huggingface.co/unum-cloud/uform-vl-english-large/ -[model-e]: https://huggingface.co/unum-cloud/uform-vl-english/ -[model-e-s]: https://huggingface.co/unum-cloud/uform-vl-english-small/ -[model-m]: https://huggingface.co/unum-cloud/uform-vl-multilingual/ -[model-m-v2]: https://huggingface.co/unum-cloud/uform-vl-multilingual-v2/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ModelParametersLanguagesArchitecture
uform3-image-text-english-large 🆕365 M112 layer BERT, ViT-L/14
uform3-image-text-english-base143 M14 layer BERT, ViT-B/16
uform3-image-text-english-small 🆕79 M14 layer BERT, ViT-S/16
uform3-image-text-multilingual-base206M2112 layer BERT, ViT-B/16
### Generative Models -| Model | Parameters | Purpose | Architecture | -| :--------------------------------- | ---------: | --------------------------: | ---------------------: | -| [`uform-gen2-dpo`][model-g2] 🆕 | 1.2B | Chat, Image Captioning, VQA | qwen1.5-0.5B, ViT-H/14 | -| [`uform-gen2-qwen-500m`][model-g2] | 1.2B | Chat, Image Captioning, VQA | qwen1.5-0.5B, ViT-H/14 | -| [`uform-gen`][model-g1] | 1.5B | Image Captioning, VQA | llama-1.3B, ViT-B/16 | + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ModelParametersPurposeArchitecture
uform-gen2-dpo 🆕1.2 BChat, Image Captioning, VQAqwen1.5-0.5B, ViT-H/14
uform-gen2-qwen-500m1.2 BChat, Image Captioning, VQAqwen1.5-0.5B, ViT-H/14
uform-gen ⚠️1.5 BImage Captioning, VQAllama-1.3B, ViT-B/16
+ +## Quick Start Examples + +### Embedding Models -[model-g2]: https://huggingface.co/unum-cloud/uform-gen2-qwen-500m/ -[model-g1]: https://huggingface.co/unum-cloud/uform-gen/ +First, `pip install uform`. +Then, load the model: -## Producing Embeddings +```py +from uform import get_model, Modality -Add UForm to your dependencies list, or just install it locally: +processors, models = get_model('unum-cloud/uform3-image-text-english-small') -```bash -pip install uform +model_text = models[Modality.TEXT_ENCODER] +model_image = models[Modality.IMAGE_ENCODER] +processor_text = processors[Modality.TEXT_ENCODER] +processor_image = processors[Modality.IMAGE_ENCODER] ``` -Then, you can use the following code to get embeddings for text and images. -You can do that either with the PyTorch reference model or the lighter cross-platform ONNX weights. +Embed images: -```python -import uform +```py +import requests +from io import BytesIO from PIL import Image -# If you want to use the PyTorch model -model, processor = uform.get_model('unum-cloud/uform-vl-english-large') # Just English -model, processor = uform.get_model('unum-cloud/uform-vl-multilingual-v2') # 21 Languages +image_url = 'https://media-cdn.tripadvisor.com/media/photo-s/1b/28/6b/53/lovely-armenia.jpg' +image_url = Image.open(BytesIO(requests.get(image_url).content)) +image_data = processor_image(image) +image_features, image_embedding = model_image.encode(image_data, return_features=True) +``` -# If you want to use the light-weight portable ONNX model -# Available combinations: cpu & fp32, gpu & fp32, gpu & fp16 -# Check out Unum's Hugging Face space for more details: https://huggingface.co/unum-cloud -model, processor = uform.get_model_onnx('unum-cloud/uform-vl-english-small', 'cpu', 'fp32') -model, processor = uform.get_model_onnx('unum-cloud/uform-vl-english-large', 'gpu', 'fp16') +Embed queries: -text = 'a small red panda in a zoo' -image = Image.open('red_panda.jpg') +```py +text = 'a cityscape bathed in the warm glow of the sun, with varied architecture and a towering, snow-capped mountain rising majestically in the background' +text_data = processor_text(text) +text_features, text_embedding = model_text.encode(text_data, return_features=True) +``` -image_data = processor.preprocess_image(image) -text_data = processor.preprocess_text(text) +For more details check out: -image_features, image_embedding = model.encode_image(image_data, return_features=True) -text_features, text_embedding = model.encode_text(text_data, return_features=True) -``` +- Python docs on embedding models in [python/README.md](https://github.com/unum-cloud/uform/blob/main/python/README.md#embedding-models) +- JavaScript docs on embedding models in [javascript/README.md](https://github.com/unum-cloud/uform/blob/main/javascript/README.md#embedding-models) +- Swift docs on embedding models in [swift/README.md](https://github.com/unum-cloud/uform/blob/main/swift/README.md#embedding-models) + +### Generative Models -To search for similar items, the embeddings can be compared using cosine similarity. -The resulting value will fall within the range of `-1` to `1`, where `1` indicates a high likelihood of a match. -PyTorch provides a built-in function for calculating cosine similarity, while for ONNX, you can use NumPy. +The generative models are natively compatible with ```python -import torch.nn.functional as F +from transformers import AutoModel, AutoProcessor -similarity = F.cosine_similarity(image_embedding, text_embedding) -``` +model = AutoModel.from_pretrained('unum-cloud/uform-gen2-dpo', trust_remote_code=True) +processor = AutoProcessor.from_pretrained('unum-cloud/uform-gen2-dpo', trust_remote_code=True) -ONNX has no such function, but you can calculate the cosine similarity using [SimSIMD](https://github.com/ashvardanian/simsimd) or manually, with NumPy: +prompt = 'Question or Instruction' +image = Image.open('image.jpg') -```python -import numpy as np +inputs = processor(text=[prompt], images=[image], return_tensors='pt') -image_embedding = image_embedding / np.linalg.norm(image_embedding, keepdims=True, axis=1) -text_embedding = text_embedding / np.linalg.norm(text_embedding, keepdims=True, axis=1) -similarity = (image_embedding * text_embedding).sum(axis=1) +with torch.inference_mode(): + output = model.generate( + **inputs, + do_sample=False, + use_cache=True, + max_new_tokens=256, + eos_token_id=151645, + pad_token_id=processor.tokenizer.pad_token_id + ) +prompt_len = inputs['input_ids'].shape[1] +decoded_text = processor.batch_decode(output[:, prompt_len:])[0] ``` -### Reranking +For more details check out: -Once the list of nearest neighbors (best matches) is obtained, the joint multimodal embeddings, created from both text and image features, can be used to better rerank (reorder) the list. -The model can calculate a "matching score" that falls within the range of `[0, 1]`, where `1` indicates a high likelihood of a match. +- Python docs on generative models in [python/README.md](https://github.com/unum-cloud/uform/blob/main/python/README.md#generative-models) +- JavaScript docs on generative models 🔜 +- Swift docs on generative models 🔜 -```python -score, joint_embedding = model.encode_multimodal( - image_features=image_features, - text_features=text_features, - attention_mask=text_data['attention_mask'], - return_scores=True, -) -``` +## Technical Details ### Down-casting, Quantization, Matryoshka, and Slicing @@ -153,7 +218,7 @@ Similarly, for higher-dimensional embeddings (512 or 768), a common strategy is ```python import numpy as np -f32_embedding: np.ndarray = model.encode_text(text_data, return_features=False).detach().cpu().numpy() +f32_embedding: np.ndarray = model.encode_text(text_data, return_features=False) f16_embedding: np.ndarray = f32_embedding.astype(np.float16) i8_embedding: np.ndarray = (f32_embedding * 127).astype(np.int8) b1_embedding: np.ndarray = np.packbits((f32_embedding > 0).astype(np.uint8)) @@ -164,7 +229,7 @@ Alternative approach to quantization is to use the Matryoshka embeddings, where ```python import numpy as np -large_embedding: np.ndarray = model.encode_text(text_data, return_features=False).detach().cpu().numpy() +large_embedding: np.ndarray = model.encode_text(text_data, return_features=False) small_embedding: np.ndarray = large_embedding[:, :256] tiny_embedding: np.ndarray = large_embedding[:, :64] ``` @@ -219,253 +284,16 @@ You can pick one of many supported [ONNX execution providers][onnx-providers], w [onnx-providers]: https://onnxruntime.ai/docs/execution-providers/ ---- - -The configuration process may include a few additional steps, depending on the environment. -When using the CUDA and TensorRT backends with CUDA 12 or newer make sure to [install the Nvidia toolkit][install-nvidia-toolkit] and the `onnxruntime-gpu` package from the custom repository. - -```sh -wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb -sudo dpkg -i cuda-keyring_1.1-1_all.deb -sudo apt-get update -sudo apt-get -y install cuda-toolkit-12 -pip install onnxruntime-gpu --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/ -export CUDA_PATH="/usr/local/cuda-12/bin" -export PATH="/usr/local/cuda-12/bin${PATH:+:${PATH}}" -export LD_LIBRARY_PATH="/usr/local/cuda-12/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" -pytest python/scripts/ -s -x -Wd -v -k onnx -``` - -[install-nvidia-toolkit]: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/#network-repo-installation-for-ubuntu - -## Chat, Image Captioning and Question Answering - -UForm generative models are fully compatible with the Hugging Face Transformers library, and can be used without installing the UForm library. -Those models can be used to caption images or power multimodal chat experiences. - -```python -from transformers import AutoModel, AutoProcessor - -model = AutoModel.from_pretrained('unum-cloud/uform-gen2-qwen-500m', trust_remote_code=True) -processor = AutoProcessor.from_pretrained('unum-cloud/uform-gen2-qwen-500m', trust_remote_code=True) +### Multimodal Chat in CLI -prompt = 'Question or Instruction' -image = Image.open('image.jpg') - -inputs = processor(text=[prompt], images=[image], return_tensors='pt') - -with torch.inference_mode(): - output = model.generate( - **inputs, - do_sample=False, - use_cache=True, - max_new_tokens=256, - eos_token_id=151645, - pad_token_id=processor.tokenizer.pad_token_id - ) -prompt_len = inputs['input_ids'].shape[1] -decoded_text = processor.batch_decode(output[:, prompt_len:])[0] -``` - -You can check examples of different prompts in our [demo space](https://huggingface.co/spaces/unum-cloud/uform-gen2-qwen-500m-demo) - - -### Image Captioning and Question Answering - -__It is the instruction for the first version of UForm-Gen model. We highly recommend you use the new model, instructions for which you can find above.__ - - -The generative model can be used to caption images, summarize their content, or answer questions about them. -The exact behavior is controlled by prompts. - -```python -from uform.gen_model import VLMForCausalLM, VLMProcessor - -model = VLMForCausalLM.from_pretrained('unum-cloud/uform-gen') -processor = VLMProcessor.from_pretrained('unum-cloud/uform-gen') - -# [cap] Narrate the contents of the image with precision. -# [cap] Summarize the visual content of the image. -# [vqa] What is the main subject of the image? -prompt = '[cap] Summarize the visual content of the image.' -image = Image.open('zebra.jpg') - -inputs = processor(texts=[prompt], images=[image], return_tensors='pt') -with torch.inference_mode(): - output = model.generate( - **inputs, - do_sample=False, - use_cache=True, - max_new_tokens=128, - eos_token_id=32001, - pad_token_id=processor.tokenizer.pad_token_id - ) - -prompt_len = inputs['input_ids'].shape[1] -decoded_text = processor.batch_decode(output[:, prompt_len:])[0] -``` - -### Multimodal Chat - -The generative models can be used for chat-like experiences, where the user can provide both text and images as input. -To use that feature, you can start with the following CLI command: +The generative models can be used for chat-like experiences in the command line. +For that, you can use the `uform-chat` CLI tool, which is available in the UForm package. ```bash -uform-chat --model unum-cloud/uform-gen-chat --image=zebra.jpg -uform-chat --model unum-cloud/uform-gen-chat \ - --image="https://bit.ly/3tIVg9M" \ - --device="cuda:0" \ - --fp16 +$ pip install uform +$ uform-chat --model unum-cloud/uform-gen2-dpo --image=zebra.jpg +$ uform-chat --model unum-cloud/uform-gen2-dpo \ +> --image="https://bit.ly/3tIVg9M" \ +> --device="cuda:0" \ +> --fp16 ``` - -### Multi-GPU - -To achieve higher throughput, you can launch UForm on multiple GPUs. -For that pick the encoder of the model you want to run in parallel (`text_encoder` or `image_encoder`), and wrap it in `nn.DataParallel` (or `nn.DistributedDataParallel`). - -```python -import uform - -model, processor = uform.get_model('unum-cloud/uform-vl-english') -model_image = nn.DataParallel(model.image_encoder) - -device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') -model_image.to(device) - -_, res = model_image(images, 0) -``` - -## Evaluation - -### Embedding Models - -Few retrieval benchmarks exist for multimodal embeddings. -The most famous ones for English are "MS-COCO" and "Flickr30k". -Evaluating `uform-vl-english` model, one can expect the following numbers for search quality. - -| Dataset | Recall @ 1 | Recall @ 5 | Recall @ 10 | -| :------- | ---------: | ---------: | ----------: | -| Flickr | 0.727 | 0.915 | 0.949 | -| MS-COCO¹ | 0.510 | 0.761 | 0.838 | - - -For multilingual benchmarks, we've created the [`unum-cloud/coco-sm`](https://github.com/unum-cloud/coco-sm) repository². -Evaluating the `unum-cloud/uform-vl-multilingual-v2` model, one can expect the following metrics for text-to-image search, compared against `xlm-roberta-base-ViT-B-32` [OpenCLIP](https://github.com/mlfoundations/open_clip) model. - -| Language | OpenCLIP @ 1 | UForm @ 1 | OpenCLIP @ 5 | UForm @ 5 | OpenCLIP @ 10 | UForm @ 10 | Speakers | -| :-------- | -----------: | --------: | -----------: | --------: | ------------: | ---------: | -------: | -| English 🇺🇸 | __37.8__ | 37.7 | 63.5 | __65.0__ | 73.5 | __75.9__ | 1'452 M | -| Chinese 🇨🇳 | 27.3 | __32.2__ | 51.3 | __59.0__ | 62.1 | __70.5__ | 1'118 M | -| Hindi 🇮🇳 | 20.7 | __31.3__ | 42.5 | __57.9__ | 53.7 | __69.6__ | 602 M | -| Spanish 🇪🇸 | 32.6 | __35.6__ | 58.0 | __62.8__ | 68.8 | __73.7__ | 548 M | -| Arabic 🇸🇦 | 22.7 | __31.7__ | 44.9 | __57.8__ | 55.8 | __69.2__ | 274 M | -| French 🇫🇷 | 31.3 | __35.4__ | 56.5 | __62.6__ | 67.4 | __73.3__ | 274 M | - - -
-All languages. -
- -| Language | OpenCLIP @ 1 | UForm @ 1 | OpenCLIP @ 5 | UForm @ 5 | OpenCLIP @ 10 | UForm @ 10 | Speakers | -| :------------------- | -----------: | -----------: | -----------: | -----------: | ------------: | -----------: | -------: | -| Arabic 🇸🇦 | 22.7 | __31.7__ | 44.9 | __57.8__ | 55.8 | __69.2__ | 274 M | -| Armenian 🇦🇲 | 5.6 | __22.0__ | 14.3 | __44.7__ | 20.2 | __56.0__ | 4 M | -| Chinese 🇨🇳 | 27.3 | __32.2__ | 51.3 | __59.0__ | 62.1 | __70.5__ | 1'118 M | -| English 🇺🇸 | __37.8__ | 37.7 | 63.5 | __65.0__ | 73.5 | __75.9__ | 1'452 M | -| French 🇫🇷 | 31.3 | __35.4__ | 56.5 | __62.6__ | 67.4 | __73.3__ | 274 M | -| German 🇩🇪 | 31.7 | __35.1__ | 56.9 | __62.2__ | 67.4 | __73.3__ | 134 M | -| Hebrew 🇮🇱 | 23.7 | __26.7__ | 46.3 | __51.8__ | 57.0 | __63.5__ | 9 M | -| Hindi 🇮🇳 | 20.7 | __31.3__ | 42.5 | __57.9__ | 53.7 | __69.6__ | 602 M | -| Indonesian 🇮🇩 | 26.9 | __30.7__ | 51.4 | __57.0__ | 62.7 | __68.6__ | 199 M | -| Italian 🇮🇹 | 31.3 | __34.9__ | 56.7 | __62.1__ | 67.1 | __73.1__ | 67 M | -| Japanese 🇯🇵 | 27.4 | __32.6__ | 51.5 | __59.2__ | 62.6 | __70.6__ | 125 M | -| Korean 🇰🇷 | 24.4 | __31.5__ | 48.1 | __57.8__ | 59.2 | __69.2__ | 81 M | -| Persian 🇮🇷 | 24.0 | __28.8__ | 47.0 | __54.6__ | 57.8 | __66.2__ | 77 M | -| Polish 🇵🇱 | 29.2 | __33.6__ | 53.9 | __60.1__ | 64.7 | __71.3__ | 41 M | -| Portuguese 🇵🇹 | 31.6 | __32.7__ | 57.1 | __59.6__ | 67.9 | __71.0__ | 257 M | -| Russian 🇷🇺 | 29.9 | __33.9__ | 54.8 | __60.9__ | 65.8 | __72.0__ | 258 M | -| Spanish 🇪🇸 | 32.6 | __35.6__ | 58.0 | __62.8__ | 68.8 | __73.7__ | 548 M | -| Thai 🇹🇭 | 21.5 | __28.7__ | 43.0 | __54.6__ | 53.7 | __66.0__ | 61 M | -| Turkish 🇹🇷 | 25.5 | __33.0__ | 49.1 | __59.6__ | 60.3 | __70.8__ | 88 M | -| Ukranian 🇺🇦 | 26.0 | __30.6__ | 49.9 | __56.7__ | 60.9 | __68.1__ | 41 M | -| Vietnamese 🇻🇳 | 25.4 | __28.3__ | 49.2 | __53.9__ | 60.3 | __65.5__ | 85 M | -| | | | | | | | | -| Mean | 26.5±6.4 | __31.8±3.5__ | 49.8±9.8 | __58.1±4.5__ | 60.4±10.6 | __69.4±4.3__ | - | -| Google Translate | 27.4±6.3 | __31.5±3.5__ | 51.1±9.5 | __57.8±4.4__ | 61.7±10.3 | __69.1±4.3__ | - | -| Microsoft Translator | 27.2±6.4 | __31.4±3.6__ | 50.8±9.8 | __57.7±4.7__ | 61.4±10.6 | __68.9±4.6__ | - | -| Meta NLLB | 24.9±6.7 | __32.4±3.5__ | 47.5±10.3 | __58.9±4.5__ | 58.2±11.2 | __70.2±4.3__ | - | - -
- -### Generative Models - -| Model | LLM Size | SQA | MME | MMBench | Average¹ | -| :------------------- | -------: | ---: | -----: | ------: | -------: | -| UForm-Gen2-Qwen-500m | 0.5B | 45.5 | 880.1 | 42.0 | 29.31 | -| MobileVLM v2 | 1.4B | 52.1 | 1302.8 | 57.7 | 36.81 | -| LLaVA-Phi | 2.7B | 68.4 | 1335.1 | 59.8 | 42.95 | - -For captioning evaluation we measure CLIPScore and RefCLIPScore³. - -| Model | Size | Caption Length | CLIPScore | RefCLIPScore | -| :---------------------------------- | ---: | -------------: | --------: | -----------: | -| `llava-hf/llava-1.5-7b-hf` | 7B | Long | 0.878 | 0.529 | -| `llava-hf/llava-1.5-7b-hf` | 7B | Short | 0.886 | 0.531 | -| | -| `Salesforce/instructblip-vicuna-7b` | 7B | Long | 0.902 | 0.534 | -| `Salesforce/instructblip-vicuna-7b` | 7B | Short | 0.848 | 0.523 | -| | -| `unum-cloud/uform-gen` | 1.5B | Long | 0.847 | 0.523 | -| `unum-cloud/uform-gen` | 1.5B | Short | 0.842 | 0.522 | -| | -| `unum-cloud/uform-gen-chat` | 1.5B | Long | 0.860 | 0.525 | -| `unum-cloud/uform-gen-chat` | 1.5B | Short | 0.858 | 0.525 | - -Results for VQAv2 evaluation. - -| Model | Size | Accuracy | -| :------------------------- | ---: | -------: | -| `llava-hf/llava-1.5-7b-hf` | 7B | 78.5 | -| `unum-cloud/uform-gen` | 1.5B | 66.5 | - -
- -> ¹ Train split was in training data.
-> ² Lacking a broad enough evaluation dataset, we translated the [COCO Karpathy test split](https://www.kaggle.com/datasets/shtvkumar/karpathy-splits) with multiple public and proprietary translation services, averaging the scores across all sets, and breaking them down in the bottom section.
-> ³ We used `apple/DFN5B-CLIP-ViT-H-14-378` CLIP model. - -## Speed - -On Nvidia RTX 3090, the following performance is expected on text encoding. - -| Model | Multilingual | Speed | Speedup | -| :---------------------------------------- | -----------: | ---------------------: | ---------: | -| `bert-base-uncased` | No | 1'612 sequences/second | | -| `distilbert-base-uncased` | No | 3'174 sequences/second | x 1.96 | -| `sentence-transformers/all-MiniLM-L12-v2` | __Yes__ | 3'604 sequences/second | x 2.24 | -| `unum-cloud/uform-vl-multilingual-v2` | __Yes__ | 6'809 sequences/second | __x 4.22__ | - -On Nvidia RTX 3090, the following performance is expected on text token generation using `float16`, equivalent PyTorch settings, and greedy decoding. - -| Model | Size | Speed | Speedup | -| :---------------------------------- | ---: | ------------------: | --------: | -| `llava-hf/llava-1.5-7b-hf` | 7B | ~ 40 tokens/second | | -| `Salesforce/instructblip-vicuna-7b` | 7B | ~ 40 tokens/second | | -| `unum-cloud/uform-gen` | 1.5B | ~ 140 tokens/second | __x 3.5__ | - -Given the small size of the model it also work well on mobile devices. -On Apple M2 Arm chips the energy efficiency of inference can exceed that of the RTX 3090 GPU and other Ampere-generation cards. - -| Device | Speed | Device TDP | Efficiency | -| :--------------------- | ------------------: | ---------: | ----------------: | -| Nvidia RTX 3090 | ~ 140 tokens/second | < 350W | 0.40 tokens/joule | -| Apple M2 Pro unplugged | ~ 19 tokens/second | < 20W | 0.95 tokens/joule | -| Apple M2 Max unplugged | ~ 38 tokens/second | < 36W | 1.06 tokens/joule | -| Apple M2 Max plugged | ~ 56 tokens/second | < 89W | 0.63 tokens/joule | - -> [!WARNING] -> The above numbers are for reference only and are not guaranteed to be accurate. - -## License - -All models come under the same license as the code - Apache 2.0. diff --git a/docs/_static/custom.js b/docs/_static/custom.js index b909a1d..3dd0974 100644 --- a/docs/_static/custom.js +++ b/docs/_static/custom.js @@ -3,5 +3,5 @@ $(document).ready(function () { ` - $(".sidebar-brand-text").html("Unum · UForm
$(VERSION)" + github_logo) + $(".sidebar-brand-text").html("Unum · UForm
2.1.1" + github_logo) }) diff --git a/docs/benchmarks.rst b/docs/benchmarks.rst new file mode 100644 index 0000000..7683788 --- /dev/null +++ b/docs/benchmarks.rst @@ -0,0 +1,5 @@ +==================== +Benchmarks +==================== + +.. mdinclude:: ../BENCHMARKS.md \ No newline at end of file diff --git a/docs/conf.py b/docs/conf.py index acc061e..f9061f5 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -5,12 +5,11 @@ # -- Project information ----------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information -import toml project = "Unum · UForm" copyright = "2023, Unum" author = "Unum" -release = toml.load("../pyproject.toml")["project"]["version"] +release = open("../VERSION", "r").read().strip() with open("_static/custom.js", "r+") as js: content = js.read() js.seek(0) @@ -24,6 +23,7 @@ "breathe", "m2r2", "sphinx.ext.autodoc", + "sphinx_js", "sphinx.ext.autosummary", "sphinx.ext.intersphinx", "sphinx.ext.napoleon", @@ -44,6 +44,9 @@ html_static_path = ["_static"] html_css_files = ["custom.css"] html_js_files = ["custom.js"] +html_baseurl = "/docs/uform/" breathe_projects = {"UForm": "../build/xml"} breathe_default_project = "UForm" + +js_source_path = "../javascript/" diff --git a/docs/contributing.rst b/docs/contributing.rst new file mode 100644 index 0000000..48893cf --- /dev/null +++ b/docs/contributing.rst @@ -0,0 +1,5 @@ +==================== +Contributing +==================== + +.. mdinclude:: ../CONTRIBUTING.md \ No newline at end of file diff --git a/docs/index.rst b/docs/index.rst index 162bbee..d3da0ec 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,11 +1,25 @@ -========== +==================== Overview -========== +==================== .. mdinclude:: ../README.md -.. toctree:: +.. toctree:: :hidden: + :caption: � + + python/index + javascript/index + swift/index + +.. toctree:: + :hidden: + :caption: � + + contributing + benchmarks + +.. toctree:: + :hidden: + :caption: � - self - reference genindex diff --git a/docs/javascript/index.rst b/docs/javascript/index.rst new file mode 100644 index 0000000..771081c --- /dev/null +++ b/docs/javascript/index.rst @@ -0,0 +1,9 @@ +==================== +JavaScript SDK +==================== + + +.. mdinclude:: ../../javascript/README.md + +.. toctree:: + :hidden: diff --git a/docs/javascript/reference.rst.txt b/docs/javascript/reference.rst.txt new file mode 100644 index 0000000..356176a --- /dev/null +++ b/docs/javascript/reference.rst.txt @@ -0,0 +1,18 @@ +API Reference +==================== + +==================== +Encoders +==================== + +.. js:autoclass:: ../javascript/encoders.TextProcessor + :members: + +.. js:autoclass:: ../javascript/encoders.ImageProcessor + :members: + +.. js:autoclass:: ../javascript/encoders.TextEncoder + :members: + +.. js:autoclass:: ../javascript/encoders.ImageEncoder + :members: diff --git a/docs/python/index.rst b/docs/python/index.rst new file mode 100644 index 0000000..5f870d1 --- /dev/null +++ b/docs/python/index.rst @@ -0,0 +1,11 @@ +==================== +Python SDK +==================== + + +.. mdinclude:: ../../python/README.md + +.. toctree:: + :hidden: + + reference \ No newline at end of file diff --git a/docs/python/reference.rst b/docs/python/reference.rst new file mode 100644 index 0000000..d580583 --- /dev/null +++ b/docs/python/reference.rst @@ -0,0 +1,42 @@ +API Reference +==================== + +==================== +Root +==================== + +.. automodule:: uform + :members: + :undoc-members: + +==================== +Torch Encoreds +==================== + +.. automodule:: uform.torch_encoders + :members: + :undoc-members: + +==================== +Torch Processors +==================== + +.. automodule:: uform.torch_processors + :members: + :undoc-members: + +==================== +ONNX Encoders +==================== + +.. automodule:: uform.onnx_encoders + :members: + :undoc-members: + +==================== +NumPy Processors +==================== + +.. automodule:: uform.numpy_processors + :members: + :undoc-members: diff --git a/docs/reference.rst b/docs/reference.rst deleted file mode 100644 index 5828f41..0000000 --- a/docs/reference.rst +++ /dev/null @@ -1,6 +0,0 @@ -API Reference -============== - -.. automodule:: uform - :members: - :undoc-members: diff --git a/docs/swift/index.rst b/docs/swift/index.rst new file mode 100644 index 0000000..5f2e213 --- /dev/null +++ b/docs/swift/index.rst @@ -0,0 +1,6 @@ +==================== +Swift SDK +==================== + + +.. mdinclude:: ../../swift/README.md diff --git a/javascript/README.md b/javascript/README.md new file mode 100644 index 0000000..0ef5c54 --- /dev/null +++ b/javascript/README.md @@ -0,0 +1,67 @@ +# UForm for JavaScript + +UForm multimodal AI SDK offers a simple way to integrate multimodal AI capabilities into your JavaScript applications. +Built around ONNX, the SDK is supposed to work with most runtimes and almost any hardware. + +## Installation + +There are several ways to install the UForm JavaScript SDK from NPM. + +```bash +pnpm add uform +npm add uform +yarn add uform +``` + +## Quick Start + +### Embeddings + +```js +import { getModel, Modality } from 'uform'; +import { TextProcessor, TextEncoder, ImageEncoder, ImageProcessor } from 'uform'; + +const { configPath, modalityPaths, tokenizerPath } = await getModel({ + modelId: 'unum-cloud/uform3-image-text-english-small', + modalities: [Modality.TextEncoder, Modality.ImageEncoder], + token: null, // Optional Hugging Face token for private models + saveDir: null, // Optional directory to save the model to +}); + +const textProcessor = new TextProcessor(configPath, tokenizerPath); +await textProcessor.init(); +const processedTexts = await textProcessor.process("a small red panda in a zoo"); + +const textEncoder = new TextEncoder(modalityPaths.text_encoder, textProcessor); +await textEncoder.init(); +const textOutput = await textEncoder.encode(processedTexts); +assert(textOutput.embeddings.dims.length === 2, "Output should be 2D"); +await textEncoder.dispose(); + +const imageProcessor = new ImageProcessor(configPath); +await imageProcessor.init(); +const processedImages = await imageProcessor.process("path/to/image.png"); + +const imageEncoder = new ImageEncoder(modalityPaths.image_encoder, imageProcessor); +await imageEncoder.init(); +const imageOutput = await imageEncoder.encode(processedImages); +assert(imageOutput.embeddings.dims.length === 2, "Output should be 2D"); +``` + +The `textOutput` and `imageOutput` would contain `features` and `embeddings` properties, which are the same as the `features` and `embeddings` properties in the Python SDK. +The embeddings can later be compared using the cosine similarity or other distance metrics. + +### Generative Models + +Coming soon ... + +## Technical Details + +### Faster Search + +Depending on the application, the embeddings can be down-casted to smaller numeric representations without losing much recall. +Independent of the quantization level, native JavaScript functionality may be too slow for large-scale search. +In such cases, consider using [USearch][github-usearch] or [SimSimD][github-simsimd]. + +[github-usearch]: https://github.com/unum-cloud/usearch +[github-simsimd]: https://github.com/ashvardanian/simsimd diff --git a/javascript/encoders.mjs b/javascript/encoders.mjs new file mode 100644 index 0000000..3c41636 --- /dev/null +++ b/javascript/encoders.mjs @@ -0,0 +1,311 @@ +import { readFileSync } from 'fs'; +import { InferenceSession, Tensor } from 'onnxruntime-node'; +import { PreTrainedTokenizer } from '@xenova/transformers'; +import sharp from 'sharp'; + +/** + * A processor for text data that prepares input for the text encoder model. + */ +class TextProcessor { + + /** + * Constructs a new TextProcessor instance. + * + * @param {string} configPath - The path to the configuration file for the text encoder. + * @param {string} tokenizerPath - The path to the tokenizer configuration file. + */ + constructor(configPath, tokenizerPath) { + this.configPath = configPath; + this.tokenizerPath = tokenizerPath; + + this.maxSeqLen = 0; + this.padTokenIdx = 0; + this.tokenizer = null; + } + + /** + * Initializes the TextProcessor by loading configurations and setting up the tokenizer. + */ + async init() { + var config = JSON.parse(readFileSync(this.configPath, { encoding: 'utf8' })); + if (config.text_encoder !== undefined) { + config = config.text_encoder; + } + + this.maxSeqLen = config.max_position_embeddings; + this.padTokenIdx = config.padding_idx; + + const tokenizerConfig = JSON.parse(readFileSync(this.tokenizerPath, { encoding: 'utf8' })); + this.tokenizer = new PreTrainedTokenizer(tokenizerConfig, config); + this.tokenizer.model_max_length = this.maxSeqLen; + this.tokenizer.pad_token_id = this.padTokenIdx; + } + + /** + * Processes a list of text strings into model-ready format, including padding and attention masks. + * + * @param {Array} texts - An array of text strings to process. + * @return {Object} The processed texts as model input features. + */ + async process(texts) { + + const encoded = await this.tokenizer(texts, { + add_special_tokens: true, + padding: 'max_length', + max_length: this.maxSeqLen, + truncation: true, + }); + + return { + 'input_ids': encoded.input_ids, + 'attention_mask': encoded.attention_mask, + }; + } +} + +/** + * An encoder for text data that uses a pre-trained model to encode text. + */ +class TextEncoder { + + /** + * Constructs a new TextEncoder instance. + * + * @param {string} modelPath - The path to the pre-trained ONNX model. + */ + constructor(modelPath) { + this.modelPath = modelPath; + this.session = null; + } + + /** + * Initializes the ONNX session with the pre-trained model. + */ + async init() { + this.session = await InferenceSession.create(this.modelPath); + } + + /** + * Releases the ONNX session resources. + */ + async dispose() { + if (this.session) { + await this.session.release(); + this.session = null; + } + } + + /** + * Encodes the input data using the pre-trained model. + * + * @param {Object} inputs - The input data containing input_ids and attention_mask. + * @return {Object} The encoded outputs from the model. + */ + async encode(inputs) { + if (!this.session) { + throw new Error("Session is not initialized."); + } + + // Helper function to convert BigInt64Array to Int32Array or validate Int32Array + function ensureInt32Array(data) { + if (data instanceof Int32Array) { + return data; // Use as is if already Int32Array + } + if (data instanceof BigInt64Array) { + // Convert BigInt64Array to Int32Array, ensuring all values are in range + return new Int32Array(Array.from(data).map(bigInt => { + if (bigInt > 2147483647n || bigInt < -2147483648n) { + throw new Error("Value out of range for Int32."); + } + return Number(bigInt); // Convert BigInt to Number + })); + } + // Additional case: handle conversion from generic Arrays or other typed arrays to Int32Array + if (Array.isArray(data) || data instanceof Uint32Array || data instanceof Uint8Array) { + return new Int32Array(data); // Convert directly + } + throw new Error("Unsupported data type for tensor conversion."); + } + + // Prepare tensor data + const inputIDsData = ensureInt32Array(inputs.input_ids.data); + const attentionMaskData = ensureInt32Array(inputs.attention_mask.data); + + // Create ONNX Tensors as 'int32' + const inputIDs = new Tensor('int32', inputIDsData, inputs.input_ids.dims); + const attentionMask = new Tensor('int32', attentionMaskData, inputs.attention_mask.dims); + + // Run model inference + return this.session.run({ + input_ids: inputIDs, + attention_mask: attentionMask, + }); + } + +} + +/** + * A processor for image data that prepares images for the image encoder model. + */ +class ImageProcessor { + constructor(configPath) { + this.configPath = configPath; + } + + /** + * Initializes the ImageProcessor by loading configuration settings for image preprocessing. + */ + async init() { + var config = JSON.parse(readFileSync(this.configPath, 'utf8')); + if (config.image_encoder !== undefined) { + config = config.image_encoder; + } + + this.imageSize = config.image_size; + this.normalizationMeans = config.normalization_means; + this.normalizationDeviations = config.normalization_deviations; + + this.imageMean = new Float32Array(this.normalizationMeans); + this.imageStd = new Float32Array(this.normalizationDeviations); + } + /** + * Processes raw image data into a model-ready format, including resizing, cropping, and normalizing. + * + * @param {Buffer|Array} images - A single image or an array of images to process. + * @return {Array} The processed image data as an array of Float32Arrays. + */ + async process(images) { + const processSingle = async (image) => { + let img = sharp(image).toColorspace('srgb'); + const metadata = await img.metadata(); + const scale = this.imageSize / Math.min(metadata.width, metadata.height); + const scaledWidth = Math.ceil(metadata.width * scale); + const scaledHeight = Math.ceil(metadata.height * scale); + img = img.resize({ + width: scaledWidth, + height: scaledHeight, + fit: sharp.fit.cover, + position: sharp.strategy.entropy, + options: sharp.interpolators.bicubic + }).extract({ + left: Math.max(0, Math.floor((scaledWidth - this.imageSize) / 2)), + top: Math.max(0, Math.floor((scaledHeight - this.imageSize) / 2)), + width: this.imageSize, + height: this.imageSize + }).removeAlpha(); + + let buffer = await img.raw().toBuffer(); + let array = new Float32Array(buffer.length); + + // When we export into the `array`, we reorder the dimensions of the tensor + // from HWC to CHW, and normalize the pixel values. + let channelSize = this.imageSize * this.imageSize; + for (let i = 0; i < this.imageSize * this.imageSize; i++) { + let r = buffer[i * 3]; + let g = buffer[i * 3 + 1]; + let b = buffer[i * 3 + 2]; + array[i] = (r / 255.0 - this.imageMean[0]) / this.imageStd[0]; + array[channelSize + i] = (g / 255.0 - this.imageMean[1]) / this.imageStd[1]; + array[channelSize * 2 + i] = (b / 255.0 - this.imageMean[2]) / this.imageStd[2]; + } + + return array; + }; + + if (Array.isArray(images)) { + return Promise.all(images.map(img => processSingle(img))); + } else { + return [await processSingle(images)]; + } + } +} + +/** + * An encoder for image data that uses a pre-trained model to encode images. + */ +class ImageEncoder { + constructor(modelPath, processor) { + this.modelPath = modelPath; + this.imageSize = processor.imageSize; + } + + /** + * Initializes the ONNX session with the pre-trained model. + */ + async init() { + this.session = await InferenceSession.create(this.modelPath); + } + + /** + * Releases the ONNX session resources. + */ + async dispose() { + if (this.session) { + await this.session.release(); + this.session = null; + } + } + + /** + * Encodes the processed image data using the pre-trained model. + * + * @param {Float32Array|Array} images - The processed image data. + * @return {Object} The encoded outputs from the model. + */ + async encode(images) { + if (!this.session) { + throw new Error("Session is not initialized."); + } + + // Helper function to ensure data is a Float32Array. + const ensureFloat32Array = (data) => { + if (!(data instanceof Float32Array)) { + throw new Error("Unsupported data type for tensor conversion."); + } + return data; + }; + + // Helper function to concatenate multiple Float32Arrays into a single Float32Array. + const concatFloat32Arrays = (arrays) => { + const totalLength = arrays.reduce((acc, val) => acc + val.length, 0); + const result = new Float32Array(totalLength); + let offset = 0; + for (let arr of arrays) { + result.set(arr, offset); + offset += arr.length; + } + return result; + }; + + let imagesData; + let dims; + + if (Array.isArray(images)) { + // Assuming each image in the array is a Float32Array representing an image already processed to a fixed size. + const arrays = images.map(ensureFloat32Array); + imagesData = concatFloat32Arrays(arrays); + const numImages = arrays.length; + const numChannels = 3; + const height = this.imageSize; + const width = this.imageSize; + dims = [numImages, numChannels, height, width]; + } else { + // Single image images, which is already a Float32Array. + imagesData = ensureFloat32Array(images); + const numChannels = 3; + const height = this.imageSize; + const width = this.imageSize; + dims = [1, numChannels, height, width]; + } + + // Create ONNX Tensor + const imagesTensor = new Tensor('float32', imagesData, dims); + + // Run model inference + return this.session.run({ + images: imagesTensor, + }); + } +} + +export { TextProcessor, TextEncoder, ImageProcessor, ImageEncoder }; diff --git a/javascript/encoders_test.js b/javascript/encoders_test.js new file mode 100644 index 0000000..30ea96a --- /dev/null +++ b/javascript/encoders_test.js @@ -0,0 +1,233 @@ +import { existsSync, readFileSync } from 'fs'; +import { fileURLToPath } from 'url'; +import path from 'path'; +import assert from 'assert'; +import fetch from 'node-fetch'; + +import { getModel, Modality } from "./hub.mjs"; +import { TextProcessor, TextEncoder, ImageEncoder, ImageProcessor } from "./encoders.mjs"; + +// Check if the HuggingFace Hub API token is set in the environment variable. +let hf_token = process.env.HUGGINGFACE_HUB_TOKEN; +if (!hf_token) { + const dirname = path.dirname(fileURLToPath(import.meta.url)); + const tokenPath = path.join(dirname, '../', '.hf_token'); + if (existsSync(tokenPath)) { + hf_token = readFileSync(tokenPath, 'utf8').trim(); + } +} + +async function tryGettingCheckpoint(modelId, modalities) { + const { configPath, modalityPaths, tokenizerPath } = await getModel( + modelId, + modalities, + hf_token, + '.onnx' + ); + + assert(configPath !== null, "Config path should not be null"); + assert(modalityPaths !== null, "Modality paths should not be null"); + assert(tokenizerPath !== null, "Tokenizer path should not be null"); + + // Check if the file actually exists + assert(existsSync(configPath), `Config file should exist at ${configPath}`); + assert(existsSync(tokenizerPath), `Tokenizer file should exist at ${tokenizerPath}`); + for (const modalityPath of Object.values(modalityPaths)) { + assert(existsSync(modalityPath), `Modality file should exist at ${modalityPath}`); + } +} + +async function testGetCheckpoint() { + console.log("- `testGetCheckpoint`: Start"); + + try { + const modalities = [Modality.TextEncoder, Modality.ImageEncoder]; + + for (const modelId of [ + 'unum-cloud/uform3-image-text-english-small', + 'unum-cloud/uform3-image-text-english-base', + 'unum-cloud/uform3-image-text-english-large', + 'unum-cloud/uform3-image-text-multilingual-base', + ]) { + await tryGettingCheckpoint(modelId, modalities, hf_token); + } + + console.log("- `testGetCheckpoint`: Success"); + } catch (error) { + console.error("- `testGetCheckpoint`: Failed", error); + } +} + +async function tryTextEncoderForwardPass(modelId) { + const modalities = [Modality.TextEncoder]; + const { configPath, modalityPaths, tokenizerPath } = await getModel( + modelId, + modalities, + hf_token, + '.onnx' + ); + + const textProcessor = new TextProcessor(configPath, tokenizerPath); + await textProcessor.init(); + const processedTexts = await textProcessor.process("a small red panda in a zoo"); + + const textEncoder = new TextEncoder(modalityPaths.text_encoder, textProcessor); + await textEncoder.init(); + const textOutput = await textEncoder.encode(processedTexts); + assert(textOutput.embeddings.dims.length === 2, "Output should be 2D"); + + await textEncoder.dispose(); +} + +async function tryImageEncoderForwardPass(modelId) { + const modalities = [Modality.ImageEncoder]; + const { configPath, modalityPaths } = await getModel( + modelId, + modalities, + hf_token, + '.onnx' + ); + + const imageProcessor = new ImageProcessor(configPath); + await imageProcessor.init(); + const processedImages = await imageProcessor.process("assets/unum.png"); + + const imageEncoder = new ImageEncoder(modalityPaths.image_encoder, imageProcessor); + await imageEncoder.init(); + const imageOutput = await imageEncoder.encode(processedImages); + assert(imageOutput.embeddings.dims.length === 2, "Output should be 2D"); + + await imageEncoder.dispose(); +} + +function cosineSimilarity(vecA, vecB) { + // We may be receiving a complex tensor type, so let's check if it + // has an array member named `data`. + if (vecA.data) { + vecA = vecA.data; + } + if (vecB.data) { + vecB = vecB.data; + } + + let dotProduct = 0.0; + let normA = 0.0; + let normB = 0.0; + for (let i = 0; i < vecA.length; i++) { + dotProduct += vecA[i] * 1.0 * vecB[i]; + normA += vecA[i] * 1.0 * vecA[i]; + normB += vecB[i] * 1.0 * vecB[i]; + } + if (normA === 0 || normB === 0) { + return 0; + } else { + return dotProduct / (Math.sqrt(normA) * Math.sqrt(normB)); + } +} + +async function fetchImage(url) { + const response = await fetch(url); + const arrayBuffer = await response.arrayBuffer(); + const buffer = Buffer.from(arrayBuffer); + return buffer; +} + +async function tryCrossReferencingImageAndText(modelId) { + + const modalities = [Modality.ImageEncoder, Modality.TextEncoder]; + const { configPath, modalityPaths, tokenizerPath } = await getModel( + modelId, + modalities, + hf_token, + '.onnx' + ); + + const imageProcessor = new ImageProcessor(configPath); + await imageProcessor.init(); + const imageEncoder = new ImageEncoder(modalityPaths.image_encoder, imageProcessor); + await imageEncoder.init(); + const textProcessor = new TextProcessor(configPath, tokenizerPath); + await textProcessor.init(); + const textEncoder = new TextEncoder(modalityPaths.text_encoder, textProcessor); + await textEncoder.init(); + + const texts = [ + "A group of friends enjoy a barbecue on a sandy beach, with one person grilling over a large black grill, while the other sits nearby, laughing and enjoying the camaraderie.", + "A white and orange cat stands on its hind legs, reaching towards a wicker basket filled with red raspberries on a wooden table in a garden, surrounded by orange flowers and a white teapot, creating a serene and whimsical scene.", + "A little girl in a yellow dress stands in a grassy field, holding an umbrella and looking at the camera, amidst rain.", + "This serene bedroom features a white bed with a black canopy, a gray armchair, a black dresser with a mirror, a vase with a plant, a window with white curtains, a rug, and a wooden floor, creating a tranquil and elegant atmosphere.", + "The image captures the iconic Louvre Museum in Paris, illuminated by warm lights against a dark sky, with the iconic glass pyramid in the center, surrounded by ornate buildings and a large courtyard, showcasing the museum's grandeur and historical significance.", + ]; + const imageUrls = [ + "https://github.com/ashvardanian/ashvardanian/blob/master/demos/bbq-on-beach.jpg?raw=true", + "https://github.com/ashvardanian/ashvardanian/blob/master/demos/cat-in-garden.jpg?raw=true", + "https://github.com/ashvardanian/ashvardanian/blob/master/demos/girl-and-rain.jpg?raw=true", + "https://github.com/ashvardanian/ashvardanian/blob/master/demos/light-bedroom-furniture.jpg?raw=true", + "https://github.com/ashvardanian/ashvardanian/blob/master/demos/louvre-at-night.jpg?raw=true", + ]; + + const textEmbeddings = []; + const imageEmbeddings = []; + + for (let i = 0; i < texts.length; i++) { + const text = texts[i]; + const imageUrl = imageUrls[i]; + const imageBuffer = await fetchImage(imageUrl); + + const processedText = await textProcessor.process(text); + const processedImage = await imageProcessor.process(imageBuffer); + + const textEmbedding = await textEncoder.encode(processedText); + const imageEmbedding = await imageEncoder.encode(processedImage); + + textEmbeddings.push(new Float32Array(textEmbedding.embeddings.data)); + imageEmbeddings.push(new Float32Array(imageEmbedding.embeddings.data)); + + // Print-based debugging at its best :) + // console.log(`Text: ${text}, Image: ${imageUrl}`); + // console.log(`Text embedding first components: ${textEmbeddings[i].slice(0, 5)}`); + // console.log(`Image embedding first components: ${imageEmbeddings[i].slice(0, 5)}`); + console.log(`Similarity: ${cosineSimilarity(textEmbeddings[i], imageEmbeddings[i])}`) + } + + for (let i = 0; i < texts.length; i++) { + const pairSimilarity = cosineSimilarity(textEmbeddings[i], imageEmbeddings[i]); + const otherTextSimilarities = textEmbeddings.map((te, idx) => idx === i ? -Infinity : cosineSimilarity(te, imageEmbeddings[i])); + const otherImageSimilarities = imageEmbeddings.map((ie, idx) => idx === i ? -Infinity : cosineSimilarity(textEmbeddings[i], ie)); + + const maxOtherTextSimilarity = Math.max(...otherTextSimilarities); + const maxOtherImageSimilarity = Math.max(...otherImageSimilarities); + + assert(pairSimilarity > maxOtherTextSimilarity, "Text should be more similar to its corresponding image than to other images."); + assert(pairSimilarity > maxOtherImageSimilarity, "Image should be more similar to its corresponding text than to other texts."); + } + + await textEncoder.dispose(); + await imageEncoder.dispose(); +} + +async function testEncoders() { + console.log("- `testEncoders`: Start"); + + try { + + // Go through the bi-modal models + for (const modelId of [ + 'unum-cloud/uform3-image-text-english-small', + // 'unum-cloud/uform3-image-text-english-base', + // 'unum-cloud/uform3-image-text-english-large', + // 'unum-cloud/uform3-image-text-multilingual-base', + ]) { + await tryTextEncoderForwardPass(modelId, hf_token); + await tryImageEncoderForwardPass(modelId, hf_token); + await tryCrossReferencingImageAndText(modelId, hf_token); + } + + console.log("- `testEncoders`: Success"); + } catch (error) { + console.error("- `testEncoders`: Failed", error); + } +} + +testGetCheckpoint(); +testEncoders(); diff --git a/javascript/hub.mjs b/javascript/hub.mjs new file mode 100644 index 0000000..a59fb73 --- /dev/null +++ b/javascript/hub.mjs @@ -0,0 +1,104 @@ +import { join } from "path" +import { createWriteStream, existsSync, mkdirSync, writeFileSync } from "fs"; + +import { downloadFile, listFiles } from "@huggingface/hub"; + +const Modality = { + TextEncoder: "text_encoder", + ImageEncoder: "image_encoder", + VideoEncoder: "video_encoder", + TextDecoder: "text_decoder", +}; + +function isModality(value) { + return Object.values(Modality).includes(value); +} + +function normalizeModalities(modalities) { + return modalities.map(x => { + if (typeof x === "string") { + if (isModality(x)) { + return x; + } else { + throw new Error(`Invalid modality: ${x}`); + } + } + return x; + }); +} + +async function ensureDirectoryExists(dirPath) { + if (!existsSync(dirPath)) { + mkdirSync(dirPath, { recursive: true }); + } +} + +async function getModel(modelId, modalities, token = null, format = '.onnx', saveDir = './models') { + modalities = normalizeModalities(modalities); + + const configNames = ['config.json']; + const tokenizerNames = ['tokenizer.json']; + const modelFileNames = modalities.map(modality => `${modality}${format}`); + const allowedPatterns = [...modelFileNames, ...configNames, ...tokenizerNames]; + + const repo = { type: "model", name: modelId }; + const credentials = token ? { accessToken: token } : undefined; + + let configPath = null; + let tokenizerPath = null; + const modalityPaths = {}; + const modelSaveDir = join(saveDir, modelId); + + await ensureDirectoryExists(modelSaveDir); + + const fileIterator = listFiles({ repo, recursive: true, credentials }); + for await (const file of fileIterator) { + const fileName = file.path.split('/').pop(); + if (fileName && allowedPatterns.includes(fileName)) { + const filePath = file.path; + const savePath = join(modelSaveDir, fileName); + + if (configNames.includes(fileName)) { + configPath = savePath; + } else if (tokenizerNames.includes(fileName)) { + tokenizerPath = savePath; + } else { + const modalityName = fileName.split('.')[0]; + modalityPaths[modalityName] = savePath; + } + + const response = await downloadFile({ repo, path: filePath, credentials }); + if (response) { + // HuggingFace might be defining the `env.localModelPath` variable + // to store the downloaded files in a local directory. + // Let's check if the file is there. + // const localPath = join(env.localModelPath, repo, filePath); + // if (existsSync(localPath)) { + // console.log(`File already exists locally at ${localPath}`); + // } + + if (response.body && response.body.pipe) { + const fileStream = createWriteStream(savePath); + response.body.pipe(fileStream); + await new Promise((resolve, reject) => { + fileStream.on('finish', resolve); + fileStream.on('error', reject); + }); + } else if (response.arrayBuffer) { + // Handle non-streamable response for environments like Node.js + const buffer = await response.arrayBuffer(); + writeFileSync(savePath, Buffer.from(buffer)); + } else { + console.error('Unexpected response type'); + } + console.log(`Downloaded ${fileName} successfully to ${savePath}`); + } else { + console.log('No response received for the file download request.'); + } + } + } + + return { configPath, modalityPaths, tokenizerPath }; +} + +export { getModel, Modality }; diff --git a/package.json b/package.json new file mode 100644 index 0000000..948550b --- /dev/null +++ b/package.json @@ -0,0 +1,33 @@ +{ + "name": "uform", + "type": "module", + "private": true, + "version": "2.0.2", + "description": "Pocket-Sized Multimodal AI for Content Understanding and Generation", + "dependencies": { + "@huggingface/hub": "^0.14.8", + "@xenova/transformers": "^2.17.0", + "node-fetch": "^3.3.2", + "onnxruntime-node": "^1.17.0", + "onnxruntime-web": "^1.17.3" + }, + "devDependencies": { + "nodemon": "^2.0.15" + }, + "scripts": { + "start": "node javascript/encoders.mjs", + "test": "node javascript/encoders_test.js" + }, + "main": "javascript/encoders.mjs", + "directories": { + "doc": "docs" + }, + "keywords": [ + "AI", + "multimodal", + "content generation", + "huggingface" + ], + "author": "Ash Vardanian, Unum Cloud", + "license": "Apache-2.0" +} diff --git a/pyproject.toml b/pyproject.toml index 10f7a9b..fef02d3 100755 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,7 +31,8 @@ classifiers = [ dependencies = [ "huggingface_hub>=0.16.4", "tokenizers>=0.13.3", - "pillow" + "pillow", + "simsimd", ] description = "Pocket-Sized Multimodal AI for Content Understanding and Generation" maintainers = [ @@ -49,6 +50,7 @@ uform-chat = "uform.chat:main" torch = ["torch>=1.13.1", "torchvision", "transformers>=4.36.2"] onnx = ["onnx>=1.15.0", "onnxruntime>=1.17.1", "numpy"] onnx-gpu = ["onnx>=1.15.0", "onnxruntime-gpu>=1.17.1", "numpy"] +dev = ["pytest", "pandas"] [project.urls] "Homepage" = "https://github.com/unum-cloud/uform" diff --git a/python/README.md b/python/README.md new file mode 100644 index 0000000..dd7611d --- /dev/null +++ b/python/README.md @@ -0,0 +1,148 @@ +# UForm Python SDK + +UForm multimodal AI SDK offers a simple way to integrate multimodal AI capabilities into your Python applications. +The SDK doesn't require any deep learning knowledge, PyTorch, or CUDA installation, and can run on almost any hardware. + +## Installation + +There are several ways to install the UForm Python SDK, depending on the backend you want to use. +PyTorch is by far the heaviest, but the most capable. +ONNX is a lightweight alternative that can run on any CPU, and on some GPUs. + +```bash +pip install "uform[torch]" # For PyTorch +pip install "uform[onnx]" # For ONNX on CPU +pip install "uform[onnx-gpu]" # For ONNX on GPU, available for some platforms +pip install "uform[torch,onnx]" # For PyTorch and ONNX Python tests +``` + +## Quick Start + +### Embeddings + +Load the model: + +```py +from uform import get_model, Modality + +model_name = 'unum-cloud/uform3-image-text-english-small' +modalities = [Modality.TEXT_ENCODER, Modality.IMAGE_ENCODER] +processors, models = get_model(model_name, modalities=modalities) + +model_text = models[Modality.TEXT_ENCODER] +model_image = models[Modality.IMAGE_ENCODER] +processor_text = processors[Modality.TEXT_ENCODER] +processor_image = processors[Modality.IMAGE_ENCODER] +``` + +Embed images: + +```py +import requests +from io import BytesIO +from PIL import Image + +image_url = 'https://media-cdn.tripadvisor.com/media/photo-s/1b/28/6b/53/lovely-armenia.jpg' +image_url = Image.open(BytesIO(requests.get(image_url).content)) +image_data = processor_image(image) +image_features, image_embedding = model_image.encode(image_data, return_features=True) +``` + +Embed queries: + +```py +text = 'a cityscape bathed in the warm glow of the sun, with varied architecture and a towering, snow-capped mountain rising majestically in the background' +text_data = processor_text(text) +text_features, text_embedding = model_text.encode(text_data, return_features=True) +``` + +### Generative Models + +UForm generative models are fully compatible with the Hugging Face Transformers library, and can be used without installing the UForm library. +Those models can be used to caption images or power multimodal chat experiences. + +```python +from transformers import AutoModel, AutoProcessor + +model = AutoModel.from_pretrained('unum-cloud/uform-gen2-dpo', trust_remote_code=True) +processor = AutoProcessor.from_pretrained('unum-cloud/uform-gen2-dpo', trust_remote_code=True) + +prompt = 'Question or Instruction' +image = Image.open('image.jpg') + +inputs = processor(text=[prompt], images=[image], return_tensors='pt') + +with torch.inference_mode(): + output = model.generate( + **inputs, + do_sample=False, + use_cache=True, + max_new_tokens=256, + eos_token_id=151645, + pad_token_id=processor.tokenizer.pad_token_id + ) +prompt_len = inputs['input_ids'].shape[1] +decoded_text = processor.batch_decode(output[:, prompt_len:])[0] +``` + +You can check examples of different prompts in our demo Gradio spaces on HuggingFace: + +- for [`uform-gen2-qwen-500m`](https://huggingface.co/spaces/unum-cloud/uform-gen2-qwen-500m-demo) +- for [`uform-gen2-dpo`](https://huggingface.co/spaces/unum-cloud/uform-gen2-qwen-500m-dpo-demo) + +## Technical Details + +### Multi-GPU Parallelism + +To achieve higher throughput, you can launch UForm on multiple GPUs. +For that pick the encoder of the model you want to run in parallel, and wrap it in `nn.DataParallel` (or `nn.DistributedDataParallel`). + +```python +from uform import get_model, Modality +import torch.nn as nn + +encoders, processors = uform.get_model('unum-cloud/uform-vl-english-small', backend='torch') + +model_text = models[Modality.TEXT_ENCODER] +model_image = models[Modality.IMAGE_ENCODER] +processor_text = processors[Modality.TEXT_ENCODER] +processor_image = processors[Modality.IMAGE_ENCODER] + +model_text.return_features = False +model_image.return_features = False +model_text_parallel = nn.DataParallel(model_text) +model_image_parallel = nn.DataParallel(model_image) +``` + +Since we are now dealing with the PyTorch wrapper, make sure to use the `forward` method (instead of `encode`) to get the embeddings, and the `.detach().cpu().numpy()` sequence to bring the data back to more Pythonic NumPy arrays. + +```python +def get_image_embedding(images: List[Image]): + preprocessed = processor_image(images) + embedding = model_image_parallel.forward(preprocessed) + return embedding.detach().cpu().numpy() + +def get_text_embedding(texts: List[str]): + preprocessed = processor_text(texts) + embedding = model_text_parallel.forward(preprocessed) + return embedding.detach().cpu().numpy() +``` + +### ONNX and CUDA + +The configuration process may include a few additional steps, depending on the environment. +When using the CUDA and TensorRT backends with CUDA 12 or newer make sure to [install the Nvidia toolkit][install-nvidia-toolkit] and the `onnxruntime-gpu` package from the custom repository. + +```sh +wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb +sudo dpkg -i cuda-keyring_1.1-1_all.deb +sudo apt-get update +sudo apt-get -y install cuda-toolkit-12 +pip install onnxruntime-gpu --extra-index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/onnxruntime-cuda-12/pypi/simple/ +export CUDA_PATH="/usr/local/cuda-12/bin" +export PATH="/usr/local/cuda-12/bin${PATH:+:${PATH}}" +export LD_LIBRARY_PATH="/usr/local/cuda-12/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" +pytest python/scripts/ -s -x -Wd -v -k onnx +``` + +[install-nvidia-toolkit]: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/#network-repo-installation-for-ubuntu diff --git a/python/scripts/bench.py b/python/scripts/bench_decoders.py similarity index 60% rename from python/scripts/bench.py rename to python/scripts/bench_decoders.py index 49c7004..0842ba9 100644 --- a/python/scripts/bench.py +++ b/python/scripts/bench_decoders.py @@ -1,6 +1,8 @@ from functools import partial from time import perf_counter +from dataclasses import dataclass from typing import List +import argparse import requests import torch @@ -10,18 +12,38 @@ InstructBlipForConditionalGeneration, InstructBlipProcessor, LlavaForConditionalGeneration, + AutoModel, + AutoProcessor, ) -from uform import get_model -from uform.gen_model import VLMForCausalLM, VLMProcessor +from uform.torch_decoders import VLMForCausalLM, VLMProcessor dtype = torch.bfloat16 low_cpu_mem_usage = False device = "cuda:0" -def caption(model, processor, prompt: str, image: Image.Image) -> str: - inputs = processor(prompt, image, return_tensors="pt") +@dataclass +class BenchmarkResult: + model_name: str + device_name: str + backend_name: str + duration_image_preprocessing: float + duration_image_embedding: float + duration_text_preprocessing: float + duration_text_embedding: float + + +def caption(model, processor, prompt: str, image: Image.Image, max_length: int, batch_size: int) -> List[str]: + # BLIP models require the prompt to be the first argument + prompt = [prompt] * batch_size + image = [image] * batch_size + try: + inputs = processor(prompt, image, return_tensors="pt") + except ValueError: + inputs = processor(image, prompt, return_tensors="pt") + + # Downcast and move to device for possible_key in ["images", "pixel_values"]: if possible_key not in inputs: continue @@ -33,19 +55,20 @@ def caption(model, processor, prompt: str, image: Image.Image) -> str: **inputs, do_sample=False, # use_cache=True, - max_new_tokens=128, + max_new_tokens=max_length, eos_token_id=32001, pad_token_id=processor.tokenizer.pad_token_id, ) prompt_len = inputs["input_ids"].shape[1] - decoded_text = processor.batch_decode( + decoded_texts = processor.batch_decode( output[:, prompt_len:], skip_special_tokens=True, - )[0].strip() - return decoded_text + ) + return decoded_texts def duration(callable): + """Profile the duration of a callable and return the duration and the result.""" start = perf_counter() result = callable() stop = perf_counter() @@ -57,49 +80,35 @@ def bench_captions( processor, prompt: str, images: List[Image.Image], + max_length: int = 256, + batch_size: int = 10, ) -> List[str]: total_duration = 0 total_length = 0 model = torch.compile(model) - def caption_image(image, model=model, processor=processor, prompt=prompt): - return caption(model=model, processor=processor, prompt=prompt, image=image) + def caption_image(image): + return caption( + model=model, + processor=processor, + prompt=prompt, + image=image, + max_length=max_length, + batch_size=batch_size, + ) for image in images: - seconds, text = duration(partial(caption_image, image=image)) + seconds, captions = duration(partial(caption_image, image=image)) total_duration += seconds - total_length += len(text) + total_length += len(captions.strip()) if isinstance(captions, str) else sum(len(t.strip()) for t in captions) del model del processor print(f"Throughput: {total_length/total_duration:.2f} tokens/s") -def bench_image_embeddings(model, images): - total_duration = 0 - total_embeddings = 0 - images *= 10 - while total_duration < 10: - seconds, embeddings = duration(lambda: model.encode_image(processor.preprocess_image(images))) - total_duration += seconds - total_embeddings += len(embeddings) - - print(f"Throughput: {total_embeddings/total_duration:.2f} images/s") - - -def bench_text_embeddings(model, texts): - total_duration = 0 - total_embeddings = 0 - texts *= 10 - while total_duration < 10: - seconds, embeddings = duration(lambda: model.encode_text(processor.preprocess_text(texts))) - total_duration += seconds - total_embeddings += len(embeddings) - - print(f"Throughput: {total_embeddings/total_duration:.2f} queries/s") - +def main(batch_size: int = 10, max_length: int = 256): -if __name__ == "__main__": image_urls = [ "https://images.unsplash.com/photo-1697665666330-7acf230fa830?q=80&w=2787&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D", "https://images.unsplash.com/photo-1695653422543-7da6d6744364?q=80&w=2940&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDF8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D", @@ -116,18 +125,40 @@ def bench_text_embeddings(model, texts): "a few food containers, with past, corn, olives, and sliced red & green peppers, with a man pouring sous on top of it", ] + print("UForm-Gen2") + bench_captions( + model=AutoModel.from_pretrained( + "unum-cloud/uform-gen2-dpo", + trust_remote_code=True, + torch_dtype=dtype, + low_cpu_mem_usage=low_cpu_mem_usage, + ignore_mismatched_sizes=True, + ).to(device), + processor=AutoProcessor.from_pretrained( + "unum-cloud/uform-gen2-dpo", + trust_remote_code=True, + ), + prompt="Describe the picture in great detail", + images=images, + batch_size=batch_size, + max_length=max_length, + ) + print("UForm-Gen") bench_captions( model=VLMForCausalLM.from_pretrained( "unum-cloud/uform-gen", torch_dtype=dtype, low_cpu_mem_usage=low_cpu_mem_usage, + ignore_mismatched_sizes=True, ).to(device), processor=VLMProcessor.from_pretrained( "unum-cloud/uform-gen", ), prompt="[cap] Summarize the visual content of the image.", images=images, + batch_size=batch_size, + max_length=max_length, ) print("LLaVA") @@ -142,6 +173,8 @@ def bench_text_embeddings(model, texts): ), prompt="USER: \nWhat are these?\nASSISTANT:", images=images, + batch_size=batch_size, + max_length=max_length, ) print("InstructBLIP") @@ -156,12 +189,26 @@ def bench_text_embeddings(model, texts): ), prompt="Summarize the visual content of the image.", images=images, + batch_size=batch_size, + max_length=max_length, ) - print("UForm-English") - bench_image_embeddings(get_model("unum-cloud/uform-vl-english"), images) - bench_text_embeddings(get_model("unum-cloud/uform-vl-english"), captions) - print("UForm-Multilingual") - bench_image_embeddings(get_model("unum-cloud/uform-vl-multilingual-v2"), images) - bench_text_embeddings(get_model("unum-cloud/uform-vl-multilingual-v2"), captions) +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument( + "--batch-size", + type=int, + default=10, + help="Batch size for the benchmark. Batch size 1 measures latency. Large batch sizes may not fit on every GPU.", + ) + parser.add_argument( + "--max-length", + type=str, + default=256, + help="Maximum length of the generated text in tokens.", + ) + args = parser.parse_args() + + main(batch_size=args.batch_size, max_length=args.max_length) diff --git a/python/scripts/bench_encoders.py b/python/scripts/bench_encoders.py new file mode 100644 index 0000000..b237126 --- /dev/null +++ b/python/scripts/bench_encoders.py @@ -0,0 +1,274 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +This script provides the throughput of UForm multimodal embedding models. + +The output of the script will cover: + - Time to preprocess an image, and throughput in images/s. + - Time to tokenize the text, and throughput in queries/s. + - Time to encode the image, and throughput in images/s. + - Time to encode the text, and throughput in queries/s. + - Share of time spent on each part of the pipeline. + +Those numbers are presented for every model, device (cpu or gpu), backend (torch or onnx), +and precision (float32 or bfloat16), producing a pretty comprehensive benchmark. + +Before running the script - install all available packages via `pip install -e ".[torch,onnx,onnx-gpu]"`. +Before printing the numbers, a warm-up is performed to ensure the model is loaded and the cache is filled. +""" + +from functools import partial +from time import perf_counter +from dataclasses import dataclass +from typing import List, Tuple, Literal, Callable, Generator +import re +import argparse + +import requests +from PIL import Image +import pandas as pd + +from uform import get_model, Modality, ExecutionProviderError + +# Define global constants for the hardware availability +torch_available = False +try: + import torch + + torch_available = True +except ImportError: + pass +onnx_available = False +try: + import onnx + + onnx_available = True +except ImportError: + pass +cuda_available = False +try: + if torch_available: + cuda_available = torch.cuda.is_available() + elif onnx_available: + import onnxruntime + + cuda_available = onnxruntime.get_device() == "GPU" +except ImportError: + pass + + +@dataclass +class BenchmarkResult: + model_name: str + device_name: Literal["cpu", "cuda"] = "cpu" + backend_name: Literal["torch", "onnx"] = "torch" + duration_image_preprocessing: float = 0 + duration_image_embedding: float = 0 + duration_text_preprocessing: float = 0 + duration_text_embedding: float = 0 + + +def duration(callable, synchronize=False): + """Profile the duration of a callable and return the duration and the result.""" + if synchronize and torch_available and cuda_available: + torch.cuda.synchronize() # Wait for CUDA operations to complete + start = perf_counter() + result = callable() + if synchronize and torch_available and cuda_available: + torch.cuda.synchronize() # Ensure all CUDA kernels have finished + stop = perf_counter() + return stop - start, result + + +def get_captioned_images() -> List[Tuple[Image.Image, str]]: + """Get a list of pre-downloaded and decoded images and their captions.""" + image_urls = [ + "https://images.unsplash.com/photo-1697665666330-7acf230fa830?q=80&w=2787&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D", + "https://images.unsplash.com/photo-1695653422543-7da6d6744364?q=80&w=2940&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDF8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D", + "https://images.unsplash.com/photo-1703244551371-ecffad9cc3b6?q=80&w=2859&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D", + "https://plus.unsplash.com/premium_photo-1702910931866-2642eee270b1?q=80&w=2940&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D", + "https://plus.unsplash.com/premium_photo-1700583712241-893aded49e69?q=80&w=2942&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D", + ] + images = [Image.open(requests.get(url, stream=True).raw) for url in image_urls] + captions = [ + "lonely house in a beautiful valley. house is made of white wood and black bricks. its surrounded by a green field", + "grab last-mile delivery driver on a scooter grabbing a delivery in Jakarta", + "monochrome picture of new york in the late 2th century on a sunny day, showing a few canonical brick buildings and the citizens bank", + "asian girl sleeping in a bed. top down view", + "a few food containers, with past, corn, olives, and sliced red & green peppers, with a man pouring sous on top of it", + ] + return list(zip(images, captions)) + + +def yield_benchmarks(batch_size: int) -> Generator[Tuple[BenchmarkResult, Callable], None, None]: + """Yields callable benchmarks for all supported backends of the given model.""" + + # Pull the content and artificially grow the batch size + images, captions = zip(*get_captioned_images()) + + if len(images) < batch_size: + import math + + multiplier = int(math.ceil(batch_size / len(images))) + images *= multiplier + captions *= multiplier + images = images[:batch_size] + captions = captions[:batch_size] + + def run(model_name: str, device: str, backend_name: str): + result = BenchmarkResult( + model_name=model_name, + backend_name=backend_name, + device_name=device, + duration_image_preprocessing=0, + duration_image_embedding=0, + duration_text_preprocessing=0, + duration_text_embedding=0, + ) + + sync = backend_name == "torch" + processors, models = get_model( + model_name, + device=device, + modalities=[Modality.IMAGE_ENCODER, Modality.TEXT_ENCODER], + backend=backend_name, + ) + + model_text = models[Modality.TEXT_ENCODER] + model_image = models[Modality.IMAGE_ENCODER] + processor_text = processors[Modality.TEXT_ENCODER] + processor_image = processors[Modality.IMAGE_ENCODER] + + # Image preprocessing + total_duration = 0 + total_iterations = 0 + while total_duration < 10 and total_iterations < 100: + seconds, _ = duration(lambda: processor_image(images)) + total_duration += seconds + total_iterations += len(images) + duration_per_iteration = total_duration / total_iterations + result.duration_image_preprocessing = duration_per_iteration + + # Image embedding + total_duration = 0 + total_iterations = 0 + while total_duration < 10 and total_iterations < 100: + images_data = processor_image(images) + seconds, _ = duration(lambda: model_image.encode(images_data), synchronize=sync) + total_duration += seconds + total_iterations += len(images) + duration_per_iteration = total_duration / total_iterations + result.duration_image_embedding = duration_per_iteration + + # Text preprocessing + total_duration = 0 + total_iterations = 0 + while total_duration < 10 and total_iterations < 100: + seconds, _ = duration(lambda: processor_text(captions)) + total_duration += seconds + total_iterations += len(captions) + duration_per_iteration = total_duration / total_iterations + result.duration_text_preprocessing = duration_per_iteration + + # Text embedding + total_duration = 0 + total_iterations = 0 + while total_duration < 10 and total_iterations < 100: + texts_data = processor_text(captions) + seconds, _ = duration(lambda: model_text.encode(texts_data), synchronize=sync) + total_duration += seconds + total_iterations += len(captions) + duration_per_iteration = total_duration / total_iterations + result.duration_text_embedding = duration_per_iteration + + return result + + devices = ["cpu"] + if cuda_available: + devices.append("cuda") + backends = [] + if torch_available: + backends.append("torch") + if onnx_available: + backends.append("onnx") + + for device in devices: + for backend_name in backends: + for model_name in [ + "unum-cloud/uform3-image-text-english-small", + "unum-cloud/uform3-image-text-english-base", + "unum-cloud/uform3-image-text-english-large", + "unum-cloud/uform3-image-text-multilingual-base", + ]: + yield BenchmarkResult( + model_name=model_name, + device_name=device, + backend_name=backend_name, + ), partial(run, model_name, device, backend_name) + + +def main(filter_out: str = None, batch_size: int = 10): + results = [] + filter_pattern = re.compile(filter_out) if filter_out else None + for specs, func in yield_benchmarks(batch_size=batch_size): + if filter_pattern and ( + filter_pattern.search(specs.model_name) + or filter_pattern.search(specs.backend_name) + or filter_pattern.search(specs.device_name) + ): + continue + + try: + print(f"Running `{specs.model_name}` on `{specs.device_name}` using `{specs.backend_name}` backend") + result = func() + results.append(result) + except ExecutionProviderError as e: + print(f"- skipping missing backend") + print(e) + + results = sorted(results, key=lambda x: x.model_name) + results = [x.__dict__ for x in results] + + df = pd.DataFrame(results) + df.columns = [ + "Model Name", + "Device", + "Backend", + "Images Preprocessed/s", + "Images Encoded/s", + "Texts Preprocessed/s", + "Texts Encoded/s", + ] + + def inverse(x): + return 1 / x if x != 0 else 0 + + # Apply number formatting directly in the DataFrame + formatted_df = df.copy() + formatted_df["Images Preprocessed/s"] = df["Images Preprocessed/s"].map(inverse).map("{:,.2f}".format) + formatted_df["Images Encoded/s"] = df["Images Encoded/s"].map(inverse).map("{:,.2f}".format) + formatted_df["Texts Preprocessed/s"] = df["Texts Preprocessed/s"].map(inverse).map("{:,.2f}".format) + formatted_df["Texts Encoded/s"] = df["Texts Encoded/s"].map(inverse).map("{:,.2f}".format) + + # Convert formatted DataFrame to Markdown + print(formatted_df.to_markdown()) + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument( + "--filter-out", + type=str, + default=None, + help="Filter out models, backends, or devices with a Regular Expression.", + ) + parser.add_argument( + "--batch-size", + type=int, + default=10, + help="Batch size for the benchmark. Batch size 1 measures latency. Large batch sizes may not fit on every GPU.", + ) + args = parser.parse_args() + + main(filter_out=args.filter_out, batch_size=args.batch_size) diff --git a/python/scripts/export.ipynb b/python/scripts/export.ipynb deleted file mode 100644 index ce8cf10..0000000 --- a/python/scripts/export.ipynb +++ /dev/null @@ -1,666 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Scripts for Exporting PyTorch Models to ONNX and CoreML" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "!pip install --upgrade \"uform[torch]\" coremltools" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/av/miniconda3/lib/python3.10/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: dlopen(/Users/av/miniconda3/lib/python3.10/site-packages/torchvision/image.so, 0x0006): Symbol not found: __ZN3c106detail19maybe_wrap_dim_slowExxb\n", - " Referenced from: <0B637046-A38B-3A5C-80C6-E847C27DCCD5> /Users/av/miniconda3/lib/python3.10/site-packages/torchvision/image.so\n", - " Expected in: <3AE92490-D363-3FD7-8532-CB6F5F795BC8> /Users/av/miniconda3/lib/python3.10/site-packages/torch/lib/libc10.dylib\n", - " warn(f\"Failed to load image Python extension: {e}\")\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "fadffc0299c04e249fd4f7a5b40ba0af", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Fetching 5 files: 0%| | 0/5 [00:00 MIL Ops: 100%|█████████▉| 453/455 [00:00<00:00, 5638.83 ops/s]\n", - "Running MIL frontend_pytorch pipeline: 100%|██████████| 5/5 [00:00<00:00, 381.07 passes/s]\n", - "Running MIL default pipeline: 100%|██████████| 69/69 [00:00<00:00, 156.08 passes/s]\n", - "Running MIL backend_mlprogram pipeline: 100%|██████████| 12/12 [00:00<00:00, 699.38 passes/s]\n" - ] - } - ], - "source": [ - "coreml_model = ct.convert(\n", - " traced_script_module, source=\"pytorch\",\n", - " inputs=[image_input], outputs=[image_features, image_embeddings],\n", - " convert_to='mlprogram', compute_precision=ct.precision.FLOAT32)\n", - "\n", - "coreml_model.author = 'Unum Cloud'\n", - "coreml_model.license = 'Apache 2.0'\n", - "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n", - "coreml_model.save(\"../uform-vl-english-small-image.mlpackage\")" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "TextEncoder(\n", - " original_name=TextEncoder\n", - " (word_embeddings): Embedding(original_name=Embedding)\n", - " (position_embeddings): Embedding(original_name=Embedding)\n", - " (layer_norm): LayerNorm(original_name=LayerNorm)\n", - " (dropout): Dropout(original_name=Dropout)\n", - " (blocks): ModuleList(\n", - " original_name=ModuleList\n", - " (0): TextEncoderBlock(\n", - " original_name=TextEncoderBlock\n", - " (norm_attn): LayerNorm(original_name=LayerNorm)\n", - " (attention): Attention(\n", - " original_name=Attention\n", - " (query): Linear(original_name=Linear)\n", - " (key): Linear(original_name=Linear)\n", - " (value): Linear(original_name=Linear)\n", - " (out): Linear(original_name=Linear)\n", - " )\n", - " (norm_mlp): LayerNorm(original_name=LayerNorm)\n", - " (mlp): MLP(\n", - " original_name=MLP\n", - " (hidden_layer): Linear(original_name=Linear)\n", - " (output_layer): Linear(original_name=Linear)\n", - " )\n", - " (dropout): Dropout(original_name=Dropout)\n", - " )\n", - " (1): TextEncoderBlock(\n", - " original_name=TextEncoderBlock\n", - " (norm_attn): LayerNorm(original_name=LayerNorm)\n", - " (attention): Attention(\n", - " original_name=Attention\n", - " (query): Linear(original_name=Linear)\n", - " (key): Linear(original_name=Linear)\n", - " (value): Linear(original_name=Linear)\n", - " (out): Linear(original_name=Linear)\n", - " )\n", - " (norm_mlp): LayerNorm(original_name=LayerNorm)\n", - " (mlp): MLP(\n", - " original_name=MLP\n", - " (hidden_layer): Linear(original_name=Linear)\n", - " (output_layer): Linear(original_name=Linear)\n", - " )\n", - " (dropout): Dropout(original_name=Dropout)\n", - " )\n", - " (2): TextEncoderBlock(\n", - " original_name=TextEncoderBlock\n", - " (norm_attn): LayerNorm(original_name=LayerNorm)\n", - " (attention): Attention(\n", - " original_name=Attention\n", - " (query): Linear(original_name=Linear)\n", - " (key): Linear(original_name=Linear)\n", - " (value): Linear(original_name=Linear)\n", - " (out): Linear(original_name=Linear)\n", - " )\n", - " (norm_crossattn): LayerNorm(original_name=LayerNorm)\n", - " (crossattn): Attention(\n", - " original_name=Attention\n", - " (query): Linear(original_name=Linear)\n", - " (key): Linear(original_name=Linear)\n", - " (value): Linear(original_name=Linear)\n", - " (out): Linear(original_name=Linear)\n", - " )\n", - " (norm_mlp): LayerNorm(original_name=LayerNorm)\n", - " (mlp): MLP(\n", - " original_name=MLP\n", - " (hidden_layer): Linear(original_name=Linear)\n", - " (output_layer): Linear(original_name=Linear)\n", - " )\n", - " (dropout): Dropout(original_name=Dropout)\n", - " )\n", - " (3): TextEncoderBlock(\n", - " original_name=TextEncoderBlock\n", - " (norm_attn): LayerNorm(original_name=LayerNorm)\n", - " (attention): Attention(\n", - " original_name=Attention\n", - " (query): Linear(original_name=Linear)\n", - " (key): Linear(original_name=Linear)\n", - " (value): Linear(original_name=Linear)\n", - " (out): Linear(original_name=Linear)\n", - " )\n", - " (norm_crossattn): LayerNorm(original_name=LayerNorm)\n", - " (crossattn): Attention(\n", - " original_name=Attention\n", - " (query): Linear(original_name=Linear)\n", - " (key): Linear(original_name=Linear)\n", - " (value): Linear(original_name=Linear)\n", - " (out): Linear(original_name=Linear)\n", - " )\n", - " (norm_mlp): LayerNorm(original_name=LayerNorm)\n", - " (mlp): MLP(\n", - " original_name=MLP\n", - " (hidden_layer): Linear(original_name=Linear)\n", - " (output_layer): Linear(original_name=Linear)\n", - " )\n", - " (dropout): Dropout(original_name=Dropout)\n", - " )\n", - " )\n", - " (embedding_projection): Linear(original_name=Linear)\n", - " (matching_head): Linear(original_name=Linear)\n", - " (context_projection): Linear(original_name=Linear)\n", - ")" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "module = model.text_encoder\n", - "module.eval()\n", - "module.return_features = True\n", - "\n", - "traced_script_module = torch.jit.trace(module, example_inputs=[text_data['input_ids'], text_data['attention_mask']])\n", - "traced_script_module" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Tuple detected at graph output. This will be flattened in the converted model.\n", - "Converting PyTorch Frontend ==> MIL Ops: 0%| | 0/157 [00:00 MIL Ops: 99%|█████████▊| 155/157 [00:00<00:00, 6809.29 ops/s]\n", - "Running MIL frontend_pytorch pipeline: 100%|██████████| 5/5 [00:00<00:00, 1947.76 passes/s]\n", - "Running MIL default pipeline: 100%|██████████| 69/69 [00:00<00:00, 816.08 passes/s]\n", - "Running MIL backend_mlprogram pipeline: 100%|██████████| 12/12 [00:00<00:00, 3294.17 passes/s]\n" - ] - } - ], - "source": [ - "coreml_model = ct.convert(\n", - " traced_script_module, source=\"pytorch\",\n", - " inputs=[text_input, text_attention_input], outputs=[text_features, text_embeddings],\n", - " convert_to='mlprogram', compute_precision=ct.precision.FLOAT32)\n", - "\n", - "coreml_model.author = 'Unum Cloud'\n", - "coreml_model.license = 'Apache 2.0'\n", - "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n", - "coreml_model.save(\"../uform-vl-english-small-text.mlpackage\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "base", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.11" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/python/scripts/export_decoders.ipynb b/python/scripts/export_decoders.ipynb new file mode 100644 index 0000000..26e463b --- /dev/null +++ b/python/scripts/export_decoders.ipynb @@ -0,0 +1,91 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Scripts for Exporting PyTorch Models to ONNX and CoreML\n", + "\n", + "Depending on the backend, we prefer different qunatization schemes.\n", + "\n", + "- For ONNX we use `uint8` quantization.\n", + "- For PyTorch we use `bfloat16` quantization.\n", + "- For CoreML we use `float32` representation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install --upgrade \"uform[torch]\" coremltools" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "model_name = \"unum-cloud/uform-gen2-dpo\"\n", + "output_directory = \"../../\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "import uform\n", + "from PIL import Image\n", + "from transformers import AutoModel, AutoProcessor\n", + "\n", + "model = AutoModel.from_pretrained(model_name, trust_remote_code=True)\n", + "processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)\n", + "\n", + "prompt = 'Describe the picture'\n", + "image = Image.open('../../assets/unum.png')\n", + "inputs = processor(text=[prompt], images=[image], return_tensors='pt')\n", + "\n", + "with torch.inference_mode():\n", + " output = model.generate(\n", + " **inputs,\n", + " do_sample=False,\n", + " use_cache=True,\n", + " max_new_tokens=256,\n", + " eos_token_id=151645,\n", + " pad_token_id=processor.tokenizer.pad_token_id\n", + " )\n", + "prompt_len = inputs['input_ids'].shape[1]\n", + "decoded_text = processor.batch_decode(output[:, prompt_len:])[0]\n", + "\n", + "print(decoded_text)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/python/scripts/export_encoders.ipynb b/python/scripts/export_encoders.ipynb new file mode 100644 index 0000000..a8b868d --- /dev/null +++ b/python/scripts/export_encoders.ipynb @@ -0,0 +1,681 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Scripts for Exporting PyTorch Models to ONNX and CoreML\n", + "\n", + "Depending on the backend, we prefer different qunatization schemes.\n", + "\n", + "- For ONNX we use `uint8` quantization.\n", + "- For PyTorch we use `bfloat16` quantization.\n", + "- For CoreML we use `float32` representation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install --upgrade \"uform[torch]\" coremltools" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "working_directory = \"../..\"\n", + "model_name = \"uform3-image-text-english-small\"\n", + "model_directory = os.path.join(working_directory, \"models\", model_name)\n", + "model_weights_path = os.path.join(model_directory, \"torch_weight.pt\")\n", + "config_path = os.path.join(model_directory, \"config.json\")\n", + "tokenizer_path = os.path.join(model_directory, \"tokenizer.json\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "\n", + "state_dict = torch.load(model_weights_path)\n", + "list(state_dict.keys())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from uform.torch_encoders import ImageEncoder, TextEncoder\n", + "from uform.torch_processors import ImageProcessor, TextProcessor" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "image_encoder = ImageEncoder.from_pretrained(config_path, state_dict)\n", + "text_encoder = TextEncoder.from_pretrained(config_path, state_dict)\n", + "image_encoder, text_encoder" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "text_processor = TextProcessor(config_path, tokenizer_path)\n", + "image_processor = ImageProcessor(config_path)\n", + "text_processor, image_processor" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import uform\n", + "from PIL import Image\n", + "\n", + "text = 'a small red panda in a zoo'\n", + "image = Image.open('../../assets/unum.png')\n", + "\n", + "text_data = text_processor(text)\n", + "image_data = image_processor(image)\n", + "\n", + "image_features, image_embedding = image_encoder.forward(image_data, return_features=True)\n", + "text_features, text_embedding = text_encoder.forward(text_data, return_features=True)\n", + "\n", + "image_features.shape, text_features.shape, image_embedding.shape, text_embedding.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## CoreML" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import coremltools as ct\n", + "import torch" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "precision = ct.precision.FLOAT32" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "CoreML Tools provides a way to convert ONNX models to CoreML models. This script demonstrates how to convert an ONNX model to a CoreML model. For that, we need to provide an example input, and the tensor shapes will be inferred from that.\n", + "\n", + "```python\n", + " image_input = ct.TensorType(name=\"images\", shape=image_data.shape)\n", + " text_input = ct.TensorType(name=\"input_ids\", shape=text_data[\"input_ids\"].shape)\n", + " text_attention_input = ct.TensorType(name=\"attention_mask\", shape=text_data[\"attention_mask\"].shape)\n", + "```\n", + "\n", + "That, however, will only work for batch-size one. To support larger batches, we need to override the input shapes.\n", + "\n", + "```python\n", + " ct.RangeDim(lower_bound=25, upper_bound=100, default=45)\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def generalize_first_dimensions(input_shape, upper_bound=64):\n", + " if upper_bound == 1:\n", + " return input_shape\n", + " input_shape = (ct.RangeDim(lower_bound=1, upper_bound=upper_bound, default=1),) + input_shape[1:]\n", + " return input_shape\n", + "\n", + "generalize_first_dimensions(image_data[\"images\"].shape), generalize_first_dimensions(text_data[\"input_ids\"].shape), generalize_first_dimensions(text_data[\"attention_mask\"].shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "image_input = ct.TensorType(name=\"images\", shape=generalize_first_dimensions(image_data[\"images\"].shape, 1))\n", + "text_input = ct.TensorType(name=\"input_ids\", shape=generalize_first_dimensions(text_data[\"input_ids\"].shape, 1))\n", + "text_attention_input = ct.TensorType(name=\"attention_mask\", shape=generalize_first_dimensions(text_data[\"attention_mask\"].shape, 1))\n", + "text_features = ct.TensorType(name=\"features\")\n", + "text_embeddings = ct.TensorType(name=\"embeddings\")\n", + "image_features = ct.TensorType(name=\"features\")\n", + "image_embeddings = ct.TensorType(name=\"embeddings\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "module = image_encoder\n", + "module.eval()\n", + "module.return_features = True\n", + "\n", + "traced_script_module = torch.jit.trace(module, example_inputs=image_data[\"images\"])\n", + "traced_script_module" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "coreml_model = ct.convert(\n", + " traced_script_module, source=\"pytorch\",\n", + " inputs=[image_input], outputs=[image_features, image_embeddings],\n", + " convert_to='mlprogram', compute_precision=precision)\n", + "\n", + "coreml_model.author = 'Unum Cloud'\n", + "coreml_model.license = 'Apache 2.0'\n", + "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n", + "coreml_model.save(os.path.join(model_directory, \"image_encoder.mlpackage\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "module = text_encoder\n", + "module.eval()\n", + "module.return_features = True\n", + "\n", + "traced_script_module = torch.jit.trace(module, example_inputs=[text_data['input_ids'], text_data['attention_mask']])\n", + "traced_script_module" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "coreml_model = ct.convert(\n", + " traced_script_module, source=\"pytorch\",\n", + " inputs=[text_input, text_attention_input], outputs=[text_features, text_embeddings],\n", + " convert_to='mlprogram', compute_precision=precision)\n", + "\n", + "coreml_model.author = 'Unum Cloud'\n", + "coreml_model.license = 'Apache 2.0'\n", + "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n", + "coreml_model.save(os.path.join(model_directory, \"text_encoder.mlpackage\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# PyTorch\n", + "\n", + "Let's ensure:\n", + "\n", + "- the `model.text_encoder` inputs are called `input_ids` and `attention_mask`, and outputs are `embeddings` and `features`.\n", + "- the `model.image_encoder` input is called `input`, and outputs are `embeddings` and `features`.\n", + "- the model itself works fine in `f16` half-precision, so that the model is lighter and easier to download." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from safetensors import safe_open\n", + "from safetensors.torch import save_file" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "image_encoder.eval()\n", + "image_encoder.to(dtype=torch.bfloat16)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "torch.save(image_encoder.state_dict(), os.path.join(model_directory, \"image_encoder.pt\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "save_file(image_encoder.state_dict(), os.path.join(model_directory, \"image_encoder.safetensors\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "text_encoder.eval()\n", + "text_encoder.to(dtype=torch.bfloat16)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "torch.save(text_encoder.state_dict(), os.path.join(model_directory, \"text_encoder.pt\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "save_file(text_encoder.state_dict(), os.path.join(model_directory, \"text_encoder.safetensors\"))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "image_features, image_embedding = image_encoder.forward(image_data[\"images\"].to(dtype=torch.bfloat16), return_features=True)\n", + "text_features, text_embedding = text_encoder.forward(text_data, return_features=True)\n", + "\n", + "image_features.shape, text_features.shape, image_embedding.shape, text_embedding.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## ONNX" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install onnx onnxconverter-common" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from torch.onnx import export as onnx_export\n", + "import torch" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can't immediately export to `bfloat16` as it's not supported by ONNX, but we also can't export to `float16`, as the forward pass (that will be traced) is gonna fail. So let's export to `float32` ONNX file first." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "module = text_encoder\n", + "module.eval()\n", + "module.return_features = True\n", + "module.to(dtype=torch.float32)\n", + "\n", + "onnx_export(\n", + " module,\n", + " (text_data[\"input_ids\"], text_data[\"attention_mask\"]), \n", + " os.path.join(model_directory, \"text_encoder.onnx\"), \n", + " export_params=True,\n", + " opset_version=15,\n", + " do_constant_folding=True,\n", + " input_names = ['input_ids', 'attention_mask'], \n", + " output_names = ['features', 'embeddings'],\n", + " dynamic_axes={\n", + " 'input_ids' : {0 : 'batch_size'}, \n", + " 'attention_mask' : {0 : 'batch_size'}, \n", + " 'features' : {0 : 'batch_size'}, \n", + " 'embeddings' : {0 : 'batch_size'}})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now repeat the same for images." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "module = image_encoder\n", + "module.eval()\n", + "module.return_features = True\n", + "module.to(dtype=torch.float32)\n", + "\n", + "torch.onnx.export(\n", + " module,\n", + " image_data[\"images\"], \n", + " os.path.join(model_directory, \"image_encoder.onnx\"), \n", + " export_params=True,\n", + " opset_version=15,\n", + " do_constant_folding=True,\n", + " input_names = ['images'], \n", + " output_names = ['features', 'embeddings'],\n", + " dynamic_axes={\n", + " 'images' : {0 : 'batch_size'},\n", + " 'features' : {0 : 'batch_size'},\n", + " 'embeddings' : {0 : 'batch_size'}})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Quantizing to `float16`\n", + "\n", + "Let's use [additional ONNX tooling](https://onnxruntime.ai/docs/performance/model-optimizations/float16.html#mixed-precision) to convert to half-precision." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import onnx\n", + "from onnxconverter_common import float16" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "module_path = os.path.join(model_directory, \"text_encoder.onnx\")\n", + "module = onnx.load(module_path)\n", + "module_fp16 = float16.convert_float_to_float16(module)\n", + "onnx.save(module_fp16, module_path)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "module_path = os.path.join(model_directory, \"image_encoder.onnx\")\n", + "module = onnx.load(module_path)\n", + "module_fp16 = float16.convert_float_to_float16(module)\n", + "onnx.save(module_fp16, module_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Quantizing to `uint8`\n", + "\n", + "We can further quantize the model into `uint8` using ONNX quantization tools.\n", + "The `int8` is default variant, but [some of the operators don't support it](https://github.com/microsoft/onnxruntime/issues/15888)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from onnxruntime.quantization import quantize_dynamic, QuantType" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "module_path = os.path.join(model_directory, \"text_encoder.onnx\")\n", + "quantize_dynamic(module_path, module_path, weight_type=QuantType.QUInt8)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "module_path = os.path.join(model_directory, \"image_encoder.onnx\")\n", + "quantize_dynamic(module_path, module_path, weight_type=QuantType.QUInt8)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's make sure that all the text inputs are integers of identical type - `int32`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import onnx\n", + "import os\n", + "from onnx import helper\n", + "\n", + "# Load the ONNX model\n", + "module_path = os.path.join(model_directory, \"text_encoder.onnx\")\n", + "module = onnx.load(module_path)\n", + "\n", + "# Get the module's graph\n", + "graph = module.graph\n", + "\n", + "# Iterate through the inputs and update the data type of `input_ids`\n", + "for input_tensor in graph.input:\n", + " # Check if this is the tensor we want to change\n", + " if input_tensor.name == 'input_ids' or input_tensor.name == 'attention_mask':\n", + " # Get the tensor type information\n", + " tensor_type = input_tensor.type.tensor_type\n", + " # Set the element type to INT32 (int32's enum value in onnx is 6)\n", + " tensor_type.elem_type = onnx.TensorProto.INT32\n", + "\n", + "# Optionally, check that the module is still valid\n", + "onnx.checker.check_model(module)\n", + "\n", + "# Save the modified module\n", + "onnx.save(module, module_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can use the following function to print and validate the input and output types of the ONNX model files." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def print_model_inputs_and_outputs(onnx_model_path):\n", + " model = onnx.load(onnx_model_path)\n", + "\n", + " # Get the model's graph\n", + " graph = model.graph\n", + "\n", + " # Print input information\n", + " print(\"Model Inputs:\")\n", + " for input_tensor in graph.input:\n", + " tensor_type = input_tensor.type.tensor_type\n", + " # Get the element type (data type)\n", + " elem_type = tensor_type.elem_type\n", + " # Convert numeric type to readable format\n", + " readable_type = onnx.TensorProto.DataType.Name(elem_type)\n", + " # Get tensor shape\n", + " shape = [dim.dim_value for dim in tensor_type.shape.dim]\n", + " print(f\"Name: {input_tensor.name}, Type: {readable_type}, Shape: {shape}\")\n", + "\n", + " # Print output information similarly if needed\n", + " print(\"\\nModel Outputs:\")\n", + " for output_tensor in graph.output:\n", + " tensor_type = output_tensor.type.tensor_type\n", + " elem_type = tensor_type.elem_type\n", + " readable_type = onnx.TensorProto.DataType.Name(elem_type)\n", + " shape = [dim.dim_value for dim in tensor_type.shape.dim]\n", + " print(f\"Name: {output_tensor.name}, Type: {readable_type}, Shape: {shape}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's check that the runtime can actually load those models." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import onnxruntime as ort\n", + "session_options = ort.SessionOptions()\n", + "session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "module_path = os.path.join(model_directory, \"text_encoder.onnx\")\n", + "session = ort.InferenceSession(module_path, sess_options=session_options)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "module_path = os.path.join(model_directory, \"image_encoder.onnx\")\n", + "session = ort.InferenceSession(module_path, sess_options=session_options)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Upload to Hugging Face" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../models/uform3-image-text-english-small/ . --exclude=\"torch_weight.pt\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../image_encoder.onnx image_encoder.onnx\n", + "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../text_encoder.onnx text_encoder.onnx\n", + "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../image_encoder.safetensors image_encoder.safetensors\n", + "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../text_encoder.safetensors text_encoder.safetensors\n", + "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../image_encoder.pt image_encoder.pt\n", + "!huggingface-cli upload unum-cloud/uform3-image-text-english-small ../../text_encoder.pt text_encoder.pt" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/python/scripts/test_generative.py b/python/scripts/test_decoders.py similarity index 100% rename from python/scripts/test_generative.py rename to python/scripts/test_decoders.py diff --git a/python/scripts/test_embeddings.py b/python/scripts/test_embeddings.py deleted file mode 100644 index d71bf0b..0000000 --- a/python/scripts/test_embeddings.py +++ /dev/null @@ -1,148 +0,0 @@ -from typing import Tuple - -import pytest -from PIL import Image -import uform - -# PyTorch is a very heavy dependency, so we may want to skip these tests if it's not installed -try: - import torch - - torch_available = True -except: - torch_available = False - -# ONNX is not a very light dependency either -try: - import onnx - - onnx_available = True -except: - onnx_available = False - -torch_models = [ - "unum-cloud/uform-vl-english", - "unum-cloud/uform-vl-multilingual-v2", -] - -onnx_models_and_providers = [ - ("unum-cloud/uform-vl-english-small", "cpu", "fp32"), - ("unum-cloud/uform-vl-english-large", "cpu", "fp32"), - ("unum-cloud/uform-vl-english-small", "gpu", "fp32"), - ("unum-cloud/uform-vl-english-large", "gpu", "fp32"), - ("unum-cloud/uform-vl-english-small", "gpu", "fp16"), - ("unum-cloud/uform-vl-english-large", "gpu", "fp16"), -] - - -@pytest.mark.skipif(not torch_available, reason="PyTorch is not installed") -@pytest.mark.parametrize("model_name", torch_models) -def test_torch_one_embedding(model_name: str): - model, processor = uform.get_model(model_name) - text = "a small red panda in a zoo" - image_path = "assets/unum.png" - - image = Image.open(image_path) - image_data = processor.preprocess_image(image) - text_data = processor.preprocess_text(text) - - image_features, image_embedding = model.encode_image(image_data, return_features=True) - text_features, text_embedding = model.encode_text(text_data, return_features=True) - - assert image_embedding.shape[0] == 1, "Image embedding batch size is not 1" - assert text_embedding.shape[0] == 1, "Text embedding batch size is not 1" - - # Test reranking - score, joint_embedding = model.encode_multimodal( - image_features=image_features, - text_features=text_features, - attention_mask=text_data["attention_mask"], - return_scores=True, - ) - assert score.shape[0] == 1, "Matching score batch size is not 1" - assert joint_embedding.shape[0] == 1, "Joint embedding batch size is not 1" - - -@pytest.mark.skipif(not torch_available, reason="PyTorch is not installed") -@pytest.mark.parametrize("model_name", torch_models) -@pytest.mark.parametrize("batch_size", [1, 2]) -def test_torch_many_embeddings(model_name: str, batch_size: int): - model, processor = uform.get_model(model_name) - texts = ["a small red panda in a zoo"] * batch_size - image_paths = ["assets/unum.png"] * batch_size - - images = [Image.open(path) for path in image_paths] - image_data = processor.preprocess_image(images) - text_data = processor.preprocess_text(texts) - - image_embeddings = model.encode_image(image_data, return_features=False) - text_embeddings = model.encode_text(text_data, return_features=False) - - assert image_embeddings.shape[0] == batch_size, "Image embedding is unexpected" - assert text_embeddings.shape[0] == batch_size, "Text embedding is unexpected" - - -@pytest.mark.skipif(not onnx_available, reason="ONNX is not installed") -@pytest.mark.parametrize("model_specs", onnx_models_and_providers) -def test_onnx_one_embedding(model_specs: Tuple[str, str, str]): - - from uform.onnx_models import ExecutionProviderError - - try: - - model, processor = uform.get_model_onnx(*model_specs) - text = "a small red panda in a zoo" - image_path = "assets/unum.png" - - image = Image.open(image_path) - image_data = processor.preprocess_image(image) - text_data = processor.preprocess_text(text) - - image_features, image_embedding = model.encode_image(image_data, return_features=True) - text_features, text_embedding = model.encode_text(text_data, return_features=True) - - assert image_embedding.shape[0] == 1, "Image embedding batch size is not 1" - assert text_embedding.shape[0] == 1, "Text embedding batch size is not 1" - - score, joint_embedding = model.encode_multimodal( - image_features=image_features, - text_features=text_features, - attention_mask=text_data["attention_mask"], - return_scores=True, - ) - assert score.shape[0] == 1, "Matching score batch size is not 1" - assert joint_embedding.shape[0] == 1, "Joint embedding batch size is not 1" - - except ExecutionProviderError as e: - pytest.skip(f"Execution provider error: {e}") - - -@pytest.mark.skipif(not onnx_available, reason="ONNX is not installed") -@pytest.mark.parametrize("model_specs", onnx_models_and_providers) -@pytest.mark.parametrize("batch_size", [1, 2]) -def test_onnx_many_embeddings(model_specs: Tuple[str, str, str], batch_size: int): - - from uform.onnx_models import ExecutionProviderError - - try: - - model, processor = uform.get_model_onnx(*model_specs) - texts = ["a small red panda in a zoo"] * batch_size - image_paths = ["assets/unum.png"] * batch_size - - images = [Image.open(path) for path in image_paths] - image_data = processor.preprocess_image(images) - text_data = processor.preprocess_text(texts) - - image_embeddings = model.encode_image(image_data, return_features=False) - text_embeddings = model.encode_text(text_data, return_features=False) - - assert image_embeddings.shape[0] == batch_size, "Image embedding is unexpected" - assert text_embeddings.shape[0] == batch_size, "Text embedding is unexpected" - - except ExecutionProviderError as e: - pytest.skip(f"Execution provider error: {e}") - - -if __name__ == "__main__": - pytest.main(["-s", "-x", __file__]) diff --git a/python/scripts/test_encoders.py b/python/scripts/test_encoders.py new file mode 100644 index 0000000..20caed2 --- /dev/null +++ b/python/scripts/test_encoders.py @@ -0,0 +1,292 @@ +from functools import wraps +from typing import Tuple +import requests +from io import BytesIO +import os + +import pytest +import numpy as np +from PIL import Image + +from uform import Modality, get_model, ExecutionProviderError + +# PyTorch is a very heavy dependency, so we may want to skip these tests if it's not installed +try: + import torch + + torch_available = True +except: + torch_available = False + +# ONNX is not a very light dependency either +try: + import onnx + + onnx_available = True +except: + onnx_available = False + +torch_models = [ + "unum-cloud/uform3-image-text-english-small", + "unum-cloud/uform3-image-text-english-base", + "unum-cloud/uform3-image-text-english-large", + "unum-cloud/uform3-image-text-multilingual-base", +] + +onnx_models = [ + "unum-cloud/uform3-image-text-english-small", + "unum-cloud/uform3-image-text-english-base", + "unum-cloud/uform3-image-text-english-large", + "unum-cloud/uform3-image-text-multilingual-base", +] + +# Let's check if the HuggingFace Hub API token is set in the environment variable. +# If it's not there, check if the `.hf_token` file is present in the current working directory. +token = os.getenv("HUGGINGFACE_HUB_TOKEN", None) +if token is None: + token_path = "./.hf_token" + if os.path.exists(token_path): + with open(token_path, "r") as file: + token = file.read().strip() + + +def skip_on(exception, reason="No good reason :)"): + def decorator_func(f): + @wraps(f) + def wrapper(*args, **kwargs): + try: + # Try to run the test + return f(*args, **kwargs) + except exception: + pytest.skip(reason) + + return wrapper + + return decorator_func + + +def cosine_similarity(x, y) -> float: + if not isinstance(x, np.ndarray): + x = x.detach().numpy() + if not isinstance(y, np.ndarray): + y = y.detach().numpy() + + # Unlike NumPy, SimSIMD can properly deal with integer types + x = x.astype(np.float32).flatten() + y = y.astype(np.float32).flatten() + return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)) + + +def cross_references_image_and_text_embeddings(text_to_embedding, image_to_embedding, batch_size_multiple: int = 1): + """Test if the embeddings of text and image are semantically similar + using a small set of example text-image pairs.""" + + texts = [ + "A group of friends enjoy a barbecue on a sandy beach, with one person grilling over a large black grill, while the other sits nearby, laughing and enjoying the camaraderie.", + "A white and orange cat stands on its hind legs, reaching towards a wicker basket filled with red raspberries on a wooden table in a garden, surrounded by orange flowers and a white teapot, creating a serene and whimsical scene.", + "A little girl in a yellow dress stands in a grassy field, holding an umbrella and looking at the camera, amidst rain.", + "This serene bedroom features a white bed with a black canopy, a gray armchair, a black dresser with a mirror, a vase with a plant, a window with white curtains, a rug, and a wooden floor, creating a tranquil and elegant atmosphere.", + "The image captures the iconic Louvre Museum in Paris, illuminated by warm lights against a dark sky, with the iconic glass pyramid in the center, surrounded by ornate buildings and a large courtyard, showcasing the museum's grandeur and historical significance.", + ] + + image_urls = [ + "https://github.com/ashvardanian/ashvardanian/blob/master/demos/bbq-on-beach.jpg?raw=true", + "https://github.com/ashvardanian/ashvardanian/blob/master/demos/cat-in-garden.jpg?raw=true", + "https://github.com/ashvardanian/ashvardanian/blob/master/demos/girl-and-rain.jpg?raw=true", + "https://github.com/ashvardanian/ashvardanian/blob/master/demos/light-bedroom-furniture.jpg?raw=true", + "https://github.com/ashvardanian/ashvardanian/blob/master/demos/louvre-at-night.jpg?raw=true", + ] + assert len(texts) == len(image_urls), "Number of texts and images should be the same." + + images = [Image.open(BytesIO(requests.get(image_url).content)) for image_url in image_urls] + count_pairs = len(texts) + + # Ensure we have a sufficiently large batch + texts = texts * batch_size_multiple + images = images * batch_size_multiple + + # Compute the embedding in a batch fashion + text_embeddings = text_to_embedding(texts) + image_embeddings = image_to_embedding(images) + + # Evaluate cosine similarity + for i in range(count_pairs): + pair_similarity = cosine_similarity(text_embeddings[i], image_embeddings[i]) + other_text_similarities = [ + cosine_similarity(text_embeddings[j], image_embeddings[i]) for j in range(count_pairs) if j != i + ] + other_image_similarities = [ + cosine_similarity(text_embeddings[i], image_embeddings[j]) for j in range(count_pairs) if j != i + ] + + assert pair_similarity > max( + other_text_similarities + ), "Text should be more similar to its corresponding image than to other images." + assert pair_similarity > max( + other_image_similarities + ), "Image should be more similar to its corresponding text than to other texts." + + +@pytest.mark.skipif(not torch_available, reason="PyTorch is not installed") +@pytest.mark.parametrize("model_name", torch_models) +def test_torch_one_embedding(model_name: str): + processors, models = get_model(model_name, token=token, backend="torch") + model_text = models[Modality.TEXT_ENCODER] + model_image = models[Modality.IMAGE_ENCODER] + processor_text = processors[Modality.TEXT_ENCODER] + processor_image = processors[Modality.IMAGE_ENCODER] + + text = "a small red panda in a zoo" + image_path = "assets/unum.png" + + image = Image.open(image_path) + image_data = processor_image(image) + text_data = processor_text(text) + + image_features, image_embedding = model_image.encode(image_data, return_features=True) + text_features, text_embedding = model_text.encode(text_data, return_features=True) + + assert image_embedding.shape[0] == 1, "Image embedding batch size is not 1" + assert text_embedding.shape[0] == 1, "Text embedding batch size is not 1" + + # Test if the model outputs actually make sense + cross_references_image_and_text_embeddings( + lambda text: model_text(processor_text(text)), + lambda image: model_image(processor_image(image)), + ) + + +@pytest.mark.skipif(not torch_available, reason="PyTorch is not installed") +@pytest.mark.parametrize("model_name", torch_models) +@pytest.mark.parametrize("batch_size", [1, 2]) +def test_torch_many_embeddings(model_name: str, batch_size: int): + + processors, models = get_model(model_name, token=token, backend="torch") + model_text = models[Modality.TEXT_ENCODER] + model_image = models[Modality.IMAGE_ENCODER] + processor_text = processors[Modality.TEXT_ENCODER] + processor_image = processors[Modality.IMAGE_ENCODER] + + texts = ["a small red panda in a zoo"] * batch_size + image_paths = ["assets/unum.png"] * batch_size + + images = [Image.open(path) for path in image_paths] + image_data = processor_image(images) + text_data = processor_text(texts) + + image_embeddings = model_image.encode(image_data, return_features=False) + text_embeddings = model_text.encode(text_data, return_features=False) + + assert image_embeddings.shape[0] == batch_size, "Image embedding is unexpected" + assert text_embeddings.shape[0] == batch_size, "Text embedding is unexpected" + + +@pytest.mark.skipif(not onnx_available, reason="ONNX is not installed") +@pytest.mark.parametrize("model_name", onnx_models) +@pytest.mark.parametrize("device", ["CPUExecutionProvider"]) +@skip_on(ExecutionProviderError, reason="Missing execution provider") +def test_onnx_one_embedding(model_name: str, device: str): + + processors, models = get_model(model_name, token=token, device=device, backend="onnx") + model_text = models[Modality.TEXT_ENCODER] + model_image = models[Modality.IMAGE_ENCODER] + processor_text = processors[Modality.TEXT_ENCODER] + processor_image = processors[Modality.IMAGE_ENCODER] + + text = "a small red panda in a zoo" + image_path = "assets/unum.png" + + image = Image.open(image_path) + image_data = processor_image(image) + text_data = processor_text(text) + + image_features, image_embedding = model_image.encode(image_data) + text_features, text_embedding = model_text.encode(text_data) + + assert image_embedding.shape[0] == 1, "Image embedding batch size is not 1" + assert text_embedding.shape[0] == 1, "Text embedding batch size is not 1" + + # Nested fucntions are easier to debug, than lambdas + def get_image_embedding(image_data): + features, embedding = model_image.encode(processor_image(image_data)) + return embedding + + def get_text_embedding(text_data): + features, embedding = model_text.encode(processor_text(text_data)) + return embedding + + # Test if the model outputs actually make sense + cross_references_image_and_text_embeddings(get_text_embedding, get_image_embedding) + + +@pytest.mark.skipif(not onnx_available, reason="ONNX is not installed") +@pytest.mark.parametrize("model_name", onnx_models) +@pytest.mark.parametrize("batch_size", [1, 2]) +@pytest.mark.parametrize("device", ["CPUExecutionProvider"]) +@skip_on(ExecutionProviderError, reason="Missing execution provider") +def test_onnx_many_embeddings(model_name: str, batch_size: int, device: str): + + processors, models = get_model(model_name, token=token, device=device, backend="onnx") + model_text = models[Modality.TEXT_ENCODER] + model_image = models[Modality.IMAGE_ENCODER] + processor_text = processors[Modality.TEXT_ENCODER] + processor_image = processors[Modality.IMAGE_ENCODER] + + texts = ["a small red panda in a zoo"] * batch_size + image_paths = ["assets/unum.png"] * batch_size + + images = [Image.open(path) for path in image_paths] + image_data = processor_image(images) + text_data = processor_text(texts) + + image_embeddings = model_image.encode(image_data, return_features=False) + text_embeddings = model_text.encode(text_data, return_features=False) + + assert image_embeddings.shape[0] == batch_size, "Image embedding is unexpected" + assert text_embeddings.shape[0] == batch_size, "Text embedding is unexpected" + + +@pytest.mark.skipif(not torch_available, reason="PyTorch is not installed") +@pytest.mark.parametrize("model_name", torch_models[:1]) +def test_torch_multi_gpu(model_name: str): + + count_cuda_devices = torch.cuda.device_count() + if count_cuda_devices < 2: + pytest.skip("Not enough CUDA devices to run multi-GPU test") + + processors, models = get_model(model_name, token=token, backend="torch", device="cuda") + model_text = models[Modality.TEXT_ENCODER] + model_image = models[Modality.IMAGE_ENCODER] + processor_text = processors[Modality.TEXT_ENCODER] + processor_image = processors[Modality.IMAGE_ENCODER] + + import torch.nn as nn + + model_text.return_features = False + model_image.return_features = False + model_text_parallel = nn.DataParallel(model_text) + model_image_parallel = nn.DataParallel(model_image) + + # Nested fucntions are easier to debug, than lambdas + def get_image_embedding(image_data): + preprocessed = processor_image(image_data) + embedding = model_image_parallel.forward(preprocessed) + return embedding.detach().cpu().numpy() + + def get_text_embedding(text_data): + preprocessed = processor_text(text_data) + embedding = model_text_parallel.forward(preprocessed) + return embedding.detach().cpu().numpy() + + # Test if the model outputs actually make sense + cross_references_image_and_text_embeddings( + get_text_embedding, + get_image_embedding, + batch_size_multiple=count_cuda_devices, + ) + + +if __name__ == "__main__": + # If you want to run this test file individually, you can do so by running: + # pytest.main(["-s", "-x", __file__]) + pass diff --git a/python/uform/__init__.py b/python/uform/__init__.py index 1ecb242..7af8b75 100755 --- a/python/uform/__init__.py +++ b/python/uform/__init__.py @@ -1,59 +1,191 @@ -from json import load -from os.path import join -from typing import Mapping, Optional, Tuple - -from huggingface_hub import snapshot_download - - -def get_checkpoint(model_name: str, token: str) -> Tuple[str, Mapping, str]: - import torch - - model_path = snapshot_download(repo_id=model_name, token=token) - config_path = join(model_path, "torch_config.json") - - state = torch.load(join(model_path, "torch_weight.pt")) - return config_path, state, join(model_path, "tokenizer.json") - - -def get_model(model_name: str, token: Optional[str] = None): - from uform.torch_models import VLM - from uform.torch_preprocessor import TorchProcessor - - config_path, state, tokenizer_path = get_checkpoint(model_name, token) - - with open(config_path) as f: - config = load(f) - - model = VLM(config, tokenizer_path) - model.image_encoder.load_state_dict(state["image_encoder"]) - model.text_encoder.load_state_dict(state["text_encoder"]) - processor = TorchProcessor(config, tokenizer_path) - - return model.eval(), processor - - -def get_model_onnx(model_name: str, device: str, dtype: str, token: Optional[str] = None): - from uform.onnx_models import VLM_ONNX - from uform.numpy_preprocessor import NumPyProcessor - - assert device in ( - "cpu", - "gpu", - ), f"Invalid `device`: {device}. Must be either `cpu` or `gpu`" - assert dtype in ( - "fp32", - "fp16", - ), f"Invalid `dtype`: {dtype}. Must be either `fp32` or `fp16` (only for gpu)" - assert ( - device == "cpu" and dtype == "fp32" - ) or device == "gpu", "Combination `device`=`cpu` & `dtype=fp16` is not supported" - - model_path = snapshot_download(repo_id=f"{model_name}-{device}-{dtype}", token=token) - - with open(join(model_path, "config.json")) as f: - config = load(f) - - model = VLM_ONNX(model_path, config, device, dtype) - processor = NumPyProcessor(config, join(model_path, "tokenizer.json")) - - return model, processor +from os.path import join, exists +from typing import Dict, Optional, Tuple, Literal, Union, Callable + +from huggingface_hub import snapshot_download, utils + +from uform.shared import ExecutionProviderError, Modality + + +def _normalize_modalities(modalities: Tuple[str, Modality]) -> Tuple[Modality]: + if modalities is None: + return (Modality.TEXT_ENCODER, Modality.IMAGE_ENCODER, Modality.TEXT_DECODER, Modality.VIDEO_ENCODER) + + return tuple(x if isinstance(x, Modality) else Modality(x) for x in modalities) + + +def get_checkpoint( + model_name: str, + modalities: Tuple[str, Modality], + token: Optional[str] = None, + format: Literal[".pt", ".onnx"] = ".pt", +) -> Tuple[str, Dict[Modality, str], Optional[str]]: + """Downloads a model checkpoint from the Hugging Face Hub. + + :param model_name: The name of the model to download, like `unum-cloud/uform3-image-text-english-small` + :param token: The Hugging Face API token, if required + :param modalities: The modalities to download, like `("text_encoder", "image_encoder")` + :param format: The format of the model checkpoint, either `.pt` or `.onnx` + :return: A tuple of the config path, dictionary of paths to different modalities, and tokenizer path + """ + + modalities = _normalize_modalities(modalities) + + # It is not recommended to use `.pth` extension when checkpointing models + # because it collides with Python path (`.pth`) configuration files. + merged_model_names = [x + format for x in ["torch_weight", "weight", "model"]] + separate_modality_names = [(x.value if isinstance(x, Modality) else x) + format for x in modalities] + config_names = ["torch_config.json", "config.json"] + tokenizer_names = ["tokenizer.json"] + + old_progress_behavior = utils.are_progress_bars_disabled() + utils.disable_progress_bars() + + # The download stats depend on the number of times the `config.json` is pulled + # https://huggingface.co/docs/hub/models-download-stats + model_path = snapshot_download( + repo_id=model_name, + token=token, + allow_patterns=merged_model_names + separate_modality_names + config_names + tokenizer_names, + ) + + if old_progress_behavior: + utils.enable_progress_bars() + + # Find the first name in `config_names` that is present + config_path = None + for config_name in config_names: + if exists(join(model_path, config_name)): + config_path = join(model_path, config_name) + break + + # Same for the tokenizer + tokenizer_path = None + for tokenizer_name in tokenizer_names: + if exists(join(model_path, tokenizer_name)): + tokenizer_path = join(model_path, tokenizer_name) + break + + # Ideally, we want to separately fetch all the models. + # If those aren't available, aggregate separate modalities and merge them. + modality_paths = None + for file_name in merged_model_names: + if exists(join(model_path, file_name)): + modality_paths = join(model_path, file_name) + break + + if modality_paths is None: + modality_paths = {} + for separate_modality_name in separate_modality_names: + if exists(join(model_path, separate_modality_name)): + modality_name, _, _ = separate_modality_name.partition(".") + modality_paths[Modality(modality_name)] = join(model_path, separate_modality_name) + + return config_path, modality_paths, tokenizer_path + + +def get_model_torch( + model_name: str, + *, + token: Optional[str] = None, + device: Literal["cpu", "cuda"] = "cpu", + modalities: Optional[Tuple[Union[str, Modality]]] = None, +) -> Tuple[Dict[Modality, Callable], Dict]: + """ + Fetches and constructs a PyTorch model with its processors based on provided modalities. + + :param model_name: The identifier of the model on the Hugging Face Hub. + :param token: Optional API token for authenticated access to the model. + :param device: The device to load the model onto ('cpu' or 'cuda'). + :param modalities: A tuple specifying the types of model components to fetch (e.g., text encoder). + :return: A tuple containing dictionaries for processors and models keyed by their respective modalities. + """ + from uform.torch_encoders import TextEncoder, ImageEncoder + from uform.torch_processors import TextProcessor, ImageProcessor + + modalities = _normalize_modalities(modalities) + config_path, modality_paths, tokenizer_path = get_checkpoint(model_name, modalities, token=token, format=".pt") + + result_processors = {} + result_models = {} + + if Modality.TEXT_ENCODER in modalities: + processor = TextProcessor(config_path, tokenizer_path) + encoder = TextEncoder.from_pretrained(config_path, modality_paths.get(Modality.TEXT_ENCODER)) + encoder = encoder.eval().to(device) + result_processors[Modality.TEXT_ENCODER] = processor + result_models[Modality.TEXT_ENCODER] = encoder + + if Modality.IMAGE_ENCODER in modalities: + processor = ImageProcessor(config_path) + encoder = ImageEncoder.from_pretrained(config_path, modality_paths.get(Modality.IMAGE_ENCODER)) + encoder = encoder.eval().to(device) + result_processors[Modality.IMAGE_ENCODER] = processor + result_models[Modality.IMAGE_ENCODER] = encoder + + return result_processors, result_models + + +def get_model_onnx( + model_name: str, + *, + device: Literal["cpu", "cuda"] = "cpu", + token: Optional[str] = None, + modalities: Optional[Tuple[str]] = None, +): + """ + Fetches and constructs an ONNX model with its processors based on provided modalities. + + :param model_name: The identifier of the model on the Hugging Face Hub. + :param device: The device on which the model will operate ('cpu' or 'cuda'). + :param token: Optional API token for authenticated access to the model. + :param modalities: A tuple specifying the types of model components to fetch (e.g., text encoder). + :return: A tuple containing dictionaries for processors and models keyed by their respective modalities. + """ + from uform.onnx_encoders import TextEncoder, ImageEncoder + from uform.numpy_processors import TextProcessor, ImageProcessor + + modalities = _normalize_modalities(modalities) + config_path, modality_paths, tokenizer_path = get_checkpoint(model_name, modalities, token=token, format=".onnx") + + result_processors = {} + result_models = {} + + if Modality.TEXT_ENCODER in modalities: + processor = TextProcessor(config_path, tokenizer_path) + encoder = TextEncoder(modality_paths.get(Modality.TEXT_ENCODER), device=device) + result_processors[Modality.TEXT_ENCODER] = processor + result_models[Modality.TEXT_ENCODER] = encoder + + if Modality.IMAGE_ENCODER in modalities: + processor = ImageProcessor(config_path) + encoder = ImageEncoder(modality_paths.get(Modality.IMAGE_ENCODER), device=device) + result_processors[Modality.IMAGE_ENCODER] = processor + result_models[Modality.IMAGE_ENCODER] = encoder + + return result_processors, result_models + + +def get_model( + model_name: str, + *, + device: Literal["cpu", "cuda"] = "cpu", # change this if you have a GPU + backend: Literal["onnx", "torch"] = "onnx", # lighter = better + modalities: Optional[Tuple[str, Modality]] = None, # all by default + token: Optional[str] = None, # optional HuggingFace Hub token for private models +) -> Tuple[Dict[Modality, Callable], Dict]: + """ + Fetches a model and its processors from the Hugging Face Hub, using either the ONNX or Torch backend. + + :param model_name: The identifier of the model on the Hugging Face Hub. + :param device: The device to load the model onto ('cpu' or 'cuda'). + :param backend: The backend framework to use ('onnx' or 'torch'). + :param modalities: A tuple specifying the types of model components to fetch. + :param token: Optional API token for authenticated access to the model. + :return: A tuple containing dictionaries for processors and models keyed by their respective modalities. + """ + if backend == "onnx": + return get_model_onnx(model_name, device=device, token=token, modalities=modalities) + elif backend == "torch": + return get_model_torch(model_name, device=device, token=token, modalities=modalities) + else: + raise ValueError(f"Unknown backend: {backend}") diff --git a/python/uform/chat.py b/python/uform/chat.py index 5ef44b7..b9e4423 100644 --- a/python/uform/chat.py +++ b/python/uform/chat.py @@ -3,20 +3,16 @@ import requests import torch from PIL import Image -from transformers import TextStreamer - -from uform.gen_model import VLMForCausalLM, VLMProcessor - -EOS_TOKEN = 32001 +from transformers import TextStreamer, AutoModel, AutoProcessor def parse_args(): parser = ArgumentParser(description="Chat with UForm generative model") - parser.add_argument("--model", type=str, default="unum-cloud/uform-gen-chat") - parser.add_argument("--image", type=str, help="", required=True) - parser.add_argument("--device", type=str, required=True) - parser.add_argument("--fp16", action="store_true") + parser.add_argument("--model", type=str, default="unum-cloud/uform-gen-chat", help="Model name or path") + parser.add_argument("--image", type=str, required=True, help="Path to image or URL") + parser.add_argument("--device", type=str, required=True, help="Device to run on, like `cpu` or `cuda:0`") + parser.add_argument("--fp16", action="store_true", help="Use half-precision math for faster inference") return parser.parse_args() @@ -30,22 +26,18 @@ def run_chat(opts, model, processor): messages = [{"role": "system", "content": "You are a helpful assistant."}] is_first_message = True + if opts.image.startswith("http"): - image = ( - processor.image_processor( - Image.open(requests.get(opts.image, stream=True).raw), - ) - .unsqueeze(0) - .to(torch.bfloat16 if opts.fp16 else torch.float32) - .to(opts.device) - ) + image = Image.open(requests.get(opts.image, stream=True).raw) else: - image = ( - processor.image_processor(Image.open(opts.image)) - .unsqueeze(0) - .to(torch.bfloat16 if opts.fp16 else torch.float32) - .to(opts.device) - ) + image = Image.open(opts.image) + + image = ( + processor.feature_extractor(image) # + .unsqueeze(0) + .to(torch.bfloat16 if opts.fp16 else torch.float32) + .to(opts.device) + ) while True: if messages[-1]["role"] in ("system", "assistant"): @@ -68,7 +60,7 @@ def run_chat(opts, model, processor): 1, input_ids.shape[1] + processor.num_image_latents - 1, ).to(opts.device) - x = { + inputs = { "input_ids": input_ids, "attention_mask": attention_mask, "images": image, @@ -76,18 +68,19 @@ def run_chat(opts, model, processor): print("Assistant: ", end="") with torch.inference_mode(): - y = model.generate( - **x, + output = model.generate( + **inputs, do_sample=False, use_cache=True, max_new_tokens=1024, - eos_token_id=EOS_TOKEN, + eos_token_id=151645, pad_token_id=processor.tokenizer.pad_token_id, streamer=streamer, ) print() - message = processor.batch_decode(y[:, x["input_ids"].shape[1] : -1])[0] + prompt_len = inputs["input_ids"].shape[1] + message = processor.batch_decode(output[:, prompt_len:-1])[0] messages.append({"role": "assistant", "content": message}) @@ -95,16 +88,17 @@ def run_chat(opts, model, processor): def main(): try: opts = parse_args() - + processor = AutoProcessor.from_pretrained(opts.model, trust_remote_code=True) model = ( - VLMForCausalLM.from_pretrained( + AutoModel.from_pretrained( opts.model, torch_dtype=torch.bfloat16 if opts.fp16 else torch.float32, + ignore_mismatched_sizes=True, + trust_remote_code=True, ) .eval() .to(opts.device) ) - processor = VLMProcessor.from_pretrained(opts.model) run_chat(opts, model, processor) diff --git a/python/uform/gen_model.py b/python/uform/gen_model.py index c03b6eb..6792120 100644 --- a/python/uform/gen_model.py +++ b/python/uform/gen_model.py @@ -1,464 +1 @@ -from typing import List, Optional, Tuple, Union - -import torch -import torch.nn.functional as F -from torch import nn -from torchvision.transforms import (CenterCrop, Compose, InterpolationMode, - Normalize, RandomResizedCrop, Resize, - ToTensor) -from transformers import AutoConfig, AutoTokenizer -from transformers.configuration_utils import PretrainedConfig -from transformers.modeling_outputs import CausalLMOutputWithPast -from transformers.modeling_utils import PreTrainedModel -from transformers.models.auto.modeling_auto import (AutoModel, - AutoModelForCausalLM) -from transformers.processing_utils import ProcessorMixin -from transformers.tokenization_utils_base import BatchEncoding - -from uform.torch_models import VisualEncoder - -IMAGENET_MEAN = (0.48145466, 0.4578275, 0.40821073) -IMAGENET_STD = (0.26862954, 0.26130258, 0.27577711) - - -def convert_to_rgb(image): - return image.convert("RGB") - - -class LayerScale(nn.Module): - def __init__(self, dim, init_values: float = 1e-5, inplace: bool = False): - super().__init__() - self.weight = nn.Parameter(init_values * torch.ones(dim)) - self.inplace = inplace - - def forward(self, x): - return x.mul_(self.weight) if self.inplace else x * self.weight - - -class ImageFeaturesPooler(nn.Module): - def __init__( - self, - input_size, - hidden_size, - num_attn_heads, - intermediate_size, - num_latents, - initializer_range, - ): - super().__init__() - self.projection = nn.Linear(input_size, hidden_size) - - self.pooler = nn.TransformerDecoderLayer( - hidden_size, - num_attn_heads, - intermediate_size, - activation=nn.functional.silu, - batch_first=True, - norm_first=True, - ) - self.image_latents = nn.Parameter( - torch.randn(1, num_latents, hidden_size) * initializer_range**0.5, - ) - - def forward(self, features): - features = self.projection(features) - return self.pooler( - self.image_latents.expand(features.shape[0], -1, -1), - features, - ) - - -class VLMConfig(PretrainedConfig): - model_type = "vlm" - - def __init__( - self, - text_decoder_name_or_path: str = "", - tokenizer_name_or_path: str = "", - image_size: int = 224, - image_encoder_hidden_size: int = 768, - image_encoder_patch_size: int = 16, - image_encoder_num_layers: int = 12, - image_encoder_num_heads: int = 12, - image_encoder_embedding_dim: int = 256, - image_encoder_pooling: str = "cls", - image_pooler_num_attn_heads: int = 16, - image_pooler_intermediate_size: int = 5504, - image_pooler_num_latents: int = 196, - image_token_id: int = 32002, - initializer_range: float = 0.02, - use_cache: bool = True, - center_crop: bool = True, - **kwargs, - ): - self.text_decoder_name_or_path = text_decoder_name_or_path - self.tokenizer_name_or_path = tokenizer_name_or_path - - self.image_size = image_size - self.image_encoder_hidden_size = image_encoder_hidden_size - self.image_encoder_patch_size = image_encoder_patch_size - self.image_encoder_num_layers = image_encoder_num_layers - self.image_encoder_num_heads = image_encoder_num_heads - self.image_encoder_embedding_dim = image_encoder_embedding_dim - self.image_encoder_pooling = image_encoder_pooling - - self.image_pooler_num_attn_heads = image_pooler_num_attn_heads - self.image_pooler_intermediate_size = image_pooler_intermediate_size - self.image_pooler_num_latents = image_pooler_num_latents - - self.image_token_id = image_token_id - - self.initializer_range = initializer_range - self.use_cache = use_cache - self.center_crop = center_crop - - super().__init__(**kwargs) - - -class VLMPreTrainedModel(PreTrainedModel): - config_class = VLMConfig - base_model_prefix = "vlm" - supports_gradient_checkpointing = True - _no_split_modules = [] - _skip_keys_device_placement = "past_key_values" - - def _init_weights(self, module): - pass - - def _initialize_weights(self, module): - pass - - -class VLMForCausalLM(VLMPreTrainedModel): - def __init__(self, config: VLMConfig): - super().__init__(config) - - self.config = config - self.text_config = AutoConfig.from_pretrained(config.text_decoder_name_or_path) - self.text_config.vocab_size += 3 - self.text_decoder = AutoModelForCausalLM.from_config(self.text_config) - - self.image_encoder = VisualEncoder( - self.config.image_encoder_hidden_size, - self.config.image_encoder_patch_size, - self.config.image_size, - self.config.image_encoder_num_layers, - self.config.image_encoder_num_heads, - self.config.image_encoder_embedding_dim, - self.config.image_encoder_pooling, - ) - - # replace models' layerscales because `transformers` automatically renames keys in state_dict - for i in range(len(self.image_encoder.blocks)): - self.image_encoder.blocks[i].ls1 = LayerScale( - self.image_encoder.blocks[i].ls1.dim, - ) - self.image_encoder.blocks[i].ls2 = LayerScale( - self.image_encoder.blocks[i].ls2.dim, - ) - - self.image_pooler = ImageFeaturesPooler( - self.config.image_encoder_hidden_size, - self.text_config.hidden_size, - self.config.image_pooler_num_attn_heads, - self.config.image_pooler_intermediate_size, - self.config.image_pooler_num_latents, - self.config.initializer_range, - ) - - def get_input_embeddings(self): - return self.text_decoder.get_input_embeddings() - - def set_input_embeddings(self, value): - self.text_decoder.set_input_embeddings(value) - - def get_images_embeddings(self, images): - features = self.image_encoder.forward_features(images) - return self.image_pooler(features) - - def gather_continuous_embeddings( - self, - input_ids: torch.Tensor, - word_embeddings: torch.Tensor, - image_embeddings: torch.Tensor, - ) -> torch.Tensor: - start_indices = (input_ids == self.config.image_token_id).nonzero()[:, 1] - embeddings = [] - - for sample_idx, start_idx in enumerate(start_indices.tolist()): - embeddings.append( - torch.cat( - ( - word_embeddings[sample_idx, :start_idx], - image_embeddings[sample_idx], - word_embeddings[sample_idx, start_idx + 1 :], - ), - dim=0, - ), - ) - - return torch.stack(embeddings, dim=0) - - def forward( - self, - input_ids: torch.LongTensor = None, - images: torch.Tensor = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - use_cache: Optional[bool] = None, - labels: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[dict, Tuple, CausalLMOutputWithPast]: - output_attentions = ( - output_attentions - if output_attentions is not None - else self.config.output_attentions - ) - output_hidden_states = ( - output_hidden_states - if output_hidden_states is not None - else self.config.output_hidden_states - ) - use_cache = use_cache if use_cache is not None else self.config.use_cache - - return_dict = ( - return_dict if return_dict is not None else self.config.use_return_dict - ) - - if input_ids is not None and inputs_embeds is not None: - raise ValueError( - "You cannot specify both input_ids and inputs_embeds at the same time", - ) - elif input_ids is None and inputs_embeds is None: - raise ValueError("You have to specify either input_is or inputs_embeds") - - if inputs_embeds is None and past_key_values is None: - inputs_embeds = self.get_input_embeddings()(input_ids) - - if images is not None: - image_embeds = self.get_images_embeddings(images) - inputs_embeds = self.gather_continuous_embeddings( - input_ids, - inputs_embeds, - image_embeds, - ) - - if position_ids is None: - seq_length = ( - inputs_embeds.shape[1] - if inputs_embeds is not None - else input_ids.shape[1] - ) - past_key_values_length = 0 - - if past_key_values is not None: - past_key_values_length = past_key_values[0][0].shape[2] - - device = input_ids.device if input_ids is not None else inputs_embeds.device - position_ids = torch.arange( - past_key_values_length, - seq_length + past_key_values_length, - dtype=torch.long, - device=device, - ) - position_ids = position_ids.unsqueeze(0) - - outputs = self.text_decoder( - inputs_embeds=inputs_embeds, - input_ids=input_ids if past_key_values is not None else None, - attention_mask=attention_mask, - labels=labels, - position_ids=position_ids, - past_key_values=past_key_values, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - use_cache=use_cache, - return_dict=return_dict, - ) - - return outputs - - def prepare_inputs_for_generation( - self, - input_ids, - images=None, - past_key_values=None, - attention_mask=None, - inputs_embeds=None, - **kwargs, - ): - if past_key_values: - input_ids = input_ids[:, -1:] - - position_ids = kwargs.get("position_ids", None) - if attention_mask is not None and position_ids is None: - # create position_ids on the fly for batch generation - position_ids = attention_mask.long().cumsum(-1) - 1 - position_ids.masked_fill_(attention_mask == 0, 1) - if past_key_values: - position_ids = position_ids[:, -1].unsqueeze(-1) - - # if `inputs_embeds` are passed, we only want to use them in the 1st generation step - if inputs_embeds is not None and past_key_values is None: - model_inputs = {"inputs_embeds": inputs_embeds} - else: - model_inputs = {"input_ids": input_ids} - - if images is not None: - model_inputs["images"] = images - - model_inputs.update( - { - "position_ids": position_ids, - "past_key_values": past_key_values, - "use_cache": kwargs.get("use_cache"), - "attention_mask": attention_mask, - "images": images if past_key_values is None else None, - }, - ) - return model_inputs - - @classmethod - def from_config(cls, config, **kwargs): - return cls._from_config(config, **kwargs) - - -class VLMProcessor(ProcessorMixin): - def __init__(self, config, **kwargs): - self.feature_extractor = None - self.config = config - - if config.center_crop: - self.image_processor = Compose( - [ - Resize(256, interpolation=InterpolationMode.BICUBIC), - CenterCrop(config.image_size), - convert_to_rgb, - ToTensor(), - Normalize( - mean=IMAGENET_MEAN, - std=IMAGENET_STD, - ), - ], - ) - else: - self.image_processor = Compose( - [ - RandomResizedCrop( - config.image_size, - scale=(0.8, 1), - interpolation=InterpolationMode.BICUBIC, - ), - convert_to_rgb, - ToTensor(), - Normalize( - mean=IMAGENET_MEAN, - std=IMAGENET_STD, - ), - ], - ) - - self.tokenizer = AutoTokenizer.from_pretrained( - config.tokenizer_name_or_path, - additional_special_tokens=["<|im_end|>"], - ) - self.num_image_latents = config.image_pooler_num_latents - - def __call__(self, texts=None, images=None, return_tensors="pt", **kwargs): - if texts is not None: - if isinstance(texts, str): - texts = [texts] - - tokenized_texts = [] - for text in texts: - messages = [ - {"role": "system", "content": "You are a helpful assistant."}, - {"role": "user", "content": f" {text}"}, - ] - tokenized_prompt = self.tokenizer.apply_chat_template( - messages, - add_generation_prompt=True, - return_tensors=return_tensors, - ) - - tokenized_texts.append(tokenized_prompt) - - max_len = max(len(t[0]) for t in tokenized_texts) - input_ids = torch.full( - (len(tokenized_texts), max_len), - fill_value=self.tokenizer.pad_token_id, - dtype=torch.int64, - ) - attention_mask = torch.full( - (len(tokenized_texts), max_len), - fill_value=0, - dtype=torch.int64, - ) - - for i, tokens in enumerate(tokenized_texts): - input_ids[i, -len(tokens[0]) :] = tokens[0] - attention_mask[i, -len(tokens[0]) :] = 1 - - attention_mask = F.pad( - attention_mask, - pad=(0, self.num_image_latents - 1), - value=1, - ) - - encoding = BatchEncoding( - data={"input_ids": input_ids, "attention_mask": attention_mask}, - ) - - if images is not None: - if isinstance(images, (list, tuple)): - image_features = torch.empty( - (len(images), 3, self.config.image_size, self.config.image_size), - dtype=torch.float32, - ) - - for i, image in enumerate(images): - image_features[i] = self.image_processor(image) - else: - image_features = self.image_processor(images).unsqueeze(0) - - if texts is not None and images is not None: - encoding["images"] = image_features - return encoding - - if texts is not None: - return encoding - - return BatchEncoding( - data={ - "images": image_features, - }, - tensor_type=return_tensors, - ) - - def batch_decode(self, *args, **kwargs): - return self.tokenizer.batch_decode(*args, **kwargs) - - def decode(self, *args, **kwargs): - return self.tokenizer.decode(*args, **kwargs) - - @classmethod - def from_pretrained( - cls, - pretrained_model_name_or_path, - cache_dir=None, - force_download: bool = False, - local_files_only: bool = False, - token=None, - revision: str = "main", - **kwargs, - ): - config = AutoConfig.from_pretrained(pretrained_model_name_or_path) - return cls(config) - - -AutoConfig.register("vlm", VLMConfig) -AutoModel.register(VLMConfig, VLMForCausalLM) +from uform.torch_decoders import VLMForCausalLM, VLMProcessor # legacy path diff --git a/python/uform/numpy_preprocessor.py b/python/uform/numpy_processors.py similarity index 62% rename from python/uform/numpy_preprocessor.py rename to python/uform/numpy_processors.py index a556db4..166ecf4 100644 --- a/python/uform/numpy_preprocessor.py +++ b/python/uform/numpy_processors.py @@ -1,29 +1,31 @@ from os import PathLike -from typing import Dict, List, Union +from typing import Dict, List, Union, Sequence +import json from PIL.Image import Image, BICUBIC from tokenizers import Tokenizer import numpy as np +from uform.shared import read_config -class NumPyProcessor: - def __init__(self, config: Dict, tokenizer_path: PathLike): + +class TextProcessor: + def __init__(self, config_path: PathLike, tokenizer_path: PathLike): """ :param config: model config :param tokenizer_path: path to tokenizer file - :param tensor_type: which tensors to return, either pt (PyTorch) or np (NumPy) """ - self._image_size = config["image_encoder"]["image_size"] - self._max_seq_len = config["text_encoder"]["max_position_embeddings"] + config = read_config(config_path) + if "text_encoder" in config: + config = config["text_encoder"] + + self._max_seq_len = config["max_position_embeddings"] self._tokenizer = Tokenizer.from_file(tokenizer_path) self._tokenizer.no_padding() - self._pad_token_idx = config["text_encoder"]["padding_idx"] - - self.image_mean = np.array([0.48145466, 0.4578275, 0.40821073], dtype=np.float32)[None, None] - self.image_std = np.array([0.26862954, 0.26130258, 0.27577711], dtype=np.float32)[None, None] + self._pad_token_idx = config["padding_idx"] - def preprocess_text(self, texts: Union[str, List[str]]) -> Dict[str, np.ndarray]: + def __call__(self, texts: Union[str, Sequence[str]]) -> Dict[str, np.ndarray]: """Transforms one or more strings into dictionary with tokenized strings and attention masks. :param texts: text of list of texts to tokenizer @@ -34,7 +36,7 @@ def preprocess_text(self, texts: Union[str, List[str]]) -> Dict[str, np.ndarray] input_ids = np.full( (len(texts), self._max_seq_len), fill_value=self._pad_token_idx, - dtype=np.int64, + dtype=np.int32, ) attention_mask = np.zeros( @@ -51,13 +53,37 @@ def preprocess_text(self, texts: Union[str, List[str]]) -> Dict[str, np.ndarray] return {"input_ids": input_ids, "attention_mask": attention_mask} - def preprocess_image(self, images: Union[Image, List[Image]]) -> np.ndarray: + +class ImageProcessor: + def __init__(self, config_path: PathLike, tokenizer_path: PathLike = None): + """ + :param config: model config + :param tokenizer_path: path to tokenizer file + :param tensor_type: which tensors to return, either pt (PyTorch) or np (NumPy) + """ + + config = read_config(config_path) + if "image_encoder" in config: + config = config["image_encoder"] + + self._image_size = config["image_size"] + self._normalization_means = config["normalization_means"] + self._normalization_deviations = config["normalization_deviations"] + + assert isinstance(self._image_size, int) and self._image_size > 0 + assert isinstance(self._normalization_means, list) and isinstance(self._normalization_deviations, list) + assert len(self._normalization_means) == len(self._normalization_deviations) == 3 + + self.image_mean = np.array(self._normalization_means, dtype=np.float32)[None, None] + self.image_std = np.array(self._normalization_deviations, dtype=np.float32)[None, None] + + def __call__(self, images: Union[Image, Sequence[Image]]) -> np.ndarray: """Transforms one or more Pillow images into Torch Tensors. :param images: image or list of images to preprocess """ - if isinstance(images, list): + if isinstance(images, Sequence): batch_images = np.empty( (len(images), 3, self._image_size, self._image_size), dtype=np.float32, diff --git a/python/uform/onnx_encoders.py b/python/uform/onnx_encoders.py new file mode 100644 index 0000000..b9c4cc4 --- /dev/null +++ b/python/uform/onnx_encoders.py @@ -0,0 +1,139 @@ +from os import PathLike +from typing import Dict, Optional, Tuple, Union, Literal +import json + +import onnxruntime as ort +from numpy import ndarray + +from uform.shared import ExecutionProviderError + + +def available_providers(device: Optional[str]) -> Tuple[str, ...]: + """Returns a tuple of available execution providers based on the requested device. + https://onnxruntime.ai/docs/execution-providers/ + + :param device: Device name, either `cpu` or `gpu`, or a specific execution provider name. + :return: Tuple of available execution providers. + :raises ExecutionProviderError: If the requested device is not available. + """ + + gpu_providers = ("CUDAExecutionProvider", "TensorrtExecutionProvider") + cpu_providers = ("OpenVINOExecutionProvider", "CoreMLExecutionProvider", "CPUExecutionProvider") + available = ort.get_available_providers() + + # If no target device is specified, let's sort all the available ones with respect to our preference + if device is None: + preferences = gpu_providers + cpu_providers + filtered_preferences = tuple(provider for provider in preferences if provider in available) + if len(filtered_preferences): + return filtered_preferences + if len(available): + return available + raise ExecutionProviderError("No execution providers are available") + + # If a GPU is requested, but no GPU providers are available, raise an error + if device == "gpu" or device == "cuda": + if all(provider not in available for provider in gpu_providers): + raise ExecutionProviderError( + f"GPU providers are not available, consider installing `onnxruntime-gpu` and make sure the CUDA is available on your system. Currently installed: {available}" + ) + return [x for x in gpu_providers if x in available] + + # If a CPU is requested, but no CPU providers are available, raise an error + if device == "cpu": + if all(provider not in available for provider in cpu_providers): + raise ExecutionProviderError( + f"CPU providers are not available, consider installing `onnxruntime` and make sure the OpenVINO and CoreML are available on your system. Currently installed: {available}" + ) + return [x for x in cpu_providers if x in available] + + if device not in available: + available_providers = ", ".join(available) + raise ExecutionProviderError( + f"Execution provider {device} is not available. Currently installed: {available_providers}" + ) + + return (device,) + + +class ImageEncoder: + def __init__( + self, + model_path: str, + *, + device: Literal["cpu", "cuda"] = "cpu", + return_features: bool = True, + ): + """ + :param model_path: Path to onnx model + :param device: Device name, either cpu or gpu + """ + + session_options = ort.SessionOptions() + session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL + + self.return_features = return_features + self.session = ort.InferenceSession( + model_path, + sess_options=session_options, + providers=available_providers(device), + ) + + def encode( + self, images: ndarray, return_features: Optional[bool] = None + ) -> Union[ndarray, Tuple[ndarray, ndarray]]: + features, embeddings = self.session.run(None, {"images": images}) + return_features = return_features if return_features is not None else self.return_features + if return_features: + return features, embeddings + return embeddings + + +class TextEncoder: + def __init__( + self, + model_path: str, + *, + device: Literal["cpu", "cuda"] = "cpu", + return_features: bool = True, + ): + """ + :param text_encoder_path: Path to onnx of text encoder + :param device: Device name, either cpu or gpu + """ + + session_options = ort.SessionOptions() + session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL + + self.return_features = return_features + self.text_encoder_session = ort.InferenceSession( + model_path, + sess_options=session_options, + providers=available_providers(device), + ) + + def encode( + self, + x: Union[ndarray, dict], + attention_mask: Optional[ndarray] = None, + return_features: Optional[bool] = None, + ) -> Union[ndarray, Tuple[ndarray, ndarray]]: + if isinstance(x, dict): + assert attention_mask is None, "If `x` is a dictionary, then `attention_mask` should be None" + attention_mask = x["attention_mask"] + input_ids = x["input_ids"] + else: + input_ids = x + + features, embeddings = self.text_encoder_session.run( + None, + { + "input_ids": input_ids, + "attention_mask": attention_mask, + }, + ) + + return_features = return_features if return_features is not None else self.return_features + if return_features: + return features, embeddings + return embeddings diff --git a/python/uform/onnx_models.py b/python/uform/onnx_models.py deleted file mode 100644 index 8e2a87a..0000000 --- a/python/uform/onnx_models.py +++ /dev/null @@ -1,231 +0,0 @@ -from os.path import join -from typing import Dict, Optional, Tuple, Union - -import onnxruntime as ort -from numpy import ndarray - - -class ExecutionProviderError(Exception): - """Exception raised when a requested execution provider is not available.""" - - -def available_providers(device: str) -> Tuple[str, ...]: - gpu_providers = ("CUDAExecutionProvider", "TensorrtExecutionProvider") - cpu_providers = ("OpenVINOExecutionProvider", "CoreMLExecutionProvider", "CPUExecutionProvider") - available = ort.get_available_providers() - if device == "gpu": - if all(provider not in available for provider in gpu_providers): - raise ExecutionProviderError( - f"GPU providers are not available, consider installing `onnxruntime-gpu` and make sure the CUDA is available on your system. Currently installed: {available}" - ) - return gpu_providers - - return cpu_providers - - -class VisualEncoderONNX: - def __init__(self, model_path: str, device: str): - """ - :param model_path: Path to onnx model - :param device: Device name, either cpu or gpu - """ - - session_options = ort.SessionOptions() - session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL - - self.session = ort.InferenceSession( - model_path, - sess_options=session_options, - providers=available_providers(device), - ) - - def __call__(self, images: ndarray) -> Tuple[ndarray, ndarray]: - return self.session.run(None, {"images": images}) - - -class TextEncoderONNX: - def __init__(self, text_encoder_path: str, reranker_path: str, device: str): - """ - :param text_encoder_path: Path to onnx of text encoder - :param reranker_path: Path to onnx of reranker - :param device: Device name, either cpu or gpu - """ - - session_options = ort.SessionOptions() - session_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL - - self.text_encoder_session = ort.InferenceSession( - text_encoder_path, - sess_options=session_options, - providers=available_providers(device), - ) - - self.reranker_session = ort.InferenceSession( - reranker_path, - sess_options=session_options, - providers=available_providers(device), - ) - - def __call__(self, input_ids: ndarray, attention_mask: ndarray) -> Tuple[ndarray, ndarray]: - return self.text_encoder_session.run(None, {"input_ids": input_ids, "attention_mask": attention_mask}) - - def forward_multimodal( - self, text_features: ndarray, attention_mask: ndarray, image_features: ndarray - ) -> Tuple[ndarray, ndarray]: - return self.reranker_session.run( - None, - { - "text_features": text_features, - "attention_mask": attention_mask, - "image_features": image_features, - }, - ) - - -class VLM_ONNX: - def __init__(self, checkpoint_path: str, config: Dict, device: str, dtype: str): - assert device in ( - "cpu", - "gpu", - ), f"Invalid `device`: {device}. Must be either `cpu` or `gpu`" - assert dtype in ( - "fp32", - "fp16", - ), f"Invalid `dtype`: {dtype}. Must be either `fp32` or `fp16` (only for gpu)" - assert ( - device == "cpu" and dtype == "fp32" - ) or device == "gpu", "Combination `device`=`cpu` & `dtype=fp16` is not supported" - - self.device = device - self.dtype = dtype - - self._embedding_dim = config["text_encoder"]["embedding_dim"] - self._text_encoder_dim = config["text_encoder"]["dim"] - self._image_encoder_dim = config["image_encoder"]["dim"] - - self.text_encoder = TextEncoderONNX( - join(checkpoint_path, f"text_encoder.onnx"), - join(checkpoint_path, f"reranker.onnx"), - device, - ) - - self.image_encoder = VisualEncoderONNX(join(checkpoint_path, f"image_encoder.onnx"), device) - - def encode_image( - self, - images: ndarray, - return_features: bool = False, - ) -> Union[ndarray, Tuple[ndarray, ndarray]]: - """Passes the pre-processed images through `image_encoder` to produce images features (optional) and embeddings. - - :param images: Preprocessed image - :param return_features: Whether to return images features or return only embeddings - """ - - features, embeddings = self.image_encoder(images) - - if return_features: - return features, embeddings - - return embeddings - - def encode_text( - self, - texts: Dict[str, ndarray], - return_features: bool = False, - ) -> Union[ndarray, Tuple[ndarray, ndarray]]: - """Passes the pre-processed texts through `text_encoder` to produce texts features (optional) and embeddings. - - :param texts: Dictionary with tokenized texts and attention masks - :param return_features: Whether to return texts features or return only embeddings - """ - - features, embeddings = self.text_encoder(**texts) - - if return_features: - return features, embeddings - - return embeddings - - def encode_multimodal( - self, - image: Optional[ndarray] = None, - text: Dict[str, ndarray] = None, - image_features: Optional[ndarray] = None, - text_features: Optional[ndarray] = None, - attention_mask: Optional[ndarray] = None, - return_scores: bool = False, - ) -> Union[ndarray, Tuple[ndarray, ndarray]]: - """Passes preprocessed texts (or precomputed texts features) and - preprocessed images (or precomputed images features) through multimodal encoded to produce matching scores and optionally multimodal joint embeddings. - - :param image: Preprocessed images - :param text: Preprocessed texts - :param image_features: Precomputed images features - :param text_features: Precomputed text features - :param attention_mask: Attention masks, not required if pass `text` instead of text_features - """ - - assert image is not None or image_features is not None, "Either `image` or `image_features` should be non None" - assert text is not None or text_features is not None, "Either `text_data` or `text_features` should be non None" - - if text_features is not None: - assert attention_mask is not None, "if `text_features` is not None, then you should pass `attention_mask`" - - if image_features is None: - image_features = self.image_encoder(image) - - if text_features is None: - text_features = self.text_encoder( - text["input_ids"], - text["attention_mask"], - ) - - matching_scores, embeddings = self.text_encoder.forward_multimodal( - text_features, - attention_mask if attention_mask is not None else text["attention_mask"], - image_features, - ) - - if return_scores: - return matching_scores, embeddings - - return embeddings - - def forward( - self, - images: ndarray, - texts: Dict[str, ndarray], - ) -> Union[ndarray, ndarray]: - """Inference forward method - - :param images: Preprocessed images - :param texts: Preprocessed texts - :return: embeddings for images and texts - """ - _, image_embeddings = self.image_encoder(images) - _, text_embeddings = self.text_encoder(texts) - return image_embeddings, text_embeddings - - @property - def text_features_dim(self) -> int: - """Dimensionality of the text encoder features.""" - - return self._text_encoder_dim - - @property - def image_features_dim(self) -> int: - """Dimensionality of the image encoder features.""" - - return self._image_encoder_dim - - @property - def embedding_dim(self) -> int: - """Dimensionality of shared space embedding.""" - - return self._embedding_dim - - @property - def multimodal_embedding_dim(self) -> int: - """Dimensionality of multimodal joint embedding.""" - return self._text_encoder_dim diff --git a/python/uform/preprocessing.py b/python/uform/preprocessing.py deleted file mode 100644 index d3d833e..0000000 --- a/python/uform/preprocessing.py +++ /dev/null @@ -1,105 +0,0 @@ -from os import PathLike -from typing import Dict, List, Union - -import torch -from PIL import Image -from tokenizers import Tokenizer -from torch import Tensor -from torchvision.transforms import (CenterCrop, Compose, InterpolationMode, - Normalize, Resize, ToTensor) - - -# lambda is not pickable -def convert_to_rgb(image): - return image.convert("RGB") - - -class Processor: - def __init__(self, config: Dict, tokenizer_path: PathLike, tensor_type: str = "pt"): - """ - :param config: model config - :param tokenizer_path: path to tokenizer file - :param tensor_type: which tensors to return, either pt (PyTorch) or np (NumPy) - """ - - assert tensor_type in ("pt", "np"), "`tensor_type` must be either `pt` or `np`" - - self._image_size = config["image_encoder"]["image_size"] - self._max_seq_len = config["text_encoder"]["max_position_embeddings"] - self._tokenizer = Tokenizer.from_file(tokenizer_path) - self._tokenizer.no_padding() - self._pad_token_idx = config["text_encoder"]["padding_idx"] - - self.tensor_type = tensor_type - - self._image_transform = Compose( - [ - Resize(self._image_size, interpolation=InterpolationMode.BICUBIC), - convert_to_rgb, - CenterCrop(self._image_size), - ToTensor(), - Normalize( - mean=(0.48145466, 0.4578275, 0.40821073), - std=(0.26862954, 0.26130258, 0.27577711), - ), - ], - ) - - def preprocess_text(self, texts: Union[str, List[str]]) -> Dict[str, Tensor]: - """Transforms one or more strings into dictionary with tokenized strings and attention masks. - - :param texts: text of list of texts to tokenizer - """ - if isinstance(texts, str): - texts = [texts] - - input_ids = torch.full( - (len(texts), self._max_seq_len), - fill_value=self._pad_token_idx, - dtype=torch.int64, - ) - - attention_mask = torch.zeros( - len(texts), - self._max_seq_len, - dtype=torch.int32, - ) - encoded = self._tokenizer.encode_batch(texts) - - for i, seq in enumerate(encoded): - seq_len = min(len(seq), self._max_seq_len) - input_ids[i, :seq_len] = torch.LongTensor( - seq.ids[: self._max_seq_len], - ) - attention_mask[i, :seq_len] = 1 - - if self.tensor_type == "np": - return { - "input_ids": input_ids.numpy(), - "attention_mask": attention_mask.numpy(), - } - - return {"input_ids": input_ids, "attention_mask": attention_mask} - - def preprocess_image(self, images: Union[Image, List[Image]]) -> Tensor: - """Transforms one or more Pillow images into Torch Tensors. - - :param images: image or list of images to preprocess - """ - - if isinstance(images, list): - batch_images = torch.empty( - (len(images), 3, self._image_size, self._image_size), - dtype=torch.float32, - ) - - for i, image in enumerate(images): - batch_images[i] = self._image_transform(image) - - else: - batch_images = self._image_transform(images).unsqueeze(0) - - if self.tensor_type == "np": - return batch_images.numpy() - - return batch_images diff --git a/python/uform/shared.py b/python/uform/shared.py new file mode 100644 index 0000000..37d256b --- /dev/null +++ b/python/uform/shared.py @@ -0,0 +1,26 @@ +from enum import Enum +from typing import Union +from os import PathLike +import json + + +class Modality(Enum): + TEXT_ENCODER = "text_encoder" + IMAGE_ENCODER = "image_encoder" + VIDEO_ENCODER = "video_encoder" + TEXT_DECODER = "text_decoder" + + +class ExecutionProviderError(Exception): + """Exception raised when a requested execution provider is not available.""" + + +ConfigOrPath = Union[PathLike, str, object] + + +def read_config(path_or_object: ConfigOrPath) -> object: + if isinstance(path_or_object, (PathLike, str)): + with open(path_or_object, "r") as f: + return json.load(f) + else: + return path_or_object diff --git a/python/uform/torch_decoders.py b/python/uform/torch_decoders.py new file mode 100644 index 0000000..475f5b0 --- /dev/null +++ b/python/uform/torch_decoders.py @@ -0,0 +1,469 @@ +from typing import List, Optional, Tuple, Union + +import torch +import torch.nn.functional as F +from torch import nn +from torchvision.transforms import ( + CenterCrop, + Compose, + InterpolationMode, + Normalize, + RandomResizedCrop, + Resize, + ToTensor, +) +from transformers import AutoConfig, AutoTokenizer +from transformers.configuration_utils import PretrainedConfig +from transformers.modeling_outputs import CausalLMOutputWithPast +from transformers.modeling_utils import PreTrainedModel +from transformers.models.auto.modeling_auto import AutoModel, AutoModelForCausalLM +from transformers.processing_utils import ProcessorMixin +from transformers.tokenization_utils_base import BatchEncoding + +from uform.torch_encoders import ImageEncoder + +IMAGENET_MEAN = (0.48145466, 0.4578275, 0.40821073) +IMAGENET_STD = (0.26862954, 0.26130258, 0.27577711) + + +def convert_to_rgb(image): + return image.convert("RGB") + + +class LayerScale(nn.Module): + def __init__(self, dim, init_values: float = 1e-5, inplace: bool = False): + super().__init__() + self.weight = nn.Parameter(init_values * torch.ones(dim)) + self.inplace = inplace + + def forward(self, x): + return x.mul_(self.weight) if self.inplace else x * self.weight + + +class ImageFeaturesPooler(nn.Module): + def __init__( + self, + input_size, + hidden_size, + num_attn_heads, + intermediate_size, + num_latents, + initializer_range, + ): + super().__init__() + self.projection = nn.Linear(input_size, hidden_size) + + self.pooler = nn.TransformerDecoderLayer( + hidden_size, + num_attn_heads, + intermediate_size, + activation=nn.functional.silu, + batch_first=True, + norm_first=True, + ) + self.image_latents = nn.Parameter( + torch.randn(1, num_latents, hidden_size) * initializer_range**0.5, + ) + + def forward(self, features): + features = self.projection(features) + return self.pooler( + self.image_latents.expand(features.shape[0], -1, -1), + features, + ) + + +class VLMConfig(PretrainedConfig): + model_type = "vlm" + + def __init__( + self, + text_decoder_name_or_path: str = "", + tokenizer_name_or_path: str = "", + image_size: int = 224, + image_encoder_hidden_size: int = 768, + image_encoder_patch_size: int = 16, + image_encoder_num_layers: int = 12, + image_encoder_num_heads: int = 12, + image_encoder_embedding_dim: int = 256, + image_encoder_pooling: str = "cls", + image_pooler_num_attn_heads: int = 16, + image_pooler_intermediate_size: int = 5504, + image_pooler_num_latents: int = 196, + image_token_id: int = 32002, + initializer_range: float = 0.02, + use_cache: bool = True, + center_crop: bool = True, + **kwargs, + ): + self.text_decoder_name_or_path = text_decoder_name_or_path + self.tokenizer_name_or_path = tokenizer_name_or_path + + self.image_size = image_size + self.image_encoder_hidden_size = image_encoder_hidden_size + self.image_encoder_patch_size = image_encoder_patch_size + self.image_encoder_num_layers = image_encoder_num_layers + self.image_encoder_num_heads = image_encoder_num_heads + self.image_encoder_embedding_dim = image_encoder_embedding_dim + self.image_encoder_pooling = image_encoder_pooling + + self.image_pooler_num_attn_heads = image_pooler_num_attn_heads + self.image_pooler_intermediate_size = image_pooler_intermediate_size + self.image_pooler_num_latents = image_pooler_num_latents + + self.image_token_id = image_token_id + + self.initializer_range = initializer_range + self.use_cache = use_cache + self.center_crop = center_crop + + super().__init__(**kwargs) + + +class VLMPreTrainedModel(PreTrainedModel): + config_class = VLMConfig + base_model_prefix = "vlm" + supports_gradient_checkpointing = True + _no_split_modules = [] + _skip_keys_device_placement = "past_key_values" + + def _init_weights(self, module): + pass + + def _initialize_weights(self, module): + pass + + +class VLMForCausalLM(VLMPreTrainedModel): + def __init__(self, config: VLMConfig): + super().__init__(config) + + self.config = config + self.text_config = AutoConfig.from_pretrained(config.text_decoder_name_or_path) + self.text_config.vocab_size += 3 + self.text_decoder = AutoModelForCausalLM.from_config(self.text_config) + + self.image_encoder = ImageEncoder( + self.config.image_encoder_hidden_size, + self.config.image_encoder_patch_size, + self.config.image_size, + self.config.image_encoder_num_layers, + self.config.image_encoder_num_heads, + self.config.image_encoder_embedding_dim, + self.config.image_encoder_pooling, + ) + + # replace models' layerscales because `transformers` automatically renames keys in `state_dict` + for i in range(len(self.image_encoder.blocks)): + self.image_encoder.blocks[i].ls1 = LayerScale( + self.image_encoder.blocks[i].ls1.dim, + ) + self.image_encoder.blocks[i].ls2 = LayerScale( + self.image_encoder.blocks[i].ls2.dim, + ) + + self.image_pooler = ImageFeaturesPooler( + self.config.image_encoder_hidden_size, + self.text_config.hidden_size, + self.config.image_pooler_num_attn_heads, + self.config.image_pooler_intermediate_size, + self.config.image_pooler_num_latents, + self.config.initializer_range, + ) + + def get_input_embeddings(self): + return self.text_decoder.get_input_embeddings() + + def set_input_embeddings(self, value): + self.text_decoder.set_input_embeddings(value) + + def get_images_embeddings(self, images): + features = self.image_encoder.forward_features(images) + return self.image_pooler(features) + + def gather_continuous_embeddings( + self, + input_ids: torch.Tensor, + word_embeddings: torch.Tensor, + image_embeddings: torch.Tensor, + ) -> torch.Tensor: + start_indices = (input_ids == self.config.image_token_id).nonzero()[:, 1] + embeddings = [] + + for sample_idx, start_idx in enumerate(start_indices.tolist()): + embeddings.append( + torch.cat( + ( + word_embeddings[sample_idx, :start_idx], + image_embeddings[sample_idx], + word_embeddings[sample_idx, start_idx + 1 :], + ), + dim=0, + ), + ) + + return torch.stack(embeddings, dim=0) + + def forward( + self, + input_ids: torch.LongTensor = None, + images: torch.Tensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[dict, Tuple, CausalLMOutputWithPast]: + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is not None and inputs_embeds is not None: + raise ValueError( + "You cannot specify both input_ids and inputs_embeds at the same time", + ) + elif input_ids is None and inputs_embeds is None: + raise ValueError("You have to specify either input_is or inputs_embeds") + + if inputs_embeds is None and past_key_values is None: + inputs_embeds = self.get_input_embeddings()(input_ids) + + if images is not None: + image_embeds = self.get_images_embeddings(images) + inputs_embeds = self.gather_continuous_embeddings( + input_ids, + inputs_embeds, + image_embeds, + ) + + if position_ids is None: + seq_length = inputs_embeds.shape[1] if inputs_embeds is not None else input_ids.shape[1] + past_key_values_length = 0 + + if past_key_values is not None: + past_key_values_length = past_key_values[0][0].shape[2] + + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = torch.arange( + past_key_values_length, + seq_length + past_key_values_length, + dtype=torch.long, + device=device, + ) + position_ids = position_ids.unsqueeze(0) + + outputs = self.text_decoder( + inputs_embeds=inputs_embeds, + input_ids=input_ids if past_key_values is not None else None, + attention_mask=attention_mask, + labels=labels, + position_ids=position_ids, + past_key_values=past_key_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + use_cache=use_cache, + return_dict=return_dict, + ) + + return outputs + + def prepare_inputs_for_generation( + self, + input_ids, + images=None, + past_key_values=None, + attention_mask=None, + inputs_embeds=None, + **kwargs, + ): + if past_key_values: + input_ids = input_ids[:, -1:] + + position_ids = kwargs.get("position_ids", None) + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -1].unsqueeze(-1) + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + if images is not None: + model_inputs["images"] = images + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "attention_mask": attention_mask, + "images": images if past_key_values is None else None, + }, + ) + return model_inputs + + @classmethod + def from_config(cls, config, **kwargs): + return cls._from_config(config, **kwargs) + + +class VLMProcessor(ProcessorMixin): + def __init__(self, config, **kwargs): + self.feature_extractor = None + self.config = config + + if config.center_crop: + self.image_processor = Compose( + [ + Resize(256, interpolation=InterpolationMode.BICUBIC), + CenterCrop(config.image_size), + convert_to_rgb, + ToTensor(), + Normalize( + mean=IMAGENET_MEAN, + std=IMAGENET_STD, + ), + ], + ) + else: + self.image_processor = Compose( + [ + RandomResizedCrop( + config.image_size, + scale=(0.8, 1), + interpolation=InterpolationMode.BICUBIC, + ), + convert_to_rgb, + ToTensor(), + Normalize( + mean=IMAGENET_MEAN, + std=IMAGENET_STD, + ), + ], + ) + + self.tokenizer = AutoTokenizer.from_pretrained( + config.tokenizer_name_or_path, + additional_special_tokens=["<|im_end|>"], + ) + self.num_image_latents = config.image_pooler_num_latents + + def __call__(self, texts=None, images=None, return_tensors="pt", **kwargs): + if texts is not None: + if isinstance(texts, str): + texts = [texts] + + tokenized_texts = [] + for text in texts: + messages = [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": f" {text}"}, + ] + tokenized_prompt = self.tokenizer.apply_chat_template( + messages, + add_generation_prompt=True, + return_tensors=return_tensors, + ) + + tokenized_texts.append(tokenized_prompt) + + max_len = max(len(t[0]) for t in tokenized_texts) + input_ids = torch.full( + (len(tokenized_texts), max_len), + fill_value=self.tokenizer.pad_token_id, + dtype=torch.int64, + ) + attention_mask = torch.full( + (len(tokenized_texts), max_len), + fill_value=0, + dtype=torch.int64, + ) + + for i, tokens in enumerate(tokenized_texts): + input_ids[i, -len(tokens[0]) :] = tokens[0] + attention_mask[i, -len(tokens[0]) :] = 1 + + attention_mask = F.pad( + attention_mask, + pad=(0, self.num_image_latents - 1), + value=1, + ) + + encoding = BatchEncoding( + data={ + "input_ids": input_ids, + "attention_mask": attention_mask, + }, + ) + + if images is not None: + if isinstance(images, (list, tuple)): + image_features = torch.empty( + (len(images), 3, self.config.image_size, self.config.image_size), + dtype=torch.float32, + ) + + for i, image in enumerate(images): + image_features[i] = self.image_processor(image) + else: + image_features = self.image_processor(images).unsqueeze(0) + + if texts is not None and images is not None: + encoding["images"] = image_features + return encoding + + if texts is not None: + return encoding + + return BatchEncoding( + data={ + "images": image_features, + }, + tensor_type=return_tensors, + ) + + def batch_decode(self, *args, **kwargs): + return self.tokenizer.batch_decode(*args, **kwargs) + + def decode(self, *args, **kwargs): + return self.tokenizer.decode(*args, **kwargs) + + @classmethod + def from_pretrained( + cls, + pretrained_model_name_or_path, + cache_dir=None, + force_download: bool = False, + local_files_only: bool = False, + token=None, + revision: str = "main", + **kwargs, + ): + config = AutoConfig.from_pretrained( + pretrained_model_name_or_path, + cache_dir=cache_dir, + force_download=force_download, + local_files_only=local_files_only, + revision=revision, + token=token, + **kwargs, + ) + return cls(config) + + +AutoConfig.register("vlm", VLMConfig) +AutoModel.register(VLMConfig, VLMForCausalLM) diff --git a/python/uform/torch_models.py b/python/uform/torch_encoders.py similarity index 63% rename from python/uform/torch_models.py rename to python/uform/torch_encoders.py index ab86622..89f6631 100644 --- a/python/uform/torch_models.py +++ b/python/uform/torch_encoders.py @@ -1,11 +1,23 @@ +from __future__ import annotations + from dataclasses import dataclass from os import PathLike -from typing import Dict, Optional, Tuple, Union +from typing import Dict, Optional, Union, Mapping, Any, Tuple import torch import torch.nn as nn import torch.nn.functional as F from torch import Tensor +from PIL.Image import Image + +from uform.shared import read_config + + +def _is_on_gpu(model: nn.Module) -> bool: + try: + return next(model.parameters()).device.type == "cuda" + except StopIteration: + return False @dataclass(eq=False) @@ -132,7 +144,7 @@ def forward( @dataclass(eq=False) -class VisualEncoderBlock(nn.Module): +class ImageEncoderBlock(nn.Module): dim: int num_heads: int @@ -219,36 +231,14 @@ def forward_features(self, x: Tensor, attn_mask: Tensor) -> Tensor: return x - def forward_multimodal( - self, - x: Tensor, - attn_mask: Tensor, - context: Tensor, - ) -> Tensor: - context = self.context_projection(context) - expanded_attn_mask = self.get_attention_mask(attn_mask, x.dtype) - for block in self.blocks: - if block.cross_attention: - x = block(x, expanded_attn_mask, context) - - return self.pool_features(x, attn_mask) - def forward_embedding(self, x: Tensor, attn_mask: Tensor) -> Tensor: return self.embedding_projection(self.pool_features(x, attn_mask)) - def forward_matching(self, x: Tensor) -> Tensor: - logits = self.matching_head(x) - if self.head_one_neuron: - return torch.sigmoid(logits)[:, 0] - - return F.softmax(logits, dim=1)[:, 1] - def pool_features(self, x: Tensor, attn_mask: Tensor) -> Tensor: if self.pooling == "cls": return x[:, 0] attn_mask = attn_mask.unsqueeze(2).type_as(x) - return (x * attn_mask).sum(dim=1) / attn_mask.sum(dim=1) def get_attention_mask(self, attn_mask: Tensor, dtype: torch.dtype) -> Tensor: @@ -273,7 +263,8 @@ def forward( x: Union[Tensor, dict], attention_mask: Optional[Tensor] = None, return_features: Optional[bool] = None, - ) -> Tensor: + ) -> Union[Tensor, Tuple[Tensor, Tensor]]: + if isinstance(x, dict): assert attention_mask is None, "If `x` is a dictionary, then `attention_mask` should be None" attention_mask = x["attention_mask"] @@ -282,6 +273,11 @@ def forward( # If no attention mask is provided - create one with all ones attention_mask = torch.ones_like(x) + # If the model is on the GPU and the input matrices are not, shift them there + if _is_on_gpu(self) and not x.is_cuda: + x = x.cuda() + attention_mask = attention_mask.cuda() + features = self.forward_features(x, attention_mask) embeddings = self.forward_embedding(features, attention_mask) @@ -290,9 +286,48 @@ def forward( return features, embeddings return embeddings + def encode( + self, + x: Union[Tensor, dict], + attention_mask: Optional[Tensor] = None, + return_features: Optional[bool] = None, + ) -> Union[Tensor, Tuple[Tensor, Tensor]]: + + result = self.forward(x, attention_mask, return_features) + if isinstance(result, tuple): + return result[0].detach(), result[1].detach() + else: + return result.detach() + + @staticmethod + def from_pretrained(config: Union[PathLike, str, object], model: Union[PathLike, str]) -> TextEncoder: + """Load the image encoder from the given configuration and model path. + + :param config: the configuration dictionary or path to the JSON configuration file + :param model: the model state dictionary or path to the `.pt` model file + """ + config = read_config(config) + if "text_encoder" in config: + config = config["text_encoder"] + + # We must strip all the non-member attributes before initializing the classes. + text_fields = TextEncoder.__dataclass_fields__ + config = {k: v for k, v in config.items() if k in text_fields} + encoder = TextEncoder(**config) + + # Load from disk + if isinstance(model, (PathLike, str)): + state = torch.load(model) + else: + state = model + if "text_encoder" in state: + state = state["text_encoder"] + encoder.load_state_dict(state) + return encoder + @dataclass(eq=False) -class VisualEncoder(nn.Module): +class ImageEncoder(nn.Module): dim: int patch_size: int image_size: int @@ -314,26 +349,23 @@ def __post_init__(self): self.reg_token = nn.Parameter(torch.zeros(1, self.num_reg_tokens, self.dim)) self.blocks = nn.Sequential( - *[VisualEncoderBlock(self.dim, self.num_heads) for _ in range(self.num_layers)], + *[ImageEncoderBlock(self.dim, self.num_heads) for _ in range(self.num_layers)], ) self.norm = nn.LayerNorm(self.dim, eps=1e-6) self.embedding_projection = nn.Linear(self.dim, self.embedding_dim, bias=False) self.return_features = False - def forward_features(self, x: Tensor) -> Tensor: + def forward_features(self, x: Union[Tensor, dict]) -> Tensor: x = self.patch_embed(x).flatten(start_dim=2).transpose(2, 1) x = x + self.pos_embed - special_tokens = [self.cls_token.expand(x.shape[0], -1, -1)] if self.num_reg_tokens > 0: special_tokens.append(self.reg_token.expand(x.shape[0], -1, -1)) x = torch.cat(special_tokens + [x], dim=1) - x = self.blocks(x) - return self.norm(x) def forward_embedding(self, x: Tensor) -> Tensor: @@ -344,7 +376,14 @@ def forward_embedding(self, x: Tensor) -> Tensor: return self.embedding_projection(x) - def forward(self, x: Tensor, return_features: Optional[bool] = None) -> Tensor: + def forward(self, x: Union[Tensor, dict], return_features: Optional[bool] = None) -> Tensor: + if isinstance(x, dict): + x = x["images"] + + # If the model is on the GPU and the input matrices are not, shift them there + if _is_on_gpu(self) and not x.is_cuda: + x = x.cuda() + features = self.forward_features(x) embeddings = self.forward_embedding(features) return_features = return_features if return_features is not None else self.return_features @@ -352,154 +391,38 @@ def forward(self, x: Tensor, return_features: Optional[bool] = None) -> Tensor: return features, embeddings return embeddings + def encode(self, x: Union[Tensor, dict], return_features: Optional[bool] = None) -> Tensor: + result = self.forward(x, return_features) + if isinstance(result, tuple): + return result[0].detach(), result[1].detach() + else: + return result.detach() -class VLM(nn.Module): - """ - Vision-Language Model for Multimodal embeddings. - """ - - def __init__(self, config: Dict, tokenizer_path: PathLike): - """ - :param config: Model config - """ - - super().__init__() - self._embedding_dim = config["text_encoder"]["embedding_dim"] - - self.text_encoder = TextEncoder(**config["text_encoder"]) - self.image_encoder = VisualEncoder(**config["image_encoder"]) - - def encode_image( - self, - images: Tensor, - return_features: bool = False, - ) -> Union[Tensor, Tuple[Tensor, Tensor]]: - """Passes the pre-processed images through `image_encoder` to produce images features (optional) and embeddings. - - :param images: Preprocessed image - :param return_features: Whether to return images features or return only embeddings - """ - - features = self.image_encoder.forward_features(images) - embeddings = self.image_encoder.forward_embedding(features) - - if return_features: - return features, embeddings - - return embeddings - - def encode_text( - self, - texts: Dict[str, Tensor], - return_features: bool = False, - ) -> Union[Tensor, Tuple[Tensor, Tensor]]: - """Passes the pre-processed texts through `text_encoder` to produce texts features (optional) and embeddings. - - :param texts: Dictionary with tokenized texts and attention masks - :param return_features: Whether to return texts features or return only embeddings - """ - - features = self.text_encoder.forward_features( - texts["input_ids"], - texts["attention_mask"], - ) - embeddings = self.text_encoder.forward_embedding( - features, - texts["attention_mask"], - ) - - if return_features: - return features, embeddings - - return embeddings - - def encode_multimodal( - self, - image: Optional[Tensor] = None, - text: Optional[Dict] = None, - image_features: Optional[Tensor] = None, - text_features: Optional[Tensor] = None, - attention_mask: Optional[Tensor] = None, - return_scores: bool = False, - ) -> Union[Tensor, Tuple[Tensor, Tensor]]: - """Passes preprocessed texts (or precomputed texts features) and - preprocessed images (or precomputed images features) through multimodal encoded to produce multimodal joint embeddings. - - :param image: Preprocessed images - :param text: Preprocessed texts - :param image_features: Precomputed images features - :param text_features: Precomputed text features - :param attention_mask: Attention masks, not required if pass `text` instead of text_features - """ - - assert image is not None or image_features is not None, "Either `image` or `image_features` should be non None" - assert text is not None or text_features is not None, "Either `text_data` or `text_features` should be non None" - - if text_features is not None: - assert attention_mask is not None, "if `text_features` is not None, then you should pass `attention_mask`" - - if image_features is None: - image_features = self.image_encoder.forward_features(image) - - if text_features is None: - text_features = self.text_encoder.forward_features( - text["input_ids"], - text["attention_mask"], - ) - - embeddings = self.text_encoder.forward_multimodal( - text_features, - attention_mask if attention_mask is not None else text["attention_mask"], - image_features, - ) - - if return_scores: - return self.get_matching_scores(embeddings), embeddings - - return embeddings - - def get_matching_scores(self, embeddings: Tensor) -> Tensor: - """Computes the probability that there is a match between images and texts based on their multimodal embeddings - - :param embeddings: multimodal joint embeddings - """ - - return self.text_encoder.forward_matching(embeddings) + @staticmethod + def from_pretrained( + config: Union[PathLike, str, object], + model: Union[PathLike, str, Mapping[str, Any]], + ) -> ImageEncoder: + """Load the image encoder from the given configuration and model path. - def forward( - self, - images: Tensor, - texts: Dict[str, Tensor], - ) -> Union[Tensor, Tensor]: - """Inference forward method - - :param images: Preprocessed images - :param texts: Preprocessed texts - :return: embeddings for images and texts + :param config: the configuration dictionary or path to the JSON configuration file + :param model: the model state dictionary or path to the `.pt` model file """ - _, image_embeddings = self.image_encoder(images) - _, text_embeddings = self.text_encoder(texts) - return image_embeddings, text_embeddings - - @property - def text_features_dim(self) -> int: - """Dimensionality of the text encoder features.""" - - return self.text_encoder.dim - - @property - def image_features_dim(self) -> int: - """Dimensionality of the image encoder features.""" - - return self.image_encoder.dim - - @property - def embedding_dim(self) -> int: - """Dimensionality of shared space embedding.""" - - return self._embedding_dim - - @property - def multimodal_embedding_dim(self) -> int: - """Dimensionality of multimodal joint embedding.""" - return self.text_encoder.dim + config = read_config(config) + if "image_encoder" in config: + config = config["image_encoder"] + + # We must strip all the non-member attributes before initializing the classes. + image_fields = ImageEncoder.__dataclass_fields__ + config = {k: v for k, v in config.items() if k in image_fields} + encoder = ImageEncoder(**config) + + # Load from disk + if isinstance(model, (PathLike, str)): + state = torch.load(model) + else: + state = model + if "image_encoder" in state: + state = state["image_encoder"] + encoder.load_state_dict(state) + return encoder diff --git a/python/uform/torch_preprocessor.py b/python/uform/torch_processors.py similarity index 57% rename from python/uform/torch_preprocessor.py rename to python/uform/torch_processors.py index 8bdc70b..79c7e87 100644 --- a/python/uform/torch_preprocessor.py +++ b/python/uform/torch_processors.py @@ -1,5 +1,6 @@ from os import PathLike -from typing import Dict, List, Union +from typing import Dict, List, Union, Sequence +import json import torch from PIL.Image import Image @@ -14,43 +15,35 @@ ToTensor, ) +from uform.shared import read_config -# lambda is not pickable + +# lambda is not pickle-able def convert_to_rgb(image): return image.convert("RGB") -class TorchProcessor: - def __init__(self, config: Dict, tokenizer_path: PathLike): +class TextProcessor: + def __init__(self, config_path: PathLike, tokenizer_path: PathLike): """ :param config: model config :param tokenizer_path: path to tokenizer file - :param tensor_type: which tensors to return, either pt (PyTorch) or np (NumPy) """ - self._image_size = config["image_encoder"]["image_size"] - self._max_seq_len = config["text_encoder"]["max_position_embeddings"] + config = read_config(config_path) + if "text_encoder" in config: + config = config["text_encoder"] + + self._max_seq_len = config["max_position_embeddings"] self._tokenizer = Tokenizer.from_file(tokenizer_path) self._tokenizer.no_padding() - self._pad_token_idx = config["text_encoder"]["padding_idx"] - - self._image_transform = Compose( - [ - Resize(self._image_size, interpolation=InterpolationMode.BICUBIC), - convert_to_rgb, - CenterCrop(self._image_size), - ToTensor(), - Normalize( - mean=(0.48145466, 0.4578275, 0.40821073), - std=(0.26862954, 0.26130258, 0.27577711), - ), - ], - ) + self._pad_token_idx = config["padding_idx"] - def preprocess_text(self, texts: Union[str, List[str]]) -> Dict[str, Tensor]: + def __call__(self, texts: Union[str, List[str]]) -> Dict[str, Tensor]: """Transforms one or more strings into dictionary with tokenized strings and attention masks. :param texts: text of list of texts to tokenizer + :return: dictionary with tokenized strings and attention masks as values """ if isinstance(texts, str): texts = [texts] @@ -77,13 +70,46 @@ def preprocess_text(self, texts: Union[str, List[str]]) -> Dict[str, Tensor]: return {"input_ids": input_ids, "attention_mask": attention_mask} - def preprocess_image(self, images: Union[Image, List[Image]]) -> Tensor: + +class ImageProcessor: + def __init__(self, config_path: PathLike): + """ + :param config: model config + """ + + config = read_config(config_path) + if "image_encoder" in config: + config = config["image_encoder"] + + self._image_size = config["image_size"] + self._normalization_means = config["normalization_means"] + self._normalization_deviations = config["normalization_deviations"] + + assert isinstance(self._image_size, int) and self._image_size > 0 + assert isinstance(self._normalization_means, list) and isinstance(self._normalization_deviations, list) + assert len(self._normalization_means) == len(self._normalization_deviations) == 3 + + self._image_transform = Compose( + [ + Resize(self._image_size, interpolation=InterpolationMode.BICUBIC), + convert_to_rgb, + CenterCrop(self._image_size), + ToTensor(), + Normalize( + mean=tuple(self._normalization_means), + std=tuple(self._normalization_deviations), + ), + ], + ) + + def __call__(self, images: Union[Image, Sequence[Image]]) -> Dict[str, Tensor]: """Transforms one or more Pillow images into Torch Tensors. :param images: image or list of images to preprocess + :return: dictionary with float-represented images in tensors as values """ - if isinstance(images, list): + if isinstance(images, Sequence): batch_images = torch.empty( (len(images), 3, self._image_size, self._image_size), dtype=torch.float32, @@ -95,4 +121,4 @@ def preprocess_image(self, images: Union[Image, List[Image]]) -> Tensor: else: batch_images = self._image_transform(images).unsqueeze(0) - return batch_images + return {"images": batch_images} diff --git a/swift/Embeddings.swift b/swift/Embeddings.swift deleted file mode 100644 index 6d973ac..0000000 --- a/swift/Embeddings.swift +++ /dev/null @@ -1,403 +0,0 @@ -// -// Embeddings.swift -// -// -// Created by Ash Vardanian on 3/27/24. -// -import Accelerate -import CoreGraphics -import CoreML -import Foundation -import Hub // `Config` -import Tokenizers // `AutoTokenizer` - -public enum Embedding { - case i32s([Int32]) - case f16s([Float16]) - case f32s([Float32]) - case f64s([Float64]) - - init?(from multiArray: MLMultiArray) { - switch multiArray.dataType { - case .float64: - self = .f64s( - Array( - UnsafeBufferPointer( - start: multiArray.dataPointer.assumingMemoryBound(to: Float64.self), - count: Int(truncating: multiArray.shape[1]) - ) - ) - ) - case .float32: - self = .f32s( - Array( - UnsafeBufferPointer( - start: multiArray.dataPointer.assumingMemoryBound(to: Float32.self), - count: Int(truncating: multiArray.shape[1]) - ) - ) - ) - case .float16: - self = .f16s( - Array( - UnsafeBufferPointer( - start: multiArray.dataPointer.assumingMemoryBound(to: Float16.self), - count: Int(truncating: multiArray.shape[1]) - ) - ) - ) - case .int32: - self = .i32s( - Array( - UnsafeBufferPointer( - start: multiArray.dataPointer.assumingMemoryBound(to: Int32.self), - count: Int(truncating: multiArray.shape[1]) - ) - ) - ) - @unknown default: - return nil // return nil for unsupported data types - } - } - - public func asFloats() -> [Float] { - switch self { - case .f32s(let array): - return array - case .i32s(let array): - return array.map { Float($0) } - case .f16s(let array): - return array.map { Float($0) } - case .f64s(let array): - return array.map { Float($0) } - } - } -} - -// MARK: - Helpers - -func readConfig(fromPath path: String) throws -> [String: Any] { - // If it's not an absolute path, let's assume it's a path relative to the current working directory - let absPath = path.hasPrefix("/") ? path : FileManager.default.currentDirectoryPath + "/" + path - let data = try Data(contentsOf: URL(fileURLWithPath: absPath)) - return try JSONSerialization.jsonObject(with: data, options: []) as! [String: Any] -} - -func readModel(fromURL modelURL: URL) throws -> MLModel { - let compiledModelURL = try MLModel.compileModel(at: modelURL) - return try MLModel(contentsOf: compiledModelURL) -} - -func readModel(fromPath path: String) throws -> MLModel { - // If it's not an absolute path, let's assume it's a path relative to the current working directory - let absPath = path.hasPrefix("/") ? path : FileManager.default.currentDirectoryPath + "/" + path - let modelURL = URL(fileURLWithPath: absPath, isDirectory: true) - return try readModel(fromURL: modelURL) -} - -// MARK: - Encoders - -public class TextEncoder { - let model: MLModel - let processor: TextProcessor - - public init(modelPath: String, configPath: String? = nil, tokenizerPath: String? = nil) throws { - let finalConfigPath = configPath ?? modelPath + "/config.json" - let finalTokenizerPath = tokenizerPath ?? modelPath + "/tokenizer.json" - self.model = try readModel(fromPath: modelPath) - self.processor = try TextProcessor(configPath: finalConfigPath, tokenizerPath: finalTokenizerPath, model: self.model) - } - - - public init(modelName: String, hubApi: HubApi = .shared) async throws { - let repo = Hub.Repo(id: modelName) - let modelURL = try await hubApi.snapshot(from: repo, matching: ["text.mlpackage/*", "config.json", "tokenizer.json"]) - let configPath = modelURL.appendingPathComponent("config.json").path - let tokenizerPath = modelURL.appendingPathComponent("tokenizer.json").path - self.model = try readModel(fromURL: modelURL.appendingPathComponent("text.mlpackage", isDirectory: true)) - self.processor = try TextProcessor(configPath: configPath, tokenizerPath: tokenizerPath, model: self.model) - } - - public func forward(with text: String) throws -> Embedding { - let inputFeatureProvider = try self.processor.preprocess(text) - let prediction = try self.model.prediction(from: inputFeatureProvider) - guard let predictionFeature = prediction.featureValue(for: "embeddings"), - let output = predictionFeature.multiArrayValue, - let embedding = Embedding(from: output) - else { - throw NSError( - domain: "TextEncoder", - code: 0, - userInfo: [NSLocalizedDescriptionKey: "Failed to extract embeddings or unsupported data type."] - ) - } - return embedding - } -} - -public class ImageEncoder { - let model: MLModel - let processor: ImageProcessor - - public init(modelPath: String, configPath: String? = nil) throws { - let finalConfigPath = configPath ?? modelPath + "/config.json" - self.model = try readModel(fromPath: modelPath) - self.processor = try ImageProcessor(configPath: finalConfigPath) - } - - public init(modelName: String, hubApi: HubApi = .shared) async throws { - let repo = Hub.Repo(id: modelName) - let modelURL = try await hubApi.snapshot(from: repo, matching: ["image.mlpackage/*", "config.json"]) - let configPath = modelURL.appendingPathComponent("config.json").path - self.model = try readModel(fromURL: modelURL.appendingPathComponent("image.mlpackage", isDirectory: true)) - self.processor = try ImageProcessor(configPath: configPath) - } - - public func forward(with image: CGImage) throws -> Embedding { - let inputFeatureProvider = try self.processor.preprocess(image) - let prediction = try self.model.prediction(from: inputFeatureProvider) - guard let predictionFeature = prediction.featureValue(for: "embeddings"), - let output = predictionFeature.multiArrayValue, - let embedding = Embedding(from: output) - else { - throw NSError( - domain: "ImageEncoder", - code: 0, - userInfo: [NSLocalizedDescriptionKey: "Failed to extract embeddings or unsupported data type."] - ) - } - return embedding - } -} - -// MARK: - Processors - -class TextProcessor { - let tokenizer: Tokenizer - let minContextLength: Int - let maxContextLength: Int - - public init(configPath: String, tokenizerPath: String, model: MLModel) throws { - var configDict = try readConfig(fromPath: configPath) - let tokenizerDict = try readConfig(fromPath: tokenizerPath) - - // Check if there's a specific 'text_encoder' configuration within the main configuration - if let textEncoderConfig = configDict["text_encoder"] as? [String: Any] { - configDict = textEncoderConfig // Use the specific 'text_encoder' configuration - } - - let config = Config(configDict) - let tokenizerData = Config(tokenizerDict) - self.tokenizer = try AutoTokenizer.from(tokenizerConfig: config, tokenizerData: tokenizerData) - - let inputDescription = model.modelDescription.inputDescriptionsByName["input_ids"] - guard let shapeConstraint = inputDescription?.multiArrayConstraint?.shapeConstraint else { - fatalError("Cannot obtain shape information") - } - - switch shapeConstraint.type { - case .enumerated: - minContextLength = shapeConstraint.enumeratedShapes[0][1].intValue - maxContextLength = minContextLength - case .range: - let range = inputDescription?.multiArrayConstraint?.shapeConstraint.sizeRangeForDimension[1] as? NSRange - minContextLength = range?.location ?? 1 - maxContextLength = range?.length ?? 128 - case .unspecified: - minContextLength = 128 - maxContextLength = 128 - @unknown default: - minContextLength = 128 - maxContextLength = 128 - } - } - - public func preprocess(_ text: String) throws -> MLFeatureProvider { - let inputIDs = self.tokenizer.encode(text: text) - return TextInput(inputIDs: inputIDs, sequenceLength: self.maxContextLength) - } -} - -class ImageProcessor { - let imageSize: Int - let mean: [Float] = [0.485, 0.456, 0.406] // Common mean values for normalization - let std: [Float] = [0.229, 0.224, 0.225] // Common std values for normalization - - init(configPath: String) throws { - var configDict = try readConfig(fromPath: configPath) - // Check if there's a specific 'image_encoder' configuration within the main configuration - if let imageEncoderConfig = configDict["image_encoder"] as? [String: Any] { - configDict = imageEncoderConfig - } - - let config = Config(configDict) - self.imageSize = config.imageSize!.intValue! - } - - func preprocess(_ cgImage: CGImage) throws -> MLFeatureProvider { - // Populate a tensor of size 3 x `imageSize` x `imageSize`, - // by resizing the image, then performing a center crop. - // Then normalize with the `mean` and `std` and export as a provider. - let cropped = resizeAndCrop(image: cgImage, toSideLength: self.imageSize)! - let normalized = exportToTensorAndNormalize(image: cropped, mean: self.mean, std: self.std)! - let featureValue = MLFeatureValue(multiArray: normalized) - return try ImageInput(precomputedFeature: featureValue) - } - - private func resizeAndCrop(image: CGImage, toSideLength imageSize: Int) -> CGImage? { - let originalWidth = CGFloat(image.width) - let originalHeight = CGFloat(image.height) - - // Calculate new size preserving the aspect ratio - let widthRatio = CGFloat(imageSize) / originalWidth - let heightRatio = CGFloat(imageSize) / originalHeight - let scaleFactor = max(widthRatio, heightRatio) - - let scaledWidth = originalWidth * scaleFactor - let scaledHeight = originalHeight * scaleFactor - - // Calculate the crop rectangle - let dx = (scaledWidth - CGFloat(imageSize)) / 2.0 - let dy = (scaledHeight - CGFloat(imageSize)) / 2.0 - guard - let context = CGContext( - data: nil, - width: imageSize, - height: imageSize, - bitsPerComponent: image.bitsPerComponent, - bytesPerRow: 0, - space: image.colorSpace ?? CGColorSpaceCreateDeviceRGB(), - bitmapInfo: image.bitmapInfo.rawValue - ) - else { return nil } - - // Draw the scaled and cropped image in the context - context.interpolationQuality = .high - context.draw(image, in: CGRect(x: -dx, y: -dy, width: scaledWidth, height: scaledHeight)) - return context.makeImage() - } - - private func exportToTensorAndNormalize(image: CGImage, mean: [Float], std: [Float]) -> MLMultiArray? { - let width = image.width - let height = image.height - - // Prepare the bitmap context for drawing the image. - var pixelData = [UInt8](repeating: 0, count: width * height * 4) - let colorSpace = CGColorSpaceCreateDeviceRGB() - let context = CGContext( - data: &pixelData, - width: width, - height: height, - bitsPerComponent: 8, - bytesPerRow: 4 * width, - space: colorSpace, - bitmapInfo: CGImageAlphaInfo.premultipliedLast.rawValue - ) - context?.draw(image, in: CGRect(x: 0, y: 0, width: width, height: height)) - - // Normalize the pixel data - var floatPixels = [Float](repeating: 0, count: width * height * 3) - for c in 0 ..< 3 { - for i in 0 ..< (width * height) { - floatPixels[i * 3 + c] = (Float(pixelData[i * 4 + c]) / 255.0 - mean[c]) / std[c] - } - } - - // Create the tensor array - var tensor = [Float](repeating: 0, count: 3 * width * height) - for i in 0 ..< (width * height) { - for c in 0 ..< 3 { - tensor[c * width * height + i] = floatPixels[i * 3 + c] - } - } - - let multiArray = try? MLMultiArray( - shape: [1, 3, NSNumber(value: height), NSNumber(value: width)], - dataType: .float32 - ) - for i in 0 ..< tensor.count { - multiArray?[i] = NSNumber(value: tensor[i]) - } - return multiArray - } - -} - -// MARK: - Feature Providers - -class TextInput: MLFeatureProvider { - var inputIDs: [Int] - var sequenceLength: Int - var paddingID: Int - - init(inputIDs: [Int], sequenceLength: Int, paddingID: Int = 0) { - self.inputIDs = inputIDs - self.sequenceLength = sequenceLength - self.paddingID = paddingID - } - - var featureNames: Set { - return Set(["input_ids", "attention_mask"]) - } - - // The model expects the input IDs to be an array of integers - // of length `sequenceLength`, padded with `paddingID` if necessary - func featureValue(for featureName: String) -> MLFeatureValue? { - switch featureName { - case "input_ids", "attention_mask": - return createFeatureValue(for: featureName) - default: - return nil - } - } - - private func createFeatureValue(for featureName: String) -> MLFeatureValue? { - let count = min(inputIDs.count, sequenceLength) - let totalElements = sequenceLength - guard let multiArray = try? MLMultiArray(shape: [1, NSNumber(value: totalElements)], dataType: .int32) else { - return nil - } - - if featureName == "input_ids" { - for i in 0 ..< count { - multiArray[i] = NSNumber(value: inputIDs[i]) - } - for i in count ..< totalElements { - multiArray[i] = NSNumber(value: paddingID) - } - } - else if featureName == "attention_mask" { - for i in 0 ..< count { - multiArray[i] = NSNumber(value: 1) - } - for i in count ..< totalElements { - multiArray[i] = NSNumber(value: 0) - } - } - - return MLFeatureValue(multiArray: multiArray) - } -} - -class ImageInput: MLFeatureProvider { - var precomputedFeature: MLFeatureValue - - init(precomputedFeature: MLFeatureValue) throws { - self.precomputedFeature = precomputedFeature - } - - var featureNames: Set { - return Set(["input"]) - } - - // The model expects the input IDs to be an array of integers - // of length `sequenceLength`, padded with `paddingID` if necessary - func featureValue(for featureName: String) -> MLFeatureValue? { - switch featureName { - case "input": - return precomputedFeature - default: - return nil - } - } -} diff --git a/swift/Encoders.swift b/swift/Encoders.swift new file mode 100644 index 0000000..509ad11 --- /dev/null +++ b/swift/Encoders.swift @@ -0,0 +1,505 @@ +// +// Embeddings.swift +// +// +// Created by Ash Vardanian on 3/27/24. +// +import Accelerate +import CoreGraphics +import CoreML +import Foundation +import Hub // `Config` +import Tokenizers // `AutoTokenizer` + +/// Defines custom errors related to the encoder's functionality. +enum EncoderError: Error { + case downloadError(String) + case loadingError(String) + case invalidInput(String) + case modelPredictionFailed(String) + case unknownError(String) +} + +/// Represents different types of embeddings as arrays of different numeric types. +public enum Embedding { + case i32s([Int32]) + case f16s([Float16]) + case f32s([Float32]) + case f64s([Float64]) + + /// Initializes an embedding from a `MLMultiArray`. + /// - Parameter multiArray: The MLMultiArray to convert into an Embedding. + /// - Returns: nil if the data type is unsupported. + init?(from multiArray: MLMultiArray) { + switch multiArray.dataType { + case .float64: + self = .f64s( + Array( + UnsafeBufferPointer( + start: multiArray.dataPointer.assumingMemoryBound(to: Float64.self), + count: Int(truncating: multiArray.shape[1]) + ) + ) + ) + case .float32: + self = .f32s( + Array( + UnsafeBufferPointer( + start: multiArray.dataPointer.assumingMemoryBound(to: Float32.self), + count: Int(truncating: multiArray.shape[1]) + ) + ) + ) + case .float16: + self = .f16s( + Array( + UnsafeBufferPointer( + start: multiArray.dataPointer.assumingMemoryBound(to: Float16.self), + count: Int(truncating: multiArray.shape[1]) + ) + ) + ) + case .int32: + self = .i32s( + Array( + UnsafeBufferPointer( + start: multiArray.dataPointer.assumingMemoryBound(to: Int32.self), + count: Int(truncating: multiArray.shape[1]) + ) + ) + ) + @unknown default: + return nil + } + } + + /// Converts the embedding to an array of `Float`. + public func asFloats() -> [Float] { + switch self { + case .f32s(let array): return array + case .i32s(let array): return array.map(Float.init) + case .f16s(let array): return array.map(Float.init) + case .f64s(let array): return array.map(Float.init) + } + } +} + +/// Provides methods for reading and handling configurations and models. +/// - Parameter path: The file path where the configuration file is located. +/// - Returns: A dictionary containing the configuration data. +func readConfig(fromPath path: String) throws -> [String: Any] { + let absPath = path.hasPrefix("/") ? path : FileManager.default.currentDirectoryPath + "/" + path + let data = try Data(contentsOf: URL(fileURLWithPath: absPath)) + return try JSONSerialization.jsonObject(with: data, options: []) as! [String: Any] +} + +/// Compiles and loads a machine learning model from a URL. +/// - Parameter modelURL: The URL where the model package is located. +/// - Returns: An instance of `MLModel`. +func readModel(fromURL modelURL: URL) throws -> MLModel { + let compiledModelURL = try MLModel.compileModel(at: modelURL) + return try MLModel(contentsOf: compiledModelURL) +} + +/// Loads a machine learning model from a local file path. +/// - Parameter path: The file path where the model file is located. +/// - Returns: An instance of `MLModel`. +func readModel(fromPath path: String) throws -> MLModel { + let absPath = path.hasPrefix("/") ? path : FileManager.default.currentDirectoryPath + "/" + path + let modelURL = URL(fileURLWithPath: absPath, isDirectory: true) + return try readModel(fromURL: modelURL) +} + +/// Encodes text input into embeddings using a machine learning model. +public class TextEncoder { + let model: MLModel + let processor: TextProcessor + + /// Initializes a `TextEncoder` using paths for the model and configuration. + /// - Parameters: + /// - modelPath: The path to the directory containing the machine learning model. + /// - configPath: Optional. The path to the configuration file. Defaults to config.json in the model directory. + /// - tokenizerPath: Optional. The path to the tokenizer file. Defaults to tokenizer.json in the model directory. + public init(modelPath: String, configPath: String? = nil, tokenizerPath: String? = nil) throws { + let finalConfigPath = configPath ?? modelPath + "/config.json" + let finalTokenizerPath = tokenizerPath ?? modelPath + "/tokenizer.json" + self.model = try readModel(fromPath: modelPath) + self.processor = try TextProcessor( + configPath: finalConfigPath, + tokenizerPath: finalTokenizerPath, + model: self.model + ) + } + + /// Initializes a `TextEncoder` using a model name and an API for fetching models. + /// - Parameters: + /// - modelName: The identifier for the model repository. + /// - hubApi: The API object to interact with the model hub. Defaults to a shared instance. + public init(modelName: String, hubApi: HubApi = .shared) async throws { + let repo = Hub.Repo(id: modelName) + let modelURL = try await hubApi.snapshot( + from: repo, + matching: ["text_encoder.mlpackage/*", "config.json", "tokenizer.json"] + ) + let configPath = modelURL.appendingPathComponent("config.json").path + let tokenizerPath = modelURL.appendingPathComponent("tokenizer.json").path + self.model = try readModel( + fromURL: modelURL.appendingPathComponent("text_encoder.mlpackage", isDirectory: true) + ) + self.processor = try TextProcessor(configPath: configPath, tokenizerPath: tokenizerPath, model: self.model) + } + + /// Processes text and returns embeddings. Throws an error if processing fails. + /// - Parameter text: The text input to encode. + /// - Returns: An `Embedding` object containing the model output. + public func encode(_ text: String) throws -> Embedding { + let inputFeatureProvider = try self.processor.preprocess(text) + guard let prediction = try? self.model.prediction(from: inputFeatureProvider), + let predictionFeature = prediction.featureValue(for: "embeddings"), + let output = predictionFeature.multiArrayValue, + let embedding = Embedding(from: output) + else { + throw EncoderError.modelPredictionFailed("Failed to extract embeddings or unsupported data type.") + } + return embedding + } +} + +/// Encodes image input into embeddings using a machine learning model. +public class ImageEncoder { + let model: MLModel + let processor: ImageProcessor + + /// Initializes an `ImageEncoder` using a path for the model and optionally a configuration file. + /// - Parameters: + /// - modelPath: The path to the directory containing the machine learning model. + /// - configPath: Optional. The path to the configuration file. Defaults to config.json in the model directory. + public init(modelPath: String, configPath: String? = nil) throws { + let finalConfigPath = configPath ?? modelPath + "/config.json" + self.model = try readModel(fromPath: modelPath) + self.processor = try ImageProcessor(configPath: finalConfigPath) + } + + /// Initializes an `ImageEncoder` using a model name and an API for fetching models. + /// - Parameters: + /// - modelName: The identifier for the model repository. + /// - hubApi: The API object to interact with the model hub. Defaults to a shared instance. + public init(modelName: String, hubApi: HubApi = .shared) async throws { + let repo = Hub.Repo(id: modelName) + let modelURL = try await hubApi.snapshot(from: repo, matching: ["image_encoder.mlpackage/*", "config.json"]) + let configPath = modelURL.appendingPathComponent("config.json").path + self.model = try readModel( + fromURL: modelURL.appendingPathComponent("image_encoder.mlpackage", isDirectory: true) + ) + self.processor = try ImageProcessor(configPath: configPath) + } + + /// Processes an image and returns embeddings. Throws an error if processing fails. + /// - Parameter image: The `CGImage` to encode. + /// - Returns: An `Embedding` object containing the model output. + public func encode(_ image: CGImage) throws -> Embedding { + let inputFeatureProvider = try self.processor.preprocess(image) + guard let prediction = try? self.model.prediction(from: inputFeatureProvider), + let predictionFeature = prediction.featureValue(for: "embeddings"), + let output = predictionFeature.multiArrayValue, + let embedding = Embedding(from: output) + else { + throw EncoderError.modelPredictionFailed("Failed to extract embeddings or unsupported data type.") + } + return embedding + } +} + +// MARK: - Processors + +/// Handles the preprocessing of text data to be used by a machine learning model. +class TextProcessor { + let tokenizer: Tokenizer + let minContextLength: Int + let maxContextLength: Int + + /// Initializes a `TextProcessor` with specific configuration. + /// - Parameters: + /// - configPath: The path to the configuration file specifying tokenizer and model configurations. + /// - tokenizerPath: The path to the tokenizer configuration. + /// - model: The machine learning model to be used with this processor. + /// - Throws: An error if the configuration is invalid or missing necessary components. + public init(configPath: String, tokenizerPath: String, model: MLModel) throws { + var configDict = try readConfig(fromPath: configPath) + let tokenizerDict = try readConfig(fromPath: tokenizerPath) + + // Check if there's a specific 'text_encoder' configuration within the main configuration + if let textEncoderConfig = configDict["text_encoder"] as? [String: Any] { + configDict = textEncoderConfig // Use the specific 'text_encoder' configuration + } + + // Initialize the tokenizer with its configuration. + let config = Config(configDict) + let tokenizerData = Config(tokenizerDict) + self.tokenizer = try AutoTokenizer.from(tokenizerConfig: config, tokenizerData: tokenizerData) + + // Extract the model's input shape constraints. + guard let inputDescription = model.modelDescription.inputDescriptionsByName["input_ids"], + let multiArrayConstraint = inputDescription.multiArrayConstraint + else { + throw EncoderError.invalidInput("Cannot obtain shape information from the model.") + } + + // Determine the context length constraints based on the model's input shape constraint. + let shapeConstraint = multiArrayConstraint.shapeConstraint + switch shapeConstraint.type { + case .enumerated: + minContextLength = shapeConstraint.enumeratedShapes[0][1].intValue + maxContextLength = minContextLength + case .range: + guard let range = shapeConstraint.sizeRangeForDimension[1] as? NSRange else { + throw EncoderError.unknownError("Model input shape has a range constraint that cannot be interpreted.") + } + minContextLength = range.location + maxContextLength = range.length + case .unspecified: + throw EncoderError.unknownError("Model input shape is unspecified.") + @unknown default: + throw EncoderError.unknownError("Unknown model input shape constraint type.") + } + } + + /// Preprocesses a string of text into a format suitable for model prediction. + /// - Parameter text: The text to preprocess. + /// - Returns: A `MLFeatureProvider` containing the processed text ready for the model. + /// - Throws: An error if the text encoding fails. + public func preprocess(_ text: String) throws -> MLFeatureProvider { + let inputIDs = self.tokenizer.encode(text: text) + return TextInput(inputIDs: inputIDs, sequenceLength: self.maxContextLength) + } +} + +/// Handles the preprocessing of image data to be used by a machine learning model. +class ImageProcessor { + let imageSize: Int + let mean: [Float] + let std: [Float] + + /// Initializes an `ImageProcessor` with specific configuration. + /// - Parameter configPath: The path to the configuration file specifying image size, mean, and std. + init(configPath: String) throws { + var configDict = try readConfig(fromPath: configPath) + if let imageEncoderConfig = configDict["image_encoder"] as? [String: Any] { + configDict = imageEncoderConfig + } + + let config = Config(configDict) + guard let imageSize = config.imageSize?.value as? Int else { + throw EncoderError.invalidInput("Invalid or missing image size.") + } + self.imageSize = imageSize + + guard let meanArray = config.normalizationMeans?.value as? [Any], + let stdArray = config.normalizationDeviations?.value as? [Any] + else { + throw EncoderError.invalidInput("Normalization means or deviations are missing.") + } + + self.mean = try meanArray.compactMap({ + guard let doubleValue = $0 as? Double else { + throw EncoderError.invalidInput("Normalization means should be an array of floats.") + } + return Float(doubleValue) + }) + + self.std = try stdArray.compactMap({ + guard let doubleValue = $0 as? Double else { + throw EncoderError.invalidInput("Normalization deviations should be an array of floats.") + } + return Float(doubleValue) + }) + + // Check if the arrays have 3 values for the 3 channels + if self.mean.count != 3 || self.std.count != 3 { + throw EncoderError.invalidInput("Normalization means should contain 3 values.") + } + } + + /// Preprocesses a `CGImage` into a format suitable for model prediction. + /// - Parameter cgImage: The image to preprocess. + /// - Returns: An `MLFeatureProvider` containing the preprocessed image data. + func preprocess(_ cgImage: CGImage) throws -> MLFeatureProvider { + guard let cropped = resizeAndCrop(image: cgImage, toSideLength: self.imageSize), + let normalized = exportToTensorAndNormalize(image: cropped, mean: self.mean, std: self.std) + else { + throw EncoderError.invalidInput("Image preprocessing failed.") + } + let featureValue = MLFeatureValue(multiArray: normalized) + return try ImageInput(precomputedFeature: featureValue) + } + + private func resizeAndCrop(image: CGImage, toSideLength imageSize: Int) -> CGImage? { + let originalWidth = CGFloat(image.width) + let originalHeight = CGFloat(image.height) + + let widthRatio = CGFloat(imageSize) / originalWidth + let heightRatio = CGFloat(imageSize) / originalHeight + let scaleFactor = max(widthRatio, heightRatio) + + let scaledWidth = originalWidth * scaleFactor + let scaledHeight = originalHeight * scaleFactor + + let dx = (scaledWidth - CGFloat(imageSize)) / 2.0 + let dy = (scaledHeight - CGFloat(imageSize)) / 2.0 + guard + let context = CGContext( + data: nil, + width: imageSize, + height: imageSize, + bitsPerComponent: image.bitsPerComponent, + bytesPerRow: 0, + space: image.colorSpace ?? CGColorSpaceCreateDeviceRGB(), + bitmapInfo: image.bitmapInfo.rawValue + ) + else { return nil } + + // Draw the scaled and cropped image in the context + context.interpolationQuality = .high + context.draw(image, in: CGRect(x: -dx, y: -dy, width: scaledWidth, height: scaledHeight)) + return context.makeImage() + } + + private func exportToTensorAndNormalize(image: CGImage, mean: [Float], std: [Float]) -> MLMultiArray? { + let width = image.width + let height = image.height + + // Prepare the bitmap context for drawing the image. + var pixelData = [UInt8](repeating: 0, count: width * height * 4) + let colorSpace = CGColorSpaceCreateDeviceRGB() + guard + let context = CGContext( + data: &pixelData, + width: width, + height: height, + bitsPerComponent: 8, + bytesPerRow: 4 * width, + space: colorSpace, + bitmapInfo: CGImageAlphaInfo.premultipliedLast.rawValue + ) + else { return nil } + context.draw(image, in: CGRect(x: 0, y: 0, width: width, height: height)) + + // While normalizing the pixels, let's also transpose them from HWC to CHW + let channelSize = width * height + var floatPixels = [Float](repeating: 0, count: channelSize * 3) + for i in 0 ..< channelSize { + floatPixels[channelSize * 0 + i] = (Float(pixelData[i * 4 + 0]) / 255.0 - mean[0]) / std[0] + floatPixels[channelSize * 1 + i] = (Float(pixelData[i * 4 + 1]) / 255.0 - mean[1]) / std[1] + floatPixels[channelSize * 2 + i] = (Float(pixelData[i * 4 + 2]) / 255.0 - mean[2]) / std[2] + } + + // We need to wrap the constructor that may fail + do { + let tensor = try MLMultiArray( + shape: [1, 3, NSNumber(value: height), NSNumber(value: width)], + dataType: .float32 + ) + for i in 0 ..< floatPixels.count { + tensor[i] = NSNumber(value: floatPixels[i]) + } + return tensor + } + catch { + return nil + } + } +} + +// MARK: - Feature Providers + +/// Provides features for text input to a machine learning model, handling padding and attention mask generation. +class TextInput: MLFeatureProvider { + var inputIDs: [Int] + var sequenceLength: Int + var paddingID: Int + + /// Initializes a new instance for providing text input features. + /// - Parameters: + /// - inputIDs: Array of integer IDs representing the encoded text. + /// - sequenceLength: The fixed length to which the input sequence should be padded. + /// - paddingID: The integer ID used for padding shorter sequences. Defaults to 0. + init(inputIDs: [Int], sequenceLength: Int, paddingID: Int = 0) { + self.inputIDs = inputIDs + self.sequenceLength = sequenceLength + self.paddingID = paddingID + } + + var featureNames: Set { + return Set(["input_ids", "attention_mask"]) + } + + /// Returns the feature value for the specified feature name. + /// - Parameter featureName: The name of the feature for which the value is requested. + /// - Returns: An optional `MLFeatureValue` containing the data for the specified feature. + func featureValue(for featureName: String) -> MLFeatureValue? { + switch featureName { + case "input_ids", "attention_mask": + return createFeatureValue(for: featureName) + default: + return nil + } + } + + /// Creates the feature value for input IDs or attention mask based on the specified feature name. + /// - Parameter featureName: The name of the feature. + /// - Returns: An `MLFeatureValue` if the array can be created, otherwise nil. + private func createFeatureValue(for featureName: String) -> MLFeatureValue? { + let count = min(inputIDs.count, sequenceLength) + let totalElements = sequenceLength + guard let multiArray = try? MLMultiArray(shape: [1, NSNumber(value: totalElements)], dataType: .int32) else { + return nil + } + + if featureName == "input_ids" { + for i in 0 ..< count { + multiArray[i] = NSNumber(value: inputIDs[i]) + } + for i in count ..< totalElements { + multiArray[i] = NSNumber(value: paddingID) + } + } + else if featureName == "attention_mask" { + for i in 0 ..< count { + multiArray[i] = NSNumber(value: 1) + } + for i in count ..< totalElements { + multiArray[i] = NSNumber(value: 0) + } + } + + return MLFeatureValue(multiArray: multiArray) + } +} + +/// Provides a precomputed feature for image inputs to a machine learning model. +class ImageInput: MLFeatureProvider { + var precomputedFeature: MLFeatureValue + + /// Initializes a new instance with a precomputed feature. + /// - Parameter precomputedFeature: The `MLFeatureValue` containing the precomputed feature data. + /// - Throws: An error if the precomputed feature is not valid for the model. + init(precomputedFeature: MLFeatureValue) throws { + self.precomputedFeature = precomputedFeature + } + + var featureNames: Set { + return Set(["images"]) + } + + /// Returns the feature value for the specified feature name. + /// - Parameter featureName: The name of the feature for which the value is requested. + /// - Returns: An optional `MLFeatureValue` containing the data for the specified feature. + func featureValue(for featureName: String) -> MLFeatureValue? { + switch featureName { + case "images": + return precomputedFeature + default: + return nil + } + } +} diff --git a/swift/EmbeddingsTests.swift b/swift/EncodersTests.swift similarity index 75% rename from swift/EmbeddingsTests.swift rename to swift/EncodersTests.swift index 5efb87f..645d531 100644 --- a/swift/EmbeddingsTests.swift +++ b/swift/EncodersTests.swift @@ -1,11 +1,26 @@ import CoreGraphics +import Hub import ImageIO import UForm -import Hub import XCTest final class TokenizerTests: XCTestCase { + var hfToken: String? + + override func setUp() { + super.setUp() + // Attempt to load the Hugging Face token from the `.hf_token` file in the current directory + let fileURL = URL(fileURLWithPath: FileManager.default.currentDirectoryPath).appendingPathComponent(".hf_token") + if let token = try? String(contentsOf: fileURL, encoding: .utf8).trimmingCharacters(in: .whitespacesAndNewlines) + { + hfToken = token + } + + hfToken = hfToken ?? ProcessInfo.processInfo.environment["HF_TOKEN"] + hfToken = hfToken ?? "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" + } + func cosineSimilarity(between vectorA: [T], and vectorB: [T]) -> T { guard vectorA.count == vectorB.count else { fatalError("Vectors must be of the same length.") @@ -23,11 +38,11 @@ final class TokenizerTests: XCTestCase { return dotProduct / (magnitudeA * magnitudeB) } - func testTextEmbeddings() async throws { + func testTextEmbeddings(forModel modelName: String) async throws { - let api = HubApi(hfToken: "xxx") + let api = HubApi(hfToken: hfToken) let textModel = try await TextEncoder( - modelName: "unum-cloud/uform-vl2-english-small", + modelName: "unum-cloud/uform3-image-text-english-small", hubApi: api ) @@ -40,7 +55,7 @@ final class TokenizerTests: XCTestCase { var textEmbeddings: [[Float32]] = [] for text in texts { - let embedding: [Float32] = try textModel.forward(with: text).asFloats() + let embedding: [Float32] = try textModel.encode(text).asFloats() textEmbeddings.append(embedding) } @@ -60,36 +75,47 @@ final class TokenizerTests: XCTestCase { ) } - func testImageEmbeddings() async throws { + func testTextEmbeddings() async throws { + for model in [ + "unum-cloud/uform3-image-text-english-small", + "unum-cloud/uform3-image-text-english-base", + "unum-cloud/uform3-image-text-english-large", + "unum-cloud/uform3-image-text-multilingual-base", + ] { + try await testTextEmbeddings(forModel: model) + } + } + + func testImageEmbeddings(forModel modelName: String) async throws { // One option is to use a local model repository. // // let root = "uform/" // let textModel = try TextEncoder( - // modelPath: root + "uform-vl-english-large-text.mlpackage", + // modelPath: root + "uform-vl-english-large-text_encoder.mlpackage", // configPath: root + "uform-vl-english-large-text.json", // tokenizerPath: root + "uform-vl-english-large-text.tokenizer.json" // ) // let imageModel = try ImageEncoder( - // modelPath: root + "uform-vl-english-large-image.mlpackage", + // modelPath: root + "uform-vl-english-large-image_encoder.mlpackage", // configPath: root + "uform-vl-english-large-image.json" // ) // // A better option is to fetch directly from HuggingFace, similar to how users would do that: - let api = HubApi(hfToken: "xxx") + let api = HubApi(hfToken: hfToken) let textModel = try await TextEncoder( - modelName: "unum-cloud/uform-vl2-english-small", + modelName: modelName, hubApi: api ) let imageModel = try await ImageEncoder( - modelName: "unum-cloud/uform-vl2-english-small", + modelName: modelName, hubApi: api ) let texts = [ "A group of friends enjoy a barbecue on a sandy beach, with one person grilling over a large black grill, while the other sits nearby, laughing and enjoying the camaraderie.", "A white and orange cat stands on its hind legs, reaching towards a wicker basket filled with red raspberries on a wooden table in a garden, surrounded by orange flowers and a white teapot, creating a serene and whimsical scene.", - "A young girl in a yellow dress stands in a grassy field, holding an umbrella and looking at the camera, amidst rain.", + "A little girl in a yellow dress stands in a grassy field, holding an umbrella and looking at the camera, amidst rain.", "This serene bedroom features a white bed with a black canopy, a gray armchair, a black dresser with a mirror, a vase with a plant, a window with white curtains, a rug, and a wooden floor, creating a tranquil and elegant atmosphere.", "The image captures the iconic Louvre Museum in Paris, illuminated by warm lights against a dark sky, with the iconic glass pyramid in the center, surrounded by ornate buildings and a large courtyard, showcasing the museum's grandeur and historical significance.", ] @@ -115,9 +141,9 @@ final class TokenizerTests: XCTestCase { ) } - let textEmbedding: [Float32] = try textModel.forward(with: text).asFloats() + let textEmbedding: [Float32] = try textModel.encode(text).asFloats() textEmbeddings.append(textEmbedding) - let imageEmbedding: [Float32] = try imageModel.forward(with: cgImage).asFloats() + let imageEmbedding: [Float32] = try imageModel.encode(cgImage).asFloats() imageEmbeddings.append(imageEmbedding) } @@ -143,4 +169,15 @@ final class TokenizerTests: XCTestCase { } } + func testImageEmbeddings() async throws { + for model in [ + "unum-cloud/uform3-image-text-english-small", + "unum-cloud/uform3-image-text-english-base", + "unum-cloud/uform3-image-text-english-large", + "unum-cloud/uform3-image-text-multilingual-base", + ] { + try await testImageEmbeddings(forModel: model) + } + } + } diff --git a/swift/README.md b/swift/README.md new file mode 100644 index 0000000..8fa0eb8 --- /dev/null +++ b/swift/README.md @@ -0,0 +1,73 @@ +# UForm Swift SDK + +UForm offers first-party support for Swift. +To get started, add UForm to your project using Swift Package Manager. + +```bash +swift package init --type executable +swift package add uform +``` + +Then, import UForm in your Swift code: + +```swift +import UForm +``` + +## Embeddings + +### Text Embeddings + +```swift +let textModel = try await TextEncoder(modelName: "unum-cloud/uform3-image-text-english-small") +let text = "A group of friends enjoy a barbecue on a sandy beach, with one person grilling over a large black grill, while the other sits nearby, laughing and enjoying the camaraderie." +let textEmbedding: Embedding = try textModel.encode(text) +let textVector: [Float32] = textEmbedding.asFloats() +``` + +### Image Embeddings + +```swift +let imageModel = try await ImageEncoder(modelName: "unum-cloud/uform3-image-text-english-small") +let imageURL = "https://github.com/ashvardanian/ashvardanian/blob/master/demos/bbq-on-beach.jpg?raw=true" +guard let url = URL(string: imageURL), + let imageSource = CGImageSourceCreateWithURL(url as CFURL, nil), + let cgImage = CGImageSourceCreateImageAtIndex(imageSource, 0, nil) { + throw Exception("Could not load image from URL: \(imageURL)") +} + +var imageEmbedding: Embedding = try imageModel.encode(cgImage) +var imageVector: [Float32] = embedding.asFloats() +``` + +### Computing Distances + +There are several ways to compute distances between embeddings, once you have them. +Naive Swift code might look like this: + +```swift +func cosineSimilarity(_ a: [Float32], _ b: [Float32]) -> Float32 { + let dotProduct = zip(a, b).map(*).reduce(0, +) + let normA = sqrt(a.map { $0 * $0 }.reduce(0, +)) + let normB = sqrt(b.map { $0 * $0 }.reduce(0, +)) + return dotProduct / (normA * normB) +} +``` + +A faster way to compute distances is to use the Accelerate framework: + +```swift +import Accelerate + +func cosineSimilarity(_ a: [Float32], _ b: [Float32]) -> Float32 { + var result: Float32 = 0 + var aNorm: Float32 = 0 + var bNorm: Float32 = 0 + vDSP_dotpr(a, 1, b, 1, &result, vDSP_Length(a.count)) + vDSP_svesq(a, 1, &aNorm, vDSP_Length(a.count)) + vDSP_svesq(b, 1, &bNorm, vDSP_Length(b.count)) + return result / sqrt(aNorm * bNorm) +} +``` + +An even faster approach would be to use USearch or SimSIMD, that work not only for `Float32` and `Float64`, but also for `Float16`, `Int8`, and binary embeddings. diff --git a/yarn.lock b/yarn.lock new file mode 100644 index 0000000..5ab5bbe --- /dev/null +++ b/yarn.lock @@ -0,0 +1,594 @@ +# THIS IS AN AUTOGENERATED FILE. DO NOT EDIT THIS FILE DIRECTLY. +# yarn lockfile v1 + + +"@huggingface/hub@^0.14.8": + version "0.14.8" + resolved "https://registry.npmjs.org/@huggingface/hub/-/hub-0.14.8.tgz" + integrity sha512-vdJRham99E5Uzsc4rO0gTz0ykafmx6V78pgPpJ7LGz5X+P2exe/izPFndqczAzy8jVWN55Jjtnuqg+Y0zrjc+Q== + dependencies: + hash-wasm "^4.9.0" + +"@huggingface/jinja@^0.2.2": + version "0.2.2" + resolved "https://registry.npmjs.org/@huggingface/jinja/-/jinja-0.2.2.tgz" + integrity sha512-/KPde26khDUIPkTGU82jdtTW9UAuvUTumCAbFs/7giR0SxsvZC4hru51PBvpijH6BVkHcROcvZM/lpy5h1jRRA== + +"@protobufjs/aspromise@^1.1.1", "@protobufjs/aspromise@^1.1.2": + version "1.1.2" + resolved "https://registry.npmjs.org/@protobufjs/aspromise/-/aspromise-1.1.2.tgz" + integrity sha512-j+gKExEuLmKwvz3OgROXtrJ2UG2x8Ch2YZUxahh+s1F2HZ+wAceUNLkvy6zKCPVRkU++ZWQrdxsUeQXmcg4uoQ== + +"@protobufjs/base64@^1.1.2": + version "1.1.2" + resolved "https://registry.npmjs.org/@protobufjs/base64/-/base64-1.1.2.tgz" + integrity sha512-AZkcAA5vnN/v4PDqKyMR5lx7hZttPDgClv83E//FMNhR2TMcLUhfRUBHCmSl0oi9zMgDDqRUJkSxO3wm85+XLg== + +"@protobufjs/codegen@^2.0.4": + version "2.0.4" + resolved "https://registry.npmjs.org/@protobufjs/codegen/-/codegen-2.0.4.tgz" + integrity sha512-YyFaikqM5sH0ziFZCN3xDC7zeGaB/d0IUb9CATugHWbd1FRFwWwt4ld4OYMPWu5a3Xe01mGAULCdqhMlPl29Jg== + +"@protobufjs/eventemitter@^1.1.0": + version "1.1.0" + resolved "https://registry.npmjs.org/@protobufjs/eventemitter/-/eventemitter-1.1.0.tgz" + integrity sha512-j9ednRT81vYJ9OfVuXG6ERSTdEL1xVsNgqpkxMsbIabzSo3goCjDIveeGv5d03om39ML71RdmrGNjG5SReBP/Q== + +"@protobufjs/fetch@^1.1.0": + version "1.1.0" + resolved "https://registry.npmjs.org/@protobufjs/fetch/-/fetch-1.1.0.tgz" + integrity sha512-lljVXpqXebpsijW71PZaCYeIcE5on1w5DlQy5WH6GLbFryLUrBD4932W/E2BSpfRJWseIL4v/KPgBFxDOIdKpQ== + dependencies: + "@protobufjs/aspromise" "^1.1.1" + "@protobufjs/inquire" "^1.1.0" + +"@protobufjs/float@^1.0.2": + version "1.0.2" + resolved "https://registry.npmjs.org/@protobufjs/float/-/float-1.0.2.tgz" + integrity sha512-Ddb+kVXlXst9d+R9PfTIxh1EdNkgoRe5tOX6t01f1lYWOvJnSPDBlG241QLzcyPdoNTsblLUdujGSE4RzrTZGQ== + +"@protobufjs/inquire@^1.1.0": + version "1.1.0" + resolved "https://registry.npmjs.org/@protobufjs/inquire/-/inquire-1.1.0.tgz" + integrity sha512-kdSefcPdruJiFMVSbn801t4vFK7KB/5gd2fYvrxhuJYg8ILrmn9SKSX2tZdV6V+ksulWqS7aXjBcRXl3wHoD9Q== + +"@protobufjs/path@^1.1.2": + version "1.1.2" + resolved "https://registry.npmjs.org/@protobufjs/path/-/path-1.1.2.tgz" + integrity sha512-6JOcJ5Tm08dOHAbdR3GrvP+yUUfkjG5ePsHYczMFLq3ZmMkAD98cDgcT2iA1lJ9NVwFd4tH/iSSoe44YWkltEA== + +"@protobufjs/pool@^1.1.0": + version "1.1.0" + resolved "https://registry.npmjs.org/@protobufjs/pool/-/pool-1.1.0.tgz" + integrity sha512-0kELaGSIDBKvcgS4zkjz1PeddatrjYcmMWOlAuAPwAeccUrPHdUqo/J6LiymHHEiJT5NrF1UVwxY14f+fy4WQw== + +"@protobufjs/utf8@^1.1.0": + version "1.1.0" + resolved "https://registry.npmjs.org/@protobufjs/utf8/-/utf8-1.1.0.tgz" + integrity sha512-Vvn3zZrhQZkkBE8LSuW3em98c0FwgO4nxzv6OdSxPKJIEKY2bGbHn+mhGIPerzI4twdxaP8/0+06HBpwf345Lw== + +"@types/long@^4.0.1": + version "4.0.2" + resolved "https://registry.npmjs.org/@types/long/-/long-4.0.2.tgz" + integrity sha512-MqTGEo5bj5t157U6fA/BiDynNkn0YknVdh48CMPkTSpFTVmvao5UQmm7uEF6xBEo7qIMAlY/JSleYaE6VOdpaA== + +"@types/node@>=13.7.0": + version "20.12.7" + resolved "https://registry.npmjs.org/@types/node/-/node-20.12.7.tgz" + integrity sha512-wq0cICSkRLVaf3UGLMGItu/PtdY7oaXaI/RVU+xliKVOtRna3PRY57ZDfztpDL0n11vfymMUnXv8QwYCO7L1wg== + dependencies: + undici-types "~5.26.4" + +"@xenova/transformers@^2.17.0": + version "2.17.0" + resolved "https://registry.npmjs.org/@xenova/transformers/-/transformers-2.17.0.tgz" + integrity sha512-usmDut7hwnrc4EqP59cboYqE6C8up63SqMy3E9RjG9nCsOhrsLndEU7DMu+bZ9R+HcAI8jRGabTIxH+B6agBVA== + dependencies: + "@huggingface/jinja" "^0.2.2" + onnxruntime-web "1.14.0" + sharp "^0.32.0" + optionalDependencies: + onnxruntime-node "1.14.0" + +b4a@^1.6.4: + version "1.6.6" + resolved "https://registry.npmjs.org/b4a/-/b4a-1.6.6.tgz" + integrity sha512-5Tk1HLk6b6ctmjIkAcU/Ujv/1WqiDl0F0JdRCR80VsOcUlHcu7pWeWRlOqQLHfDEsVx9YH/aif5AG4ehoCtTmg== + +bare-events@^2.0.0, bare-events@^2.2.0: + version "2.2.2" + resolved "https://registry.npmjs.org/bare-events/-/bare-events-2.2.2.tgz" + integrity sha512-h7z00dWdG0PYOQEvChhOSWvOfkIKsdZGkWr083FgN/HyoQuebSew/cgirYqh9SCuy/hRvxc5Vy6Fw8xAmYHLkQ== + +bare-fs@^2.1.1: + version "2.2.3" + resolved "https://registry.npmjs.org/bare-fs/-/bare-fs-2.2.3.tgz" + integrity sha512-amG72llr9pstfXOBOHve1WjiuKKAMnebcmMbPWDZ7BCevAoJLpugjuAPRsDINEyjT0a6tbaVx3DctkXIRbLuJw== + dependencies: + bare-events "^2.0.0" + bare-path "^2.0.0" + streamx "^2.13.0" + +bare-os@^2.1.0: + version "2.2.1" + resolved "https://registry.npmjs.org/bare-os/-/bare-os-2.2.1.tgz" + integrity sha512-OwPyHgBBMkhC29Hl3O4/YfxW9n7mdTr2+SsO29XBWKKJsbgj3mnorDB80r5TiCQgQstgE5ga1qNYrpes6NvX2w== + +bare-path@^2.0.0, bare-path@^2.1.0: + version "2.1.1" + resolved "https://registry.npmjs.org/bare-path/-/bare-path-2.1.1.tgz" + integrity sha512-OHM+iwRDRMDBsSW7kl3dO62JyHdBKO3B25FB9vNQBPcGHMo4+eA8Yj41Lfbk3pS/seDY+siNge0LdRTulAau/A== + dependencies: + bare-os "^2.1.0" + +base64-js@^1.3.1: + version "1.5.1" + resolved "https://registry.npmjs.org/base64-js/-/base64-js-1.5.1.tgz" + integrity sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA== + +bl@^4.0.3: + version "4.1.0" + resolved "https://registry.npmjs.org/bl/-/bl-4.1.0.tgz" + integrity sha512-1W07cM9gS6DcLperZfFSj+bWLtaPGSOHWhPiGzXmvVJbRLdG82sH/Kn8EtW1VqWVA54AKf2h5k5BbnIbwF3h6w== + dependencies: + buffer "^5.5.0" + inherits "^2.0.4" + readable-stream "^3.4.0" + +buffer@^5.5.0: + version "5.7.1" + resolved "https://registry.npmjs.org/buffer/-/buffer-5.7.1.tgz" + integrity sha512-EHcyIPBQ4BSGlvjB16k5KgAJ27CIsHY/2JBmCRReo48y9rQ3MaUzWX3KVlBa4U7MyX02HdVj0K7C3WaB3ju7FQ== + dependencies: + base64-js "^1.3.1" + ieee754 "^1.1.13" + +chownr@^1.1.1: + version "1.1.4" + resolved "https://registry.npmjs.org/chownr/-/chownr-1.1.4.tgz" + integrity sha512-jJ0bqzaylmJtVnNgzTeSOs8DPavpbYgEr/b0YL8/2GO3xJEhInFmhKMUnEJQjZumK7KXGFhUy89PrsJWlakBVg== + +color-convert@^2.0.1: + version "2.0.1" + resolved "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz" + integrity sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ== + dependencies: + color-name "~1.1.4" + +color-name@^1.0.0, color-name@~1.1.4: + version "1.1.4" + resolved "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz" + integrity sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA== + +color-string@^1.9.0: + version "1.9.1" + resolved "https://registry.npmjs.org/color-string/-/color-string-1.9.1.tgz" + integrity sha512-shrVawQFojnZv6xM40anx4CkoDP+fZsw/ZerEMsW/pyzsRbElpsL/DBVW7q3ExxwusdNXI3lXpuhEZkzs8p5Eg== + dependencies: + color-name "^1.0.0" + simple-swizzle "^0.2.2" + +color@^4.2.3: + version "4.2.3" + resolved "https://registry.npmjs.org/color/-/color-4.2.3.tgz" + integrity sha512-1rXeuUUiGGrykh+CeBdu5Ie7OJwinCgQY0bc7GCRxy5xVHy+moaqkpL/jqQq0MtQOeYcrqEz4abc5f0KtU7W4A== + dependencies: + color-convert "^2.0.1" + color-string "^1.9.0" + +decompress-response@^6.0.0: + version "6.0.0" + resolved "https://registry.npmjs.org/decompress-response/-/decompress-response-6.0.0.tgz" + integrity sha512-aW35yZM6Bb/4oJlZncMH2LCoZtJXTRxES17vE3hoRiowU2kWHaJKFkSBDnDR+cm9J+9QhXmREyIfv0pji9ejCQ== + dependencies: + mimic-response "^3.1.0" + +deep-extend@^0.6.0: + version "0.6.0" + resolved "https://registry.npmjs.org/deep-extend/-/deep-extend-0.6.0.tgz" + integrity sha512-LOHxIOaPYdHlJRtCQfDIVZtfw/ufM8+rVj649RIHzcm/vGwQRXFt6OPqIFWsm2XEMrNIEtWR64sY1LEKD2vAOA== + +detect-libc@^2.0.0, detect-libc@^2.0.2: + version "2.0.3" + resolved "https://registry.npmjs.org/detect-libc/-/detect-libc-2.0.3.tgz" + integrity sha512-bwy0MGW55bG41VqxxypOsdSdGqLwXPI/focwgTYCFMbdUiBAxLg9CFzG08sz2aqzknwiX7Hkl0bQENjg8iLByw== + +end-of-stream@^1.1.0, end-of-stream@^1.4.1: + version "1.4.4" + resolved "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.4.tgz" + integrity sha512-+uw1inIHVPQoaVuHzRyXd21icM+cnt4CzD5rW+NC1wjOUSTOs+Te7FOv7AhN7vS9x/oIyhLP5PR1H+phQAHu5Q== + dependencies: + once "^1.4.0" + +expand-template@^2.0.3: + version "2.0.3" + resolved "https://registry.npmjs.org/expand-template/-/expand-template-2.0.3.tgz" + integrity sha512-XYfuKMvj4O35f/pOXLObndIRvyQ+/+6AhODh+OKWj9S9498pHHn/IMszH+gt0fBCRWMNfk1ZSp5x3AifmnI2vg== + +fast-fifo@^1.1.0, fast-fifo@^1.2.0: + version "1.3.2" + resolved "https://registry.npmjs.org/fast-fifo/-/fast-fifo-1.3.2.tgz" + integrity sha512-/d9sfos4yxzpwkDkuN7k2SqFKtYNmCTzgfEpz82x34IM9/zc8KGxQoXg1liNC/izpRM/MBdt44Nmx41ZWqk+FQ== + +flatbuffers@^1.12.0: + version "1.12.0" + resolved "https://registry.npmjs.org/flatbuffers/-/flatbuffers-1.12.0.tgz" + integrity sha512-c7CZADjRcl6j0PlvFy0ZqXQ67qSEZfrVPynmnL+2zPc+NtMvrF8Y0QceMo7QqnSPc7+uWjUIAbvCQ5WIKlMVdQ== + +fs-constants@^1.0.0: + version "1.0.0" + resolved "https://registry.npmjs.org/fs-constants/-/fs-constants-1.0.0.tgz" + integrity sha512-y6OAwoSIf7FyjMIv94u+b5rdheZEjzR63GTyZJm5qh4Bi+2YgwLCcI/fPFZkL5PSixOt6ZNKm+w+Hfp/Bciwow== + +github-from-package@0.0.0: + version "0.0.0" + resolved "https://registry.npmjs.org/github-from-package/-/github-from-package-0.0.0.tgz" + integrity sha512-SyHy3T1v2NUXn29OsWdxmK6RwHD+vkj3v8en8AOBZ1wBQ/hCAQ5bAQTD02kW4W9tUp/3Qh6J8r9EvntiyCmOOw== + +guid-typescript@^1.0.9: + version "1.0.9" + resolved "https://registry.npmjs.org/guid-typescript/-/guid-typescript-1.0.9.tgz" + integrity sha512-Y8T4vYhEfwJOTbouREvG+3XDsjr8E3kIr7uf+JZ0BYloFsttiHU0WfvANVsR7TxNUJa/WpCnw/Ino/p+DeBhBQ== + +hash-wasm@^4.9.0: + version "4.11.0" + resolved "https://registry.npmjs.org/hash-wasm/-/hash-wasm-4.11.0.tgz" + integrity sha512-HVusNXlVqHe0fzIzdQOGolnFN6mX/fqcrSAOcTBXdvzrXVHwTz11vXeKRmkR5gTuwVpvHZEIyKoePDvuAR+XwQ== + +ieee754@^1.1.13: + version "1.2.1" + resolved "https://registry.npmjs.org/ieee754/-/ieee754-1.2.1.tgz" + integrity sha512-dcyqhDvX1C46lXZcVqCpK+FtMRQVdIMN6/Df5js2zouUsqG7I6sFxitIC+7KYK29KdXOLHdu9zL4sFnoVQnqaA== + +inherits@^2.0.3, inherits@^2.0.4: + version "2.0.4" + resolved "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz" + integrity sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ== + +ini@~1.3.0: + version "1.3.8" + resolved "https://registry.npmjs.org/ini/-/ini-1.3.8.tgz" + integrity sha512-JV/yugV2uzW5iMRSiZAyDtQd+nxtUnjeLt0acNdw98kKLrvuRVyB80tsREOE7yvGVgalhZ6RNXCmEHkUKBKxew== + +is-arrayish@^0.3.1: + version "0.3.2" + resolved "https://registry.npmjs.org/is-arrayish/-/is-arrayish-0.3.2.tgz" + integrity sha512-eVRqCvVlZbuw3GrM63ovNSNAeA1K16kaR/LRY/92w0zxQ5/1YzwblUX652i4Xs9RwAGjW9d9y6X88t8OaAJfWQ== + +long@^4.0.0: + version "4.0.0" + resolved "https://registry.npmjs.org/long/-/long-4.0.0.tgz" + integrity sha512-XsP+KhQif4bjX1kbuSiySJFNAehNxgLb6hPRGJ9QsUr8ajHkuXGdrHmFUTUUXhDwVX2R5bY4JNZEwbUiMhV+MA== + +long@^5.0.0: + version "5.2.3" + resolved "https://registry.npmjs.org/long/-/long-5.2.3.tgz" + integrity sha512-lcHwpNoggQTObv5apGNCTdJrO69eHOZMi4BNC+rTLER8iHAqGrUVeLh/irVIM7zTw2bOXA8T6uNPeujwOLg/2Q== + +long@^5.2.3: + version "5.2.3" + resolved "https://registry.npmjs.org/long/-/long-5.2.3.tgz" + integrity sha512-lcHwpNoggQTObv5apGNCTdJrO69eHOZMi4BNC+rTLER8iHAqGrUVeLh/irVIM7zTw2bOXA8T6uNPeujwOLg/2Q== + +lru-cache@^6.0.0: + version "6.0.0" + resolved "https://registry.npmjs.org/lru-cache/-/lru-cache-6.0.0.tgz" + integrity sha512-Jo6dJ04CmSjuznwJSS3pUeWmd/H0ffTlkXXgwZi+eq1UCmqQwCh+eLsYOYCwY991i2Fah4h1BEMCx4qThGbsiA== + dependencies: + yallist "^4.0.0" + +mimic-response@^3.1.0: + version "3.1.0" + resolved "https://registry.npmjs.org/mimic-response/-/mimic-response-3.1.0.tgz" + integrity sha512-z0yWI+4FDrrweS8Zmt4Ej5HdJmky15+L2e6Wgn3+iK5fWzb6T3fhNFq2+MeTRb064c6Wr4N/wv0DzQTjNzHNGQ== + +minimist@^1.2.0, minimist@^1.2.3: + version "1.2.8" + resolved "https://registry.npmjs.org/minimist/-/minimist-1.2.8.tgz" + integrity sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA== + +mkdirp-classic@^0.5.2, mkdirp-classic@^0.5.3: + version "0.5.3" + resolved "https://registry.npmjs.org/mkdirp-classic/-/mkdirp-classic-0.5.3.tgz" + integrity sha512-gKLcREMhtuZRwRAfqP3RFW+TK4JqApVBtOIftVgjuABpAtpxhPGaDcfvbhNvD0B8iD1oUr/txX35NjcaY6Ns/A== + +napi-build-utils@^1.0.1: + version "1.0.2" + resolved "https://registry.npmjs.org/napi-build-utils/-/napi-build-utils-1.0.2.tgz" + integrity sha512-ONmRUqK7zj7DWX0D9ADe03wbwOBZxNAfF20PlGfCWQcD3+/MakShIHrMqx9YwPTfxDdF1zLeL+RGZiR9kGMLdg== + +node-abi@^3.3.0: + version "3.57.0" + resolved "https://registry.npmjs.org/node-abi/-/node-abi-3.57.0.tgz" + integrity sha512-Dp+A9JWxRaKuHP35H77I4kCKesDy5HUDEmScia2FyncMTOXASMyg251F5PhFoDA5uqBrDDffiLpbqnrZmNXW+g== + dependencies: + semver "^7.3.5" + +node-addon-api@^6.1.0: + version "6.1.0" + resolved "https://registry.npmjs.org/node-addon-api/-/node-addon-api-6.1.0.tgz" + integrity sha512-+eawOlIgy680F0kBzPUNFhMZGtJ1YmqM6l4+Crf4IkImjYrO/mqPwRMh352g23uIaQKFItcQ64I7KMaJxHgAVA== + +once@^1.3.1, once@^1.4.0: + version "1.4.0" + resolved "https://registry.npmjs.org/once/-/once-1.4.0.tgz" + integrity sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w== + dependencies: + wrappy "1" + +onnx-proto@^4.0.4: + version "4.0.4" + resolved "https://registry.npmjs.org/onnx-proto/-/onnx-proto-4.0.4.tgz" + integrity sha512-aldMOB3HRoo6q/phyB6QRQxSt895HNNw82BNyZ2CMh4bjeKv7g/c+VpAFtJuEMVfYLMbRx61hbuqnKceLeDcDA== + dependencies: + protobufjs "^6.8.8" + +onnxruntime-common@~1.14.0: + version "1.14.0" + resolved "https://registry.npmjs.org/onnxruntime-common/-/onnxruntime-common-1.14.0.tgz" + integrity sha512-3LJpegM2iMNRX2wUmtYfeX/ytfOzNwAWKSq1HbRrKc9+uqG/FsEA0bbKZl1btQeZaXhC26l44NWpNUeXPII7Ew== + +onnxruntime-common@1.17.3: + version "1.17.3" + resolved "https://registry.npmjs.org/onnxruntime-common/-/onnxruntime-common-1.17.3.tgz" + integrity sha512-IkbaDelNVX8cBfHFgsNADRIq2TlXMFWW+nG55mwWvQT4i0NZb32Jf35Pf6h9yjrnK78RjcnlNYaI37w394ovMw== + +onnxruntime-node@1.14.0: + version "1.14.0" + resolved "https://registry.npmjs.org/onnxruntime-node/-/onnxruntime-node-1.14.0.tgz" + integrity sha512-5ba7TWomIV/9b6NH/1x/8QEeowsb+jBEvFzU6z0T4mNsFwdPqXeFUM7uxC6QeSRkEbWu3qEB0VMjrvzN/0S9+w== + dependencies: + onnxruntime-common "~1.14.0" + +onnxruntime-web@^1.17.3: + version "1.17.3" + resolved "https://registry.npmjs.org/onnxruntime-web/-/onnxruntime-web-1.17.3.tgz" + integrity sha512-MSDrNUWgc1biP0YzY488OJ9n/jTMS9EXysgm9Aw4CUj2A836ALbO2J1sgzguWJeVUHTlM6p7tRzo8IGAgaXWKw== + dependencies: + flatbuffers "^1.12.0" + guid-typescript "^1.0.9" + long "^5.2.3" + onnxruntime-common "1.17.3" + platform "^1.3.6" + protobufjs "^7.2.4" + +onnxruntime-web@1.14.0: + version "1.14.0" + resolved "https://registry.npmjs.org/onnxruntime-web/-/onnxruntime-web-1.14.0.tgz" + integrity sha512-Kcqf43UMfW8mCydVGcX9OMXI2VN17c0p6XvR7IPSZzBf/6lteBzXHvcEVWDPmCKuGombl997HgLqj91F11DzXw== + dependencies: + flatbuffers "^1.12.0" + guid-typescript "^1.0.9" + long "^4.0.0" + onnx-proto "^4.0.4" + onnxruntime-common "~1.14.0" + platform "^1.3.6" + +platform@^1.3.6: + version "1.3.6" + resolved "https://registry.npmjs.org/platform/-/platform-1.3.6.tgz" + integrity sha512-fnWVljUchTro6RiCFvCXBbNhJc2NijN7oIQxbwsyL0buWJPG85v81ehlHI9fXrJsMNgTofEoWIQeClKpgxFLrg== + +prebuild-install@^7.1.1: + version "7.1.2" + resolved "https://registry.npmjs.org/prebuild-install/-/prebuild-install-7.1.2.tgz" + integrity sha512-UnNke3IQb6sgarcZIDU3gbMeTp/9SSU1DAIkil7PrqG1vZlBtY5msYccSKSHDqa3hNg436IXK+SNImReuA1wEQ== + dependencies: + detect-libc "^2.0.0" + expand-template "^2.0.3" + github-from-package "0.0.0" + minimist "^1.2.3" + mkdirp-classic "^0.5.3" + napi-build-utils "^1.0.1" + node-abi "^3.3.0" + pump "^3.0.0" + rc "^1.2.7" + simple-get "^4.0.0" + tar-fs "^2.0.0" + tunnel-agent "^0.6.0" + +protobufjs@^6.8.8: + version "6.11.4" + resolved "https://registry.npmjs.org/protobufjs/-/protobufjs-6.11.4.tgz" + integrity sha512-5kQWPaJHi1WoCpjTGszzQ32PG2F4+wRY6BmAT4Vfw56Q2FZ4YZzK20xUYQH4YkfehY1e6QSICrJquM6xXZNcrw== + dependencies: + "@protobufjs/aspromise" "^1.1.2" + "@protobufjs/base64" "^1.1.2" + "@protobufjs/codegen" "^2.0.4" + "@protobufjs/eventemitter" "^1.1.0" + "@protobufjs/fetch" "^1.1.0" + "@protobufjs/float" "^1.0.2" + "@protobufjs/inquire" "^1.1.0" + "@protobufjs/path" "^1.1.2" + "@protobufjs/pool" "^1.1.0" + "@protobufjs/utf8" "^1.1.0" + "@types/long" "^4.0.1" + "@types/node" ">=13.7.0" + long "^4.0.0" + +protobufjs@^7.2.4: + version "7.2.6" + resolved "https://registry.npmjs.org/protobufjs/-/protobufjs-7.2.6.tgz" + integrity sha512-dgJaEDDL6x8ASUZ1YqWciTRrdOuYNzoOf27oHNfdyvKqHr5i0FV7FSLU+aIeFjyFgVxrpTOtQUi0BLLBymZaBw== + dependencies: + "@protobufjs/aspromise" "^1.1.2" + "@protobufjs/base64" "^1.1.2" + "@protobufjs/codegen" "^2.0.4" + "@protobufjs/eventemitter" "^1.1.0" + "@protobufjs/fetch" "^1.1.0" + "@protobufjs/float" "^1.0.2" + "@protobufjs/inquire" "^1.1.0" + "@protobufjs/path" "^1.1.2" + "@protobufjs/pool" "^1.1.0" + "@protobufjs/utf8" "^1.1.0" + "@types/node" ">=13.7.0" + long "^5.0.0" + +pump@^3.0.0: + version "3.0.0" + resolved "https://registry.npmjs.org/pump/-/pump-3.0.0.tgz" + integrity sha512-LwZy+p3SFs1Pytd/jYct4wpv49HiYCqd9Rlc5ZVdk0V+8Yzv6jR5Blk3TRmPL1ft69TxP0IMZGJ+WPFU2BFhww== + dependencies: + end-of-stream "^1.1.0" + once "^1.3.1" + +queue-tick@^1.0.1: + version "1.0.1" + resolved "https://registry.npmjs.org/queue-tick/-/queue-tick-1.0.1.tgz" + integrity sha512-kJt5qhMxoszgU/62PLP1CJytzd2NKetjSRnyuj31fDd3Rlcz3fzlFdFLD1SItunPwyqEOkca6GbV612BWfaBag== + +rc@^1.2.7: + version "1.2.8" + resolved "https://registry.npmjs.org/rc/-/rc-1.2.8.tgz" + integrity sha512-y3bGgqKj3QBdxLbLkomlohkvsA8gdAiUQlSBJnBhfn+BPxg4bc62d8TcBW15wavDfgexCgccckhcZvywyQYPOw== + dependencies: + deep-extend "^0.6.0" + ini "~1.3.0" + minimist "^1.2.0" + strip-json-comments "~2.0.1" + +readable-stream@^3.1.1, readable-stream@^3.4.0: + version "3.6.2" + resolved "https://registry.npmjs.org/readable-stream/-/readable-stream-3.6.2.tgz" + integrity sha512-9u/sniCrY3D5WdsERHzHE4G2YCXqoG5FTHUiCC4SIbr6XcLZBY05ya9EKjYek9O5xOAwjGq+1JdGBAS7Q9ScoA== + dependencies: + inherits "^2.0.3" + string_decoder "^1.1.1" + util-deprecate "^1.0.1" + +safe-buffer@^5.0.1, safe-buffer@~5.2.0: + version "5.2.1" + resolved "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz" + integrity sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ== + +semver@^7.3.5, semver@^7.5.4: + version "7.6.0" + resolved "https://registry.npmjs.org/semver/-/semver-7.6.0.tgz" + integrity sha512-EnwXhrlwXMk9gKu5/flx5sv/an57AkRplG3hTK68W7FRDN+k+OWBj65M7719OkA82XLBxrcX0KSHj+X5COhOVg== + dependencies: + lru-cache "^6.0.0" + +sharp@^0.32.0: + version "0.32.6" + resolved "https://registry.npmjs.org/sharp/-/sharp-0.32.6.tgz" + integrity sha512-KyLTWwgcR9Oe4d9HwCwNM2l7+J0dUQwn/yf7S0EnTtb0eVS4RxO0eUSvxPtzT4F3SY+C4K6fqdv/DO27sJ/v/w== + dependencies: + color "^4.2.3" + detect-libc "^2.0.2" + node-addon-api "^6.1.0" + prebuild-install "^7.1.1" + semver "^7.5.4" + simple-get "^4.0.1" + tar-fs "^3.0.4" + tunnel-agent "^0.6.0" + +simple-concat@^1.0.0: + version "1.0.1" + resolved "https://registry.npmjs.org/simple-concat/-/simple-concat-1.0.1.tgz" + integrity sha512-cSFtAPtRhljv69IK0hTVZQ+OfE9nePi/rtJmw5UjHeVyVroEqJXP1sFztKUy1qU+xvz3u/sfYJLa947b7nAN2Q== + +simple-get@^4.0.0, simple-get@^4.0.1: + version "4.0.1" + resolved "https://registry.npmjs.org/simple-get/-/simple-get-4.0.1.tgz" + integrity sha512-brv7p5WgH0jmQJr1ZDDfKDOSeWWg+OVypG99A/5vYGPqJ6pxiaHLy8nxtFjBA7oMa01ebA9gfh1uMCFqOuXxvA== + dependencies: + decompress-response "^6.0.0" + once "^1.3.1" + simple-concat "^1.0.0" + +simple-swizzle@^0.2.2: + version "0.2.2" + resolved "https://registry.npmjs.org/simple-swizzle/-/simple-swizzle-0.2.2.tgz" + integrity sha512-JA//kQgZtbuY83m+xT+tXJkmJncGMTFT+C+g2h2R9uxkYIrE2yy9sgmcLhCnw57/WSD+Eh3J97FPEDFnbXnDUg== + dependencies: + is-arrayish "^0.3.1" + +streamx@^2.13.0, streamx@^2.15.0: + version "2.16.1" + resolved "https://registry.npmjs.org/streamx/-/streamx-2.16.1.tgz" + integrity sha512-m9QYj6WygWyWa3H1YY69amr4nVgy61xfjys7xO7kviL5rfIEc2naf+ewFiOA+aEJD7y0JO3h2GoiUv4TDwEGzQ== + dependencies: + fast-fifo "^1.1.0" + queue-tick "^1.0.1" + optionalDependencies: + bare-events "^2.2.0" + +string_decoder@^1.1.1: + version "1.3.0" + resolved "https://registry.npmjs.org/string_decoder/-/string_decoder-1.3.0.tgz" + integrity sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA== + dependencies: + safe-buffer "~5.2.0" + +strip-json-comments@~2.0.1: + version "2.0.1" + resolved "https://registry.npmjs.org/strip-json-comments/-/strip-json-comments-2.0.1.tgz" + integrity sha512-4gB8na07fecVVkOI6Rs4e7T6NOTki5EmL7TUduTs6bu3EdnSycntVJ4re8kgZA+wx9IueI2Y11bfbgwtzuE0KQ== + +tar-fs@^2.0.0: + version "2.1.1" + resolved "https://registry.npmjs.org/tar-fs/-/tar-fs-2.1.1.tgz" + integrity sha512-V0r2Y9scmbDRLCNex/+hYzvp/zyYjvFbHPNgVTKfQvVrb6guiE/fxP+XblDNR011utopbkex2nM4dHNV6GDsng== + dependencies: + chownr "^1.1.1" + mkdirp-classic "^0.5.2" + pump "^3.0.0" + tar-stream "^2.1.4" + +tar-fs@^3.0.4: + version "3.0.5" + resolved "https://registry.npmjs.org/tar-fs/-/tar-fs-3.0.5.tgz" + integrity sha512-JOgGAmZyMgbqpLwct7ZV8VzkEB6pxXFBVErLtb+XCOqzc6w1xiWKI9GVd6bwk68EX7eJ4DWmfXVmq8K2ziZTGg== + dependencies: + pump "^3.0.0" + tar-stream "^3.1.5" + optionalDependencies: + bare-fs "^2.1.1" + bare-path "^2.1.0" + +tar-stream@^2.1.4: + version "2.2.0" + resolved "https://registry.npmjs.org/tar-stream/-/tar-stream-2.2.0.tgz" + integrity sha512-ujeqbceABgwMZxEJnk2HDY2DlnUZ+9oEcb1KzTVfYHio0UE6dG71n60d8D2I4qNvleWrrXpmjpt7vZeF1LnMZQ== + dependencies: + bl "^4.0.3" + end-of-stream "^1.4.1" + fs-constants "^1.0.0" + inherits "^2.0.3" + readable-stream "^3.1.1" + +tar-stream@^3.1.5: + version "3.1.7" + resolved "https://registry.npmjs.org/tar-stream/-/tar-stream-3.1.7.tgz" + integrity sha512-qJj60CXt7IU1Ffyc3NJMjh6EkuCFej46zUqJ4J7pqYlThyd9bO0XBTmcOIhSzZJVWfsLks0+nle/j538YAW9RQ== + dependencies: + b4a "^1.6.4" + fast-fifo "^1.2.0" + streamx "^2.15.0" + +tunnel-agent@^0.6.0: + version "0.6.0" + resolved "https://registry.npmjs.org/tunnel-agent/-/tunnel-agent-0.6.0.tgz" + integrity sha512-McnNiV1l8RYeY8tBgEpuodCC1mLUdbSN+CYBL7kJsJNInOP8UjDDEwdk6Mw60vdLLrr5NHKZhMAOSrR2NZuQ+w== + dependencies: + safe-buffer "^5.0.1" + +undici-types@~5.26.4: + version "5.26.5" + resolved "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz" + integrity sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA== + +util-deprecate@^1.0.1: + version "1.0.2" + resolved "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz" + integrity sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw== + +wrappy@1: + version "1.0.2" + resolved "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz" + integrity sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ== + +yallist@^4.0.0: + version "4.0.0" + resolved "https://registry.npmjs.org/yallist/-/yallist-4.0.0.tgz" + integrity sha512-3wdGidZyq5PB084XLES5TpOSRA3wjXAlIWMhum2kRcv/41Sn2emQ0dycQW4uZXLejwKvg6EsvbdlVL+FYEct7A==