diff --git a/.gitignore b/.gitignore
index 1732614..af7d4af 100755
--- a/.gitignore
+++ b/.gitignore
@@ -5,4 +5,6 @@ build/
 package-lock.json
 *.egg-info
 *.onnx
-__pycache__
\ No newline at end of file
+__pycache__
+.build
+.swiftpm
\ No newline at end of file
diff --git a/.swift-format b/.swift-format
new file mode 100644
index 0000000..53bf631
--- /dev/null
+++ b/.swift-format
@@ -0,0 +1,13 @@
+{
+    "version": 1,
+    "lineLength": 120,
+    "indentation": {
+        "spaces": 4
+    },
+    "maximumBlankLines": 1,
+    "respectsExistingLineBreaks": true,
+    "lineBreakBeforeControlFlowKeywords": true,
+    "lineBreakBeforeEachArgument": true,
+    "multiElementCollectionTrailingCommas": true,
+    "spacesAroundRangeFormationOperators": true
+}
\ No newline at end of file
diff --git a/.vscode/launch.json b/.vscode/launch.json
new file mode 100644
index 0000000..59eb78c
--- /dev/null
+++ b/.vscode/launch.json
@@ -0,0 +1,15 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python Debugger: Current File with Arguments",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "${file}",
+            "console": "integratedTerminal",
+        }
+    ]
+}
\ No newline at end of file
diff --git a/.vscode/settings.json b/.vscode/settings.json
index 9b08d04..a6cceb8 100755
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -1,16 +1,26 @@
 {
     "cSpell.words": [
+        "arange",
+        "CFURL",
+        "coreml",
+        "cumsum",
         "dtype",
         "embs",
+        "finfo",
         "huggingface",
         "keepdim",
+        "linalg",
         "logits",
         "Matryoshka",
+        "mlmodel",
+        "mlpackage",
+        "mlprogram",
         "multimodal",
         "ndarray",
         "numpy",
         "ONNX",
         "onnxruntime",
+        "packbits",
         "preprocess",
         "pretrained",
         "probs",
@@ -18,11 +28,14 @@
         "rerank",
         "reranker",
         "reranking",
+        "sess",
+        "SIMD",
         "softmax",
         "transfromers",
         "uform",
         "unimodal",
-        "unsqueeze"
+        "unsqueeze",
+        "Vardanian"
     ],
     "[python]": {
         "editor.defaultFormatter": "ms-python.black-formatter"
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 1548c30..181d9e2 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,6 +1,9 @@
 # Contributing to UForm
 
 We welcome contributions to UForm!
+
+## Python
+
 Before submitting any changes, please make sure that the tests pass.
 
 ```sh
@@ -13,4 +16,17 @@ pip install -e ".[torch,onnx]"  # For PyTorch and ONNX Python tests
 
 pytest python/scripts/ -s -x -Wd -v
 pytest python/scripts/ -s -x -Wd -v -k onnx # To run only ONNX tests without loading Torch
-```
\ No newline at end of file
+```
+
+## Swift
+
+Swift formatting is enforced with `swift-format` default utility from Apple.
+To install and run it on all the files in the project, use the following command:
+
+```bash
+brew install swift-format
+swift-format . -i -r
+```
+
+The style is controlled by the `.swift-format` JSON file in the root of the repository.
+As there is no standard for Swift formatting, even Apple's own `swift-format` tool and Xcode differ in their formatting rules, and available settings.
diff --git a/Package.resolved b/Package.resolved
new file mode 100644
index 0000000..fe63c94
--- /dev/null
+++ b/Package.resolved
@@ -0,0 +1,22 @@
+{
+  "pins" : [
+    {
+      "identity" : "swift-argument-parser",
+      "kind" : "remoteSourceControl",
+      "location" : "https://github.com/apple/swift-argument-parser.git",
+      "state" : {
+        "revision" : "c8ed701b513cf5177118a175d85fbbbcd707ab41",
+        "version" : "1.3.0"
+      }
+    },
+    {
+      "identity" : "swift-transformers",
+      "kind" : "remoteSourceControl",
+      "location" : "https://github.com/ashvardanian/swift-transformers",
+      "state" : {
+        "revision" : "9ef46a51eca46978b62773f8887926dfe72b0ab4"
+      }
+    }
+  ],
+  "version" : 2
+}
diff --git a/Package.swift b/Package.swift
new file mode 100644
index 0000000..6ac8372
--- /dev/null
+++ b/Package.swift
@@ -0,0 +1,41 @@
+// swift-tools-version:5.9
+import PackageDescription
+
+let package = Package(
+    name: "UForm",
+    platforms: [
+        // Linux doesn't have to be explicitly listed
+        .iOS(.v16),  // For iOS, version 13 and later
+        .tvOS(.v16),  // For tvOS, version 13 and later
+        .macOS(.v13),  // For macOS, version 10.15 (Catalina) and later
+        .watchOS(.v6),  // For watchOS, version 6 and later
+    ],
+    products: [
+        .library(
+            name: "UForm",
+            targets: ["UForm"]
+        )
+    ],
+    dependencies: [
+        .package(
+            url: "https://github.com/ashvardanian/swift-transformers",
+            revision: "9ef46a51eca46978b62773f8887926dfe72b0ab4"
+        )
+    ],
+    targets: [
+        .target(
+            name: "UForm",
+            dependencies: [
+                .product(name: "Transformers", package: "swift-transformers")
+            ],
+            path: "swift",
+            exclude: ["EmbeddingsTests.swift"]
+        ),
+        .testTarget(
+            name: "UFormTests",
+            dependencies: ["UForm"],
+            path: "swift",
+            sources: ["EmbeddingsTests.swift"]
+        ),
+    ]
+)
diff --git a/python/scripts/export.ipynb b/python/scripts/export.ipynb
new file mode 100644
index 0000000..ce8cf10
--- /dev/null
+++ b/python/scripts/export.ipynb
@@ -0,0 +1,666 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Scripts for Exporting PyTorch Models to ONNX and CoreML"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install --upgrade \"uform[torch]\" coremltools"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/av/miniconda3/lib/python3.10/site-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: dlopen(/Users/av/miniconda3/lib/python3.10/site-packages/torchvision/image.so, 0x0006): Symbol not found: __ZN3c106detail19maybe_wrap_dim_slowExxb\n",
+      "  Referenced from: <0B637046-A38B-3A5C-80C6-E847C27DCCD5> /Users/av/miniconda3/lib/python3.10/site-packages/torchvision/image.so\n",
+      "  Expected in:     <3AE92490-D363-3FD7-8532-CB6F5F795BC8> /Users/av/miniconda3/lib/python3.10/site-packages/torch/lib/libc10.dylib\n",
+      "  warn(f\"Failed to load image Python extension: {e}\")\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "fadffc0299c04e249fd4f7a5b40ba0af",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "(torch.Size([1, 197, 384]),\n",
+       " torch.Size([1, 64, 768]),\n",
+       " torch.Size([1, 256]),\n",
+       " torch.Size([1, 256]))"
+      ]
+     },
+     "execution_count": 1,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import uform\n",
+    "from PIL import Image\n",
+    "\n",
+    "model, processor = uform.get_model('unum-cloud/uform-vl-english-small')\n",
+    "text = 'a small red panda in a zoo'\n",
+    "image = Image.open('../../assets/unum.png')\n",
+    "\n",
+    "image_data = processor.preprocess_image(image)\n",
+    "text_data = processor.preprocess_text(text)\n",
+    "\n",
+    "image_features, image_embedding = model.encode_image(image_data, return_features=True)\n",
+    "text_features, text_embedding = model.encode_text(text_data, return_features=True)\n",
+    "\n",
+    "image_features.shape, text_features.shape, image_embedding.shape, text_embedding.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "TextEncoder(model_type='bert', dim=768, context_dim=384, vocab_size=30522, padding_idx=0, num_layers=4, num_heads=12, embedding_dim=256, multimodal_layers_ids=[2, 3], head_one_neuron=False, pooling='cls', max_position_embeddings=64, dropout_prob=0.1)"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model.text_encoder"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "VisualEncoder(dim=384, patch_size=16, image_size=224, num_layers=12, num_heads=6, embedding_dim=256, pooling='cls', num_reg_tokens=0)"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model.image_encoder"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "First layer of image_encoder: patch_embed\n",
+      "First layer of text_encoder: word_embeddings\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Assuming `model` is your loaded model with image_encoder and text_encoder attributes\n",
+    "for name, module in model.image_encoder.named_children():\n",
+    "    print(f\"First layer of image_encoder: {name}\")\n",
+    "    break  # We break after the first layer\n",
+    "\n",
+    "for name, module in model.text_encoder.named_children():\n",
+    "    print(f\"First layer of text_encoder: {name}\")\n",
+    "    break  # We break after the first layer"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## ONNX"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## CoreML"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "scikit-learn version 1.2.1 is not supported. Minimum required version: 0.17. Maximum required version: 1.1.2. Disabling scikit-learn conversion API.\n",
+      "Torch version 2.1.1 has not been tested with coremltools. You may run into unexpected errors. Torch 2.1.0 is the most recent version that has been tested.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import coremltools as ct\n",
+    "import torch"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "image_input = ct.TensorType(name=\"input\", shape=image_data.shape)\n",
+    "text_input = ct.TensorType(name=\"input_ids\", shape=text_data[\"input_ids\"].shape)\n",
+    "text_attention_input = ct.TensorType(name=\"attention_mask\", shape=text_data[\"attention_mask\"].shape)\n",
+    "text_features = ct.TensorType(name=\"features\")\n",
+    "text_embeddings = ct.TensorType(name=\"embeddings\")\n",
+    "image_features = ct.TensorType(name=\"features\")\n",
+    "image_embeddings = ct.TensorType(name=\"embeddings\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "VisualEncoder(\n",
+       "  original_name=VisualEncoder\n",
+       "  (patch_embed): Conv2d(original_name=Conv2d)\n",
+       "  (blocks): Sequential(\n",
+       "    original_name=Sequential\n",
+       "    (0): VisualEncoderBlock(\n",
+       "      original_name=VisualEncoderBlock\n",
+       "      (norm1): LayerNorm(original_name=LayerNorm)\n",
+       "      (attn): Attention(\n",
+       "        original_name=Attention\n",
+       "        (query): Linear(original_name=Linear)\n",
+       "        (key): Linear(original_name=Linear)\n",
+       "        (value): Linear(original_name=Linear)\n",
+       "        (out): Linear(original_name=Linear)\n",
+       "      )\n",
+       "      (ls1): LayerScale(original_name=LayerScale)\n",
+       "      (norm2): LayerNorm(original_name=LayerNorm)\n",
+       "      (mlp): MLP(\n",
+       "        original_name=MLP\n",
+       "        (hidden_layer): Linear(original_name=Linear)\n",
+       "        (output_layer): Linear(original_name=Linear)\n",
+       "      )\n",
+       "      (ls2): LayerScale(original_name=LayerScale)\n",
+       "    )\n",
+       "    (1): VisualEncoderBlock(\n",
+       "      original_name=VisualEncoderBlock\n",
+       "      (norm1): LayerNorm(original_name=LayerNorm)\n",
+       "      (attn): Attention(\n",
+       "        original_name=Attention\n",
+       "        (query): Linear(original_name=Linear)\n",
+       "        (key): Linear(original_name=Linear)\n",
+       "        (value): Linear(original_name=Linear)\n",
+       "        (out): Linear(original_name=Linear)\n",
+       "      )\n",
+       "      (ls1): LayerScale(original_name=LayerScale)\n",
+       "      (norm2): LayerNorm(original_name=LayerNorm)\n",
+       "      (mlp): MLP(\n",
+       "        original_name=MLP\n",
+       "        (hidden_layer): Linear(original_name=Linear)\n",
+       "        (output_layer): Linear(original_name=Linear)\n",
+       "      )\n",
+       "      (ls2): LayerScale(original_name=LayerScale)\n",
+       "    )\n",
+       "    (2): VisualEncoderBlock(\n",
+       "      original_name=VisualEncoderBlock\n",
+       "      (norm1): LayerNorm(original_name=LayerNorm)\n",
+       "      (attn): Attention(\n",
+       "        original_name=Attention\n",
+       "        (query): Linear(original_name=Linear)\n",
+       "        (key): Linear(original_name=Linear)\n",
+       "        (value): Linear(original_name=Linear)\n",
+       "        (out): Linear(original_name=Linear)\n",
+       "      )\n",
+       "      (ls1): LayerScale(original_name=LayerScale)\n",
+       "      (norm2): LayerNorm(original_name=LayerNorm)\n",
+       "      (mlp): MLP(\n",
+       "        original_name=MLP\n",
+       "        (hidden_layer): Linear(original_name=Linear)\n",
+       "        (output_layer): Linear(original_name=Linear)\n",
+       "      )\n",
+       "      (ls2): LayerScale(original_name=LayerScale)\n",
+       "    )\n",
+       "    (3): VisualEncoderBlock(\n",
+       "      original_name=VisualEncoderBlock\n",
+       "      (norm1): LayerNorm(original_name=LayerNorm)\n",
+       "      (attn): Attention(\n",
+       "        original_name=Attention\n",
+       "        (query): Linear(original_name=Linear)\n",
+       "        (key): Linear(original_name=Linear)\n",
+       "        (value): Linear(original_name=Linear)\n",
+       "        (out): Linear(original_name=Linear)\n",
+       "      )\n",
+       "      (ls1): LayerScale(original_name=LayerScale)\n",
+       "      (norm2): LayerNorm(original_name=LayerNorm)\n",
+       "      (mlp): MLP(\n",
+       "        original_name=MLP\n",
+       "        (hidden_layer): Linear(original_name=Linear)\n",
+       "        (output_layer): Linear(original_name=Linear)\n",
+       "      )\n",
+       "      (ls2): LayerScale(original_name=LayerScale)\n",
+       "    )\n",
+       "    (4): VisualEncoderBlock(\n",
+       "      original_name=VisualEncoderBlock\n",
+       "      (norm1): LayerNorm(original_name=LayerNorm)\n",
+       "      (attn): Attention(\n",
+       "        original_name=Attention\n",
+       "        (query): Linear(original_name=Linear)\n",
+       "        (key): Linear(original_name=Linear)\n",
+       "        (value): Linear(original_name=Linear)\n",
+       "        (out): Linear(original_name=Linear)\n",
+       "      )\n",
+       "      (ls1): LayerScale(original_name=LayerScale)\n",
+       "      (norm2): LayerNorm(original_name=LayerNorm)\n",
+       "      (mlp): MLP(\n",
+       "        original_name=MLP\n",
+       "        (hidden_layer): Linear(original_name=Linear)\n",
+       "        (output_layer): Linear(original_name=Linear)\n",
+       "      )\n",
+       "      (ls2): LayerScale(original_name=LayerScale)\n",
+       "    )\n",
+       "    (5): VisualEncoderBlock(\n",
+       "      original_name=VisualEncoderBlock\n",
+       "      (norm1): LayerNorm(original_name=LayerNorm)\n",
+       "      (attn): Attention(\n",
+       "        original_name=Attention\n",
+       "        (query): Linear(original_name=Linear)\n",
+       "        (key): Linear(original_name=Linear)\n",
+       "        (value): Linear(original_name=Linear)\n",
+       "        (out): Linear(original_name=Linear)\n",
+       "      )\n",
+       "      (ls1): LayerScale(original_name=LayerScale)\n",
+       "      (norm2): LayerNorm(original_name=LayerNorm)\n",
+       "      (mlp): MLP(\n",
+       "        original_name=MLP\n",
+       "        (hidden_layer): Linear(original_name=Linear)\n",
+       "        (output_layer): Linear(original_name=Linear)\n",
+       "      )\n",
+       "      (ls2): LayerScale(original_name=LayerScale)\n",
+       "    )\n",
+       "    (6): VisualEncoderBlock(\n",
+       "      original_name=VisualEncoderBlock\n",
+       "      (norm1): LayerNorm(original_name=LayerNorm)\n",
+       "      (attn): Attention(\n",
+       "        original_name=Attention\n",
+       "        (query): Linear(original_name=Linear)\n",
+       "        (key): Linear(original_name=Linear)\n",
+       "        (value): Linear(original_name=Linear)\n",
+       "        (out): Linear(original_name=Linear)\n",
+       "      )\n",
+       "      (ls1): LayerScale(original_name=LayerScale)\n",
+       "      (norm2): LayerNorm(original_name=LayerNorm)\n",
+       "      (mlp): MLP(\n",
+       "        original_name=MLP\n",
+       "        (hidden_layer): Linear(original_name=Linear)\n",
+       "        (output_layer): Linear(original_name=Linear)\n",
+       "      )\n",
+       "      (ls2): LayerScale(original_name=LayerScale)\n",
+       "    )\n",
+       "    (7): VisualEncoderBlock(\n",
+       "      original_name=VisualEncoderBlock\n",
+       "      (norm1): LayerNorm(original_name=LayerNorm)\n",
+       "      (attn): Attention(\n",
+       "        original_name=Attention\n",
+       "        (query): Linear(original_name=Linear)\n",
+       "        (key): Linear(original_name=Linear)\n",
+       "        (value): Linear(original_name=Linear)\n",
+       "        (out): Linear(original_name=Linear)\n",
+       "      )\n",
+       "      (ls1): LayerScale(original_name=LayerScale)\n",
+       "      (norm2): LayerNorm(original_name=LayerNorm)\n",
+       "      (mlp): MLP(\n",
+       "        original_name=MLP\n",
+       "        (hidden_layer): Linear(original_name=Linear)\n",
+       "        (output_layer): Linear(original_name=Linear)\n",
+       "      )\n",
+       "      (ls2): LayerScale(original_name=LayerScale)\n",
+       "    )\n",
+       "    (8): VisualEncoderBlock(\n",
+       "      original_name=VisualEncoderBlock\n",
+       "      (norm1): LayerNorm(original_name=LayerNorm)\n",
+       "      (attn): Attention(\n",
+       "        original_name=Attention\n",
+       "        (query): Linear(original_name=Linear)\n",
+       "        (key): Linear(original_name=Linear)\n",
+       "        (value): Linear(original_name=Linear)\n",
+       "        (out): Linear(original_name=Linear)\n",
+       "      )\n",
+       "      (ls1): LayerScale(original_name=LayerScale)\n",
+       "      (norm2): LayerNorm(original_name=LayerNorm)\n",
+       "      (mlp): MLP(\n",
+       "        original_name=MLP\n",
+       "        (hidden_layer): Linear(original_name=Linear)\n",
+       "        (output_layer): Linear(original_name=Linear)\n",
+       "      )\n",
+       "      (ls2): LayerScale(original_name=LayerScale)\n",
+       "    )\n",
+       "    (9): VisualEncoderBlock(\n",
+       "      original_name=VisualEncoderBlock\n",
+       "      (norm1): LayerNorm(original_name=LayerNorm)\n",
+       "      (attn): Attention(\n",
+       "        original_name=Attention\n",
+       "        (query): Linear(original_name=Linear)\n",
+       "        (key): Linear(original_name=Linear)\n",
+       "        (value): Linear(original_name=Linear)\n",
+       "        (out): Linear(original_name=Linear)\n",
+       "      )\n",
+       "      (ls1): LayerScale(original_name=LayerScale)\n",
+       "      (norm2): LayerNorm(original_name=LayerNorm)\n",
+       "      (mlp): MLP(\n",
+       "        original_name=MLP\n",
+       "        (hidden_layer): Linear(original_name=Linear)\n",
+       "        (output_layer): Linear(original_name=Linear)\n",
+       "      )\n",
+       "      (ls2): LayerScale(original_name=LayerScale)\n",
+       "    )\n",
+       "    (10): VisualEncoderBlock(\n",
+       "      original_name=VisualEncoderBlock\n",
+       "      (norm1): LayerNorm(original_name=LayerNorm)\n",
+       "      (attn): Attention(\n",
+       "        original_name=Attention\n",
+       "        (query): Linear(original_name=Linear)\n",
+       "        (key): Linear(original_name=Linear)\n",
+       "        (value): Linear(original_name=Linear)\n",
+       "        (out): Linear(original_name=Linear)\n",
+       "      )\n",
+       "      (ls1): LayerScale(original_name=LayerScale)\n",
+       "      (norm2): LayerNorm(original_name=LayerNorm)\n",
+       "      (mlp): MLP(\n",
+       "        original_name=MLP\n",
+       "        (hidden_layer): Linear(original_name=Linear)\n",
+       "        (output_layer): Linear(original_name=Linear)\n",
+       "      )\n",
+       "      (ls2): LayerScale(original_name=LayerScale)\n",
+       "    )\n",
+       "    (11): VisualEncoderBlock(\n",
+       "      original_name=VisualEncoderBlock\n",
+       "      (norm1): LayerNorm(original_name=LayerNorm)\n",
+       "      (attn): Attention(\n",
+       "        original_name=Attention\n",
+       "        (query): Linear(original_name=Linear)\n",
+       "        (key): Linear(original_name=Linear)\n",
+       "        (value): Linear(original_name=Linear)\n",
+       "        (out): Linear(original_name=Linear)\n",
+       "      )\n",
+       "      (ls1): LayerScale(original_name=LayerScale)\n",
+       "      (norm2): LayerNorm(original_name=LayerNorm)\n",
+       "      (mlp): MLP(\n",
+       "        original_name=MLP\n",
+       "        (hidden_layer): Linear(original_name=Linear)\n",
+       "        (output_layer): Linear(original_name=Linear)\n",
+       "      )\n",
+       "      (ls2): LayerScale(original_name=LayerScale)\n",
+       "    )\n",
+       "  )\n",
+       "  (norm): LayerNorm(original_name=LayerNorm)\n",
+       "  (embedding_projection): Linear(original_name=Linear)\n",
+       ")"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "module = model.image_encoder\n",
+    "module.eval()\n",
+    "module.return_features = True\n",
+    "\n",
+    "traced_script_module = torch.jit.trace(module, example_inputs=image_data)\n",
+    "traced_script_module"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Tuple detected at graph output. This will be flattened in the converted model.\n",
+      "Converting PyTorch Frontend ==> MIL Ops: 100%|█████████▉| 453/455 [00:00<00:00, 5638.83 ops/s]\n",
+      "Running MIL frontend_pytorch pipeline: 100%|██████████| 5/5 [00:00<00:00, 381.07 passes/s]\n",
+      "Running MIL default pipeline: 100%|██████████| 69/69 [00:00<00:00, 156.08 passes/s]\n",
+      "Running MIL backend_mlprogram pipeline: 100%|██████████| 12/12 [00:00<00:00, 699.38 passes/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "coreml_model = ct.convert(\n",
+    "    traced_script_module, source=\"pytorch\",\n",
+    "    inputs=[image_input], outputs=[image_features, image_embeddings],\n",
+    "    convert_to='mlprogram', compute_precision=ct.precision.FLOAT32)\n",
+    "\n",
+    "coreml_model.author = 'Unum Cloud'\n",
+    "coreml_model.license = 'Apache 2.0'\n",
+    "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n",
+    "coreml_model.save(\"../uform-vl-english-small-image.mlpackage\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "TextEncoder(\n",
+       "  original_name=TextEncoder\n",
+       "  (word_embeddings): Embedding(original_name=Embedding)\n",
+       "  (position_embeddings): Embedding(original_name=Embedding)\n",
+       "  (layer_norm): LayerNorm(original_name=LayerNorm)\n",
+       "  (dropout): Dropout(original_name=Dropout)\n",
+       "  (blocks): ModuleList(\n",
+       "    original_name=ModuleList\n",
+       "    (0): TextEncoderBlock(\n",
+       "      original_name=TextEncoderBlock\n",
+       "      (norm_attn): LayerNorm(original_name=LayerNorm)\n",
+       "      (attention): Attention(\n",
+       "        original_name=Attention\n",
+       "        (query): Linear(original_name=Linear)\n",
+       "        (key): Linear(original_name=Linear)\n",
+       "        (value): Linear(original_name=Linear)\n",
+       "        (out): Linear(original_name=Linear)\n",
+       "      )\n",
+       "      (norm_mlp): LayerNorm(original_name=LayerNorm)\n",
+       "      (mlp): MLP(\n",
+       "        original_name=MLP\n",
+       "        (hidden_layer): Linear(original_name=Linear)\n",
+       "        (output_layer): Linear(original_name=Linear)\n",
+       "      )\n",
+       "      (dropout): Dropout(original_name=Dropout)\n",
+       "    )\n",
+       "    (1): TextEncoderBlock(\n",
+       "      original_name=TextEncoderBlock\n",
+       "      (norm_attn): LayerNorm(original_name=LayerNorm)\n",
+       "      (attention): Attention(\n",
+       "        original_name=Attention\n",
+       "        (query): Linear(original_name=Linear)\n",
+       "        (key): Linear(original_name=Linear)\n",
+       "        (value): Linear(original_name=Linear)\n",
+       "        (out): Linear(original_name=Linear)\n",
+       "      )\n",
+       "      (norm_mlp): LayerNorm(original_name=LayerNorm)\n",
+       "      (mlp): MLP(\n",
+       "        original_name=MLP\n",
+       "        (hidden_layer): Linear(original_name=Linear)\n",
+       "        (output_layer): Linear(original_name=Linear)\n",
+       "      )\n",
+       "      (dropout): Dropout(original_name=Dropout)\n",
+       "    )\n",
+       "    (2): TextEncoderBlock(\n",
+       "      original_name=TextEncoderBlock\n",
+       "      (norm_attn): LayerNorm(original_name=LayerNorm)\n",
+       "      (attention): Attention(\n",
+       "        original_name=Attention\n",
+       "        (query): Linear(original_name=Linear)\n",
+       "        (key): Linear(original_name=Linear)\n",
+       "        (value): Linear(original_name=Linear)\n",
+       "        (out): Linear(original_name=Linear)\n",
+       "      )\n",
+       "      (norm_crossattn): LayerNorm(original_name=LayerNorm)\n",
+       "      (crossattn): Attention(\n",
+       "        original_name=Attention\n",
+       "        (query): Linear(original_name=Linear)\n",
+       "        (key): Linear(original_name=Linear)\n",
+       "        (value): Linear(original_name=Linear)\n",
+       "        (out): Linear(original_name=Linear)\n",
+       "      )\n",
+       "      (norm_mlp): LayerNorm(original_name=LayerNorm)\n",
+       "      (mlp): MLP(\n",
+       "        original_name=MLP\n",
+       "        (hidden_layer): Linear(original_name=Linear)\n",
+       "        (output_layer): Linear(original_name=Linear)\n",
+       "      )\n",
+       "      (dropout): Dropout(original_name=Dropout)\n",
+       "    )\n",
+       "    (3): TextEncoderBlock(\n",
+       "      original_name=TextEncoderBlock\n",
+       "      (norm_attn): LayerNorm(original_name=LayerNorm)\n",
+       "      (attention): Attention(\n",
+       "        original_name=Attention\n",
+       "        (query): Linear(original_name=Linear)\n",
+       "        (key): Linear(original_name=Linear)\n",
+       "        (value): Linear(original_name=Linear)\n",
+       "        (out): Linear(original_name=Linear)\n",
+       "      )\n",
+       "      (norm_crossattn): LayerNorm(original_name=LayerNorm)\n",
+       "      (crossattn): Attention(\n",
+       "        original_name=Attention\n",
+       "        (query): Linear(original_name=Linear)\n",
+       "        (key): Linear(original_name=Linear)\n",
+       "        (value): Linear(original_name=Linear)\n",
+       "        (out): Linear(original_name=Linear)\n",
+       "      )\n",
+       "      (norm_mlp): LayerNorm(original_name=LayerNorm)\n",
+       "      (mlp): MLP(\n",
+       "        original_name=MLP\n",
+       "        (hidden_layer): Linear(original_name=Linear)\n",
+       "        (output_layer): Linear(original_name=Linear)\n",
+       "      )\n",
+       "      (dropout): Dropout(original_name=Dropout)\n",
+       "    )\n",
+       "  )\n",
+       "  (embedding_projection): Linear(original_name=Linear)\n",
+       "  (matching_head): Linear(original_name=Linear)\n",
+       "  (context_projection): Linear(original_name=Linear)\n",
+       ")"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "module = model.text_encoder\n",
+    "module.eval()\n",
+    "module.return_features = True\n",
+    "\n",
+    "traced_script_module = torch.jit.trace(module, example_inputs=[text_data['input_ids'], text_data['attention_mask']])\n",
+    "traced_script_module"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Tuple detected at graph output. This will be flattened in the converted model.\n",
+      "Converting PyTorch Frontend ==> MIL Ops:   0%|          | 0/157 [00:00<?, ? ops/s]Core ML embedding (gather) layer does not support any inputs besides the weights and indices. Those given will be ignored.\n",
+      "Converting PyTorch Frontend ==> MIL Ops:  99%|█████████▊| 155/157 [00:00<00:00, 6809.29 ops/s]\n",
+      "Running MIL frontend_pytorch pipeline: 100%|██████████| 5/5 [00:00<00:00, 1947.76 passes/s]\n",
+      "Running MIL default pipeline: 100%|██████████| 69/69 [00:00<00:00, 816.08 passes/s]\n",
+      "Running MIL backend_mlprogram pipeline: 100%|██████████| 12/12 [00:00<00:00, 3294.17 passes/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "coreml_model = ct.convert(\n",
+    "    traced_script_module, source=\"pytorch\",\n",
+    "    inputs=[text_input, text_attention_input], outputs=[text_features, text_embeddings],\n",
+    "    convert_to='mlprogram', compute_precision=ct.precision.FLOAT32)\n",
+    "\n",
+    "coreml_model.author = 'Unum Cloud'\n",
+    "coreml_model.license = 'Apache 2.0'\n",
+    "coreml_model.short_description = 'Pocket-Sized Multimodal AI for Content Understanding'\n",
+    "coreml_model.save(\"../uform-vl-english-small-text.mlpackage\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/python/scripts/test_embeddings.py b/python/scripts/test_embeddings.py
index f831d72..d71bf0b 100644
--- a/python/scripts/test_embeddings.py
+++ b/python/scripts/test_embeddings.py
@@ -11,7 +11,7 @@
     torch_available = True
 except:
     torch_available = False
-    
+
 # ONNX is not a very light dependency either
 try:
     import onnx
@@ -34,6 +34,7 @@
     ("unum-cloud/uform-vl-english-large", "gpu", "fp16"),
 ]
 
+
 @pytest.mark.skipif(not torch_available, reason="PyTorch is not installed")
 @pytest.mark.parametrize("model_name", torch_models)
 def test_torch_one_embedding(model_name: str):
@@ -141,3 +142,7 @@ def test_onnx_many_embeddings(model_specs: Tuple[str, str, str], batch_size: int
 
     except ExecutionProviderError as e:
         pytest.skip(f"Execution provider error: {e}")
+
+
+if __name__ == "__main__":
+    pytest.main(["-s", "-x", __file__])
diff --git a/python/uform/numpy_preprocessor.py b/python/uform/numpy_preprocessor.py
index 6cdd54b..a556db4 100644
--- a/python/uform/numpy_preprocessor.py
+++ b/python/uform/numpy_preprocessor.py
@@ -89,6 +89,9 @@ def _resize_crop_normalize(self, image: Image):
         bottom = (height + self._image_size) / 2
 
         image = image.convert("RGB").crop((left, top, right, bottom))
+        # At this point `image` is a PIL Image with RGB channels.
+        # If you convert it to `np.ndarray` it will have shape (H, W, C) where C is the number of channels.
         image = (np.array(image).astype(np.float32) / 255.0 - self.image_mean) / self.image_std
 
+        # To make it compatible with PyTorch, we need to transpose the image to (C, H, W).
         return np.transpose(image, (2, 0, 1))
diff --git a/python/uform/torch_models.py b/python/uform/torch_models.py
index d0d810b..ab86622 100644
--- a/python/uform/torch_models.py
+++ b/python/uform/torch_models.py
@@ -207,6 +207,7 @@ def __post_init__(self):
             self.context_projection = nn.Linear(self.context_dim, self.dim, bias=False)
         else:
             self.context_projection = nn.Identity()
+        self.return_features = False
 
     def forward_features(self, x: Tensor, attn_mask: Tensor) -> Tensor:
         x = self.embed_text(x)
@@ -267,10 +268,27 @@ def embed_text(self, x: Tensor) -> Tensor:
         x = self.word_embeddings(x) + positional_embedding
         return self.dropout(self.layer_norm(x))
 
-    def forward(self, x: dict) -> Tensor:
-        features = self.forward_features(x["input_ids"], x["attention_mask"])
-        embeddings = self.forward_embedding(features, x["attention_mask"])
-        return features, embeddings
+    def forward(
+        self,
+        x: Union[Tensor, dict],
+        attention_mask: Optional[Tensor] = None,
+        return_features: Optional[bool] = None,
+    ) -> Tensor:
+        if isinstance(x, dict):
+            assert attention_mask is None, "If `x` is a dictionary, then `attention_mask` should be None"
+            attention_mask = x["attention_mask"]
+            x = x["input_ids"]
+        elif attention_mask is None:
+            # If no attention mask is provided - create one with all ones
+            attention_mask = torch.ones_like(x)
+
+        features = self.forward_features(x, attention_mask)
+        embeddings = self.forward_embedding(features, attention_mask)
+
+        return_features = return_features if return_features is not None else self.return_features
+        if return_features:
+            return features, embeddings
+        return embeddings
 
 
 @dataclass(eq=False)
@@ -301,6 +319,7 @@ def __post_init__(self):
 
         self.norm = nn.LayerNorm(self.dim, eps=1e-6)
         self.embedding_projection = nn.Linear(self.dim, self.embedding_dim, bias=False)
+        self.return_features = False
 
     def forward_features(self, x: Tensor) -> Tensor:
         x = self.patch_embed(x).flatten(start_dim=2).transpose(2, 1)
@@ -325,10 +344,13 @@ def forward_embedding(self, x: Tensor) -> Tensor:
 
         return self.embedding_projection(x)
 
-    def forward(self, x: Tensor) -> Tensor:
+    def forward(self, x: Tensor, return_features: Optional[bool] = None) -> Tensor:
         features = self.forward_features(x)
         embeddings = self.forward_embedding(features)
-        return features, embeddings
+        return_features = return_features if return_features is not None else self.return_features
+        if return_features:
+            return features, embeddings
+        return embeddings
 
 
 class VLM(nn.Module):
@@ -430,7 +452,7 @@ def encode_multimodal(
             attention_mask if attention_mask is not None else text["attention_mask"],
             image_features,
         )
-        
+
         if return_scores:
             return self.get_matching_scores(embeddings), embeddings
 
diff --git a/swift/Embeddings.swift b/swift/Embeddings.swift
new file mode 100644
index 0000000..6d973ac
--- /dev/null
+++ b/swift/Embeddings.swift
@@ -0,0 +1,403 @@
+//
+//  Embeddings.swift
+//
+//
+//  Created by Ash Vardanian on 3/27/24.
+//
+import Accelerate
+import CoreGraphics
+import CoreML
+import Foundation
+import Hub  // `Config`
+import Tokenizers  // `AutoTokenizer`
+
+public enum Embedding {
+    case i32s([Int32])
+    case f16s([Float16])
+    case f32s([Float32])
+    case f64s([Float64])
+
+    init?(from multiArray: MLMultiArray) {
+        switch multiArray.dataType {
+        case .float64:
+            self = .f64s(
+                Array(
+                    UnsafeBufferPointer(
+                        start: multiArray.dataPointer.assumingMemoryBound(to: Float64.self),
+                        count: Int(truncating: multiArray.shape[1])
+                    )
+                )
+            )
+        case .float32:
+            self = .f32s(
+                Array(
+                    UnsafeBufferPointer(
+                        start: multiArray.dataPointer.assumingMemoryBound(to: Float32.self),
+                        count: Int(truncating: multiArray.shape[1])
+                    )
+                )
+            )
+        case .float16:
+            self = .f16s(
+                Array(
+                    UnsafeBufferPointer(
+                        start: multiArray.dataPointer.assumingMemoryBound(to: Float16.self),
+                        count: Int(truncating: multiArray.shape[1])
+                    )
+                )
+            )
+        case .int32:
+            self = .i32s(
+                Array(
+                    UnsafeBufferPointer(
+                        start: multiArray.dataPointer.assumingMemoryBound(to: Int32.self),
+                        count: Int(truncating: multiArray.shape[1])
+                    )
+                )
+            )
+        @unknown default:
+            return nil  // return nil for unsupported data types
+        }
+    }
+
+    public func asFloats() -> [Float] {
+        switch self {
+        case .f32s(let array):
+            return array
+        case .i32s(let array):
+            return array.map { Float($0) }
+        case .f16s(let array):
+            return array.map { Float($0) }
+        case .f64s(let array):
+            return array.map { Float($0) }
+        }
+    }
+}
+
+// MARK: - Helpers
+
+func readConfig(fromPath path: String) throws -> [String: Any] {
+    // If it's not an absolute path, let's assume it's a path relative to the current working directory
+    let absPath = path.hasPrefix("/") ? path : FileManager.default.currentDirectoryPath + "/" + path
+    let data = try Data(contentsOf: URL(fileURLWithPath: absPath))
+    return try JSONSerialization.jsonObject(with: data, options: []) as! [String: Any]
+}
+
+func readModel(fromURL modelURL: URL) throws -> MLModel {
+    let compiledModelURL = try MLModel.compileModel(at: modelURL)
+    return try MLModel(contentsOf: compiledModelURL)
+}
+
+func readModel(fromPath path: String) throws -> MLModel {
+    // If it's not an absolute path, let's assume it's a path relative to the current working directory
+    let absPath = path.hasPrefix("/") ? path : FileManager.default.currentDirectoryPath + "/" + path
+    let modelURL = URL(fileURLWithPath: absPath, isDirectory: true)
+    return try readModel(fromURL: modelURL)
+}
+
+// MARK: - Encoders
+
+public class TextEncoder {
+    let model: MLModel
+    let processor: TextProcessor
+
+    public init(modelPath: String, configPath: String? = nil, tokenizerPath: String? = nil) throws {
+        let finalConfigPath = configPath ?? modelPath + "/config.json"
+        let finalTokenizerPath = tokenizerPath ?? modelPath + "/tokenizer.json"
+        self.model = try readModel(fromPath: modelPath)
+        self.processor = try TextProcessor(configPath: finalConfigPath, tokenizerPath: finalTokenizerPath, model: self.model)
+    }
+
+    
+    public init(modelName: String, hubApi: HubApi = .shared) async throws {
+        let repo = Hub.Repo(id: modelName)
+        let modelURL = try await hubApi.snapshot(from: repo, matching: ["text.mlpackage/*", "config.json", "tokenizer.json"])
+        let configPath = modelURL.appendingPathComponent("config.json").path
+        let tokenizerPath = modelURL.appendingPathComponent("tokenizer.json").path
+        self.model = try readModel(fromURL: modelURL.appendingPathComponent("text.mlpackage", isDirectory: true))
+        self.processor = try TextProcessor(configPath: configPath, tokenizerPath: tokenizerPath, model: self.model)
+    }
+
+    public func forward(with text: String) throws -> Embedding {
+        let inputFeatureProvider = try self.processor.preprocess(text)
+        let prediction = try self.model.prediction(from: inputFeatureProvider)
+        guard let predictionFeature = prediction.featureValue(for: "embeddings"),
+            let output = predictionFeature.multiArrayValue,
+            let embedding = Embedding(from: output)
+        else {
+            throw NSError(
+                domain: "TextEncoder",
+                code: 0,
+                userInfo: [NSLocalizedDescriptionKey: "Failed to extract embeddings or unsupported data type."]
+            )
+        }
+        return embedding
+    }
+}
+
+public class ImageEncoder {
+    let model: MLModel
+    let processor: ImageProcessor
+
+    public init(modelPath: String, configPath: String? = nil) throws {
+        let finalConfigPath = configPath ?? modelPath + "/config.json"
+        self.model = try readModel(fromPath: modelPath)
+        self.processor = try ImageProcessor(configPath: finalConfigPath)
+    }
+
+    public init(modelName: String, hubApi: HubApi = .shared) async throws {
+        let repo = Hub.Repo(id: modelName)
+        let modelURL = try await hubApi.snapshot(from: repo, matching: ["image.mlpackage/*", "config.json"])
+        let configPath = modelURL.appendingPathComponent("config.json").path
+        self.model = try readModel(fromURL: modelURL.appendingPathComponent("image.mlpackage", isDirectory: true))
+        self.processor = try ImageProcessor(configPath: configPath)
+    }
+    
+    public func forward(with image: CGImage) throws -> Embedding {
+        let inputFeatureProvider = try self.processor.preprocess(image)
+        let prediction = try self.model.prediction(from: inputFeatureProvider)
+        guard let predictionFeature = prediction.featureValue(for: "embeddings"),
+            let output = predictionFeature.multiArrayValue,
+            let embedding = Embedding(from: output)
+        else {
+            throw NSError(
+                domain: "ImageEncoder",
+                code: 0,
+                userInfo: [NSLocalizedDescriptionKey: "Failed to extract embeddings or unsupported data type."]
+            )
+        }
+        return embedding
+    }
+}
+
+// MARK: - Processors
+
+class TextProcessor {
+    let tokenizer: Tokenizer
+    let minContextLength: Int
+    let maxContextLength: Int
+
+    public init(configPath: String, tokenizerPath: String, model: MLModel) throws {
+        var configDict = try readConfig(fromPath: configPath)
+        let tokenizerDict = try readConfig(fromPath: tokenizerPath)
+
+        // Check if there's a specific 'text_encoder' configuration within the main configuration
+        if let textEncoderConfig = configDict["text_encoder"] as? [String: Any] {
+            configDict = textEncoderConfig  // Use the specific 'text_encoder' configuration
+        }
+
+        let config = Config(configDict)
+        let tokenizerData = Config(tokenizerDict)
+        self.tokenizer = try AutoTokenizer.from(tokenizerConfig: config, tokenizerData: tokenizerData)
+
+        let inputDescription = model.modelDescription.inputDescriptionsByName["input_ids"]
+        guard let shapeConstraint = inputDescription?.multiArrayConstraint?.shapeConstraint else {
+            fatalError("Cannot obtain shape information")
+        }
+
+        switch shapeConstraint.type {
+        case .enumerated:
+            minContextLength = shapeConstraint.enumeratedShapes[0][1].intValue
+            maxContextLength = minContextLength
+        case .range:
+            let range = inputDescription?.multiArrayConstraint?.shapeConstraint.sizeRangeForDimension[1] as? NSRange
+            minContextLength = range?.location ?? 1
+            maxContextLength = range?.length ?? 128
+        case .unspecified:
+            minContextLength = 128
+            maxContextLength = 128
+        @unknown default:
+            minContextLength = 128
+            maxContextLength = 128
+        }
+    }
+
+    public func preprocess(_ text: String) throws -> MLFeatureProvider {
+        let inputIDs = self.tokenizer.encode(text: text)
+        return TextInput(inputIDs: inputIDs, sequenceLength: self.maxContextLength)
+    }
+}
+
+class ImageProcessor {
+    let imageSize: Int
+    let mean: [Float] = [0.485, 0.456, 0.406]  // Common mean values for normalization
+    let std: [Float] = [0.229, 0.224, 0.225]  // Common std values for normalization
+
+    init(configPath: String) throws {
+        var configDict = try readConfig(fromPath: configPath)
+        // Check if there's a specific 'image_encoder' configuration within the main configuration
+        if let imageEncoderConfig = configDict["image_encoder"] as? [String: Any] {
+            configDict = imageEncoderConfig
+        }
+        
+        let config = Config(configDict)
+        self.imageSize = config.imageSize!.intValue!
+    }
+
+    func preprocess(_ cgImage: CGImage) throws -> MLFeatureProvider {
+        // Populate a tensor of size 3 x `imageSize` x `imageSize`,
+        // by resizing the image, then performing a center crop.
+        // Then normalize with the `mean` and `std` and export as a provider.
+        let cropped = resizeAndCrop(image: cgImage, toSideLength: self.imageSize)!
+        let normalized = exportToTensorAndNormalize(image: cropped, mean: self.mean, std: self.std)!
+        let featureValue = MLFeatureValue(multiArray: normalized)
+        return try ImageInput(precomputedFeature: featureValue)
+    }
+
+    private func resizeAndCrop(image: CGImage, toSideLength imageSize: Int) -> CGImage? {
+        let originalWidth = CGFloat(image.width)
+        let originalHeight = CGFloat(image.height)
+
+        // Calculate new size preserving the aspect ratio
+        let widthRatio = CGFloat(imageSize) / originalWidth
+        let heightRatio = CGFloat(imageSize) / originalHeight
+        let scaleFactor = max(widthRatio, heightRatio)
+
+        let scaledWidth = originalWidth * scaleFactor
+        let scaledHeight = originalHeight * scaleFactor
+
+        // Calculate the crop rectangle
+        let dx = (scaledWidth - CGFloat(imageSize)) / 2.0
+        let dy = (scaledHeight - CGFloat(imageSize)) / 2.0
+        guard
+            let context = CGContext(
+                data: nil,
+                width: imageSize,
+                height: imageSize,
+                bitsPerComponent: image.bitsPerComponent,
+                bytesPerRow: 0,
+                space: image.colorSpace ?? CGColorSpaceCreateDeviceRGB(),
+                bitmapInfo: image.bitmapInfo.rawValue
+            )
+        else { return nil }
+
+        // Draw the scaled and cropped image in the context
+        context.interpolationQuality = .high
+        context.draw(image, in: CGRect(x: -dx, y: -dy, width: scaledWidth, height: scaledHeight))
+        return context.makeImage()
+    }
+
+    private func exportToTensorAndNormalize(image: CGImage, mean: [Float], std: [Float]) -> MLMultiArray? {
+        let width = image.width
+        let height = image.height
+
+        // Prepare the bitmap context for drawing the image.
+        var pixelData = [UInt8](repeating: 0, count: width * height * 4)
+        let colorSpace = CGColorSpaceCreateDeviceRGB()
+        let context = CGContext(
+            data: &pixelData,
+            width: width,
+            height: height,
+            bitsPerComponent: 8,
+            bytesPerRow: 4 * width,
+            space: colorSpace,
+            bitmapInfo: CGImageAlphaInfo.premultipliedLast.rawValue
+        )
+        context?.draw(image, in: CGRect(x: 0, y: 0, width: width, height: height))
+
+        // Normalize the pixel data
+        var floatPixels = [Float](repeating: 0, count: width * height * 3)
+        for c in 0 ..< 3 {
+            for i in 0 ..< (width * height) {
+                floatPixels[i * 3 + c] = (Float(pixelData[i * 4 + c]) / 255.0 - mean[c]) / std[c]
+            }
+        }
+
+        // Create the tensor array
+        var tensor = [Float](repeating: 0, count: 3 * width * height)
+        for i in 0 ..< (width * height) {
+            for c in 0 ..< 3 {
+                tensor[c * width * height + i] = floatPixels[i * 3 + c]
+            }
+        }
+
+        let multiArray = try? MLMultiArray(
+            shape: [1, 3, NSNumber(value: height), NSNumber(value: width)],
+            dataType: .float32
+        )
+        for i in 0 ..< tensor.count {
+            multiArray?[i] = NSNumber(value: tensor[i])
+        }
+        return multiArray
+    }
+
+}
+
+// MARK: - Feature Providers
+
+class TextInput: MLFeatureProvider {
+    var inputIDs: [Int]
+    var sequenceLength: Int
+    var paddingID: Int
+
+    init(inputIDs: [Int], sequenceLength: Int, paddingID: Int = 0) {
+        self.inputIDs = inputIDs
+        self.sequenceLength = sequenceLength
+        self.paddingID = paddingID
+    }
+
+    var featureNames: Set<String> {
+        return Set(["input_ids", "attention_mask"])
+    }
+
+    // The model expects the input IDs to be an array of integers
+    // of length `sequenceLength`, padded with `paddingID` if necessary
+    func featureValue(for featureName: String) -> MLFeatureValue? {
+        switch featureName {
+        case "input_ids", "attention_mask":
+            return createFeatureValue(for: featureName)
+        default:
+            return nil
+        }
+    }
+
+    private func createFeatureValue(for featureName: String) -> MLFeatureValue? {
+        let count = min(inputIDs.count, sequenceLength)
+        let totalElements = sequenceLength
+        guard let multiArray = try? MLMultiArray(shape: [1, NSNumber(value: totalElements)], dataType: .int32) else {
+            return nil
+        }
+
+        if featureName == "input_ids" {
+            for i in 0 ..< count {
+                multiArray[i] = NSNumber(value: inputIDs[i])
+            }
+            for i in count ..< totalElements {
+                multiArray[i] = NSNumber(value: paddingID)
+            }
+        }
+        else if featureName == "attention_mask" {
+            for i in 0 ..< count {
+                multiArray[i] = NSNumber(value: 1)
+            }
+            for i in count ..< totalElements {
+                multiArray[i] = NSNumber(value: 0)
+            }
+        }
+
+        return MLFeatureValue(multiArray: multiArray)
+    }
+}
+
+class ImageInput: MLFeatureProvider {
+    var precomputedFeature: MLFeatureValue
+
+    init(precomputedFeature: MLFeatureValue) throws {
+        self.precomputedFeature = precomputedFeature
+    }
+
+    var featureNames: Set<String> {
+        return Set(["input"])
+    }
+
+    // The model expects the input IDs to be an array of integers
+    // of length `sequenceLength`, padded with `paddingID` if necessary
+    func featureValue(for featureName: String) -> MLFeatureValue? {
+        switch featureName {
+        case "input":
+            return precomputedFeature
+        default:
+            return nil
+        }
+    }
+}
diff --git a/swift/EmbeddingsTests.swift b/swift/EmbeddingsTests.swift
new file mode 100644
index 0000000..5efb87f
--- /dev/null
+++ b/swift/EmbeddingsTests.swift
@@ -0,0 +1,146 @@
+import CoreGraphics
+import ImageIO
+import UForm
+import Hub
+import XCTest
+
+final class TokenizerTests: XCTestCase {
+
+    func cosineSimilarity<T: FloatingPoint>(between vectorA: [T], and vectorB: [T]) -> T {
+        guard vectorA.count == vectorB.count else {
+            fatalError("Vectors must be of the same length.")
+        }
+
+        let dotProduct = zip(vectorA, vectorB).reduce(T.zero) { $0 + ($1.0 * $1.1) }
+        let magnitudeA = sqrt(vectorA.reduce(T.zero) { $0 + $1 * $1 })
+        let magnitudeB = sqrt(vectorB.reduce(T.zero) { $0 + $1 * $1 })
+
+        // Avoid division by zero
+        if magnitudeA == T.zero || magnitudeB == T.zero {
+            return T.zero
+        }
+
+        return dotProduct / (magnitudeA * magnitudeB)
+    }
+
+    func testTextEmbeddings() async throws {
+
+        let api = HubApi(hfToken: "xxx")
+        let textModel = try await TextEncoder(
+            modelName: "unum-cloud/uform-vl2-english-small",
+            hubApi: api
+        )
+
+        let texts = [
+            "sunny beach with clear blue water",
+            "crowded sandbeach under the bright sun",
+            "dense forest with tall green trees",
+            "quiet park in the morning light",
+        ]
+
+        var textEmbeddings: [[Float32]] = []
+        for text in texts {
+            let embedding: [Float32] = try textModel.forward(with: text).asFloats()
+            textEmbeddings.append(embedding)
+        }
+
+        // Now let's compute the cosine similarity between the textEmbeddings
+        let similarityBeach = cosineSimilarity(between: textEmbeddings[0], and: textEmbeddings[1])
+        let similarityForest = cosineSimilarity(between: textEmbeddings[2], and: textEmbeddings[3])
+        let dissimilarityBetweenScenes = cosineSimilarity(between: textEmbeddings[0], and: textEmbeddings[2])
+
+        // Assert that similar texts have higher similarity scores
+        XCTAssertTrue(
+            similarityBeach > dissimilarityBetweenScenes,
+            "Beach texts should be more similar to each other than to forest texts."
+        )
+        XCTAssertTrue(
+            similarityForest > dissimilarityBetweenScenes,
+            "Forest texts should be more similar to each other than to beach texts."
+        )
+    }
+
+    func testImageEmbeddings() async throws {
+
+        // One option is to use a local model repository.
+        //
+        //        let root = "uform/"
+        //        let textModel = try TextEncoder(
+        //            modelPath: root + "uform-vl-english-large-text.mlpackage",
+        //            configPath: root + "uform-vl-english-large-text.json",
+        //            tokenizerPath: root + "uform-vl-english-large-text.tokenizer.json"
+        //        )
+        //        let imageModel = try ImageEncoder(
+        //            modelPath: root + "uform-vl-english-large-image.mlpackage",
+        //            configPath: root + "uform-vl-english-large-image.json"
+        //        )
+        //
+        // A better option is to fetch directly from HuggingFace, similar to how users would do that:
+        let api = HubApi(hfToken: "xxx")
+        let textModel = try await TextEncoder(
+            modelName: "unum-cloud/uform-vl2-english-small",
+            hubApi: api
+        )
+        let imageModel = try await ImageEncoder(
+            modelName: "unum-cloud/uform-vl2-english-small",
+            hubApi: api
+        )
+
+        let texts = [
+            "A group of friends enjoy a barbecue on a sandy beach, with one person grilling over a large black grill, while the other sits nearby, laughing and enjoying the camaraderie.",
+            "A white and orange cat stands on its hind legs, reaching towards a wicker basket filled with red raspberries on a wooden table in a garden, surrounded by orange flowers and a white teapot, creating a serene and whimsical scene.",
+            "A young girl in a yellow dress stands in a grassy field, holding an umbrella and looking at the camera, amidst rain.",
+            "This serene bedroom features a white bed with a black canopy, a gray armchair, a black dresser with a mirror, a vase with a plant, a window with white curtains, a rug, and a wooden floor, creating a tranquil and elegant atmosphere.",
+            "The image captures the iconic Louvre Museum in Paris, illuminated by warm lights against a dark sky, with the iconic glass pyramid in the center, surrounded by ornate buildings and a large courtyard, showcasing the museum's grandeur and historical significance.",
+        ]
+        let imageURLs = [
+            "https://github.com/ashvardanian/ashvardanian/blob/master/demos/bbq-on-beach.jpg?raw=true",
+            "https://github.com/ashvardanian/ashvardanian/blob/master/demos/cat-in-garden.jpg?raw=true",
+            "https://github.com/ashvardanian/ashvardanian/blob/master/demos/girl-and-rain.jpg?raw=true",
+            "https://github.com/ashvardanian/ashvardanian/blob/master/demos/light-bedroom-furniture.jpg?raw=true",
+            "https://github.com/ashvardanian/ashvardanian/blob/master/demos/louvre-at-night.jpg?raw=true",
+        ]
+
+        var textEmbeddings: [[Float32]] = []
+        var imageEmbeddings: [[Float32]] = []
+        for (text, imageURL) in zip(texts, imageURLs) {
+            guard let url = URL(string: imageURL),
+                let imageSource = CGImageSourceCreateWithURL(url as CFURL, nil),
+                let cgImage = CGImageSourceCreateImageAtIndex(imageSource, 0, nil)
+            else {
+                throw NSError(
+                    domain: "ImageError",
+                    code: 100,
+                    userInfo: [NSLocalizedDescriptionKey: "Could not load image from URL: \(imageURL)"]
+                )
+            }
+
+            let textEmbedding: [Float32] = try textModel.forward(with: text).asFloats()
+            textEmbeddings.append(textEmbedding)
+            let imageEmbedding: [Float32] = try imageModel.forward(with: cgImage).asFloats()
+            imageEmbeddings.append(imageEmbedding)
+        }
+
+        // Now let's make sure that the cosine distance between image and respective text embeddings is low.
+        // Make sure that the similarity between image and text at index `i` is higher than with other texts and images.
+        for i in 0 ..< texts.count {
+            let pairSimilarity = cosineSimilarity(between: textEmbeddings[i], and: imageEmbeddings[i])
+            let otherTextSimilarities = (0 ..< texts.count).filter { $0 != i }.map {
+                cosineSimilarity(between: textEmbeddings[$0], and: imageEmbeddings[i])
+            }
+            let otherImageSimilarities = (0 ..< texts.count).filter { $0 != i }.map {
+                cosineSimilarity(between: textEmbeddings[i], and: imageEmbeddings[$0])
+            }
+
+            XCTAssertTrue(
+                pairSimilarity > otherTextSimilarities.max()!,
+                "Text should be more similar to its corresponding image than to other images."
+            )
+            XCTAssertTrue(
+                pairSimilarity > otherImageSimilarities.max()!,
+                "Text should be more similar to its corresponding image than to other texts."
+            )
+        }
+    }
+
+}