pytorch · shoumikhin · Oct 14, 2025 · Oct 14, 2025
@@ -6,6 +6,8 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#import <ExecuTorch/ExecuTorch.h>
+
 #import "ExecuTorchLLMConfig.h"
 
 NS_ASSUME_NONNULL_BEGIN
@@ -29,6 +31,16 @@ __attribute__((deprecated("This API is experimental.")))
 __attribute__((objc_subclassing_restricted))
 @interface ExecuTorchLLMImage : NSObject<NSCopying>
 
+/**
+ Initializes an image container from a tensor.
+
+ @param tensor   A tensor with shape {C, H, W} and dtype Byte or Float.
+ @return An initialized ExecuTorchLLMImage instance.
+*/
+- (instancetype)initWithTensor:(ExecuTorchTensor *)tensor
+    NS_DESIGNATED_INITIALIZER
+    NS_SWIFT_NAME(init(_:));
+
 /**
  Initializes an image container with the provided data and dimensions.
 
@@ -41,16 +53,21 @@ __attribute__((objc_subclassing_restricted))
 - (instancetype)initWithData:(NSData *)data
                        width:(NSInteger)width
                       height:(NSInteger)height
-                    channels:(NSInteger)channels
-    NS_DESIGNATED_INITIALIZER;
+                    channels:(NSInteger)channels;
+
+/**
+ Initializes an image container with the provided float data and dimensions.
 
+ @param data       Float image buffer.
+ @param width      Image width in pixels.
+ @param height     Image height in pixels.
+ @param channels   Number of channels.
+ @return An initialized ExecuTorchLLMImage instance.
+*/
 - (instancetype)initWithFloatData:(NSData *)data
                             width:(NSInteger)width
                            height:(NSInteger)height
-                         channels:(NSInteger)channels
-    NS_DESIGNATED_INITIALIZER;
-
-@property(nonatomic, readonly) NSData *data;
+                         channels:(NSInteger)channels;
 
 @property(nonatomic, readonly) NSInteger width;
 
@@ -60,6 +77,8 @@ __attribute__((objc_subclassing_restricted))
 
 @property(nonatomic, readonly) BOOL isFloat;
 
+@property(nonatomic, readonly) ExecuTorchTensor *tensor;
+
 + (instancetype)new NS_UNAVAILABLE;
 - (instancetype)init NS_UNAVAILABLE;
 
@@ -73,6 +92,16 @@ __attribute__((deprecated("This API is experimental.")))
 __attribute__((objc_subclassing_restricted))
 @interface ExecuTorchLLMAudio : NSObject<NSCopying>
 
+/**
+ Initializes an audio features container from a tensor.
+
+ @param tensor   A tensor with shape {batchSize, bins, frames} and dtype Byte or Float.
+ @return An initialized ExecuTorchLLMAudio instance.
+*/
+- (instancetype)initWithTensor:(ExecuTorchTensor *)tensor
+    NS_DESIGNATED_INITIALIZER
+    NS_SWIFT_NAME(init(_:));
+
 /**
  Initializes an audio features container with the provided data and shape.
 
@@ -85,16 +114,21 @@ __attribute__((objc_subclassing_restricted))
 - (instancetype)initWithData:(NSData *)data
                    batchSize:(NSInteger)batchSize
                         bins:(NSInteger)bins
-                      frames:(NSInteger)frames
-    NS_DESIGNATED_INITIALIZER;
+                      frames:(NSInteger)frames;
+
+/**
+ Initializes an audio features container with the provided float data and shape.
 
+ @param data        Float feature buffer.
+ @param batchSize   Batch dimension size.
+ @param bins        Number of frequency bins.
+ @param frames      Number of time frames.
+ @return An initialized ExecuTorchLLMAudio instance.
+*/
 - (instancetype)initWithFloatData:(NSData *)data
                         batchSize:(NSInteger)batchSize
                              bins:(NSInteger)bins
-                           frames:(NSInteger)frames
-    NS_DESIGNATED_INITIALIZER;
-
-@property(nonatomic, readonly) NSData *data;
+                           frames:(NSInteger)frames;
 
 @property(nonatomic, readonly) NSInteger batchSize;
 
@@ -104,6 +138,8 @@ __attribute__((objc_subclassing_restricted))
 
 @property(nonatomic, readonly) BOOL isFloat;
 
+@property(nonatomic, readonly) ExecuTorchTensor *tensor;
+
 + (instancetype)new NS_UNAVAILABLE;
 - (instancetype)init NS_UNAVAILABLE;
 

@@ -21,35 +21,60 @@ @interface ExecuTorchLLMConfig ()
 
 @end
 
-@implementation ExecuTorchLLMImage
+@implementation ExecuTorchLLMImage {
+  ExecuTorchTensor *_tensor;
+}
+
+- (instancetype)initWithTensor:(ExecuTorchTensor *)tensor {
+  ET_CHECK(tensor);
+  if (self = [super init]) {
+    ET_CHECK_MSG(tensor.shape.count == 3, "Image tensor must be rank-3 {C,H,W}");
+    ExecuTorchDataType dataType = tensor.dataType;
+    ET_CHECK_MSG(dataType == ExecuTorchDataTypeByte || dataType == ExecuTorchDataTypeFloat,
+                 "Image tensor must be Byte or Float");
+    _tensor = tensor;
+  }
+  return self;
+}
 
 - (instancetype)initWithData:(NSData *)data
                        width:(NSInteger)width
                       height:(NSInteger)height
                     channels:(NSInteger)channels {
-  if (self = [super init]) {
-    _data = [data copy];
-    _width = width;
-    _height = height;
-    _channels = channels;
-    _isFloat = NO;
-  }
-  return self;
+  return [self initWithTensor:[[ExecuTorchTensor alloc]
+                                 initWithData:data
+                                        shape:@[@(channels), @(height), @(width)]
+                                      dataType:ExecuTorchDataTypeByte]];
 }
 
 - (instancetype)initWithFloatData:(NSData *)data
                             width:(NSInteger)width
                            height:(NSInteger)height
                          channels:(NSInteger)channels {
-  self = [super init];
-  if (self) {
-    _data = [data copy];
-    _width = width;
-    _height = height;
-    _channels = channels;
-    _isFloat = YES;
-  }
-  return self;
+  return [self initWithTensor:[[ExecuTorchTensor alloc]
+                                 initWithData:data
+                                        shape:@[@(channels), @(height), @(width)]
+                                      dataType:ExecuTorchDataTypeFloat]];
+}
+
+- (NSInteger)width {
+  return _tensor.shape[2].integerValue;
+}
+
+- (NSInteger)height {
+  return _tensor.shape[1].integerValue;
+}
+
+- (NSInteger)channels {
+  return _tensor.shape[0].integerValue;
+}
+
+- (BOOL)isFloat {
+  return _tensor.dataType == ExecuTorchDataTypeFloat;
+}
+
+- (ExecuTorchTensor *)tensor {
+  return _tensor;
 }
 
 - (id)copyWithZone:(NSZone *)zone {
@@ -58,35 +83,60 @@ - (id)copyWithZone:(NSZone *)zone {
 
 @end
 
-@implementation ExecuTorchLLMAudio
+@implementation ExecuTorchLLMAudio {
+  ExecuTorchTensor *_tensor;
+}
+
+- (instancetype)initWithTensor:(ExecuTorchTensor *)tensor {
+  ET_CHECK(tensor);
+  if (self = [super init]) {
+    ET_CHECK_MSG(tensor.shape.count == 3, "Audio tensor must be rank-3 {B,bins,frames}");
+    ExecuTorchDataType dataType = tensor.dataType;
+    ET_CHECK_MSG(dataType == ExecuTorchDataTypeByte || dataType == ExecuTorchDataTypeFloat,
+                 "Audio tensor must be Byte or Float");
+    _tensor = tensor;
+  }
+  return self;
+}
 
 - (instancetype)initWithData:(NSData *)data
                    batchSize:(NSInteger)batchSize
                         bins:(NSInteger)bins
                       frames:(NSInteger)frames {
-  if (self = [super init]) {
-    _data = [data copy];
-    _batchSize = batchSize;
-    _bins = bins;
-    _frames = frames;
-    _isFloat = NO;
-  }
-  return self;
+  return [self initWithTensor:
+      [[ExecuTorchTensor alloc] initWithData:data
+                                       shape:@[@(batchSize), @(bins), @(frames)]
+                                    dataType:ExecuTorchDataTypeByte]];
 }
 
 - (instancetype)initWithFloatData:(NSData *)data
                         batchSize:(NSInteger)batchSize
                              bins:(NSInteger)bins
                            frames:(NSInteger)frames {
-  self = [super init];
-  if (self) {
-    _data = [data copy];
-    _batchSize = batchSize;
-    _bins = bins;
-    _frames = frames;
-    _isFloat = YES;
-  }
-  return self;
+  return [self initWithTensor:
+      [[ExecuTorchTensor alloc] initWithData:data
+                                       shape:@[@(batchSize), @(bins), @(frames)]
+                                    dataType:ExecuTorchDataTypeFloat]];
+}
+
+- (NSInteger)batchSize {
+  return _tensor.shape[0].integerValue;
+}
+
+- (NSInteger)bins {
+  return _tensor.shape[1].integerValue;
+}
+
+- (NSInteger)frames {
+  return _tensor.shape[2].integerValue;
+}
+
+- (BOOL)isFloat {
+  return _tensor.dataType == ExecuTorchDataTypeFloat;
+}
+
+- (ExecuTorchTensor *)tensor {
+  return _tensor;
 }
 
 - (id)copyWithZone:(NSZone *)zone {
@@ -208,54 +258,16 @@ - (BOOL)generateWithInputs:(NSArray<ExecuTorchLLMMultimodalInput *> *)inputs
       case ExecuTorchLLMMultimodalInputTypeText:
         nativeInputs.emplace_back(llm::MultimodalInput(input.text.UTF8String));
         break;
-      case ExecuTorchLLMMultimodalInputTypeImage: {
-        ExecuTorchLLMImage *image = input.image;
-        if (image.isFloat) {
-          const float *buffer = (const float *)image.data.bytes;
-          size_t elementCount = (size_t)image.data.length / sizeof(float);
-          std::vector<float> data(buffer, buffer + elementCount);
-          nativeInputs.emplace_back(llm::MultimodalInput(llm::Image(
-            std::move(data),
-            (int32_t)image.width,
-            (int32_t)image.height,
-            (int32_t)image.channels
-          )));
-        } else {
-          const uint8_t *buffer = (const uint8_t *)image.data.bytes;
-          std::vector<uint8_t> data(buffer, buffer + image.data.length);
-          nativeInputs.emplace_back(llm::MultimodalInput(llm::Image(
-            std::move(data),
-            (int32_t)image.width,
-            (int32_t)image.height,
-            (int32_t)image.channels
-          )));
-        }
+      case ExecuTorchLLMMultimodalInputTypeImage:
+        nativeInputs.emplace_back(llm::MultimodalInput(llm::Image(
+          make_tensor_ptr(*reinterpret_cast<TensorPtr *>(input.image.tensor.nativeInstance))
+        )));
         break;
-      }
-      case ExecuTorchLLMMultimodalInputTypeAudio: {
-        ExecuTorchLLMAudio *audio = input.audio;
-        if (audio.isFloat) {
-          const float *buffer = (const float *)audio.data.bytes;
-          size_t elementCount = (size_t)audio.data.length / sizeof(float);
-          std::vector<float> data(buffer, buffer + elementCount);
-          nativeInputs.emplace_back(llm::MultimodalInput(llm::Audio(
-            std::move(data),
-            (int32_t)audio.batchSize,
-            (int32_t)audio.bins,
-            (int32_t)audio.frames
-          )));
-        } else {
-          const uint8_t *buffer = (const uint8_t *)audio.data.bytes;
-          std::vector<uint8_t> data(buffer, buffer + audio.data.length);
-          nativeInputs.emplace_back(llm::MultimodalInput(llm::Audio(
-            std::move(data),
-            (int32_t)audio.batchSize,
-            (int32_t)audio.bins,
-            (int32_t)audio.frames
-          )));
-        }
+      case ExecuTorchLLMMultimodalInputTypeAudio:
+        nativeInputs.emplace_back(llm::MultimodalInput(llm::Audio(
+          make_tensor_ptr(*reinterpret_cast<TensorPtr *>(input.audio.tensor.nativeInstance))
+        )));
         break;
-      }
       default: {
         if (error) {
           *error = [NSError errorWithDomain:ExecuTorchLLMErrorDomain