elizaOS · kroist · Dec 24, 2024 · Dec 31, 2024 · monilpat · Dec 24, 2024
diff --git a/packages/plugin-node/package.json b/packages/plugin-node/package.json
@@ -12,13 +12,13 @@
     "tsup.config.ts"
   ],
   "dependencies": {
-    "@elizaos/core": "workspace:*",
     "@aws-sdk/client-s3": "^3.705.0",
     "@aws-sdk/s3-request-presigner": "^3.705.0",
     "@cliqz/adblocker-playwright": "1.34.0",
     "@echogarden/espeak-ng-emscripten": "0.3.3",
     "@echogarden/kissfft-wasm": "0.2.0",
     "@echogarden/speex-resampler-wasm": "0.2.1",
+    "@elizaos/core": "workspace:*",
     "@huggingface/transformers": "3.0.2",
     "@opendocsg/pdf2md": "0.1.32",
     "@types/uuid": "10.0.0",
@@ -32,6 +32,7 @@
     "echogarden": "2.0.7",
     "espeak-ng": "1.0.2",
     "ffmpeg-static": "5.2.0",
+    "file-type": "^19.6.0",
     "fluent-ffmpeg": "2.1.3",
     "formdata-node": "6.0.3",
     "fs-extra": "11.2.0",

diff --git a/packages/plugin-node/src/services/image.ts b/packages/plugin-node/src/services/image.ts
@@ -21,6 +21,7 @@ import fs from "fs";
 import gifFrames from "gif-frames";
 import os from "os";
 import path from "path";
+import { resizeImageBuffer } from "./imageUtils";
 
 export class ImageDescriptionService
     extends Service
@@ -97,11 +98,13 @@ export class ImageDescriptionService
 
             if (model === models[ModelProviderName.LLAMALOCAL]) {
                 await this.initializeLocalModel();
+            } else if (model === models[ModelProviderName.ANTHROPIC]) {
+                this.modelId = "claude-3-haiku-20240307";
+                this.device = "cloud";
             } else {
                 this.modelId = "gpt-4o-mini";
                 this.device = "cloud";
             }
-
             this.initialized = true;
         }
 
@@ -111,7 +114,7 @@ export class ImageDescriptionService
                     "Runtime is required for OpenAI image recognition"
                 );
             }
-            return this.recognizeWithOpenAI(imageUrl);
+            return this.recognizeWithCloud(imageUrl);
         }
 
         this.queue.push(imageUrl);
@@ -130,7 +133,7 @@ export class ImageDescriptionService
         });
     }
 
-    private async recognizeWithOpenAI(
+    private async recognizeWithCloud(
         imageUrl: string
     ): Promise<{ title: string; description: string }> {
         const isGif = imageUrl.toLowerCase().endsWith(".gif");
@@ -159,13 +162,16 @@ export class ImageDescriptionService
 
             const prompt =
                 "Describe this image and give it a title. The first line should be the title, and then a line break, then a detailed description of the image. Respond with the format 'title\ndescription'";
-            const text = await this.requestOpenAI(
-                imageUrl,
-                imageData,
-                prompt,
-                isGif,
-                true
-            );
+            const text =
+                this.runtime.imageModelProvider === ModelProviderName.ANTHROPIC
+                    ? await this.requestAnthropic(imageData, prompt)
+                    : await this.requestOpenAI(
+                          imageUrl,
+                          imageData,
+                          prompt,
+                          isGif,
+                          true
+                      );
 
             const [title, ...descriptionParts] = text.split("\n");
             return {
@@ -218,7 +224,7 @@ export class ImageDescriptionService
                         Authorization: `Bearer ${this.runtime.getSetting("OPENAI_API_KEY")}`,
                     },
                     body: JSON.stringify({
-                        model: "gpt-4o-mini",
+                        model: this.modelId,
                         messages: [{ role: "user", content }],
                         max_tokens: shouldUseBase64 ? 500 : 300,
                     }),
@@ -252,6 +258,78 @@ export class ImageDescriptionService
         );
     }
 
+    private async requestAnthropic(
+        imageData: Buffer,
+        prompt: string
+    ): Promise<string> {
+        for (let attempt = 0; attempt < 3; attempt++) {
+            try {
+                const endpoint =
+                    models[this.runtime.imageModelProvider].endpoint ??
+                    "https://api.anthropic.com/v1";
+
+                // Resize image to 400x400 max, keeping the token count ~ 213
+                const resizedImage = await resizeImageBuffer(
+                    imageData,
+                    400,
+                    400
+                );
+
+                const response = await fetch(endpoint + "/messages", {
+                    method: "POST",
+                    headers: {
+                        "Content-Type": "application/json",
+                        "x-api-key": `${this.runtime.getSetting("ANTHROPIC_API_KEY")}`,
+                        "anthropic-version": "2023-06-01",
+                    },
+                    body: JSON.stringify({
+                        model: this.modelId,
+                        max_tokens: 300,
+                        messages: [
+                            {
+                                role: "user",
+                                content: [
+                                    {
+                                        type: "image",
+                                        source: {
+                                            type: "base64",
+                                            media_type: resizedImage.mimeType,
+                                            data: resizedImage.buffer.toString(
+                                                "base64"
+                                            ),
+                                        },
+                                    },
+                                    {
+                                        type: "text",
+                                        text: prompt,
+                                    },
+                                ],
+                            },
+                        ],
+                    }),
+                });
+
+                if (!response.ok) {
+                    throw new Error(
+                        `HTTP error! status: ${await response.text()}`
+                    );
+                }
+
+                const data = await response.json();
+                return data.content[0].text;
+            } catch (error) {
+                elizaLogger.error(
+                    `Anthropic request failed (attempt ${attempt + 1}):`,
+                    error
+                );
+                if (attempt === 2) throw error;
+            }
+        }
+        throw new Error(
+            "Failed to recognize image with Anthropic after 3 attempts"
+        );
+    }
+
     private async processQueue(): Promise<void> {
         if (this.processing || this.queue.length === 0) return;
 

diff --git a/packages/plugin-node/src/services/imageUtils.ts b/packages/plugin-node/src/services/imageUtils.ts
@@ -0,0 +1,79 @@
+import sharp from "sharp";
+import * as FileType from "file-type/core";
+
+interface ImageDimensions {
+    width: number;
+    height: number;
+}
+
+interface ProcessedImage {
+    buffer: Buffer;
+    mimeType: string;
+    dimensions: {
+        original: ImageDimensions;
+        resized: ImageDimensions;
+    };
+}
+
+export async function resizeImageBuffer(
+    imageBuffer: Buffer,
+    maxWidth: number,
+    maxHeight: number
+): Promise<ProcessedImage> {
+    // Detect MIME type
+    try {
+        // Detect MIME type
+        const fileTypeResult = await FileType.fileTypeFromBuffer(imageBuffer);
+        if (!fileTypeResult || !fileTypeResult.mime.startsWith("image/")) {
+            throw new Error("Invalid image format");
+        }
+
+        // Get original image metadata
+        const metadata = await sharp(imageBuffer).metadata();
+        if (!metadata.width || !metadata.height) {
+            throw new Error("Could not get image dimensions");
+        }
+
+        // Calculate new dimensions maintaining aspect ratio
+        let width = metadata.width;
+        let height = metadata.height;
+
+        if (width > maxWidth) {
+            height = Math.round((maxWidth * height) / width);
+            width = maxWidth;
+        }
+
+        if (height > maxHeight) {
+            width = Math.round((maxHeight * width) / height);
+            height = maxHeight;
+        }
+
+        // Process the image
+        const resizedBuffer = await sharp(imageBuffer)
+            .resize(width, height, {
+                fit: "inside",
+                withoutEnlargement: true,
+            })
+            .toBuffer();
+
+        return {
+            buffer: resizedBuffer,
+            mimeType: fileTypeResult.mime,
+            dimensions: {
+                original: {
+                    width: metadata.width,
+                    height: metadata.height,
+                },
+                resized: {
+                    width,
+                    height,
+                },
+            },
+        };
+    } catch (error) {
+        if (error instanceof Error) {
+            throw new Error(`Image processing failed: ${error.message}`);
+        }
+        throw new Error("Image processing failed with unknown error");
+    }
+}