Skip to content

Commit

Permalink
feat: generic image processing and size target
Browse files Browse the repository at this point in the history
  • Loading branch information
Saghen committed May 13, 2024
1 parent c6e1cfe commit c8814f4
Show file tree
Hide file tree
Showing 7 changed files with 315 additions and 66 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ The following is the default `chatPromptTemplate`, although newlines and indenti

#### Multi modal model

We currently support IDEFICS, hosted on TGI, and Claude 3 as multimodal models. You can enable it by setting `multimodal: true` in your `MODELS` configuration. For IDEFICS, you must have a PRO HF Api token. For Anthropic, see the [Anthropic section](#Anthropic)
We currently support IDEFICS, hosted on TGI, OpenAI and Claude 3 as multimodal models. You can enable it by setting `multimodal: true` in your `MODELS` configuration. For IDEFICS, you must have a PRO HF Api token. For OpenAI, see the [OpenAI section](#OpenAI). For Anthropic, see the [Anthropic section](#Anthropic).

```env
{
Expand Down
10 changes: 5 additions & 5 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,6 @@
"@google-cloud/vertexai": "^1.1.0",
"aws4fetch": "^1.0.17",
"cohere-ai": "^7.9.0",
"openai": "^4.14.2"
"openai": "^4.44.0"
}
}
45 changes: 28 additions & 17 deletions src/lib/server/endpoints/anthropic/endpointAnthropic.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,11 @@ import { env } from "$env/dynamic/private";
import type { TextGenerationStreamOutput } from "@huggingface/inference";
import type { ImageBlockParam, MessageParam } from "@anthropic-ai/sdk/resources";
import type { MessageFile } from "$lib/types/Message";
import { chooseMimeType, convertImage } from "../images";
import sharp from "sharp";
import {
createImageProcessorOptionsValidator,
makeImageProcessor,
type ImageProcessorOptions,
} from "../images";

export const endpointAnthropicParametersSchema = z.object({
weight: z.number().int().positive().default(1),
Expand All @@ -15,14 +18,25 @@ export const endpointAnthropicParametersSchema = z.object({
apiKey: z.string().default(env.ANTHROPIC_API_KEY ?? "sk-"),
defaultHeaders: z.record(z.string()).optional(),
defaultQuery: z.record(z.string()).optional(),
multimodal: z
.object({
image: createImageProcessorOptionsValidator({
supportedMimeTypes: ["image/png", "image/jpeg", "image/webp"],
preferredMimeType: "image/webp",
maxSizeInMB: (5 / 4) * 3,
maxWidth: 4096,
maxHeight: 4096,
}),
})
.default({}),
});

type NonSystemMessage = EndpointMessage & { from: "user" | "assistant" };

export async function endpointAnthropic(
input: z.input<typeof endpointAnthropicParametersSchema>
): Promise<Endpoint> {
const { baseURL, apiKey, model, defaultHeaders, defaultQuery } =
const { baseURL, apiKey, model, defaultHeaders, defaultQuery, multimodal } =
endpointAnthropicParametersSchema.parse(input);
let Anthropic;
try {
Expand Down Expand Up @@ -51,7 +65,9 @@ export async function endpointAnthropic(
return {
role: message.from,
content: [
...(await Promise.all((message.files ?? []).map(fileToImageBlock))),
...(await Promise.all(
(message.files ?? []).map((file) => fileToImageBlock(file, multimodal.image))
)),
{ type: "text", text: message.content },
],
};
Expand Down Expand Up @@ -107,24 +123,19 @@ export async function endpointAnthropic(
};
}

const supportedMimeTypes = ["image/jpeg", "image/gif", "image/webp"] as const;
async function fileToImageBlock(file: MessageFile): Promise<ImageBlockParam> {
let imageBase64 = file.value;

// Convert the image if it's an unsupported format
const chosenMime = chooseMimeType(supportedMimeTypes, "webp", file.mime);
if (chosenMime !== file.mime) {
const buffer = Buffer.from(file.value, "base64");
const convertedBuffer = await convertImage(sharp(buffer), chosenMime).toBuffer();
imageBase64 = convertedBuffer.toString("base64");
}
async function fileToImageBlock(
file: MessageFile,
opts: ImageProcessorOptions<"image/png" | "image/jpeg" | "image/webp">
): Promise<ImageBlockParam> {
const processor = makeImageProcessor(opts);
const { image, mime } = await processor(file);

return {
type: "image",
source: {
type: "base64",
media_type: chosenMime,
data: imageBase64,
media_type: mime,
data: image.toString("base64"),
},
};
}
185 changes: 177 additions & 8 deletions src/lib/server/endpoints/images.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,95 @@
import type { Sharp } from "sharp";
import sharp from "sharp";
import type { MessageFile } from "$lib/types/Message";
import { z } from "zod";

export function createImageProcessorOptionsValidator<MimeType extends string = string>(
defaults: ImageProcessorOptions<MimeType>
) {
return z
.object({
supportedMimeTypes: z
.array(
z.enum<string, [MimeType, ...MimeType[]]>([
defaults.supportedMimeTypes[0],
...defaults.supportedMimeTypes.slice(1),
])
)
.default(defaults.supportedMimeTypes),
preferredMimeType: z
.enum<string, [MimeType, ...MimeType[]]>([
defaults.supportedMimeTypes[0],
...defaults.supportedMimeTypes.slice(1),
])
.default(defaults.preferredMimeType),
// The 4 / 3 compensates for the 33% increase in size when converting to base64
maxSizeInMB: z.number().positive().default(defaults.maxSizeInMB),
maxWidth: z.number().int().positive().default(defaults.maxWidth),
maxHeight: z.number().int().positive().default(defaults.maxHeight),
})
.default(defaults);
}

export interface ImageProcessorOptions<MimeType extends string = string> {
supportedMimeTypes: MimeType[];
preferredMimeType: MimeType;
maxSizeInMB: number;
maxWidth: number;
maxHeight: number;
}
export type ImageProcessor<MimeType extends string = string> = (file: MessageFile) => Promise<{
image: Buffer;
mime: MimeType;
}>;
export function makeImageProcessor<MimeType extends string = string>(
options: ImageProcessorOptions<MimeType>
): ImageProcessor<MimeType> {
return async (file) => {
const { supportedMimeTypes, preferredMimeType, maxSizeInMB, maxWidth, maxHeight } = options;
const { mime, value } = file;

const buffer = Buffer.from(value, "base64");
let sharpInst = sharp(buffer);

const metadata = await sharpInst.metadata();
if (!metadata) throw Error("Failed to read image metadata");
const { width, height } = metadata;
if (width === undefined || height === undefined) throw Error("Failed to read image size");

const tooLargeInSize = width > maxWidth || height > maxHeight;
const tooLargeInBytes = buffer.byteLength > maxSizeInMB * 1024 * 1024;

const outputMime = chooseMimeType(supportedMimeTypes, preferredMimeType, mime, {
preferSizeReduction: tooLargeInBytes,
});

// Resize if necessary
if (tooLargeInSize || tooLargeInBytes) {
const size = chooseImageSize({
mime: outputMime,
width,
height,
maxWidth,
maxHeight,
maxSizeInMB,
});
if (size.width !== width || size.height !== height) {
sharpInst = resizeImage(sharpInst, size.width, size.height);
}
}

// Convert format if necessary
// We always want to convert the image when the file was too large in bytes
// so we can guarantee that ideal options are used, which are expected when
// choosing the image size
if (outputMime !== mime || tooLargeInBytes) {
sharpInst = convertImage(sharpInst, outputMime);
}

const processedImage = await sharpInst.toBuffer();
return { image: processedImage, mime: outputMime };
};
}

type OutputFormat = "png" | "jpeg" | "webp" | "avif" | "tiff" | "gif";
const outputFormats: OutputFormat[] = ["png", "jpeg", "webp", "avif", "tiff", "gif"];
Expand All @@ -21,25 +112,103 @@ export function convertImage(sharpInst: Sharp, outputMime: string): Sharp {
// TODO: consider what to do about animated formats: apng, gif, animated webp, ...
const blocklistedMimes = ["image/heic", "image/heif"];

/** Defaults to preferred format or uses existing mime if supported */
export function chooseMimeType<T extends readonly string[]>(
/** Sorted from largest to smallest */
const mimesBySize = [
"image/png",
"image/tiff",
"image/gif",
"image/jpeg",
"image/webp",
"image/avif",
];

/**
* Defaults to preferred format or uses existing mime if supported
* When preferSizeReduction is true, it will choose the smallest format that is supported
**/
function chooseMimeType<T extends readonly string[]>(
supportedMimes: T,
preferredFormat: OutputFormat,
mime: string
preferredMime: string,
mime: string,
{ preferSizeReduction }: { preferSizeReduction: boolean }
): T[number] {
if (!supportedMimes.includes(`image/${preferredFormat}`)) {
if (!supportedMimes.includes(preferredMime)) {
const supportedMimesStr = supportedMimes.join(", ");
throw Error(
`Preferred format "${preferredFormat}" not found in supported mimes: ${supportedMimesStr}`
`Preferred format "${preferredMime}" not found in supported mimes: ${supportedMimesStr}`
);
}

const [type] = mime.split("/");
if (type !== "image") throw Error(`Received non-image mime type: ${mime}`);

if (supportedMimes.includes(mime)) return mime;
if (supportedMimes.includes(mime) && !preferSizeReduction) return mime;

if (blocklistedMimes.includes(mime)) throw Error(`Received blocklisted mime type: ${mime}`);

return `image/${preferredFormat}`;
const smallestMime = mimesBySize.findLast((m) => supportedMimes.includes(m));
return smallestMime ?? preferredMime;
}

interface ImageSizeOptions {
mime: string;
width: number;
height: number;
maxWidth: number;
maxHeight: number;
maxSizeInMB: number;
}

/** Resizes the image to fit within the specified size in MB by guessing the output size */
export function chooseImageSize({
mime,
width,
height,
maxWidth,
maxHeight,
maxSizeInMB,
}: ImageSizeOptions): { width: number; height: number } {
const biggestDiscrepency = Math.max(1, width / maxWidth, height / maxHeight);

let selectedWidth = Math.ceil(width / biggestDiscrepency);
let selectedHeight = Math.ceil(height / biggestDiscrepency);

do {
const estimatedSize = estimateImageSizeInBytes(mime, selectedWidth, selectedHeight);
if (estimatedSize < maxSizeInMB * 1024 * 1024) {
return { width: selectedWidth, height: selectedHeight };
}
selectedWidth = Math.floor(selectedWidth / 1.1);
selectedHeight = Math.floor(selectedHeight / 1.1);
} while (selectedWidth > 1 && selectedHeight > 1);

throw Error(`Failed to resize image to fit within ${maxSizeInMB}MB`);
}

const mimeToCompressionRatio: Record<string, number> = {
"image/png": 1 / 2,
"image/jpeg": 1 / 10,
"image/webp": 1 / 4,
"image/avif": 1 / 5,
"image/tiff": 1,
"image/gif": 1 / 5,
};

/**
* Guesses the side of an image in MB based on its format and dimensions
* Should guess the worst case
**/
function estimateImageSizeInBytes(mime: string, width: number, height: number): number {
const compressionRatio = mimeToCompressionRatio[mime];
if (!compressionRatio) throw Error(`Unsupported image format: ${mime}`);

const bitsPerPixel = 32; // Assuming 32-bit color depth for 8-bit R G B A
const bytesPerPixel = bitsPerPixel / 8;
const uncompressedSize = width * height * bytesPerPixel;

return uncompressedSize * compressionRatio;
}

export function resizeImage(sharpInst: Sharp, maxWidth: number, maxHeight: number): Sharp {
return sharpInst.resize({ width: maxWidth, height: maxHeight, fit: "inside" });
}
Loading

0 comments on commit c8814f4

Please sign in to comment.