diff --git a/.vscode/settings.json b/.vscode/settings.json index 75cdc9a3b7..53b780fa4a 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -14,6 +14,7 @@ "arrayify", "astgrep", "astrojs", + "autocrop", "Automatable", "autopad", "azurecontentsafety", @@ -22,6 +23,7 @@ "azuretoken", "bitindex", "blockslist", + "bufferlike", "BYOG", "cancellers", "cctx", @@ -45,6 +47,7 @@ "createfile", "cringy", "cybercrime", + "dall", "DALLE", "dbgc", "dbgp", diff --git a/docs/genaisrc/blog-image.genai.mts b/docs/genaisrc/blog-image.genai.mts index e27fae6dcc..c6b3971815 100644 --- a/docs/genaisrc/blog-image.genai.mts +++ b/docs/genaisrc/blog-image.genai.mts @@ -10,7 +10,7 @@ if (env.vars.force || !(await workspace.stat(target))) { const { text: imagePrompt } = await runPrompt( (_) => { _.def("BLOG_POST", MD.content(file.content)) - _.$`Generate an image prompt for DALLE-3 that illustrates the contents of . + _.$`Generate an image prompt for gpt-image-1 that illustrates the contents of . Include specific description related to the content of . ${style}` }, @@ -24,10 +24,9 @@ Include specific description related to the content of . ${style}`, { mime: "image/png", - size: "1792x1024", - scale: 768 / 1792, + size: "landscape", maxHeight: 762, - style: "vivid", + model: "openai:gpt-image-1", } ) diff --git a/docs/public/blog/gpt-image-1.mp3 b/docs/public/blog/gpt-image-1.mp3 new file mode 100644 index 0000000000..3a0d355d79 Binary files /dev/null and b/docs/public/blog/gpt-image-1.mp3 differ diff --git a/docs/public/blog/gpt-image-1.txt b/docs/public/blog/gpt-image-1.txt new file mode 100644 index 0000000000..9ece4c104e --- /dev/null +++ b/docs/public/blog/gpt-image-1.txt @@ -0,0 +1 @@ +OpenAI just introduced gpt-image-1, the latest AI model designed to generate images from plain text prompts. This update is now available through both OpenAI’s own API and Azure AI Foundry, making it even easier for anyone to experiment with image generation. To test its abilities, the team generated pixel art cats with the same prompt across three models—DALL·E 2, DALL·E 3, and gpt-image-1—highlighting how each interprets the same creative task. The results show different visual styles and a unique flair from the new gpt-image-1, which brings fresh options for creators seeking distinctive, high-detail pixel art. With this launch, you can now compare outputs side by side, observing nuanced changes in how these models translate words into images—and discover which one fits your next idea best. \ No newline at end of file diff --git a/docs/src/content/docs/blog/88daddda0cbe49a60fe7b11db44b2f037c0e70f8469884df13e0bbaff8bb66de.png b/docs/src/content/docs/blog/88daddda0cbe49a60fe7b11db44b2f037c0e70f8469884df13e0bbaff8bb66de.png new file mode 100644 index 0000000000..187265e7bd Binary files /dev/null and b/docs/src/content/docs/blog/88daddda0cbe49a60fe7b11db44b2f037c0e70f8469884df13e0bbaff8bb66de.png differ diff --git a/docs/src/content/docs/blog/8ce06ae2b0bd7193701d7914faf3faf9b384ae6d3d8cb1d29113b47900aad66a.png b/docs/src/content/docs/blog/8ce06ae2b0bd7193701d7914faf3faf9b384ae6d3d8cb1d29113b47900aad66a.png new file mode 100644 index 0000000000..df29cafbd1 Binary files /dev/null and b/docs/src/content/docs/blog/8ce06ae2b0bd7193701d7914faf3faf9b384ae6d3d8cb1d29113b47900aad66a.png differ diff --git a/docs/src/content/docs/blog/9c8d4a6bd2b023110b8e716ca48acae431401adf1c8d816c9b986abefa6acafe.png b/docs/src/content/docs/blog/9c8d4a6bd2b023110b8e716ca48acae431401adf1c8d816c9b986abefa6acafe.png new file mode 100644 index 0000000000..a2c911f1a4 Binary files /dev/null and b/docs/src/content/docs/blog/9c8d4a6bd2b023110b8e716ca48acae431401adf1c8d816c9b986abefa6acafe.png differ diff --git a/docs/src/content/docs/blog/gpt-image-1.mdx b/docs/src/content/docs/blog/gpt-image-1.mdx new file mode 100644 index 0000000000..b07ee05019 --- /dev/null +++ b/docs/src/content/docs/blog/gpt-image-1.mdx @@ -0,0 +1,83 @@ +--- +title: GPT-Image-1 +description: A new model for generating images from text prompts +author: pelikhan +date: 2025-04-25 +cover: + alt: Three side-by-side square frames, each showing a uniquely posed 8-bit style + pixel cat. Each frame visually represents image generation from different AI + models, using five flat corporate colors and minimalist geometric + backgrounds. The cats are simple, highly pixelated, and visually distinct + from one another, with no text or people present, creating a clean, + corporate, and comparative visual suitable for a blog. + image: ./gpt-image-1.png +tags: + - openai gpt-image-1 + - image generation + - dall-e comparison + - azure ai foundry + - pixel art cat +excerpt: Our team just launched support for the new OpenAI gpt-image-1 image + generation model, now available through both OpenAI’s API and Azure AI + Foundry. We compared gpt-image-1 to DALL·E 2 and DALL·E 3 by generating 8-bit + pixel cat images using the same prompt. Each model produces distinct visual + results, and gpt-image-1 brings its own style and interpretation. This update + helps you evaluate how current generative models handle familiar creative + tasks while leveraging advances in image synthesis. Try running the same + workflows you use for existing models to see how output and prompt handling + differ with gpt-image-1. + +--- +import BlogNarration from "../../../components/BlogNarration.astro" + + + +We've added support for the new OpenAI `gpt-image-1` image generation model. +You can try out through OpenAI's API or Azure AI Foundry. + +```js 'model: "openai:gpt-image-1"' +... = await generateImage("...", { + model: "openai:gpt-image-1", +}) +``` + +To compare the performance of this model, here is a little script that +generate an pixelated cat image on DallE-2/3 and `gpt-image-1`. + +```js title="images.genai.mjs" wrap +const { output } = env +for (const model of [ + "openai:dall-e-2", + "openai:dall-e-3", + "openai:gpt-image-1", +]) { + output.heading(3, `Model: ${model}`) + const { image, revisedPrompt } = await generateImage( + `a cute cat. only one. iconic, high details. 8-bit resolution.`, + { + maxWidth: 400, + mime: "image/png", + model, + size: "square", + } + ) + await env.output.image(image.filename) + output.fence(revisedPrompt) +} +``` + +### Model: openai:dall-e-2 + +![image](./88daddda0cbe49a60fe7b11db44b2f037c0e70f8469884df13e0bbaff8bb66de.png) + +### Model: openai:dall-e-3 + +![image](./8ce06ae2b0bd7193701d7914faf3faf9b384ae6d3d8cb1d29113b47900aad66a.png) + +``` +Visualize an adorable single feline, lavishly detailed, represented in charming 8-bit resolution. This cat is incredibly distinctive and recognizable, with unique features that make it stand out from the norm. Consider adding intricate patterns on its fur or any other unusual characteristics to boost the iconic nature of this cute cat. +``` + +### Model: openai:gpt-image-1 + +![image](./9c8d4a6bd2b023110b8e716ca48acae431401adf1c8d816c9b986abefa6acafe.png) diff --git a/docs/src/content/docs/blog/gpt-image-1.png b/docs/src/content/docs/blog/gpt-image-1.png new file mode 100644 index 0000000000..522eb7186e Binary files /dev/null and b/docs/src/content/docs/blog/gpt-image-1.png differ diff --git a/packages/core/src/chat.ts b/packages/core/src/chat.ts index e0cbfb42ac..25dc40dcbc 100644 --- a/packages/core/src/chat.ts +++ b/packages/core/src/chat.ts @@ -167,7 +167,7 @@ export type SpeechFunction = ( export type CreateImageRequest = { model: string prompt: string - quality?: "hd" + quality?: string size?: string style?: string } diff --git a/packages/core/src/image.ts b/packages/core/src/image.ts index d2bcd8ca8e..f3dbf6f871 100644 --- a/packages/core/src/image.ts +++ b/packages/core/src/image.ts @@ -1,6 +1,3 @@ -import debug from "debug" -const dbg = debug("genaiscript:image") - // Import necessary functions and types from other modules import { resolveBufferLike } from "./bufferlike" import { @@ -16,6 +13,8 @@ import pLimit from "p-limit" import { CancellationOptions, checkCancelled } from "./cancellation" import { wrapColor, wrapRgbColor } from "./consolecolor" import { assert } from "console" +import { genaiscriptDebug } from "./debug" +const dbg = genaiscriptDebug("image") async function prepare( url: BufferLike, @@ -75,15 +74,33 @@ async function prepare( // Contain the image within specified max dimensions if provided if (options.maxWidth ?? options.maxHeight) { - dbg( - `containing image within ${options.maxWidth || ""}x${options.maxHeight || ""}` - ) - contain( - img, - img.width > maxWidth ? maxWidth : img.width, - img.height > maxHeight ? maxHeight : img.height, - HorizontalAlign.CENTER | VerticalAlign.MIDDLE - ) + if (options.maxWidth && !options.maxHeight) { + if (img.width > options.maxWidth) { + dbg(`resize width to %d`, options.maxWidth) + img.resize({ + w: options.maxWidth, + h: Math.ceil((img.height / img.width) * options.maxWidth), + }) + } + } else if (options.maxHeight && !options.maxWidth) { + if (img.height > options.maxHeight) { + dbg(`resize height to %d`, options.maxHeight) + img.resize({ + h: options.maxHeight, + w: Math.ceil((img.width / img.height) * options.maxHeight), + }) + } + } else { + dbg( + `containing image within ${options.maxWidth || ""}x${options.maxHeight || ""}` + ) + contain( + img, + img.width > maxWidth ? maxWidth : img.width, + img.height > maxHeight ? maxHeight : img.height, + HorizontalAlign.CENTER | VerticalAlign.MIDDLE + ) + } } // Auto-crop the image if required by options diff --git a/packages/core/src/openai.ts b/packages/core/src/openai.ts index 7a10492f6c..daaab5d695 100644 --- a/packages/core/src/openai.ts +++ b/packages/core/src/openai.ts @@ -762,15 +762,56 @@ export async function OpenAIImageGeneration( const { trace } = options || {} let url = `${cfg.base}/images/generations` - const body = { + const isDallE = /^dall-e/i.test(model) + const isDallE2 = /^dall-e-2/i.test(model) + const isDallE3 = /^dall-e-3/i.test(model) + const isGpt = /^gpt-image/i.test(model) + + const body: any = { model, prompt, size, quality, style, - response_format: "b64_json", ...rest, } + + // auto is the default quality, so always delete it + if (body.quality === "auto" || isDallE2) delete body.quality + if (isDallE3) { + if (body.quality === "high") body.quality = "hd" + else delete body.quality + } + if (isGpt && body.quality === "hd") body.quality = "high" + if (!isDallE3) delete body.style + if (isDallE) body.response_format = "b64_json" + + if (isDallE3) { + if (body.size === "portrait") body.size = "1024x1792" + else if (body.size === "landscape") body.size = "1792x1024" + else if (body.size === "square") body.size = "1024x1024" + } else if (isDallE2) { + if ( + body.size === "portrait" || + body.size === "landscape" || + body.size === "square" + ) + body.size = "1024x1024" + } else if (isGpt) { + if (body.size === "portrait") body.size = "1024x1536" + else if (body.size === "landscape") body.size = "1536x1024" + else if (body.size === "square") body.size = "1024x1024" + } + + if (body.size === "auto") delete body.size + + dbg("%o", { + quality: body.quality, + style: body.style, + response_format: body.response_format, + size: body.size, + }) + if (cfg.type === "azure") { const version = cfg.version || AZURE_OPENAI_API_VERSION trace?.itemValue(`version`, version) @@ -784,7 +825,9 @@ export async function OpenAIImageGeneration( const fetch = await createFetch(options) try { - logInfo(`generate image with ${cfg.provider}:${cfg.model}`) + logInfo( + `generate image with ${cfg.provider}:${cfg.model} (this may take a while)` + ) const freq = { method: "POST", headers: { @@ -797,6 +840,7 @@ export async function OpenAIImageGeneration( trace?.itemValue(`url`, `[${url}](${url})`) traceFetchPost(trace, url, freq.headers, body) const res = await fetch(url, freq as any) + dbg(`response: %d %s`, res.status, res.statusText) trace?.itemValue(`status`, `${res.status} ${res.statusText}`) if (!res.ok) return { @@ -804,6 +848,7 @@ export async function OpenAIImageGeneration( error: (await res.json())?.error || res.statusText, } const j = await res.json() + dbg(`%O`, j) const revisedPrompt = j.data[0]?.revised_prompt if (revisedPrompt) trace?.details(`📷 revised prompt`, j.data[0].revised_prompt) diff --git a/packages/core/src/types/prompt_template.d.ts b/packages/core/src/types/prompt_template.d.ts index d671393188..a30558adbc 100644 --- a/packages/core/src/types/prompt_template.d.ts +++ b/packages/core/src/types/prompt_template.d.ts @@ -307,7 +307,7 @@ type ModelVisionType = OptionsOrString< > type ModelImageGenerationType = OptionsOrString< - "openai:dall-e-2" | "openai:dall-e-3" + "openai:gpt-image-1" | "openai:dall-e-2" | "openai:dall-e-3" > type ModelProviderType = OptionsOrString< @@ -4380,10 +4380,35 @@ type TranscriptionModelType = OptionsOrString< interface ImageGenerationOptions extends ImageTransformOptions { model?: OptionsOrString - quality?: "hd" + /** + * The quality of the image that will be generated. + * auto (default value) will automatically select the best quality for the given model. + * high, medium and low are supported for gpt-image-1. + * high is supported for dall-e-3. + * dall-e-2 ignores this flag + */ + quality?: "auto" | "low" | "medium" | "high" + /** + * Image size. + * For gpt-image-1: 1024x1024, 1536x1024 (landscape), 1024x1536 (portrait), or auto (default value) + * For dall-e: 256x256, 512x512, or 1024x1024 for dall-e-2, and one of 1024x1024, 1792x1024. + */ size?: OptionsOrString< - "256x256" | "512x512" | "1024x1024" | "1024x1792" | "1792x1024" + | "auto" + | "landscape" + | "portrait" + | "square" + | "1536x1024" + | "1024x1536" + | "256x256" + | "512x512" + | "1024x1024" + | "1024x1792" + | "1792x1024" > + /** + * Only used for DALL-E 3 + */ style?: OptionsOrString<"vivid" | "natural"> } diff --git a/packages/sample/genaisrc/image-gen.genai.mjs b/packages/sample/genaisrc/image-gen.genai.mjs index 262c277684..4c1c7c154a 100644 --- a/packages/sample/genaisrc/image-gen.genai.mjs +++ b/packages/sample/genaisrc/image-gen.genai.mjs @@ -1,6 +1,20 @@ -const { image, revisedPrompt } = await generateImage( - `a cute cat. only one. iconic, high details. 8-bit resolution.`, - { maxWidth: 400, mime: "image/png" } -) -env.output.image(image.filename) -env.output.fence(revisedPrompt) +const { output } = env +for (const model of [ + "openai:dall-e-2", + "openai:dall-e-3", + "openai:gpt-image-1", +]) { + output.heading(3, `Model: ${model}`) + const { image, revisedPrompt } = await generateImage( + `a cute cat. only one. iconic, high details. 8-bit resolution.`, + { + maxWidth: 400, + autoCrop: true, + mime: "image/png", + model, + size: "square", + } + ) + await env.output.image(image.filename) + output.fence(revisedPrompt) +}