From 5edd6fd8111ad7a6ee25f1261e96005ae0bcd98b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Wed, 30 Oct 2024 19:35:10 +0100 Subject: [PATCH 01/22] fast latent image preview --- examples/cli/main.cpp | 122 +++++++++++++++++++++++++++++++++++++++++- stable-diffusion.cpp | 30 +++++------ stable-diffusion.h | 5 +- 3 files changed, 138 insertions(+), 19 deletions(-) diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index cf8f5b130..82f5fc368 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -787,6 +787,125 @@ void sd_log_cb(enum sd_log_level_t level, const char* log, void* data) { fflush(out_stream); } +// https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L152-L169 +const float flux_latent_rgb_proj[16][3] = { + {-0.0346, 0.0244, 0.0681}, + {0.0034, 0.0210, 0.0687}, + {0.0275, -0.0668, -0.0433}, + {-0.0174, 0.0160, 0.0617}, + {0.0859, 0.0721, 0.0329}, + {0.0004, 0.0383, 0.0115}, + {0.0405, 0.0861, 0.0915}, + {-0.0236, -0.0185, -0.0259}, + {-0.0245, 0.0250, 0.1180}, + {0.1008, 0.0755, -0.0421}, + {-0.0515, 0.0201, 0.0011}, + {0.0428, -0.0012, -0.0036}, + {0.0817, 0.0765, 0.0749}, + {-0.1264, -0.0522, -0.1103}, + {-0.0280, -0.0881, -0.0499}, + {-0.1262, -0.0982, -0.0778}}; + +// https://github.com/Stability-AI/sd3.5/blob/main/sd3_impls.py#L228-L246 +const float sd3_latent_rgb_proj[16][3] = { + {-0.0645, 0.0177, 0.1052}, + {0.0028, 0.0312, 0.0650}, + {0.1848, 0.0762, 0.0360}, + {0.0944, 0.0360, 0.0889}, + {0.0897, 0.0506, -0.0364}, + {-0.0020, 0.1203, 0.0284}, + {0.0855, 0.0118, 0.0283}, + {-0.0539, 0.0658, 0.1047}, + {-0.0057, 0.0116, 0.0700}, + {-0.0412, 0.0281, -0.0039}, + {0.1106, 0.1171, 0.1220}, + {-0.0248, 0.0682, -0.0481}, + {0.0815, 0.0846, 0.1207}, + {-0.0120, -0.0055, -0.0867}, + {-0.0749, -0.0634, -0.0456}, + {-0.1418, -0.1457, -0.1259}, +}; + +// https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L32-L38 +const float sdxl_latent_rgb_proj[4][3] = { + {0.3651, 0.4232, 0.4341}, + {-0.2533, -0.0042, 0.1068}, + {0.1076, 0.1111, -0.0362}, + {-0.3165, -0.2492, -0.2188}}; + +// https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L32-L38 +const float sd_latent_rgb_proj[4][3]{ + {0.3512, 0.2297, 0.3227}, + {0.3250, 0.4974, 0.2350}, + {-0.2829, 0.1762, 0.2721}, + {-0.2120, -0.2616, -0.7177}}; + +void step_callback(int step, struct ggml_tensor* latents, enum SDVersion version) { + const int channel = 3; + int width = latents->ne[0]; + int height = latents->ne[1]; + int dim = latents->ne[2]; + + const float(*latent_rgb_proj)[channel]; + + if (dim == 16) { + // 16 channels VAE -> Flux or SD3 + + if (version == VERSION_SD3_2B || version == VERSION_SD3_5_8B /* || version == VERSION_SD3_5_2B*/) { + latent_rgb_proj = sd3_latent_rgb_proj; + } else if (version == VERSION_FLUX_DEV || version == VERSION_FLUX_SCHNELL) { + latent_rgb_proj = flux_latent_rgb_proj; + } else { + // unknown model + return; + } + + } else if (dim == 4) { + // 4 channels VAE + if (version == VERSION_SDXL) { + latent_rgb_proj = sdxl_latent_rgb_proj; + } else if (version == VERSION_SD1 || version == VERSION_SD2) { + latent_rgb_proj = sd_latent_rgb_proj; + } else { + // unknown model + return; + } + } else { + // unknown latent space + return; + } + uint8_t* data = (uint8_t*)malloc(width * height * channel * sizeof(uint8_t)); + int data_head = 0; + for (int j = 0; j < height; j++) { + for (int i = 0; i < width; i++) { + int latent_id = (i * latents->nb[0] + j * latents->nb[1]); + float r = 0, g = 0, b = 0; + for (int d = 0; d < dim; d++) { + float value = *(float*)((char*)latents->data + latent_id + d * latents->nb[2]); + r += value * latent_rgb_proj[d][0]; + g += value * latent_rgb_proj[d][1]; + b += value * latent_rgb_proj[d][2]; + } + + // change range + r = r * .5 + .5; + g = g * .5 + .5; + b = b * .5 + .5; + + // clamp rgb values to [0,1] range + r = r >= 0 ? r <= 1 ? r : 1 : 0; + g = g >= 0 ? g <= 1 ? g : 1 : 0; + b = b >= 0 ? b <= 1 ? b : 1 : 0; + + data[data_head++] = (uint8_t)(r * 255.); + data[data_head++] = (uint8_t)(g * 255.); + data[data_head++] = (uint8_t)(b * 255.); + } + } + stbi_write_png("latent-preview.png", width, height, channel, data, 0); + free(data); +} + int main(int argc, const char* argv[]) { SDParams params; @@ -967,7 +1086,8 @@ int main(int argc, const char* argv[]) { params.skip_layers.size(), params.slg_scale, params.skip_layer_start, - params.skip_layer_end); + params.skip_layer_end, + step_callback); } else { sd_image_t input_image = {(uint32_t)params.width, (uint32_t)params.height, diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index a2d33bca2..05f858860 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -804,19 +804,8 @@ class StableDiffusionGGML { float slg_scale = 0, float skip_layer_start = 0.01, float skip_layer_end = 0.2, - ggml_tensor* noise_mask = nullptr) { - LOG_DEBUG("Sample"); - struct ggml_init_params params; - size_t data_size = ggml_row_size(init_latent->type, init_latent->ne[0]); - for (int i = 1; i < 4; i++) { - data_size *= init_latent->ne[i]; - } - data_size += 1024; - params.mem_size = data_size * 3; - params.mem_buffer = NULL; - params.no_alloc = false; - ggml_context* tmp_ctx = ggml_init(params); - + ggml_tensor* noise_mask = nullptr, + std::function step_callback = nullptr) { size_t steps = sigmas.size() - 1; // noise = load_tensor_from_file(work_ctx, "./rand0.bin"); // print_ggml_tensor(noise); @@ -988,6 +977,9 @@ class StableDiffusionGGML { } } + if (step_callback != nullptr) { + step_callback(step, denoised, version); + } return denoised; }; @@ -1213,7 +1205,8 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, float slg_scale = 0, float skip_layer_start = 0.01, float skip_layer_end = 0.2, - ggml_tensor* masked_image = NULL) { + ggml_tensor* masked_image = NULL, + std::function step_callback = nullptr) { if (seed < 0) { // Generally, when using the provided command line, the seed is always >0. // However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library @@ -1470,7 +1463,8 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, slg_scale, skip_layer_start, skip_layer_end, - noise_mask); + noise_mask, + step_callback); // struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin"); // print_ggml_tensor(x_0); @@ -1543,7 +1537,8 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx, size_t skip_layers_count = 0, float slg_scale = 0, float skip_layer_start = 0.01, - float skip_layer_end = 0.2) { + float skip_layer_end = 0.2, + step_callback_t step_callback) { std::vector skip_layers_vec(skip_layers, skip_layers + skip_layers_count); LOG_DEBUG("txt2img %dx%d", width, height); if (sd_ctx == NULL) { @@ -1621,7 +1616,8 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx, skip_layers_vec, slg_scale, skip_layer_start, - skip_layer_end); + skip_layer_end, + step_callback); size_t t1 = ggml_time_ms(); diff --git a/stable-diffusion.h b/stable-diffusion.h index 8872bbaac..a3d1f0189 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -151,6 +151,8 @@ SD_API sd_ctx_t* new_sd_ctx(const char* model_path, SD_API void free_sd_ctx(sd_ctx_t* sd_ctx); +typedef void (*step_callback_t)(int, struct ggml_tensor*, enum SDVersion); + SD_API sd_image_t* txt2img(sd_ctx_t* sd_ctx, const char* prompt, const char* negative_prompt, @@ -173,7 +175,8 @@ SD_API sd_image_t* txt2img(sd_ctx_t* sd_ctx, size_t skip_layers_count, float slg_scale, float skip_layer_start, - float skip_layer_end); + float skip_layer_end, + step_callback_t step_callback = NULL); SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx, sd_image_t init_image, From 6bcf352fcd08b69d2dd209c54a55f1d104f225aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Wed, 30 Oct 2024 20:18:38 +0100 Subject: [PATCH 02/22] fix posix compile --- examples/cli/main.cpp | 2 +- stable-diffusion.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index 82f5fc368..075c930c0 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -1087,7 +1087,7 @@ int main(int argc, const char* argv[]) { params.slg_scale, params.skip_layer_start, params.skip_layer_end, - step_callback); + (step_callback_t)step_callback); } else { sd_image_t input_image = {(uint32_t)params.width, (uint32_t)params.height, diff --git a/stable-diffusion.h b/stable-diffusion.h index a3d1f0189..462b80fb7 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -151,7 +151,7 @@ SD_API sd_ctx_t* new_sd_ctx(const char* model_path, SD_API void free_sd_ctx(sd_ctx_t* sd_ctx); -typedef void (*step_callback_t)(int, struct ggml_tensor*, enum SDVersion); +typedef void (*step_callback_t)(int, struct ggml_tensor*, int); SD_API sd_image_t* txt2img(sd_ctx_t* sd_ctx, const char* prompt, From 155ead4a15c224f96e3fa03d2523dce3dea8edf9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Sun, 24 Nov 2024 19:07:52 +0100 Subject: [PATCH 03/22] preview: use new helper functions --- examples/cli/main.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index 075c930c0..4e2361b49 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -851,9 +851,9 @@ void step_callback(int step, struct ggml_tensor* latents, enum SDVersion version if (dim == 16) { // 16 channels VAE -> Flux or SD3 - if (version == VERSION_SD3_2B || version == VERSION_SD3_5_8B /* || version == VERSION_SD3_5_2B*/) { + if (sd_version_is_sd3(version)) { latent_rgb_proj = sd3_latent_rgb_proj; - } else if (version == VERSION_FLUX_DEV || version == VERSION_FLUX_SCHNELL) { + } else if (sd_version_is_flux(version)) { latent_rgb_proj = flux_latent_rgb_proj; } else { // unknown model From 3cbfa6d53cc77c0163aa99fde538275a77187aa1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Mon, 25 Nov 2024 00:28:35 +0100 Subject: [PATCH 04/22] move latent preview code to a separate file --- examples/cli/main.cpp | 83 +++---------------------------------------- latent-preview.h | 83 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+), 79 deletions(-) create mode 100644 latent-preview.h diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index 4e2361b49..98a8ab07c 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -10,6 +10,8 @@ #include "flux.hpp" #include "stable-diffusion.h" +#include "latent-preview.h" + #define STB_IMAGE_IMPLEMENTATION #define STB_IMAGE_STATIC #include "stb_image.h" @@ -787,59 +789,6 @@ void sd_log_cb(enum sd_log_level_t level, const char* log, void* data) { fflush(out_stream); } -// https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L152-L169 -const float flux_latent_rgb_proj[16][3] = { - {-0.0346, 0.0244, 0.0681}, - {0.0034, 0.0210, 0.0687}, - {0.0275, -0.0668, -0.0433}, - {-0.0174, 0.0160, 0.0617}, - {0.0859, 0.0721, 0.0329}, - {0.0004, 0.0383, 0.0115}, - {0.0405, 0.0861, 0.0915}, - {-0.0236, -0.0185, -0.0259}, - {-0.0245, 0.0250, 0.1180}, - {0.1008, 0.0755, -0.0421}, - {-0.0515, 0.0201, 0.0011}, - {0.0428, -0.0012, -0.0036}, - {0.0817, 0.0765, 0.0749}, - {-0.1264, -0.0522, -0.1103}, - {-0.0280, -0.0881, -0.0499}, - {-0.1262, -0.0982, -0.0778}}; - -// https://github.com/Stability-AI/sd3.5/blob/main/sd3_impls.py#L228-L246 -const float sd3_latent_rgb_proj[16][3] = { - {-0.0645, 0.0177, 0.1052}, - {0.0028, 0.0312, 0.0650}, - {0.1848, 0.0762, 0.0360}, - {0.0944, 0.0360, 0.0889}, - {0.0897, 0.0506, -0.0364}, - {-0.0020, 0.1203, 0.0284}, - {0.0855, 0.0118, 0.0283}, - {-0.0539, 0.0658, 0.1047}, - {-0.0057, 0.0116, 0.0700}, - {-0.0412, 0.0281, -0.0039}, - {0.1106, 0.1171, 0.1220}, - {-0.0248, 0.0682, -0.0481}, - {0.0815, 0.0846, 0.1207}, - {-0.0120, -0.0055, -0.0867}, - {-0.0749, -0.0634, -0.0456}, - {-0.1418, -0.1457, -0.1259}, -}; - -// https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L32-L38 -const float sdxl_latent_rgb_proj[4][3] = { - {0.3651, 0.4232, 0.4341}, - {-0.2533, -0.0042, 0.1068}, - {0.1076, 0.1111, -0.0362}, - {-0.3165, -0.2492, -0.2188}}; - -// https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L32-L38 -const float sd_latent_rgb_proj[4][3]{ - {0.3512, 0.2297, 0.3227}, - {0.3250, 0.4974, 0.2350}, - {-0.2829, 0.1762, 0.2721}, - {-0.2120, -0.2616, -0.7177}}; - void step_callback(int step, struct ggml_tensor* latents, enum SDVersion version) { const int channel = 3; int width = latents->ne[0]; @@ -875,33 +824,9 @@ void step_callback(int step, struct ggml_tensor* latents, enum SDVersion version return; } uint8_t* data = (uint8_t*)malloc(width * height * channel * sizeof(uint8_t)); - int data_head = 0; - for (int j = 0; j < height; j++) { - for (int i = 0; i < width; i++) { - int latent_id = (i * latents->nb[0] + j * latents->nb[1]); - float r = 0, g = 0, b = 0; - for (int d = 0; d < dim; d++) { - float value = *(float*)((char*)latents->data + latent_id + d * latents->nb[2]); - r += value * latent_rgb_proj[d][0]; - g += value * latent_rgb_proj[d][1]; - b += value * latent_rgb_proj[d][2]; - } - - // change range - r = r * .5 + .5; - g = g * .5 + .5; - b = b * .5 + .5; - - // clamp rgb values to [0,1] range - r = r >= 0 ? r <= 1 ? r : 1 : 0; - g = g >= 0 ? g <= 1 ? g : 1 : 0; - b = b >= 0 ? b <= 1 ? b : 1 : 0; + + preview_latent_image(data, latents, latent_rgb_proj, width, height, dim); - data[data_head++] = (uint8_t)(r * 255.); - data[data_head++] = (uint8_t)(g * 255.); - data[data_head++] = (uint8_t)(b * 255.); - } - } stbi_write_png("latent-preview.png", width, height, channel, data, 0); free(data); } diff --git a/latent-preview.h b/latent-preview.h new file mode 100644 index 000000000..5457c47ed --- /dev/null +++ b/latent-preview.h @@ -0,0 +1,83 @@ + +// https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L152-L169 +const float flux_latent_rgb_proj[16][3] = { + {-0.0346, 0.0244, 0.0681}, + {0.0034, 0.0210, 0.0687}, + {0.0275, -0.0668, -0.0433}, + {-0.0174, 0.0160, 0.0617}, + {0.0859, 0.0721, 0.0329}, + {0.0004, 0.0383, 0.0115}, + {0.0405, 0.0861, 0.0915}, + {-0.0236, -0.0185, -0.0259}, + {-0.0245, 0.0250, 0.1180}, + {0.1008, 0.0755, -0.0421}, + {-0.0515, 0.0201, 0.0011}, + {0.0428, -0.0012, -0.0036}, + {0.0817, 0.0765, 0.0749}, + {-0.1264, -0.0522, -0.1103}, + {-0.0280, -0.0881, -0.0499}, + {-0.1262, -0.0982, -0.0778}}; + +// https://github.com/Stability-AI/sd3.5/blob/main/sd3_impls.py#L228-L246 +const float sd3_latent_rgb_proj[16][3] = { + {-0.0645, 0.0177, 0.1052}, + {0.0028, 0.0312, 0.0650}, + {0.1848, 0.0762, 0.0360}, + {0.0944, 0.0360, 0.0889}, + {0.0897, 0.0506, -0.0364}, + {-0.0020, 0.1203, 0.0284}, + {0.0855, 0.0118, 0.0283}, + {-0.0539, 0.0658, 0.1047}, + {-0.0057, 0.0116, 0.0700}, + {-0.0412, 0.0281, -0.0039}, + {0.1106, 0.1171, 0.1220}, + {-0.0248, 0.0682, -0.0481}, + {0.0815, 0.0846, 0.1207}, + {-0.0120, -0.0055, -0.0867}, + {-0.0749, -0.0634, -0.0456}, + {-0.1418, -0.1457, -0.1259}, +}; + +// https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L32-L38 +const float sdxl_latent_rgb_proj[4][3] = { + {0.3651, 0.4232, 0.4341}, + {-0.2533, -0.0042, 0.1068}, + {0.1076, 0.1111, -0.0362}, + {-0.3165, -0.2492, -0.2188}}; + +// https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L32-L38 +const float sd_latent_rgb_proj[4][3]{ + {0.3512, 0.2297, 0.3227}, + {0.3250, 0.4974, 0.2350}, + {-0.2829, 0.1762, 0.2721}, + {-0.2120, -0.2616, -0.7177}}; + +void preview_latent_image(uint8_t* buffer, struct ggml_tensor* latents, const float (*latent_rgb_proj)[3], int width, int height, int dim) { + size_t buffer_head = 0; + for (int j = 0; j < height; j++) { + for (int i = 0; i < width; i++) { + size_t latent_id = (i * latents->nb[0] + j * latents->nb[1]); + float r = 0, g = 0, b = 0; + for (int d = 0; d < dim; d++) { + float value = *(float*)((char*)latents->data + latent_id + d * latents->nb[2]); + r += value * latent_rgb_proj[d][0]; + g += value * latent_rgb_proj[d][1]; + b += value * latent_rgb_proj[d][2]; + } + + // change range + r = r * .5f + .5f; + g = g * .5f + .5f; + b = b * .5f + .5f; + + // clamp rgb values to [0,1] range + r = r >= 0 ? r <= 1 ? r : 1 : 0; + g = g >= 0 ? g <= 1 ? g : 1 : 0; + b = b >= 0 ? b <= 1 ? b : 1 : 0; + + buffer[buffer_head++] = (uint8_t)(r * 255); + buffer[buffer_head++] = (uint8_t)(g * 255); + buffer[buffer_head++] = (uint8_t)(b * 255); + } + } +} \ No newline at end of file From 2e2a6319fafb8af20e7623036edfd771212d0403 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Tue, 26 Nov 2024 11:43:26 +0100 Subject: [PATCH 05/22] No defaults in c code --- stable-diffusion.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stable-diffusion.h b/stable-diffusion.h index 462b80fb7..6443422b6 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -176,7 +176,7 @@ SD_API sd_image_t* txt2img(sd_ctx_t* sd_ctx, float slg_scale, float skip_layer_start, float skip_layer_end, - step_callback_t step_callback = NULL); + step_callback_t step_callback); SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx, sd_image_t init_image, From 02ec23c79f10853eff40a9c31332ccce93ae1dcc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Tue, 26 Nov 2024 11:56:57 +0100 Subject: [PATCH 06/22] Latent preview support for img2img and img2vid --- examples/cli/main.cpp | 6 ++++-- stable-diffusion.cpp | 36 +++++++++++++++++++++--------------- stable-diffusion.h | 6 ++++-- 3 files changed, 29 insertions(+), 19 deletions(-) diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index 98a8ab07c..470e6f1cb 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -1033,7 +1033,8 @@ int main(int argc, const char* argv[]) { params.sample_method, params.sample_steps, params.strength, - params.seed); + params.seed, + (step_callback_t)step_callback); if (results == NULL) { printf("generate failed\n"); free_sd_ctx(sd_ctx); @@ -1081,7 +1082,8 @@ int main(int argc, const char* argv[]) { params.skip_layers.size(), params.slg_scale, params.skip_layer_start, - params.skip_layer_end); + params.skip_layer_end, + (step_callback_t)step_callback); } } diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 05f858860..35d4846ab 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -800,11 +800,11 @@ class StableDiffusionGGML { const std::vector& sigmas, int start_merge_step, SDCondition id_cond, - std::vector skip_layers = {}, - float slg_scale = 0, - float skip_layer_start = 0.01, - float skip_layer_end = 0.2, - ggml_tensor* noise_mask = nullptr, + std::vector skip_layers = {}, + float slg_scale = 0, + float skip_layer_start = 0.01, + float skip_layer_end = 0.2, + ggml_tensor* noise_mask = nullptr, std::function step_callback = nullptr) { size_t steps = sigmas.size() - 1; // noise = load_tensor_from_file(work_ctx, "./rand0.bin"); @@ -1201,11 +1201,11 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, float style_ratio, bool normalize_input, std::string input_id_images_path, - std::vector skip_layers = {}, - float slg_scale = 0, - float skip_layer_start = 0.01, - float skip_layer_end = 0.2, - ggml_tensor* masked_image = NULL, + std::vector skip_layers = {}, + float slg_scale = 0, + float skip_layer_start = 0.01, + float skip_layer_end = 0.2, + ggml_tensor* masked_image = NULL, std::function step_callback = nullptr) { if (seed < 0) { // Generally, when using the provided command line, the seed is always >0. @@ -1538,7 +1538,7 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx, float slg_scale = 0, float skip_layer_start = 0.01, float skip_layer_end = 0.2, - step_callback_t step_callback) { + step_callback_t step_callback = NULL) { std::vector skip_layers_vec(skip_layers, skip_layers + skip_layers_count); LOG_DEBUG("txt2img %dx%d", width, height); if (sd_ctx == NULL) { @@ -1651,7 +1651,8 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, size_t skip_layers_count = 0, float slg_scale = 0, float skip_layer_start = 0.01, - float skip_layer_end = 0.2) { + float skip_layer_end = 0.2, + step_callback_t step_callback = NULL) { std::vector skip_layers_vec(skip_layers, skip_layers + skip_layers_count); LOG_DEBUG("img2img %dx%d", width, height); if (sd_ctx == NULL) { @@ -1798,7 +1799,8 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, slg_scale, skip_layer_start, skip_layer_end, - masked_image); + masked_image, + step_callback); size_t t2 = ggml_time_ms(); @@ -1820,7 +1822,8 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx, enum sample_method_t sample_method, int sample_steps, float strength, - int64_t seed) { + int64_t seed, + step_callback_t step_callback = NULL) { if (sd_ctx == NULL) { return NULL; } @@ -1900,7 +1903,10 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx, sample_method, sigmas, -1, - SDCondition(NULL, NULL, NULL)); + SDCondition(NULL, NULL, NULL), + {}, + 0, 0, 0, + step_callback); int64_t t2 = ggml_time_ms(); LOG_INFO("sampling completed, taking %.2fs", (t2 - t1) * 1.0f / 1000); diff --git a/stable-diffusion.h b/stable-diffusion.h index 6443422b6..ad64717e7 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -203,7 +203,8 @@ SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx, size_t skip_layers_count, float slg_scale, float skip_layer_start, - float skip_layer_end); + float skip_layer_end, + step_callback_t step_callback); SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx, sd_image_t init_image, @@ -218,7 +219,8 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx, enum sample_method_t sample_method, int sample_steps, float strength, - int64_t seed); + int64_t seed, + step_callback_t step_callback); typedef struct upscaler_ctx_t upscaler_ctx_t; From ba9a4479ae5f6714a32c84940037b802798a1a8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Tue, 26 Nov 2024 12:06:53 +0100 Subject: [PATCH 07/22] add latent-preview to .gitignore --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 38fe570df..2e520df2c 100644 --- a/.gitignore +++ b/.gitignore @@ -10,4 +10,5 @@ test/ *.gguf output*.png models* -*.log \ No newline at end of file +*.log +latent-preview.png From 828c80d36ce8728ec575a0975e54eb71f05b9fd1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Fri, 13 Dec 2024 19:10:51 +0100 Subject: [PATCH 08/22] Refactor latent preview + support tae/vae preview --- .gitignore | 2 +- examples/cli/main.cpp | 98 ++++++++++-------- stable-diffusion.cpp | 234 ++++++++++++++++++++++++++++++++---------- stable-diffusion.h | 20 +++- 4 files changed, 252 insertions(+), 102 deletions(-) diff --git a/.gitignore b/.gitignore index 2e520df2c..552d5673c 100644 --- a/.gitignore +++ b/.gitignore @@ -11,4 +11,4 @@ test/ output*.png models* *.log -latent-preview.png +preview.png diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index 470e6f1cb..0ffe430bf 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -10,8 +10,6 @@ #include "flux.hpp" #include "stable-diffusion.h" -#include "latent-preview.h" - #define STB_IMAGE_IMPLEMENTATION #define STB_IMAGE_STATIC #include "stb_image.h" @@ -62,6 +60,13 @@ const char* modes_str[] = { "convert", }; +const char* previews_str[] = { + "none", + "proj", + "tae", + "vae", +}; + enum SDMode { TXT2IMG, IMG2IMG, @@ -131,6 +136,11 @@ struct SDParams { float slg_scale = 0.; float skip_layer_start = 0.01; float skip_layer_end = 0.2; + + sd_preview_policy_t preview_method = SD_PREVIEW_NONE; + int preview_interval = 1; + std::string preview_path = "preview.png"; + bool taesd_preview = false; }; void print_params(SDParams params) { @@ -509,6 +519,8 @@ void parse_args(int argc, const char** argv, SDParams& params) { params.diffusion_flash_attn = true; // can reduce MEM significantly } else if (arg == "--canny") { params.canny_preprocess = true; + } else if (arg == "--taesd-preview-only") { + params.taesd_preview = true; } else if (arg == "-b" || arg == "--batch-count") { if (++i >= argc) { invalid_arg = true; @@ -631,6 +643,35 @@ void parse_args(int argc, const char** argv, SDParams& params) { break; } params.skip_layer_end = std::stof(argv[i]); + } else if (arg == "--preview") { + if (++i >= argc) { + invalid_arg = true; + break; + } + const char* preview = argv[i]; + int preview_method = -1; + for (int m = 0; m < N_PREVIEWS; m++) { + if (!strcmp(preview, previews_str[m])) { + preview_method = m; + } + } + if (preview_method == -1) { + invalid_arg = true; + break; + } + params.preview_method = (sd_preview_policy_t)preview_method; + } else if (arg == "--preview-interval") { + if (++i >= argc) { + invalid_arg = true; + break; + } + params.preview_interval = std::stoi(argv[i]); + } else if (arg == "--preview-path") { + if (++i >= argc) { + invalid_arg = true; + break; + } + params.preview_path = argv[i]; } else { fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); print_usage(argc, argv); @@ -789,52 +830,17 @@ void sd_log_cb(enum sd_log_level_t level, const char* log, void* data) { fflush(out_stream); } -void step_callback(int step, struct ggml_tensor* latents, enum SDVersion version) { - const int channel = 3; - int width = latents->ne[0]; - int height = latents->ne[1]; - int dim = latents->ne[2]; - - const float(*latent_rgb_proj)[channel]; - - if (dim == 16) { - // 16 channels VAE -> Flux or SD3 - - if (sd_version_is_sd3(version)) { - latent_rgb_proj = sd3_latent_rgb_proj; - } else if (sd_version_is_flux(version)) { - latent_rgb_proj = flux_latent_rgb_proj; - } else { - // unknown model - return; - } +const char* preview_path; - } else if (dim == 4) { - // 4 channels VAE - if (version == VERSION_SDXL) { - latent_rgb_proj = sdxl_latent_rgb_proj; - } else if (version == VERSION_SD1 || version == VERSION_SD2) { - latent_rgb_proj = sd_latent_rgb_proj; - } else { - // unknown model - return; - } - } else { - // unknown latent space - return; - } - uint8_t* data = (uint8_t*)malloc(width * height * channel * sizeof(uint8_t)); - - preview_latent_image(data, latents, latent_rgb_proj, width, height, dim); - - stbi_write_png("latent-preview.png", width, height, channel, data, 0); - free(data); +void step_callback(int step, sd_image_t image) { + stbi_write_png(preview_path, image.width, image.height, image.channel, image.data, 0); } int main(int argc, const char* argv[]) { SDParams params; parse_args(argc, argv, params); + preview_path = params.preview_path.c_str(); sd_set_log_callback(sd_log_cb, (void*)¶ms); @@ -944,7 +950,8 @@ int main(int argc, const char* argv[]) { params.clip_on_cpu, params.control_net_cpu, params.vae_on_cpu, - params.diffusion_flash_attn); + params.diffusion_flash_attn, + params.taesd_preview); if (sd_ctx == NULL) { printf("new_sd_ctx_t failed\n"); @@ -1012,6 +1019,8 @@ int main(int argc, const char* argv[]) { params.slg_scale, params.skip_layer_start, params.skip_layer_end, + params.preview_method, + params.preview_interval, (step_callback_t)step_callback); } else { sd_image_t input_image = {(uint32_t)params.width, @@ -1033,8 +1042,7 @@ int main(int argc, const char* argv[]) { params.sample_method, params.sample_steps, params.strength, - params.seed, - (step_callback_t)step_callback); + params.seed); if (results == NULL) { printf("generate failed\n"); free_sd_ctx(sd_ctx); @@ -1083,6 +1091,8 @@ int main(int argc, const char* argv[]) { params.slg_scale, params.skip_layer_start, params.skip_layer_end, + params.preview_method, + params.preview_interval, (step_callback_t)step_callback); } } diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 35d4846ab..5c4b36bb3 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -20,6 +20,8 @@ #define STB_IMAGE_STATIC #include "stb_image.h" +#include "latent-preview.h" + // #define STB_IMAGE_WRITE_IMPLEMENTATION // #define STB_IMAGE_WRITE_STATIC // #include "stb_image_write.h" @@ -48,8 +50,7 @@ const char* sampling_methods_str[] = { "iPNDM_v", "LCM", "DDIM \"trailing\"", - "TCD" -}; + "TCD"}; /*================================================== Helper Functions ================================================*/ @@ -159,7 +160,8 @@ class StableDiffusionGGML { bool clip_on_cpu, bool control_net_cpu, bool vae_on_cpu, - bool diffusion_flash_attn) { + bool diffusion_flash_attn, + bool tae_preview_only) { use_tiny_autoencoder = taesd_path.size() > 0; #ifdef SD_USE_CUDA LOG_DEBUG("Using CUDA backend"); @@ -351,7 +353,7 @@ class StableDiffusionGGML { diffusion_model->alloc_params_buffer(); diffusion_model->get_param_tensors(tensors); - if (!use_tiny_autoencoder) { + if (!use_tiny_autoencoder || tae_preview_only) { if (vae_on_cpu && !ggml_backend_is_cpu(backend)) { LOG_INFO("VAE Autoencoder: Using CPU backend"); vae_backend = ggml_backend_cpu_init(); @@ -361,7 +363,8 @@ class StableDiffusionGGML { first_stage_model = std::make_shared(vae_backend, model_loader.tensor_storages_types, "first_stage_model", vae_decode_only, false, version); first_stage_model->alloc_params_buffer(); first_stage_model->get_param_tensors(tensors, "first_stage_model"); - } else { + } + if (use_tiny_autoencoder) { tae_first_stage = std::make_shared(backend, model_loader.tensor_storages_types, "decoder.layers", vae_decode_only, version); } // first_stage_model->get_param_tensors(tensors, "first_stage_model."); @@ -453,9 +456,10 @@ class StableDiffusionGGML { size_t clip_params_mem_size = cond_stage_model->get_params_buffer_size(); size_t unet_params_mem_size = diffusion_model->get_params_buffer_size(); size_t vae_params_mem_size = 0; - if (!use_tiny_autoencoder) { + if (!use_tiny_autoencoder || tae_preview_only) { vae_params_mem_size = first_stage_model->get_params_buffer_size(); - } else { + } + if (use_tiny_autoencoder) { if (!tae_first_stage->load_from_file(taesd_path)) { return false; } @@ -599,6 +603,7 @@ class StableDiffusionGGML { LOG_DEBUG("finished loaded file"); ggml_free(ctx); + use_tiny_autoencoder = use_tiny_autoencoder && !tae_preview_only; return true; } @@ -682,7 +687,7 @@ class StableDiffusionGGML { float curr_multiplier = kv.second; lora_state_diff[lora_name] -= curr_multiplier; } - + size_t rm = lora_state_diff.size() - lora_state.size(); if (rm != 0) { LOG_INFO("Attempting to apply %lu LoRAs (removing %lu applied LoRAs)", lora_state.size(), rm); @@ -785,27 +790,125 @@ class StableDiffusionGGML { return {c_crossattn, y, c_concat}; } - ggml_tensor* sample(ggml_context* work_ctx, - ggml_tensor* init_latent, - ggml_tensor* noise, - SDCondition cond, - SDCondition uncond, - ggml_tensor* control_hint, - float control_strength, - float min_cfg, - float cfg_scale, - float guidance, - float eta, - sample_method_t method, - const std::vector& sigmas, - int start_merge_step, - SDCondition id_cond, - std::vector skip_layers = {}, - float slg_scale = 0, - float skip_layer_start = 0.01, - float skip_layer_end = 0.2, - ggml_tensor* noise_mask = nullptr, - std::function step_callback = nullptr) { + void preview_image(ggml_context* work_ctx, + int step, + struct ggml_tensor* latents, + enum SDVersion version, + sd_preview_policy_t preview_mode, + ggml_tensor* result, + std::function step_callback) { + const size_t channel = 3; + size_t width = latents->ne[0]; + size_t height = latents->ne[1]; + size_t dim = latents->ne[2]; + if (preview_mode == SD_PREVIEW_PROJ) { + const float(*latent_rgb_proj)[channel]; + + if (dim == 16) { + // 16 channels VAE -> Flux or SD3 + + if (sd_version_is_sd3(version)) { + latent_rgb_proj = sd3_latent_rgb_proj; + } else if (sd_version_is_flux(version)) { + latent_rgb_proj = flux_latent_rgb_proj; + } else { + // unknown model + return; + } + + } else if (dim == 4) { + // 4 channels VAE + if (version == VERSION_SDXL) { + latent_rgb_proj = sdxl_latent_rgb_proj; + } else if (version == VERSION_SD1 || version == VERSION_SD2) { + latent_rgb_proj = sd_latent_rgb_proj; + } else { + // unknown model + return; + } + } else { + // unknown latent space + return; + } + uint8_t* data = (uint8_t*)malloc(width * height * channel * sizeof(uint8_t)); + + preview_latent_image(data, latents, latent_rgb_proj, width, height, dim); + sd_image_t image = { + width, + height, + channel, + data}; + step_callback(step, image); + free(image.data); + } else { + if (preview_mode == SD_PREVIEW_VAE) { + ggml_tensor_scale(latents, 1.0f / scale_factor); + if (vae_tiling) { + // split latent in 32x32 tiles and compute in several steps + auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { + first_stage_model->compute(n_threads, in, true, &out); + }; + sd_tiling(latents, result, 8, 32, 0.5f, on_tiling); + } else { + first_stage_model->compute(n_threads, latents, true, &result); + } + first_stage_model->free_compute_buffer(); + + ggml_tensor_scale_output(result); + } else if (preview_mode == SD_PREVIEW_TAE) { + if (tae_first_stage == nullptr) { + LOG_WARN("TAE not found for preview"); + return; + } + if (vae_tiling) { + // split latent in 64x64 tiles and compute in several steps + auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { + tae_first_stage->compute(n_threads, in, true, &out); + }; + sd_tiling(latents, result, 8, 64, 0.5f, on_tiling); + } else { + tae_first_stage->compute(n_threads, latents, true, &result); + } + tae_first_stage->free_compute_buffer(); + } else { + return; + } + ggml_tensor_clamp(result, 0.0f, 1.0f); + sd_image_t image = { + width * 8, + height * 8, + channel, + sd_tensor_to_image(result)}; + ggml_tensor_scale(result, 0); + step_callback(step, image); + free(image.data); + } + } + + ggml_tensor* + sample(ggml_context* work_ctx, + ggml_tensor* init_latent, + ggml_tensor* noise, + SDCondition cond, + SDCondition uncond, + ggml_tensor* control_hint, + float control_strength, + float min_cfg, + float cfg_scale, + float guidance, + float eta, + sample_method_t method, + const std::vector& sigmas, + int start_merge_step, + SDCondition id_cond, + std::vector skip_layers = {}, + float slg_scale = 0, + float skip_layer_start = 0.01, + float skip_layer_end = 0.2, + ggml_tensor* noise_mask = nullptr, + sd_preview_policy_t preview_mode = SD_PREVIEW_PROJ, + int preview_interval = 1, + std::function step_callback = nullptr) { size_t steps = sigmas.size() - 1; // noise = load_tensor_from_file(work_ctx, "./rand0.bin"); // print_ggml_tensor(noise); @@ -836,6 +939,15 @@ class StableDiffusionGGML { } struct ggml_tensor* denoised = ggml_dup_tensor(work_ctx, x); + struct ggml_tensor* preview_tensor = NULL; + if (preview_mode != SD_PREVIEW_PROJ) { + preview_tensor = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, + (denoised->ne[0] * 8), + (denoised->ne[1] * 8), + 3, + denoised->ne[3]); + } + auto denoise = [&](ggml_tensor* input, float sigma, int step) -> ggml_tensor* { if (step == 1) { pretty_progress(0, (int)steps, 0); @@ -978,7 +1090,9 @@ class StableDiffusionGGML { } if (step_callback != nullptr) { - step_callback(step, denoised, version); + if (step % preview_interval == 0) { + preview_image(work_ctx, step, denoised, version, preview_mode, preview_tensor, step_callback); + } } return denoised; }; @@ -1122,7 +1236,8 @@ sd_ctx_t* new_sd_ctx(const char* model_path_c_str, bool keep_clip_on_cpu, bool keep_control_net_cpu, bool keep_vae_on_cpu, - bool diffusion_flash_attn) { + bool diffusion_flash_attn, + bool tae_preview_only) { sd_ctx_t* sd_ctx = (sd_ctx_t*)malloc(sizeof(sd_ctx_t)); if (sd_ctx == NULL) { return NULL; @@ -1164,7 +1279,8 @@ sd_ctx_t* new_sd_ctx(const char* model_path_c_str, keep_clip_on_cpu, keep_control_net_cpu, keep_vae_on_cpu, - diffusion_flash_attn)) { + diffusion_flash_attn, + tae_preview_only)) { delete sd_ctx->sd; sd_ctx->sd = NULL; free(sd_ctx); @@ -1201,12 +1317,14 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, float style_ratio, bool normalize_input, std::string input_id_images_path, - std::vector skip_layers = {}, - float slg_scale = 0, - float skip_layer_start = 0.01, - float skip_layer_end = 0.2, - ggml_tensor* masked_image = NULL, - std::function step_callback = nullptr) { + std::vector skip_layers = {}, + float slg_scale = 0, + float skip_layer_start = 0.01, + float skip_layer_end = 0.2, + ggml_tensor* masked_image = NULL, + sd_preview_policy_t preview_mode = SD_PREVIEW_PROJ, + int preview_interval = 1, + std::function step_callback = nullptr) { if (seed < 0) { // Generally, when using the provided command line, the seed is always >0. // However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library @@ -1464,6 +1582,8 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, skip_layer_start, skip_layer_end, noise_mask, + preview_mode, + preview_interval, step_callback); // struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin"); @@ -1533,12 +1653,14 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx, float style_ratio, bool normalize_input, const char* input_id_images_path_c_str, - int* skip_layers = NULL, - size_t skip_layers_count = 0, - float slg_scale = 0, - float skip_layer_start = 0.01, - float skip_layer_end = 0.2, - step_callback_t step_callback = NULL) { + int* skip_layers = NULL, + size_t skip_layers_count = 0, + float slg_scale = 0, + float skip_layer_start = 0.01, + float skip_layer_end = 0.2, + sd_preview_policy_t preview_mode = SD_PREVIEW_PROJ, + int preview_interval = 1, + step_callback_t step_callback = NULL) { std::vector skip_layers_vec(skip_layers, skip_layers + skip_layers_count); LOG_DEBUG("txt2img %dx%d", width, height); if (sd_ctx == NULL) { @@ -1617,6 +1739,8 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx, slg_scale, skip_layer_start, skip_layer_end, + preview_mode, + preview_interval, step_callback); size_t t1 = ggml_time_ms(); @@ -1647,12 +1771,14 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, float style_ratio, bool normalize_input, const char* input_id_images_path_c_str, - int* skip_layers = NULL, - size_t skip_layers_count = 0, - float slg_scale = 0, - float skip_layer_start = 0.01, - float skip_layer_end = 0.2, - step_callback_t step_callback = NULL) { + int* skip_layers = NULL, + size_t skip_layers_count = 0, + float slg_scale = 0, + float skip_layer_start = 0.01, + float skip_layer_end = 0.2, + sd_preview_policy_t preview_mode = SD_PREVIEW_PROJ, + int preview_interval = 1, + step_callback_t step_callback = NULL) { std::vector skip_layers_vec(skip_layers, skip_layers + skip_layers_count); LOG_DEBUG("img2img %dx%d", width, height); if (sd_ctx == NULL) { @@ -1800,6 +1926,8 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, skip_layer_start, skip_layer_end, masked_image, + preview_mode, + preview_interval, step_callback); size_t t2 = ggml_time_ms(); @@ -1822,8 +1950,7 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx, enum sample_method_t sample_method, int sample_steps, float strength, - int64_t seed, - step_callback_t step_callback = NULL) { + int64_t seed) { if (sd_ctx == NULL) { return NULL; } @@ -1906,7 +2033,8 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx, SDCondition(NULL, NULL, NULL), {}, 0, 0, 0, - step_callback); + (sd_preview_policy_t)0, 1, + NULL); int64_t t2 = ggml_time_ms(); LOG_INFO("sampling completed, taking %.2fs", (t2 - t1) * 1.0f / 1000); diff --git a/stable-diffusion.h b/stable-diffusion.h index ad64717e7..9b1f188f7 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -109,6 +109,14 @@ enum sd_log_level_t { SD_LOG_ERROR }; +enum sd_preview_policy_t { + SD_PREVIEW_NONE, + SD_PREVIEW_PROJ, + SD_PREVIEW_TAE, + SD_PREVIEW_VAE, + N_PREVIEWS +}; + typedef void (*sd_log_cb_t)(enum sd_log_level_t level, const char* text, void* data); typedef void (*sd_progress_cb_t)(int step, int steps, float time, void* data); @@ -147,11 +155,12 @@ SD_API sd_ctx_t* new_sd_ctx(const char* model_path, bool keep_clip_on_cpu, bool keep_control_net_cpu, bool keep_vae_on_cpu, - bool diffusion_flash_attn); + bool diffusion_flash_attn, + bool tae_preview_only); SD_API void free_sd_ctx(sd_ctx_t* sd_ctx); -typedef void (*step_callback_t)(int, struct ggml_tensor*, int); +typedef void (*step_callback_t)(int, sd_image_t); SD_API sd_image_t* txt2img(sd_ctx_t* sd_ctx, const char* prompt, @@ -176,6 +185,8 @@ SD_API sd_image_t* txt2img(sd_ctx_t* sd_ctx, float slg_scale, float skip_layer_start, float skip_layer_end, + sd_preview_policy_t preview_mode, + int preview_interval, step_callback_t step_callback); SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx, @@ -204,6 +215,8 @@ SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx, float slg_scale, float skip_layer_start, float skip_layer_end, + sd_preview_policy_t preview_mode, + int preview_interval, step_callback_t step_callback); SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx, @@ -219,8 +232,7 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx, enum sample_method_t sample_method, int sample_steps, float strength, - int64_t seed, - step_callback_t step_callback); + int64_t seed); typedef struct upscaler_ctx_t upscaler_ctx_t; From 42af6058a6e6f08621275ac6a4e2d7fa8074ee28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Fri, 13 Dec 2024 19:11:10 +0100 Subject: [PATCH 09/22] update usage --- examples/cli/main.cpp | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index 0ffe430bf..b35f2dccb 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -190,6 +190,8 @@ void print_params(SDParams params) { printf(" batch_count: %d\n", params.batch_count); printf(" vae_tiling: %s\n", params.vae_tiling ? "true" : "false"); printf(" upscale_repeats: %d\n", params.upscale_repeats); + printf(" preview_mode: %d\n", previews_str[params.preview_method]); + printf(" preview_interval: %d\n", params.preview_interval); } void print_usage(int argc, const char* argv[]) { @@ -197,16 +199,17 @@ void print_usage(int argc, const char* argv[]) { printf("\n"); printf("arguments:\n"); printf(" -h, --help show this help message and exit\n"); - printf(" -M, --mode [MODEL] run mode (txt2img or img2img or convert, default: txt2img)\n"); + printf(" -M, --mode [MODE] run mode (txt2img or img2img or convert, default: txt2img)\n"); printf(" -t, --threads N number of threads to use during computation (default: -1)\n"); printf(" If threads <= 0, then threads will be set to the number of CPU physical cores\n"); printf(" -m, --model [MODEL] path to full model\n"); - printf(" --diffusion-model path to the standalone diffusion model\n"); - printf(" --clip_l path to the clip-l text encoder\n"); - printf(" --clip_g path to the clip-g text encoder\n"); - printf(" --t5xxl path to the the t5xxl text encoder\n"); + printf(" --diffusion-model [MODEL] path to the standalone diffusion model\n"); + printf(" --clip_l [ENCODER] path to the clip-l text encoder\n"); + printf(" --clip_g [ENCODER] path to the clip-g text encoder\n"); + printf(" --t5xxl [ENCODER] path to the the t5xxl text encoder\n"); printf(" --vae [VAE] path to vae\n"); - printf(" --taesd [TAESD_PATH] path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)\n"); + printf(" --taesd [TAESD] path to taesd. Using Tiny AutoEncoder for fast decoding (low quality)\n"); + printf(" --taesd-preview-only prevents usage of taesd for decoding the final image. (for use with --preview %s)\n", previews_str[SD_PREVIEW_TAE]); printf(" --control-net [CONTROL_PATH] path to control net model\n"); printf(" --embd-dir [EMBEDDING_PATH] path to embeddings\n"); printf(" --stacked-id-embd-dir [DIR] path to PHOTOMAKER stacked id embeddings\n"); @@ -255,6 +258,10 @@ void print_usage(int argc, const char* argv[]) { printf(" This might crash if it is not supported by the backend.\n"); printf(" --control-net-cpu keep controlnet in cpu (for low vram)\n"); printf(" --canny apply canny preprocessor (edge detection)\n"); + printf(" --preview {%s,%s,%s,%s} preview method. (default is %s(disabled))\n", previews_str[0], previews_str[1], previews_str[2], previews_str[3], previews_str[SD_PREVIEW_NONE]); + printf(" %s is the fastest\n", previews_str[SD_PREVIEW_PROJ]); + printf(" --preview-interval [N] How often to save the image preview"); + printf(" --preview-path [PATH} path to write preview image to (default: ./preview.png)\n"); printf(" --color Colors the logging tags according to level\n"); printf(" -v, --verbose print extra info\n"); } From 570703f1fb8cfe52ba73a18f087a66c891b4e630 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Fri, 13 Dec 2024 19:22:24 +0100 Subject: [PATCH 10/22] Fix build + add warning --- stable-diffusion.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 5c4b36bb3..26f9ba471 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -797,10 +797,10 @@ class StableDiffusionGGML { sd_preview_policy_t preview_mode, ggml_tensor* result, std::function step_callback) { - const size_t channel = 3; - size_t width = latents->ne[0]; - size_t height = latents->ne[1]; - size_t dim = latents->ne[2]; + const uint32_t channel = 3; + uint32_t width = latents->ne[0]; + uint32_t height = latents->ne[1]; + uint32_t dim = latents->ne[2]; if (preview_mode == SD_PREVIEW_PROJ) { const float(*latent_rgb_proj)[channel]; @@ -812,6 +812,7 @@ class StableDiffusionGGML { } else if (sd_version_is_flux(version)) { latent_rgb_proj = flux_latent_rgb_proj; } else { + LOG_WARN("No latent to RGB projection known for this model"); // unknown model return; } @@ -824,9 +825,11 @@ class StableDiffusionGGML { latent_rgb_proj = sd_latent_rgb_proj; } else { // unknown model + LOG_WARN("No latent to RGB projection known for this model"); return; } } else { + LOG_WARN("No latent to RGB projection known for this model"); // unknown latent space return; } From 93bfdbac8f4df155e4a7096f832fd633e47eed66 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Fri, 13 Dec 2024 19:27:04 +0100 Subject: [PATCH 11/22] Disable preview by default in sdcpp too --- stable-diffusion.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 26f9ba471..265165347 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -909,7 +909,7 @@ class StableDiffusionGGML { float skip_layer_start = 0.01, float skip_layer_end = 0.2, ggml_tensor* noise_mask = nullptr, - sd_preview_policy_t preview_mode = SD_PREVIEW_PROJ, + sd_preview_policy_t preview_mode = SD_PREVIEW_NONE, int preview_interval = 1, std::function step_callback = nullptr) { size_t steps = sigmas.size() - 1; @@ -1325,7 +1325,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, float skip_layer_start = 0.01, float skip_layer_end = 0.2, ggml_tensor* masked_image = NULL, - sd_preview_policy_t preview_mode = SD_PREVIEW_PROJ, + sd_preview_policy_t preview_mode = SD_PREVIEW_NONE, int preview_interval = 1, std::function step_callback = nullptr) { if (seed < 0) { @@ -1661,7 +1661,7 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx, float slg_scale = 0, float skip_layer_start = 0.01, float skip_layer_end = 0.2, - sd_preview_policy_t preview_mode = SD_PREVIEW_PROJ, + sd_preview_policy_t preview_mode = SD_PREVIEW_NONE, int preview_interval = 1, step_callback_t step_callback = NULL) { std::vector skip_layers_vec(skip_layers, skip_layers + skip_layers_count); @@ -1779,7 +1779,7 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, float slg_scale = 0, float skip_layer_start = 0.01, float skip_layer_end = 0.2, - sd_preview_policy_t preview_mode = SD_PREVIEW_PROJ, + sd_preview_policy_t preview_mode = SD_PREVIEW_NONE, int preview_interval = 1, step_callback_t step_callback = NULL) { std::vector skip_layers_vec(skip_layers, skip_layers + skip_layers_count); From 7390cf3a8fc9681373fc012f401413b3942dbf77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Fri, 13 Dec 2024 19:29:40 +0100 Subject: [PATCH 12/22] Done not preload preview tensor when preview is disabled. --- stable-diffusion.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 265165347..9b07b7ac5 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -943,7 +943,7 @@ class StableDiffusionGGML { struct ggml_tensor* denoised = ggml_dup_tensor(work_ctx, x); struct ggml_tensor* preview_tensor = NULL; - if (preview_mode != SD_PREVIEW_PROJ) { + if (preview_mode != SD_PREVIEW_NONE && preview_mode != SD_PREVIEW_PROJ) { preview_tensor = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, (denoised->ne[0] * 8), (denoised->ne[1] * 8), From d92ed6923406fb7b302ac491767793d738f1e963 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Fri, 13 Dec 2024 19:43:28 +0100 Subject: [PATCH 13/22] Fix VAE preview darkening --- stable-diffusion.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 9b07b7ac5..0028d3cf7 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -856,6 +856,7 @@ class StableDiffusionGGML { first_stage_model->compute(n_threads, latents, true, &result); } first_stage_model->free_compute_buffer(); + ggml_tensor_scale(latents, scale_factor); ggml_tensor_scale_output(result); } else if (preview_mode == SD_PREVIEW_TAE) { From 747401cb48d1487d24b863a341bd777c9450ac4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Sat, 14 Dec 2024 00:12:09 +0100 Subject: [PATCH 14/22] Increase context memory when loading multiple auto encoders --- stable-diffusion.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 0028d3cf7..4a72a81f1 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -1682,6 +1682,9 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx, if (sd_ctx->sd->stacked_id) { params.mem_size += static_cast(10 * 1024 * 1024); // 10 MB } + if (sd_ctx->sd->first_stage_model != nullptr && sd_ctx->sd->tae_first_stage != nullptr) { + params.mem_size *= 2; + } params.mem_size += width * height * 3 * sizeof(float); params.mem_size *= batch_count; params.mem_buffer = NULL; From cff8dea401ad9dedc2623d416159c24a6da8cbe7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Sat, 14 Dec 2024 02:13:16 +0100 Subject: [PATCH 15/22] Increase context memory when previewing with auto encoder instead --- stable-diffusion.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 4a72a81f1..3c9549572 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -1682,7 +1682,7 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx, if (sd_ctx->sd->stacked_id) { params.mem_size += static_cast(10 * 1024 * 1024); // 10 MB } - if (sd_ctx->sd->first_stage_model != nullptr && sd_ctx->sd->tae_first_stage != nullptr) { + if (preview_mode!=SD_PREVIEW_NONE && preview_mode!=SD_PREVIEW_PROJ) { params.mem_size *= 2; } params.mem_size += width * height * 3 * sizeof(float); From 7ed5b7744d8f391c9bdecfafc18f0caf43d0200a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Sat, 21 Dec 2024 00:55:27 +0100 Subject: [PATCH 16/22] latent-preview: fix compile warnings --- latent-preview.h | 80 ++++++++++++++++++++++++------------------------ 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/latent-preview.h b/latent-preview.h index 5457c47ed..ca4d132f3 100644 --- a/latent-preview.h +++ b/latent-preview.h @@ -1,56 +1,56 @@ // https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L152-L169 const float flux_latent_rgb_proj[16][3] = { - {-0.0346, 0.0244, 0.0681}, - {0.0034, 0.0210, 0.0687}, - {0.0275, -0.0668, -0.0433}, - {-0.0174, 0.0160, 0.0617}, - {0.0859, 0.0721, 0.0329}, - {0.0004, 0.0383, 0.0115}, - {0.0405, 0.0861, 0.0915}, - {-0.0236, -0.0185, -0.0259}, - {-0.0245, 0.0250, 0.1180}, - {0.1008, 0.0755, -0.0421}, - {-0.0515, 0.0201, 0.0011}, - {0.0428, -0.0012, -0.0036}, - {0.0817, 0.0765, 0.0749}, - {-0.1264, -0.0522, -0.1103}, - {-0.0280, -0.0881, -0.0499}, - {-0.1262, -0.0982, -0.0778}}; + {-0.0346f, 0.0244f, 0.0681f}, + {0.0034f, 0.0210f, 0.0687f}, + {0.0275f, -0.0668f, -0.0433f}, + {-0.0174f, 0.0160f, 0.0617f}, + {0.0859f, 0.0721f, 0.0329f}, + {0.0004f, 0.0383f, 0.0115f}, + {0.0405f, 0.0861f, 0.0915f}, + {-0.0236f, -0.0185f, -0.0259f}, + {-0.0245f, 0.0250f, 0.1180f}, + {0.1008f, 0.0755f, -0.0421f}, + {-0.0515f, 0.0201f, 0.0011f}, + {0.0428f, -0.0012f, -0.0036f}, + {0.0817f, 0.0765f, 0.0749f}, + {-0.1264f, -0.0522f, -0.1103f}, + {-0.0280f, -0.0881f, -0.0499f}, + {-0.1262f, -0.0982f, -0.0778f}}; // https://github.com/Stability-AI/sd3.5/blob/main/sd3_impls.py#L228-L246 const float sd3_latent_rgb_proj[16][3] = { - {-0.0645, 0.0177, 0.1052}, - {0.0028, 0.0312, 0.0650}, - {0.1848, 0.0762, 0.0360}, - {0.0944, 0.0360, 0.0889}, - {0.0897, 0.0506, -0.0364}, - {-0.0020, 0.1203, 0.0284}, - {0.0855, 0.0118, 0.0283}, - {-0.0539, 0.0658, 0.1047}, - {-0.0057, 0.0116, 0.0700}, - {-0.0412, 0.0281, -0.0039}, - {0.1106, 0.1171, 0.1220}, - {-0.0248, 0.0682, -0.0481}, - {0.0815, 0.0846, 0.1207}, - {-0.0120, -0.0055, -0.0867}, - {-0.0749, -0.0634, -0.0456}, - {-0.1418, -0.1457, -0.1259}, + {-0.0645f, 0.0177f, 0.1052f}, + {0.0028f, 0.0312f, 0.0650f}, + {0.1848f, 0.0762f, 0.0360f}, + {0.0944f, 0.0360f, 0.0889f}, + {0.0897f, 0.0506f, -0.0364f}, + {-0.0020f, 0.1203f, 0.0284f}, + {0.0855f, 0.0118f, 0.0283f}, + {-0.0539f, 0.0658f, 0.1047f}, + {-0.0057f, 0.0116f, 0.0700f}, + {-0.0412f, 0.0281f, -0.0039f}, + {0.1106f, 0.1171f, 0.1220f}, + {-0.0248f, 0.0682f, -0.0481f}, + {0.0815f, 0.0846f, 0.1207f}, + {-0.0120f, -0.0055f, -0.0867f}, + {-0.0749f, -0.0634f, -0.0456f}, + {-0.1418f, -0.1457f, -0.1259f}, }; // https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L32-L38 const float sdxl_latent_rgb_proj[4][3] = { - {0.3651, 0.4232, 0.4341}, - {-0.2533, -0.0042, 0.1068}, - {0.1076, 0.1111, -0.0362}, - {-0.3165, -0.2492, -0.2188}}; + {0.3651f, 0.4232f, 0.4341f}, + {-0.2533f, -0.0042f, 0.1068f}, + {0.1076f, 0.1111f, -0.0362f}, + {-0.3165f, -0.2492f, -0.2188f}}; // https://github.com/comfyanonymous/ComfyUI/blob/master/comfy/latent_formats.py#L32-L38 const float sd_latent_rgb_proj[4][3]{ - {0.3512, 0.2297, 0.3227}, - {0.3250, 0.4974, 0.2350}, - {-0.2829, 0.1762, 0.2721}, - {-0.2120, -0.2616, -0.7177}}; + {0.3512f, 0.2297f, 0.3227f}, + {0.3250f, 0.4974f, 0.2350f}, + {-0.2829f, 0.1762f, 0.2721f}, + {-0.2120f, -0.2616f, -0.7177f}}; void preview_latent_image(uint8_t* buffer, struct ggml_tensor* latents, const float (*latent_rgb_proj)[3], int width, int height, int dim) { size_t buffer_head = 0; From 133e7273498856fcb2fd4db16494c3d4ff1d5090 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Sun, 29 Dec 2024 22:01:36 +0100 Subject: [PATCH 17/22] Fix compile --- stable-diffusion.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 3c9549572..f5448d496 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -1746,6 +1746,7 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx, slg_scale, skip_layer_start, skip_layer_end, + NULL, preview_mode, preview_interval, step_callback); @@ -2039,7 +2040,7 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx, -1, SDCondition(NULL, NULL, NULL), {}, - 0, 0, 0, + 0, 0, 0, NULL, (sd_preview_policy_t)0, 1, NULL); From fbef799f965fb3238ca8328611b15a9980451106 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Sun, 29 Dec 2024 22:01:01 +0100 Subject: [PATCH 18/22] fix print-params --- examples/cli/main.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index b35f2dccb..97e74bf2c 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -186,11 +186,11 @@ void print_params(SDParams params) { printf(" sample_steps: %d\n", params.sample_steps); printf(" strength(img2img): %.2f\n", params.strength); printf(" rng: %s\n", rng_type_to_str[params.rng_type]); - printf(" seed: %ld\n", params.seed); + printf(" seed: %lld\n", params.seed); printf(" batch_count: %d\n", params.batch_count); printf(" vae_tiling: %s\n", params.vae_tiling ? "true" : "false"); printf(" upscale_repeats: %d\n", params.upscale_repeats); - printf(" preview_mode: %d\n", previews_str[params.preview_method]); + printf(" preview_mode: %s\n", previews_str[params.preview_method]); printf(" preview_interval: %d\n", params.preview_interval); } From c4384bfeb937d64facee2273cf7992497c7a9982 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Mon, 30 Dec 2024 01:40:38 +0100 Subject: [PATCH 19/22] fix preview with unet inpaint models --- stable-diffusion.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index f5448d496..73d16ee56 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -819,9 +819,9 @@ class StableDiffusionGGML { } else if (dim == 4) { // 4 channels VAE - if (version == VERSION_SDXL) { + if (sd_version_is_sdxl(version)) { latent_rgb_proj = sdxl_latent_rgb_proj; - } else if (version == VERSION_SD1 || version == VERSION_SD2) { + } else if (sd_version_is_sd1(version) || sd_version_is_sd2(version)) { latent_rgb_proj = sd_latent_rgb_proj; } else { // unknown model @@ -1682,7 +1682,7 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx, if (sd_ctx->sd->stacked_id) { params.mem_size += static_cast(10 * 1024 * 1024); // 10 MB } - if (preview_mode!=SD_PREVIEW_NONE && preview_mode!=SD_PREVIEW_PROJ) { + if (preview_mode != SD_PREVIEW_NONE && preview_mode != SD_PREVIEW_PROJ) { params.mem_size *= 2; } params.mem_size += width * height * 3 * sizeof(float); From 55b7377dbb4cde3fdea7a281e47bb9a1532005bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Sun, 5 Jan 2025 02:00:34 +0100 Subject: [PATCH 20/22] preview: do not spam pretty progress when using tiled vae/tae as preview --- stable-diffusion.cpp | 29 +++++++++++++++++++++++------ stable-diffusion.h | 2 ++ util.cpp | 6 ++++++ 3 files changed, 31 insertions(+), 6 deletions(-) diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 73d16ee56..eb78a4b41 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -69,6 +69,14 @@ void calculate_alphas_cumprod(float* alphas_cumprod, } } +void suppress_pp(int step, int steps, float time, void* data) { + (void)step; + (void)steps; + (void)time; + (void)data; + return; +} + /*=============================================== StableDiffusionGGML ================================================*/ class StableDiffusionGGML { @@ -790,6 +798,14 @@ class StableDiffusionGGML { return {c_crossattn, y, c_concat}; } + void silent_tiling(ggml_tensor* input, ggml_tensor* output, const int scale, const int tile_size, const float tile_overlap_factor, on_tile_process on_processing) { + sd_progress_cb_t cb = sd_get_progress_callback(); + void* cbd = sd_get_progress_callback_data(); + sd_set_progress_callback((sd_progress_cb_t)suppress_pp, NULL); + sd_tiling(input, output, scale, tile_size, tile_overlap_factor, on_processing); + sd_set_progress_callback(cb, cbd); + } + void preview_image(ggml_context* work_ctx, int step, struct ggml_tensor* latents, @@ -851,7 +867,8 @@ class StableDiffusionGGML { auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { first_stage_model->compute(n_threads, in, true, &out); }; - sd_tiling(latents, result, 8, 32, 0.5f, on_tiling); + silent_tiling(latents, result, 8, 32, 0.5f, on_tiling); + } else { first_stage_model->compute(n_threads, latents, true, &result); } @@ -869,7 +886,7 @@ class StableDiffusionGGML { auto on_tiling = [&](ggml_tensor* in, ggml_tensor* out, bool init) { tae_first_stage->compute(n_threads, in, true, &out); }; - sd_tiling(latents, result, 8, 64, 0.5f, on_tiling); + silent_tiling(latents, result, 8, 64, 0.5f, on_tiling); } else { tae_first_stage->compute(n_threads, latents, true, &result); } @@ -1076,10 +1093,6 @@ class StableDiffusionGGML { vec_denoised[i] = latent_result * c_out + vec_input[i] * c_skip; } int64_t t1 = ggml_time_us(); - if (step > 0) { - pretty_progress(step, (int)steps, (t1 - t0) / 1000000.f); - // LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000); - } if (noise_mask != nullptr) { for (int64_t x = 0; x < denoised->ne[0]; x++) { for (int64_t y = 0; y < denoised->ne[1]; y++) { @@ -1092,6 +1105,10 @@ class StableDiffusionGGML { } } } + if (step > 0) { + pretty_progress(step, (int)steps, (t1 - t0) / 1000000.f); + // LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000); + } if (step_callback != nullptr) { if (step % preview_interval == 0) { diff --git a/stable-diffusion.h b/stable-diffusion.h index 9b1f188f7..516290d59 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -122,6 +122,8 @@ typedef void (*sd_progress_cb_t)(int step, int steps, float time, void* data); SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void* data); SD_API void sd_set_progress_callback(sd_progress_cb_t cb, void* data); +SD_API sd_progress_cb_t sd_get_progress_callback(); +SD_API void* sd_get_progress_callback_data(); SD_API int32_t get_num_physical_cores(); SD_API const char* sd_get_system_info(); diff --git a/util.cpp b/util.cpp index da11a14d6..176ec4d9d 100644 --- a/util.cpp +++ b/util.cpp @@ -420,6 +420,12 @@ void sd_set_progress_callback(sd_progress_cb_t cb, void* data) { sd_progress_cb = cb; sd_progress_cb_data = data; } +sd_progress_cb_t sd_get_progress_callback(){ + return sd_progress_cb; +} +void* sd_get_progress_callback_data(){ + return sd_progress_cb_data; +} const char* sd_get_system_info() { static char buffer[1024]; std::stringstream ss; From 53903e699036e2119fdb4bd4b0a445d7817c9636 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Sun, 5 Jan 2025 02:03:26 +0100 Subject: [PATCH 21/22] change log level of "processing %i tiles" --- ggml_extend.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml_extend.hpp b/ggml_extend.hpp index c5913be4d..8404a9977 100644 --- a/ggml_extend.hpp +++ b/ggml_extend.hpp @@ -627,7 +627,7 @@ __STATIC_INLINE__ void sd_tiling(ggml_tensor* input, ggml_tensor* output, const ggml_tensor* output_tile = ggml_new_tensor_4d(tiles_ctx, GGML_TYPE_F32, tile_size * scale, tile_size * scale, output->ne[2], 1); on_processing(input_tile, NULL, true); int num_tiles = ceil((float)input_width / non_tile_overlap) * ceil((float)input_height / non_tile_overlap); - LOG_INFO("processing %i tiles", num_tiles); + LOG_DEBUG("processing %i tiles", num_tiles); pretty_progress(1, num_tiles, 0.0f); int tile_count = 1; bool last_y = false, last_x = false; From efc6db855e16451450f3fe347e431dd59920e00e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20du=20Hamel?= Date: Thu, 6 Feb 2025 16:47:15 +0100 Subject: [PATCH 22/22] Refactor preview to match the other callbacks --- examples/cli/main.cpp | 31 +++++++-------- stable-diffusion.cpp | 88 +++++++++++++++++-------------------------- stable-diffusion.h | 33 +++++++--------- util.cpp | 25 +++++++++++- util.h | 7 ++++ 5 files changed, 90 insertions(+), 94 deletions(-) diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp index 97e74bf2c..cc1c14d36 100644 --- a/examples/cli/main.cpp +++ b/examples/cli/main.cpp @@ -137,10 +137,10 @@ struct SDParams { float skip_layer_start = 0.01; float skip_layer_end = 0.2; - sd_preview_policy_t preview_method = SD_PREVIEW_NONE; - int preview_interval = 1; - std::string preview_path = "preview.png"; - bool taesd_preview = false; + sd_preview_t preview_method = SD_PREVIEW_NONE; + int preview_interval = 1; + std::string preview_path = "preview.png"; + bool taesd_preview = false; }; void print_params(SDParams params) { @@ -666,7 +666,7 @@ void parse_args(int argc, const char** argv, SDParams& params) { invalid_arg = true; break; } - params.preview_method = (sd_preview_policy_t)preview_method; + params.preview_method = (sd_preview_t)preview_method; } else if (arg == "--preview-interval") { if (++i >= argc) { invalid_arg = true; @@ -850,6 +850,7 @@ int main(int argc, const char* argv[]) { preview_path = params.preview_path.c_str(); sd_set_log_callback(sd_log_cb, (void*)¶ms); + sd_set_preview_callback((sd_preview_cb_t)step_callback, params.preview_method, params.preview_interval); if (params.verbose) { print_params(params); @@ -1025,10 +1026,7 @@ int main(int argc, const char* argv[]) { params.skip_layers.size(), params.slg_scale, params.skip_layer_start, - params.skip_layer_end, - params.preview_method, - params.preview_interval, - (step_callback_t)step_callback); + params.skip_layer_end); } else { sd_image_t input_image = {(uint32_t)params.width, (uint32_t)params.height, @@ -1097,10 +1095,7 @@ int main(int argc, const char* argv[]) { params.skip_layers.size(), params.slg_scale, params.skip_layer_start, - params.skip_layer_end, - params.preview_method, - params.preview_interval, - (step_callback_t)step_callback); + params.skip_layer_end); } } @@ -1139,11 +1134,11 @@ int main(int argc, const char* argv[]) { std::string dummy_name, ext, lc_ext; bool is_jpg; - size_t last = params.output_path.find_last_of("."); + size_t last = params.output_path.find_last_of("."); size_t last_path = std::min(params.output_path.find_last_of("/"), params.output_path.find_last_of("\\")); - if (last != std::string::npos // filename has extension - && (last_path == std::string::npos || last > last_path)) { + if (last != std::string::npos // filename has extension + && (last_path == std::string::npos || last > last_path)) { dummy_name = params.output_path.substr(0, last); ext = lc_ext = params.output_path.substr(last); std::transform(ext.begin(), ext.end(), lc_ext.begin(), ::tolower); @@ -1151,7 +1146,7 @@ int main(int argc, const char* argv[]) { } else { dummy_name = params.output_path; ext = lc_ext = ""; - is_jpg = false; + is_jpg = false; } // appending ".png" to absent or unknown extension if (!is_jpg && lc_ext != ".png") { @@ -1163,7 +1158,7 @@ int main(int argc, const char* argv[]) { continue; } std::string final_image_path = i > 0 ? dummy_name + "_" + std::to_string(i + 1) + ext : dummy_name + ext; - if(is_jpg) { + if (is_jpg) { stbi_write_jpg(final_image_path.c_str(), results[i].width, results[i].height, results[i].channel, results[i].data, 90, get_image_params(params, params.seed + i).c_str()); printf("save result JPEG image to '%s'\n", final_image_path.c_str()); diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index eb78a4b41..7d3253215 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -810,7 +810,7 @@ class StableDiffusionGGML { int step, struct ggml_tensor* latents, enum SDVersion version, - sd_preview_policy_t preview_mode, + sd_preview_t preview_mode, ggml_tensor* result, std::function step_callback) { const uint32_t channel = 3; @@ -922,14 +922,11 @@ class StableDiffusionGGML { const std::vector& sigmas, int start_merge_step, SDCondition id_cond, - std::vector skip_layers = {}, - float slg_scale = 0, - float skip_layer_start = 0.01, - float skip_layer_end = 0.2, - ggml_tensor* noise_mask = nullptr, - sd_preview_policy_t preview_mode = SD_PREVIEW_NONE, - int preview_interval = 1, - std::function step_callback = nullptr) { + std::vector skip_layers = {}, + float slg_scale = 0, + float skip_layer_start = 0.01, + float skip_layer_end = 0.2, + ggml_tensor* noise_mask = nullptr) { size_t steps = sigmas.size() - 1; // noise = load_tensor_from_file(work_ctx, "./rand0.bin"); // print_ggml_tensor(noise); @@ -961,7 +958,8 @@ class StableDiffusionGGML { struct ggml_tensor* denoised = ggml_dup_tensor(work_ctx, x); struct ggml_tensor* preview_tensor = NULL; - if (preview_mode != SD_PREVIEW_NONE && preview_mode != SD_PREVIEW_PROJ) { + auto sd_preview_mode = sd_get_preview_mode(); + if (sd_preview_mode != SD_PREVIEW_NONE && sd_preview_mode != SD_PREVIEW_PROJ) { preview_tensor = ggml_new_tensor_4d(work_ctx, GGML_TYPE_F32, (denoised->ne[0] * 8), (denoised->ne[1] * 8), @@ -1109,10 +1107,11 @@ class StableDiffusionGGML { pretty_progress(step, (int)steps, (t1 - t0) / 1000000.f); // LOG_INFO("step %d sampling completed taking %.2fs", step, (t1 - t0) * 1.0f / 1000000); } - - if (step_callback != nullptr) { - if (step % preview_interval == 0) { - preview_image(work_ctx, step, denoised, version, preview_mode, preview_tensor, step_callback); + auto sd_preview_cb = sd_get_preview_callback(); + auto sd_preview_mode = sd_get_preview_mode(); + if (sd_preview_cb != NULL) { + if (step % sd_get_preview_interval() == 0) { + preview_image(work_ctx, step, denoised, version, sd_preview_mode, preview_tensor, sd_preview_cb); } } return denoised; @@ -1338,14 +1337,11 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, float style_ratio, bool normalize_input, std::string input_id_images_path, - std::vector skip_layers = {}, - float slg_scale = 0, - float skip_layer_start = 0.01, - float skip_layer_end = 0.2, - ggml_tensor* masked_image = NULL, - sd_preview_policy_t preview_mode = SD_PREVIEW_NONE, - int preview_interval = 1, - std::function step_callback = nullptr) { + std::vector skip_layers = {}, + float slg_scale = 0, + float skip_layer_start = 0.01, + float skip_layer_end = 0.2, + ggml_tensor* masked_image = NULL) { if (seed < 0) { // Generally, when using the provided command line, the seed is always >0. // However, to prevent potential issues if 'stable-diffusion.cpp' is invoked as a library @@ -1602,10 +1598,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, slg_scale, skip_layer_start, skip_layer_end, - noise_mask, - preview_mode, - preview_interval, - step_callback); + noise_mask); // struct ggml_tensor* x_0 = load_tensor_from_file(ctx, "samples_ddim.bin"); // print_ggml_tensor(x_0); @@ -1674,14 +1667,11 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx, float style_ratio, bool normalize_input, const char* input_id_images_path_c_str, - int* skip_layers = NULL, - size_t skip_layers_count = 0, - float slg_scale = 0, - float skip_layer_start = 0.01, - float skip_layer_end = 0.2, - sd_preview_policy_t preview_mode = SD_PREVIEW_NONE, - int preview_interval = 1, - step_callback_t step_callback = NULL) { + int* skip_layers = NULL, + size_t skip_layers_count = 0, + float slg_scale = 0, + float skip_layer_start = 0.01, + float skip_layer_end = 0.2) { std::vector skip_layers_vec(skip_layers, skip_layers + skip_layers_count); LOG_DEBUG("txt2img %dx%d", width, height); if (sd_ctx == NULL) { @@ -1699,7 +1689,8 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx, if (sd_ctx->sd->stacked_id) { params.mem_size += static_cast(10 * 1024 * 1024); // 10 MB } - if (preview_mode != SD_PREVIEW_NONE && preview_mode != SD_PREVIEW_PROJ) { + auto sd_preview_mode = sd_get_preview_mode(); + if (sd_preview_mode != SD_PREVIEW_NONE && sd_preview_mode != SD_PREVIEW_PROJ) { params.mem_size *= 2; } params.mem_size += width * height * 3 * sizeof(float); @@ -1763,10 +1754,7 @@ sd_image_t* txt2img(sd_ctx_t* sd_ctx, slg_scale, skip_layer_start, skip_layer_end, - NULL, - preview_mode, - preview_interval, - step_callback); + NULL); size_t t1 = ggml_time_ms(); @@ -1796,14 +1784,11 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, float style_ratio, bool normalize_input, const char* input_id_images_path_c_str, - int* skip_layers = NULL, - size_t skip_layers_count = 0, - float slg_scale = 0, - float skip_layer_start = 0.01, - float skip_layer_end = 0.2, - sd_preview_policy_t preview_mode = SD_PREVIEW_NONE, - int preview_interval = 1, - step_callback_t step_callback = NULL) { + int* skip_layers = NULL, + size_t skip_layers_count = 0, + float slg_scale = 0, + float skip_layer_start = 0.01, + float skip_layer_end = 0.2) { std::vector skip_layers_vec(skip_layers, skip_layers + skip_layers_count); LOG_DEBUG("img2img %dx%d", width, height); if (sd_ctx == NULL) { @@ -1950,10 +1935,7 @@ sd_image_t* img2img(sd_ctx_t* sd_ctx, slg_scale, skip_layer_start, skip_layer_end, - masked_image, - preview_mode, - preview_interval, - step_callback); + masked_image); size_t t2 = ggml_time_ms(); @@ -2057,9 +2039,7 @@ SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx, -1, SDCondition(NULL, NULL, NULL), {}, - 0, 0, 0, NULL, - (sd_preview_policy_t)0, 1, - NULL); + 0, 0, 0, NULL); int64_t t2 = ggml_time_ms(); LOG_INFO("sampling completed, taking %.2fs", (t2 - t1) * 1.0f / 1000); diff --git a/stable-diffusion.h b/stable-diffusion.h index 516290d59..d422cea6d 100644 --- a/stable-diffusion.h +++ b/stable-diffusion.h @@ -109,7 +109,7 @@ enum sd_log_level_t { SD_LOG_ERROR }; -enum sd_preview_policy_t { +enum sd_preview_t { SD_PREVIEW_NONE, SD_PREVIEW_PROJ, SD_PREVIEW_TAE, @@ -117,23 +117,24 @@ enum sd_preview_policy_t { N_PREVIEWS }; +typedef struct { + uint32_t width; + uint32_t height; + uint32_t channel; + uint8_t* data; +} sd_image_t; + typedef void (*sd_log_cb_t)(enum sd_log_level_t level, const char* text, void* data); typedef void (*sd_progress_cb_t)(int step, int steps, float time, void* data); +typedef void (*sd_preview_cb_t)(int, sd_image_t); + SD_API void sd_set_log_callback(sd_log_cb_t sd_log_cb, void* data); SD_API void sd_set_progress_callback(sd_progress_cb_t cb, void* data); -SD_API sd_progress_cb_t sd_get_progress_callback(); -SD_API void* sd_get_progress_callback_data(); +SD_API void sd_set_preview_callback(sd_preview_cb_t cb, sd_preview_t mode, int interval); SD_API int32_t get_num_physical_cores(); SD_API const char* sd_get_system_info(); -typedef struct { - uint32_t width; - uint32_t height; - uint32_t channel; - uint8_t* data; -} sd_image_t; - typedef struct sd_ctx_t sd_ctx_t; SD_API sd_ctx_t* new_sd_ctx(const char* model_path, @@ -162,8 +163,6 @@ SD_API sd_ctx_t* new_sd_ctx(const char* model_path, SD_API void free_sd_ctx(sd_ctx_t* sd_ctx); -typedef void (*step_callback_t)(int, sd_image_t); - SD_API sd_image_t* txt2img(sd_ctx_t* sd_ctx, const char* prompt, const char* negative_prompt, @@ -186,10 +185,7 @@ SD_API sd_image_t* txt2img(sd_ctx_t* sd_ctx, size_t skip_layers_count, float slg_scale, float skip_layer_start, - float skip_layer_end, - sd_preview_policy_t preview_mode, - int preview_interval, - step_callback_t step_callback); + float skip_layer_end); SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx, sd_image_t init_image, @@ -216,10 +212,7 @@ SD_API sd_image_t* img2img(sd_ctx_t* sd_ctx, size_t skip_layers_count, float slg_scale, float skip_layer_start, - float skip_layer_end, - sd_preview_policy_t preview_mode, - int preview_interval, - step_callback_t step_callback); + float skip_layer_end); SD_API sd_image_t* img2vid(sd_ctx_t* sd_ctx, sd_image_t init_image, diff --git a/util.cpp b/util.cpp index 176ec4d9d..bf7178cab 100644 --- a/util.cpp +++ b/util.cpp @@ -247,6 +247,10 @@ int32_t get_num_physical_cores() { static sd_progress_cb_t sd_progress_cb = NULL; void* sd_progress_cb_data = NULL; +static sd_preview_cb_t sd_preview_cb = NULL; +sd_preview_t sd_preview_mode = SD_PREVIEW_NONE; +int sd_preview_interval = 1; + std::u32string utf8_to_utf32(const std::string& utf8_str) { std::wstring_convert, char32_t> converter; return converter.from_bytes(utf8_str); @@ -420,10 +424,27 @@ void sd_set_progress_callback(sd_progress_cb_t cb, void* data) { sd_progress_cb = cb; sd_progress_cb_data = data; } -sd_progress_cb_t sd_get_progress_callback(){ +void sd_set_preview_callback(sd_preview_cb_t cb, sd_preview_t mode = SD_PREVIEW_PROJ, int interval = 1) { + sd_preview_cb = cb; + sd_preview_mode = mode; + sd_preview_interval = interval; +} + +sd_preview_cb_t sd_get_preview_callback() { + return sd_preview_cb; +} + +sd_preview_t sd_get_preview_mode() { + return sd_preview_mode; +} +int sd_get_preview_interval() { + return sd_preview_interval; +} + +sd_progress_cb_t sd_get_progress_callback() { return sd_progress_cb; } -void* sd_get_progress_callback_data(){ +void* sd_get_progress_callback_data() { return sd_progress_cb_data; } const char* sd_get_system_info() { diff --git a/util.h b/util.h index 14fa812e5..36a2e18af 100644 --- a/util.h +++ b/util.h @@ -54,6 +54,13 @@ std::string trim(const std::string& s); std::vector> parse_prompt_attention(const std::string& text); +sd_progress_cb_t sd_get_progress_callback(); +void* sd_get_progress_callback_data(); + +sd_preview_cb_t sd_get_preview_callback(); +sd_preview_t sd_get_preview_mode(); +int sd_get_preview_interval(); + #define LOG_DEBUG(format, ...) log_printf(SD_LOG_DEBUG, __FILE__, __LINE__, format, ##__VA_ARGS__) #define LOG_INFO(format, ...) log_printf(SD_LOG_INFO, __FILE__, __LINE__, format, ##__VA_ARGS__) #define LOG_WARN(format, ...) log_printf(SD_LOG_WARN, __FILE__, __LINE__, format, ##__VA_ARGS__)