Skip to content

Commit

Permalink
[Image generation] Fixed SD3 accuracy issues (openvinotoolkit#1131)
Browse files Browse the repository at this point in the history
- Fixed VAE part for SD3
- `scaling_factor` was applied 2x time: in pipeline itself and as part
of VAE decoder preprocessing.
- Fixed float / double arithmetic mismatch in
`FlowMatchEulerDiscreteScheduler`

CVS-156384
  • Loading branch information
ilya-lavrenov authored Nov 4, 2024
1 parent 6165c47 commit 0b4848a
Show file tree
Hide file tree
Showing 10 changed files with 30 additions and 41 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@ class OPENVINO_GENAI_EXPORTS AutoencoderKL {
size_t in_channels = 3;
size_t latent_channels = 4;
size_t out_channels = 3;
float scaling_factor = 0.18215f;
float shift_factor = 0.0609f;
float scaling_factor = 1.0f;
float shift_factor = 0.0f;
std::vector<size_t> block_out_channels = { 64 };

explicit Config(const std::filesystem::path& config_path);
Expand Down
6 changes: 4 additions & 2 deletions src/cpp/src/image_generation/models/autoencoder_kl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -186,8 +186,10 @@ void AutoencoderKL::merge_vae_image_pre_processing() const {
void AutoencoderKL::merge_vae_image_post_processing() const {
ov::preprocess::PrePostProcessor ppp(m_decoder_model);

// scale input before VAE decoder
ppp.input().preprocess().scale(m_config.scaling_factor);
// scale and shift input before VAE decoder
ppp.input().preprocess()
.scale(m_config.scaling_factor)
.mean(-m_config.shift_factor);

// apply VaeImageProcessor normalization steps
// https://github.com/huggingface/diffusers/blob/v0.30.1/src/diffusers/image_processor.py#L159
Expand Down
2 changes: 1 addition & 1 deletion src/cpp/src/image_generation/models/clip_text_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ CLIPTextModel& CLIPTextModel::compile(const std::string& device, const ov::AnyMa
}

void CLIPTextModel::set_adapters(const std::optional<AdapterConfig>& adapters) {
if(adapters) {
if (adapters) {
m_adapter_controller.apply(m_request, *adapters);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ CLIPTextModelWithProjection& CLIPTextModelWithProjection::compile(const std::str
ov::Core core = utils::singleton_core();
ov::CompiledModel compiled_model;
std::optional<AdapterConfig> adapters;
if(auto filtered_properties = extract_adapters_from_properties(properties, &adapters)) {
if (auto filtered_properties = extract_adapters_from_properties(properties, &adapters)) {
adapters->set_tensor_name_prefix(adapters->get_tensor_name_prefix().value_or("lora_te"));
m_adapter_controller = AdapterController(m_model, *adapters, device);
compiled_model = core.compile_model(m_model, device, *filtered_properties);
Expand All @@ -77,7 +77,7 @@ CLIPTextModelWithProjection& CLIPTextModelWithProjection::compile(const std::str
}

void CLIPTextModelWithProjection::set_adapters(const std::optional<AdapterConfig>& adapters) {
if(adapters) {
if (adapters) {
m_adapter_controller.apply(m_request, *adapters);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ SD3Transformer2DModel& SD3Transformer2DModel::reshape(int batch_size,
std::string input_name = input.get_any_name();
name_to_shape[input_name] = input.get_partial_shape();
if (input_name == "timestep") {
name_to_shape[input_name][0] = batch_size;
name_to_shape[input_name][0] = 1;
} else if (input_name == "hidden_states") {
name_to_shape[input_name] = {batch_size, name_to_shape[input_name][1], height, width};
} else if (input_name == "encoder_hidden_states") {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ void UNet2DConditionModel::set_hidden_states(const std::string& tensor_name, ov:
}

void UNet2DConditionModel::set_adapters(const std::optional<AdapterConfig>& adapters) {
if(adapters) {
if (adapters) {
m_adapter_controller.apply(m_request, *adapters);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,8 @@ FlowMatchEulerDiscreteScheduler::FlowMatchEulerDiscreteScheduler(const Config& s
int32_t num_train_timesteps = m_config.num_train_timesteps;
float shift = m_config.shift;

auto linspaced = linspace<float>(1.0f, static_cast<float>(num_train_timesteps), num_train_timesteps, true);
for (auto it = linspaced.rbegin(); it != linspaced.rend(); ++it) {
m_timesteps.push_back(*it);
}
m_timesteps = linspace<float>(1.0f, static_cast<float>(num_train_timesteps), num_train_timesteps, true);
std::reverse(m_timesteps.begin(), m_timesteps.end());

std::transform(m_timesteps.begin(),
m_timesteps.end(),
Expand All @@ -66,7 +64,7 @@ FlowMatchEulerDiscreteScheduler::FlowMatchEulerDiscreteScheduler(const Config& s
m_sigma_max = m_sigmas[0], m_sigma_min = m_sigmas.back();
}

float FlowMatchEulerDiscreteScheduler::sigma_to_t(float sigma) {
double FlowMatchEulerDiscreteScheduler::sigma_to_t(double sigma) {
return sigma * m_config.num_train_timesteps;
}

Expand All @@ -79,20 +77,24 @@ void FlowMatchEulerDiscreteScheduler::set_timesteps(size_t num_inference_steps,
float shift = m_config.shift;

using numpy_utils::linspace;
m_timesteps = linspace<float>(sigma_to_t(m_sigma_max), sigma_to_t(m_sigma_min), m_num_inference_steps, true);
std::vector<double> timesteps = linspace<double>(sigma_to_t(m_sigma_max), sigma_to_t(m_sigma_min), m_num_inference_steps, true);

for (const float& i : m_timesteps) {
m_sigmas.push_back(i / num_train_timesteps);
std::vector<double> sigmas(timesteps.size());
for (size_t i = 0; i < sigmas.size(); ++i) {
sigmas[i] = timesteps[i] / num_train_timesteps;
}

OPENVINO_ASSERT(!m_config.use_dynamic_shifting,
"Parameter 'use_dynamic_shifting' is not supported. Please, add support.");

for (size_t i = 0; i < m_sigmas.size(); ++i) {
m_sigmas[i] = shift * m_sigmas[i] / (1 + (shift - 1) * m_sigmas[i]);
m_sigmas.resize(sigmas.size());
m_timesteps.resize(sigmas.size());

for (size_t i = 0; i < sigmas.size(); ++i) {
m_sigmas[i] = shift * sigmas[i] / (1.0 + (shift - 1.0) * sigmas[i]);
m_timesteps[i] = m_sigmas[i] * num_train_timesteps;
}
m_sigmas.push_back(0);
m_sigmas.push_back(0.0f);

m_step_index = -1, m_begin_index = -1;
}
Expand All @@ -102,8 +104,8 @@ std::map<std::string, ov::Tensor> FlowMatchEulerDiscreteScheduler::step(ov::Tens
// latents - sample
// inference_step

float* model_output_data = noise_pred.data<float>();
float* sample_data = latents.data<float>();
const float* model_output_data = noise_pred.data<const float>();
const float* sample_data = latents.data<const float>();

if (m_step_index == -1)
init_step_index();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ class FlowMatchEulerDiscreteScheduler : public IScheduler {
size_t m_num_inference_steps;

void init_step_index();
float sigma_to_t(float simga);
double sigma_to_t(double simga);
};

} // namespace genai
Expand Down
19 changes: 2 additions & 17 deletions src/cpp/src/image_generation/stable_diffusion_3_pipeline.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -557,27 +557,18 @@ class StableDiffusion3Pipeline : public DiffusionPipeline {

// 6. Denoising loop
ov::Tensor noisy_residual_tensor(ov::element::f32, {});
ov::Tensor timestep;

for (size_t inference_step = 0; inference_step < generation_config.num_inference_steps; ++inference_step) {
// concat the same latent twice along a batch dimension in case of CFG
if (batch_size_multiplier > 1) {
batch_copy(latent, latent_cfg, 0, 0, generation_config.num_images_per_prompt);
batch_copy(latent,
latent_cfg,
0,
generation_config.num_images_per_prompt,
generation_config.num_images_per_prompt);

size_t timestep_size = generation_config.num_images_per_prompt * batch_size_multiplier;
timestep = ov::Tensor(ov::element::f32, {timestep_size});
std::fill_n(timestep.data<float>(), timestep.get_size(), timesteps[inference_step]);
batch_copy(latent, latent_cfg, 0, generation_config.num_images_per_prompt, generation_config.num_images_per_prompt);
} else {
// just assign to save memory copy
latent_cfg = latent;
timestep = ov::Tensor(ov::element::f32, {1}, &timesteps[inference_step]);
}

ov::Tensor timestep(ov::element::f32, {1}, &timesteps[inference_step]);
ov::Tensor noise_pred_tensor = m_transformer->infer(latent_cfg, timestep);

ov::Shape noise_pred_shape = noise_pred_tensor.get_shape();
Expand All @@ -603,12 +594,6 @@ class StableDiffusion3Pipeline : public DiffusionPipeline {
latent = scheduler_step_result["latent"];
}

float* latent_data = latent.data<float>();
for (size_t i = 0; i < latent.get_size(); ++i) {
latent_data[i] = (latent_data[i] / m_vae->get_config().scaling_factor) +
m_vae->get_config().shift_factor;
}

return m_vae->decode(latent);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -492,7 +492,7 @@ class StableDiffusionXLPipeline : public DiffusionPipeline {

ov::AnyMap properties_for_text_encoder(ov::AnyMap properties, const std::string& tensor_name_prefix) {
std::optional<AdapterConfig> adapters;
if(update_adapters_from_properties(properties, adapters) && !adapters->get_tensor_name_prefix()) {
if (update_adapters_from_properties(properties, adapters) && !adapters->get_tensor_name_prefix()) {
adapters->set_tensor_name_prefix(tensor_name_prefix);
properties[ov::genai::adapters.name()] = *adapters;
}
Expand Down

0 comments on commit 0b4848a

Please sign in to comment.